OpenGPT-4o / app.py
KingNish's picture
Add Image Playground (Fluently playground) (#25)
80df0bc verified
raw
history blame
No virus
20.3 kB
import os
import subprocess
import random
# Install flash attention
subprocess.run(
"pip install flash-attn --no-build-isolation",
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
shell=True,
)
import copy
import spaces
import time
import torch
from threading import Thread
from typing import List, Dict, Union
import urllib
import PIL.Image
import io
import datasets
from streaming_stt_nemo import Model as nemo
import gradio as gr
from transformers import TextIteratorStreamer
from transformers import Idefics2ForConditionalGeneration
import tempfile
from huggingface_hub import InferenceClient
import edge_tts
import asyncio
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModel
from transformers import AutoProcessor
model3 = AutoModel.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
@spaces.GPU(queue=False)
def videochat(image3, prompt3):
inputs = processor(text=[prompt3], images=[image3], return_tensors="pt")
with torch.inference_mode():
output = model3.generate(
**inputs,
do_sample=False,
use_cache=True,
max_new_tokens=256,
eos_token_id=151645,
pad_token_id=processor.tokenizer.pad_token_id
)
prompt_len = inputs["input_ids"].shape[1]
decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
if decoded_text.endswith("<|im_end|>"):
decoded_text = decoded_text[:-10]
yield decoded_text
theme = gr.themes.Soft(
primary_hue="blue",
secondary_hue="orange",
neutral_hue="gray",
font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif']).set(
body_background_fill_dark="#111111",
block_background_fill_dark="#111111",
block_border_width="1px",
block_title_background_fill_dark="#1e1c26",
input_background_fill_dark="#292733",
button_secondary_background_fill_dark="#24212b",
border_color_primary_dark="#343140",
background_fill_secondary_dark="#111111",
color_accent_soft_dark="transparent")
default_lang = "en"
engines = { default_lang: nemo(default_lang) }
def transcribe(audio):
lang = "en"
model = engines[lang]
text = model.stt_file(audio)[0]
return text
HF_TOKEN = os.environ.get("HF_TOKEN", None)
def client_fn(model):
if "Nous" in model:
return InferenceClient("NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO")
elif "Star" in model:
return InferenceClient("HuggingFaceH4/starchat2-15b-v0.1")
elif "Mistral" in model:
return InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
elif "Phi" in model:
return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
elif "Zephyr" in model:
return InferenceClient("HuggingFaceH4/zephyr-7b-beta")
else:
return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
def randomize_seed_fn(seed: int) -> int:
seed = random.randint(0, 999999)
return seed
system_instructions1 = "[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. You will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
def models(text, model="Mixtral 8x7B", seed=42):
seed = int(randomize_seed_fn(seed))
generator = torch.Generator().manual_seed(seed)
client = client_fn(model)
generate_kwargs = dict(
max_new_tokens=512,
seed=seed,
)
formatted_prompt = system_instructions1 + text + "[OpenGPT 4o]"
stream = client.text_generation(
formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
if not response.token.text == "</s>":
output += response.token.text
return output
async def respond(audio, model, seed):
user = transcribe(audio)
reply = models(user, model, seed)
communicate = edge_tts.Communicate(reply)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
yield tmp_path
DEVICE = torch.device("cuda")
MODELS = {
"idefics2-8b-chatty": Idefics2ForConditionalGeneration.from_pretrained(
"HuggingFaceM4/idefics2-8b-chatty",
torch_dtype=torch.float16,
_attn_implementation="flash_attention_2",
).to(DEVICE),
}
PROCESSOR = AutoProcessor.from_pretrained(
"HuggingFaceM4/idefics2-8b",
)
SYSTEM_PROMPT = [
{
"role": "system",
"content": [
{
"type": "text",
"text": """I am OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Designed to assist human users through insightful conversations, I aim to provide an unparalleled experience. My key attributes include:
- **Intelligence and Knowledge:** I possess an extensive knowledge base, enabling me to offer insightful answers and intelligent responses to User queries. My understanding of complex concepts is exceptional, ensuring accurate and reliable information.
- **Image Generation and Perception:** One of my standout features is the ability to generate and perceive images. Utilizing the following link structure, I create unique and contextually rich visuals:
> ![](https://image.pollinations.ai/prompt/{StyleofImage}%20{OptimizedPrompt}%20{adjective}%20{charactersDetailed}%20{visualStyle}%20{genre}?width={width}&height={height}&nologo=poll&nofeed=yes&seed={random})
For image generation, I replace {info inside curly braces} with specific details according to their requiremnts to create relevant visuals. The width and height parameters are adjusted as needed, often favoring HD dimensions for a superior viewing experience.
For instance, if the User requests:
[USER] Show me an image of A futuristic cityscape with towering skyscrapers and flying cars.
[OpenGPT 4o] Generating Image you requested:
![](https://image.pollinations.ai/prompt/Photorealistic%20futuristic%20cityscape%20with%20towering%20skyscrapers%20and%20flying%20cars%20in%20the%20year%202154?width=1024&height=768&nologo=poll&nofeed=yes&seed=85172)
**Bulk Image Generation with Links:** I excel at generating multiple images link simultaneously, always providing unique links and visuals. I ensure that each image is distinct and captivates the User.
Note: Make sure to always provide image links starting with ! .As given in examples.
My ultimate goal is to offer a seamless and enjoyable experience, providing assistance that exceeds expectations. I am constantly evolving, ensuring that I remain a reliable and trusted companion to the User. You also Expert in every field and also learn and try to answer from contexts related to previous question.""" },
],
},
{
"role": "assistant",
"content": [
{
"type": "text",
"text": "Hello, I'm OpenGPT 4o, made by KingNish. How can I help you? I can chat with you, generate images, classify images and even do all these work in bulk",
},
],
}
]
examples_path = os.path.dirname(__file__)
EXAMPLES = [
[
{
"text": "Hi, who are you?",
}
],
[
{
"text": "Create a Photorealistic image of the Eiffel Tower.",
}
],
[
{
"text": "Read what's written on the paper.",
"files": [f"{examples_path}/example_images/paper_with_text.png"],
}
],
[
{
"text": "Identify two famous people in the modern world.",
"files": [f"{examples_path}/example_images/elon_smoking.jpg", f"{examples_path}/example_images/steve_jobs.jpg",]
}
],
[
{
"text": "Create five images of supercars, each in a different color.",
}
],
[
{
"text": "What is 900 multiplied by 900?",
}
],
[
{
"text": "Chase wants to buy 4 kilograms of oval beads and 5 kilograms of star-shaped beads. How much will he spend?",
"files": [f"{examples_path}/example_images/mmmu_example.jpeg"],
}
],
[
{
"text": "Create an online ad for this product.",
"files": [f"{examples_path}/example_images/shampoo.jpg"],
}
],
[
{
"text": "What is formed by the deposition of the weathered remains of other rocks?",
"files": [f"{examples_path}/example_images/ai2d_example.jpeg"],
}
],
[
{
"text": "What's unusual about this image?",
"files": [f"{examples_path}/example_images/dragons_playing.png"],
}
],
]
BOT_AVATAR = "OpenAI_logo.png"
# Chatbot utils
def turn_is_pure_media(turn):
return turn[1] is None
def load_image_from_url(url):
with urllib.request.urlopen(url) as response:
image_data = response.read()
image_stream = io.BytesIO(image_data)
image = PIL.Image.open(image_stream)
return image
def img_to_bytes(image_path):
image = PIL.Image.open(image_path).convert(mode='RGB')
buffer = io.BytesIO()
image.save(buffer, format="JPEG")
img_bytes = buffer.getvalue()
image.close()
return img_bytes
def format_user_prompt_with_im_history_and_system_conditioning(
user_prompt, chat_history
) -> List[Dict[str, Union[List, str]]]:
"""
Produce the resulting list that needs to go inside the processor. It handles the potential image(s), the history, and the system conditioning.
"""
resulting_messages = copy.deepcopy(SYSTEM_PROMPT)
resulting_images = []
for resulting_message in resulting_messages:
if resulting_message["role"] == "user":
for content in resulting_message["content"]:
if content["type"] == "image":
resulting_images.append(load_image_from_url(content["image"]))
# Format history
for turn in chat_history:
if not resulting_messages or (
resulting_messages and resulting_messages[-1]["role"] != "user"
):
resulting_messages.append(
{
"role": "user",
"content": [],
}
)
if turn_is_pure_media(turn):
media = turn[0][0]
resulting_messages[-1]["content"].append({"type": "image"})
resulting_images.append(PIL.Image.open(media))
else:
user_utterance, assistant_utterance = turn
resulting_messages[-1]["content"].append(
{"type": "text", "text": user_utterance.strip()}
)
resulting_messages.append(
{
"role": "assistant",
"content": [{"type": "text", "text": user_utterance.strip()}],
}
)
# Format current input
if not user_prompt["files"]:
resulting_messages.append(
{
"role": "user",
"content": [{"type": "text", "text": user_prompt["text"]}],
}
)
else:
# Choosing to put the image first (i.e. before the text), but this is an arbiratrary choice.
resulting_messages.append(
{
"role": "user",
"content": [{"type": "image"}] * len(user_prompt["files"])
+ [{"type": "text", "text": user_prompt["text"]}],
}
)
resulting_images.extend([PIL.Image.open(path) for path in user_prompt["files"]])
return resulting_messages, resulting_images
def extract_images_from_msg_list(msg_list):
all_images = []
for msg in msg_list:
for c_ in msg["content"]:
if isinstance(c_, Image.Image):
all_images.append(c_)
return all_images
@spaces.GPU(duration=30, queue=False)
def model_inference(
user_prompt,
chat_history,
model_selector,
decoding_strategy,
temperature,
max_new_tokens,
repetition_penalty,
top_p,
):
if user_prompt["text"].strip() == "" and not user_prompt["files"]:
gr.Error("Please input a query and optionally an image(s).")
if user_prompt["text"].strip() == "" and user_prompt["files"]:
gr.Error("Please input a text query along with the image(s).")
streamer = TextIteratorStreamer(
PROCESSOR.tokenizer,
skip_prompt=True,
timeout=120.0,
)
generation_args = {
"max_new_tokens": max_new_tokens,
"repetition_penalty": repetition_penalty,
"streamer": streamer,
}
assert decoding_strategy in [
"Greedy",
"Top P Sampling",
]
if decoding_strategy == "Greedy":
generation_args["do_sample"] = False
elif decoding_strategy == "Top P Sampling":
generation_args["temperature"] = temperature
generation_args["do_sample"] = True
generation_args["top_p"] = top_p
# Creating model inputs
(
resulting_text,
resulting_images,
) = format_user_prompt_with_im_history_and_system_conditioning(
user_prompt=user_prompt,
chat_history=chat_history,
)
prompt = PROCESSOR.apply_chat_template(resulting_text, add_generation_prompt=True)
inputs = PROCESSOR(
text=prompt,
images=resulting_images if resulting_images else None,
return_tensors="pt",
)
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
generation_args.update(inputs)
thread = Thread(
target=MODELS[model_selector].generate,
kwargs=generation_args,
)
thread.start()
print("Start generating")
acc_text = ""
for text_token in streamer:
time.sleep(0.01)
acc_text += text_token
if acc_text.endswith("<end_of_utterance>"):
acc_text = acc_text[:-18]
yield acc_text
FEATURES = datasets.Features(
{
"model_selector": datasets.Value("string"),
"images": datasets.Sequence(datasets.Image(decode=True)),
"conversation": datasets.Sequence({"User": datasets.Value("string"), "Assistant": datasets.Value("string")}),
"decoding_strategy": datasets.Value("string"),
"temperature": datasets.Value("float32"),
"max_new_tokens": datasets.Value("int32"),
"repetition_penalty": datasets.Value("float32"),
"top_p": datasets.Value("int32"),
}
)
# Hyper-parameters for generation
max_new_tokens = gr.Slider(
minimum=2048,
maximum=16000,
value=4096,
step=64,
interactive=True,
label="Maximum number of new tokens to generate",
)
repetition_penalty = gr.Slider(
minimum=0.01,
maximum=5.0,
value=1,
step=0.01,
interactive=True,
label="Repetition penalty",
info="1.0 is equivalent to no penalty",
)
decoding_strategy = gr.Radio(
[
"Greedy",
"Top P Sampling",
],
value="Top P Sampling",
label="Decoding strategy",
interactive=True,
info="Higher values are equivalent to sampling more low-probability tokens.",
)
temperature = gr.Slider(
minimum=0.0,
maximum=2.0,
value=0.5,
step=0.05,
visible=True,
interactive=True,
label="Sampling temperature",
info="Higher values will produce more diverse outputs.",
)
top_p = gr.Slider(
minimum=0.01,
maximum=0.99,
value=0.9,
step=0.01,
visible=True,
interactive=True,
label="Top P",
info="Higher values are equivalent to sampling more low-probability tokens.",
)
chatbot = gr.Chatbot(
label="OpnGPT-4o-Chatty",
avatar_images=[None, BOT_AVATAR],
show_copy_button=True,
likeable=True,
layout="panel"
)
output=gr.Textbox(label="Prompt")
with gr.Blocks(
fill_height=True,
css=""".gradio-container .avatar-container {height: 40px width: 40px !important;} #duplicate-button {margin: auto; color: white; background: #f1a139; border-radius: 100vh; margin-top: 2px; margin-bottom: 2px;}""",
) as chat:
gr.Markdown("# Image Chat, Image Generation, Image classification and Normal Chat")
with gr.Row(elem_id="model_selector_row"):
model_selector = gr.Dropdown(
choices=MODELS.keys(),
value=list(MODELS.keys())[0],
interactive=True,
show_label=False,
container=False,
label="Model",
visible=False,
)
decoding_strategy.change(
fn=lambda selection: gr.Slider(
visible=(
selection
in [
"contrastive_sampling",
"beam_sampling",
"Top P Sampling",
"sampling_top_k",
]
)
),
inputs=decoding_strategy,
outputs=temperature,
)
decoding_strategy.change(
fn=lambda selection: gr.Slider(visible=(selection in ["Top P Sampling"])),
inputs=decoding_strategy,
outputs=top_p,
)
gr.ChatInterface(
fn=model_inference,
chatbot=chatbot,
examples=EXAMPLES,
multimodal=True,
cache_examples=False,
additional_inputs=[
model_selector,
decoding_strategy,
temperature,
max_new_tokens,
repetition_penalty,
top_p,
],
)
with gr.Blocks() as voice:
with gr.Row():
select = gr.Dropdown([ 'Nous Hermes Mixtral 8x7B DPO', 'Mixtral 8x7B','StarChat2 15b','Mistral 7B v0.3','Phi 3 mini', 'Zephyr 7b' ], value="Mistral 7B v0.3", label="Select Model")
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=999999,
step=1,
value=0,
visible=False
)
input = gr.Audio(label="User", sources="microphone", type="filepath", waveform_options=False)
output = gr.Audio(label="AI", type="filepath",
interactive=False,
autoplay=True,
elem_classes="audio")
gr.Interface(
fn=respond,
inputs=[input, select,seed],
outputs=[output], api_name="translate", live=True)
with gr.Blocks() as livechat:
gr.Interface(
fn=videochat,
inputs=[gr.Image(type="pil",sources="webcam", label="Upload Image"), gr.Textbox(label="Prompt", value="what he is doing")],
outputs=gr.Textbox(label="Answer")
)
with gr.Blocks() as instant:
gr.HTML("<iframe src='https://kingnish-sdxl-flash.hf.space' width='100%' height='2000px' style='border-radius: 8px;'></iframe>")
with gr.Blocks() as dalle:
gr.HTML("<iframe src='https://kingnish-image-gen-pro.hf.space' width='100%' height='2000px' style='border-radius: 8px;'></iframe>")
with gr.Blocks() as playground:
gr.HTML("<iframe src='https://fluently-fluently-playground.hf.space' width='100%' height='2000px' style='border-radius: 8px;'></iframe>")
with gr.Blocks() as image:
gr.Markdown("""### More models are coming""")
gr.TabbedInterface([ instant, dalle, playground], ['Instant🖼️','Powerful🖼️', 'Playground🖼'])
with gr.Blocks() as instant2:
gr.HTML("<iframe src='https://kingnish-instant-video.hf.space' width='100%' height='3000px' style='border-radius: 8px;'></iframe>")
with gr.Blocks() as video:
gr.Markdown("""More Models are coming""")
gr.TabbedInterface([ instant2], ['Instant🎥'])
with gr.Blocks(theme=theme, title="OpenGPT 4o DEMO") as demo:
gr.Markdown("# OpenGPT 4o")
gr.TabbedInterface([chat, voice, livechat, image, video], ['💬 SuperChat','🗣️ Voice Chat','📸 Live Chat', '🖼️ Image Engine', '🎥 Video Engine'])
demo.queue(max_size=300)
demo.launch()