SunderAli17's picture
Update app.py
e337c90 verified
raw
history blame contribute delete
No virus
3.23 kB
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
from PIL import Image
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
models = {
"microsoft/Phi-3.5-vision-instruct": AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval()
}
processors = {
"microsoft/Phi-3.5-vision-instruct": AutoProcessor.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True)
}
MARKDOWN = """
This demo utilizes <a href="https://huggingface.co/microsoft/Phi-3.5-vision-instruct">Phi-3.5-Vision Instruct</a> by @Microsoft.
Try out with different images and generate captions. Do provide your feedback.
Model Card is acquired from <a href="https://huggingface.co/microsoft/Phi-3.5-vision-instruct"> Microsoft's Phi Vision Instruct</a>
**Demo by [Sunder Ali Khowaja](https://sander-ali.github.io) - [X](https://x.com/SunderAKhowaja) -[Github](https://github.com/sander-ali) -[Hugging Face](https://huggingface.co/SunderAli17)**
"""
kwargs = {}
kwargs['torch_dtype'] = torch.bfloat16
promptu = '<|user|>\n'
prompta = '<|assistant|>\n'
prompts = "<|end|>\n"
@spaces.GPU
def run_example(image, text_input=None, model_id="microsoft/Phi-3.5-vision-instruct"):
model = models[model_id]
processor = processors[model_id]
prompt = f"{promptu}<|image_1|>\n{text_input}{prompts}{prompta}"
image = Image.fromarray(image).convert("RGB")
inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
generate_ids = model.generate(**inputs,
max_new_tokens=1000,
eos_token_id=processor.tokenizer.eos_token_id,
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(generate_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False)[0]
return response
theme = gr.themes.Soft(
font=[gr.themes.GoogleFont('Pacifico'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
)
js_func = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'dark') {
url.searchParams.set('__theme', 'dark');
window.location.href = url.href;
}
}
"""
with gr.Blocks(js=js_func, theme=theme) as demo:
gr.Markdown(MARKDOWN)
with gr.Tab(label="Phi-3.5 Input"):
with gr.Row():
with gr.Column():
input_img = gr.Image(label="Input Picture")
model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="microsoft/Phi-3.5-vision-instruct")
text_input = gr.Textbox(label="Question")
submit_btn = gr.Button(value="Submit")
with gr.Column():
output_text = gr.Textbox(label="Output Text")
submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text])
demo.launch(debug=True)