Aashi's picture
Update app.py
2f8eff4 verified
raw
history blame
No virus
2.35 kB
# import gradio as gr
# def greet(name):
# return "Hello " + name + "!!"
# demo = gr.Interface(fn=greet, inputs="text", outputs="text")
# demo.launch()
import gradio as gr
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor
# Load the Llama 3.2 Vision Model
def load_llama_model():
model_id = "meta-llama/Llama-3.2-11B-Vision"
# Load model and processor
model = MllamaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)
return model, processor
# Function to generate predictions for text and image
def process_input(text, image=None):
model, processor = load_llama_model()
if image:
# If an image is uploaded, process it as a PIL Image object
vision_input = image.convert("RGB").resize((224, 224))
prompt = f"<|image|><|begin_of_text|>{text}"
# Process image and text together
inputs = processor(vision_input, prompt, return_tensors="pt").to(model.device)
else:
# If no image is uploaded, just process the text
prompt = f"<|begin_of_text|>{text}"
inputs = processor(prompt, return_tensors="pt").to(model.device)
# Generate output from the model
outputs = model.generate(**inputs, max_new_tokens=100)
# Decode the output to return a readable text
decoded_output = processor.decode(outputs[0], skip_special_tokens=True)
return decoded_output
# Gradio Interface Setup
def demo():
# Define Gradio input and output components
text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5)
# Use type="pil" to work with PIL Image objects
image_input = gr.Image(label="Upload an Image", type="pil")
output = gr.Textbox(label="Model Output", lines=5)
# Define the interface layout
interface = gr.Interface(
fn=process_input,
inputs=[text_input, image_input],
outputs=output,
title="Llama 3.2 Multimodal Text-Image Analyzer",
description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model."
)
# Launch the demo
interface.launch()
# Run the demo
if __name__ == "__main__":
demo()