Spaces:

Aashi
/

Text-Image-Analyzer

Running on Zero

App Files Files Community

Text-Image-Analyzer / app.py

Aashi

Update app.py

2f8eff4 verified 2 days ago

raw

history blame

No virus

2.35 kB

	# import gradio as gr

	# def greet(name):
	# return "Hello " + name + "!!"

	# demo = gr.Interface(fn=greet, inputs="text", outputs="text")
	# demo.launch()


	import gradio as gr
	import requests
	import torch
	from PIL import Image
	from transformers import MllamaForConditionalGeneration, AutoProcessor

	# Load the Llama 3.2 Vision Model
	def load_llama_model():
	model_id = "meta-llama/Llama-3.2-11B-Vision"

	# Load model and processor
	model = MllamaForConditionalGeneration.from_pretrained(
	model_id,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	)
	processor = AutoProcessor.from_pretrained(model_id)

	return model, processor

	# Function to generate predictions for text and image
	def process_input(text, image=None):
	model, processor = load_llama_model()

	if image:
	# If an image is uploaded, process it as a PIL Image object
	vision_input = image.convert("RGB").resize((224, 224))

	prompt = f"<\|image\|><\|begin_of_text\|>{text}"

	# Process image and text together
	inputs = processor(vision_input, prompt, return_tensors="pt").to(model.device)
	else:
	# If no image is uploaded, just process the text
	prompt = f"<\|begin_of_text\|>{text}"
	inputs = processor(prompt, return_tensors="pt").to(model.device)

	# Generate output from the model
	outputs = model.generate(**inputs, max_new_tokens=100)

	# Decode the output to return a readable text
	decoded_output = processor.decode(outputs[0], skip_special_tokens=True)

	return decoded_output

	# Gradio Interface Setup
	def demo():
	# Define Gradio input and output components
	text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5)

	# Use type="pil" to work with PIL Image objects
	image_input = gr.Image(label="Upload an Image", type="pil")

	output = gr.Textbox(label="Model Output", lines=5)

	# Define the interface layout
	interface = gr.Interface(
	fn=process_input,
	inputs=[text_input, image_input],
	outputs=output,
	title="Llama 3.2 Multimodal Text-Image Analyzer",
	description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model."
	)

	# Launch the demo
	interface.launch()

	# Run the demo
	if __name__ == "__main__":
	demo()