Aashi commited on
Commit
2f8eff4
1 Parent(s): 99f09a0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -4
app.py CHANGED
@@ -1,7 +1,79 @@
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
1
+ # import gradio as gr
2
+
3
+ # def greet(name):
4
+ # return "Hello " + name + "!!"
5
+
6
+ # demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
+ # demo.launch()
8
+
9
+
10
  import gradio as gr
11
+ import requests
12
+ import torch
13
+ from PIL import Image
14
+ from transformers import MllamaForConditionalGeneration, AutoProcessor
15
+
16
+ # Load the Llama 3.2 Vision Model
17
+ def load_llama_model():
18
+ model_id = "meta-llama/Llama-3.2-11B-Vision"
19
+
20
+ # Load model and processor
21
+ model = MllamaForConditionalGeneration.from_pretrained(
22
+ model_id,
23
+ torch_dtype=torch.bfloat16,
24
+ device_map="auto",
25
+ )
26
+ processor = AutoProcessor.from_pretrained(model_id)
27
+
28
+ return model, processor
29
+
30
+ # Function to generate predictions for text and image
31
+ def process_input(text, image=None):
32
+ model, processor = load_llama_model()
33
+
34
+ if image:
35
+ # If an image is uploaded, process it as a PIL Image object
36
+ vision_input = image.convert("RGB").resize((224, 224))
37
+
38
+ prompt = f"<|image|><|begin_of_text|>{text}"
39
+
40
+ # Process image and text together
41
+ inputs = processor(vision_input, prompt, return_tensors="pt").to(model.device)
42
+ else:
43
+ # If no image is uploaded, just process the text
44
+ prompt = f"<|begin_of_text|>{text}"
45
+ inputs = processor(prompt, return_tensors="pt").to(model.device)
46
+
47
+ # Generate output from the model
48
+ outputs = model.generate(**inputs, max_new_tokens=100)
49
+
50
+ # Decode the output to return a readable text
51
+ decoded_output = processor.decode(outputs[0], skip_special_tokens=True)
52
+
53
+ return decoded_output
54
+
55
+ # Gradio Interface Setup
56
+ def demo():
57
+ # Define Gradio input and output components
58
+ text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5)
59
+
60
+ # Use type="pil" to work with PIL Image objects
61
+ image_input = gr.Image(label="Upload an Image", type="pil")
62
+
63
+ output = gr.Textbox(label="Model Output", lines=5)
64
+
65
+ # Define the interface layout
66
+ interface = gr.Interface(
67
+ fn=process_input,
68
+ inputs=[text_input, image_input],
69
+ outputs=output,
70
+ title="Llama 3.2 Multimodal Text-Image Analyzer",
71
+ description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model."
72
+ )
73
 
74
+ # Launch the demo
75
+ interface.launch()
76
 
77
+ # Run the demo
78
+ if __name__ == "__main__":
79
+ demo()