yasserrmd commited on
Commit
cf2a851
1 Parent(s): f22a6e1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -0
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
4
+ from PIL import Image
5
+ import torch
6
+ import requests
7
+
8
+ # Load the processor and model
9
+ processor = AutoProcessor.from_pretrained(
10
+ 'allenai/Molmo-7B-D-0924',
11
+ trust_remote_code=True,
12
+ torch_dtype='auto',
13
+ device_map='auto'
14
+ )
15
+
16
+ model = AutoModelForCausalLM.from_pretrained(
17
+ 'allenai/Molmo-7B-D-0924',
18
+ trust_remote_code=True,
19
+ torch_dtype='auto',
20
+ device_map='auto'
21
+ )
22
+
23
+ @spaces.GPU
24
+ def describe_image(image):
25
+ # Process the image
26
+ inputs = processor.process(images=[image], text="Describe this image.")
27
+
28
+ # Move inputs to the correct device and make a batch of size 1
29
+ inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
30
+
31
+ # Generate output with maximum 200 new tokens
32
+ output = model.generate_from_batch(
33
+ inputs,
34
+ GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
35
+ tokenizer=processor.tokenizer
36
+ )
37
+
38
+ # Decode and return generated text
39
+ generated_tokens = output[0, inputs['input_ids'].size(1):]
40
+ generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
41
+
42
+ return generated_text
43
+
44
+ # Gradio interface
45
+ gr.Interface(
46
+ fn=describe_image,
47
+ inputs=gr.inputs.Image(type="pil"),
48
+ outputs="text",
49
+ title="Visual Language Model - Molmo",
50
+ description="Upload an image, and the model will generate a detailed description of it."
51
+ ).launch()