MolmoVision / app.py
yasserrmd's picture
Create app.py
cf2a851 verified
raw
history blame
No virus
1.51 kB
import spaces
import gradio as gr
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from PIL import Image
import torch
import requests
# Load the processor and model
processor = AutoProcessor.from_pretrained(
'allenai/Molmo-7B-D-0924',
trust_remote_code=True,
torch_dtype='auto',
device_map='auto'
)
model = AutoModelForCausalLM.from_pretrained(
'allenai/Molmo-7B-D-0924',
trust_remote_code=True,
torch_dtype='auto',
device_map='auto'
)
@spaces.GPU
def describe_image(image):
# Process the image
inputs = processor.process(images=[image], text="Describe this image.")
# Move inputs to the correct device and make a batch of size 1
inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
# Generate output with maximum 200 new tokens
output = model.generate_from_batch(
inputs,
GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
tokenizer=processor.tokenizer
)
# Decode and return generated text
generated_tokens = output[0, inputs['input_ids'].size(1):]
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
return generated_text
# Gradio interface
gr.Interface(
fn=describe_image,
inputs=gr.inputs.Image(type="pil"),
outputs="text",
title="Visual Language Model - Molmo",
description="Upload an image, and the model will generate a detailed description of it."
).launch()