Spaces:

cantuncok
/

meta-Llama-3.2-11B-Vision-Instruct

Running

File size: 1,731 Bytes

b1fd6cc
52abee0
f660379
6205fd1
 
b1fd6cc
ae30d65
 
 
 
 
f660379
6f75ab3
6205fd1
52abee0
 
6205fd1
52abee0
 
6205fd1
f660379
 
6205fd1
 
 
 
 
 
 
 
 
 
 
f660379
52abee0
f660379
6205fd1
f660379
 
 
 
 
52abee0
 
 
 
 
f660379
 
 
 
52abee0

import gradio as gr
import os
import torch
from transformers import AutoProcessor, MllamaForConditionalGeneration
from PIL import Image

# Hugging Face tokeninizi çevresel değişkenden alın
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
    raise ValueError("HF_TOKEN çevresel değişkeni ayarlanmamış. Lütfen Hugging Face token'ınızı ayarlayın.")

# Model ve işlemciyi yükleyin
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(
    model_name,
    use_auth_token=hf_token,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_name, use_auth_token=hf_token)

def predict(image, text):
    # Mesajları hazırlayın
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": text}
        ]}
    ]
    # Girdi metnini oluşturun
    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    # Girdileri işleyin ve cihaza taşıyın
    inputs = processor(image, input_text, return_tensors="pt").to(model.device)
    # Modelden yanıt alın
    outputs = model.generate(**inputs, max_new_tokens=100)
    # Çıktıyı çözümleyin
    response = processor.decode(outputs[0], skip_special_tokens=True)
    return response

# Gradio arayüzünü tanımlayın
interface = gr.Interface(
    fn=predict,
    inputs=[
        gr.Image(type="pil", label="Görüntü Girdisi"),
        gr.Textbox(label="Metin Girdisi")
    ],
    outputs=gr.Textbox(label="Çıktı"),
    title="Llama 3.2 90B Vision Instruct Demo",
    description="Bir görüntü ve metin girdisi alarak yanıt üreten model."
)

interface.launch()