import streamlit as st
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq

# Title of the app
st.title("OCR with Qwen2-VL-7B-Instruct")

# Load the processor and model
st.write("Loading model...")
model_id = "Qwen/Qwen2-VL-7B-Instruct"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForVision2Seq.from_pretrained(model_id)
st.write("Model loaded successfully!")

# Upload image section
uploaded_image = st.file_uploader("Upload an image for OCR", type=["jpg", "jpeg", "png"])

if uploaded_image is not None:
    # Open the image
    image = Image.open(uploaded_image)
    
    # Display the uploaded image
    st.image(image, caption="Uploaded Image", use_column_width=True)

    # Process the image using the model
    st.write("Processing the image...")

    # Prepare the image for model input
    inputs = processor(images=image, return_tensors="pt")

    # Generate text (OCR) from the image
    with torch.no_grad():
        generated_ids = model.generate(**inputs)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Display the OCR result
    st.write("Extracted Text:")
    st.text(generated_text)