import streamlit as st from PIL import Image import torch from transformers import AutoProcessor, AutoModelForVision2Seq # Title of the app st.title("OCR with Qwen2-VL-7B-Instruct") # Load the processor and model st.write("Loading model...") model_id = "Qwen/Qwen2-VL-7B-Instruct" processor = AutoProcessor.from_pretrained(model_id) model = AutoModelForVision2Seq.from_pretrained(model_id) st.write("Model loaded successfully!") # Upload image section uploaded_image = st.file_uploader("Upload an image for OCR", type=["jpg", "jpeg", "png"]) if uploaded_image is not None: # Open the image image = Image.open(uploaded_image) # Display the uploaded image st.image(image, caption="Uploaded Image", use_column_width=True) # Process the image using the model st.write("Processing the image...") # Prepare the image for model input inputs = processor(images=image, return_tensors="pt") # Generate text (OCR) from the image with torch.no_grad(): generated_ids = model.generate(**inputs) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # Display the OCR result st.write("Extracted Text:") st.text(generated_text)