import streamlit as st
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image
import torch

# Load your model and tokenizer
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Streamlit UI
st.title("Image Caption Generator")
st.write("Upload an image and click 'Generate' to get a caption.")

# File uploader for image
uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])

if uploaded_image is not None:
    # Display the uploaded image
    image = Image.open(uploaded_image)
    st.image(image, caption='Uploaded Image', use_column_width=True)

    # Generate caption when button is clicked
    if st.button('Generate'):
        # Preprocess the image
        pixel_values = processor(images=image, return_tensors="pt").pixel_values

        # Generate captions
        output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences
        caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Display the generated caption
        st.write(f"**Generated Caption:** {caption}")