import streamlit as st from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer from PIL import Image import torch # Load your model and tokenizer model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") # Streamlit UI st.title("Image Caption Generator") st.write("Upload an image and click 'Generate' to get a caption.") # File uploader for image uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) if uploaded_image is not None: # Display the uploaded image image = Image.open(uploaded_image) st.image(image, caption='Uploaded Image', use_column_width=True) # Generate caption when button is clicked if st.button('Generate'): # Preprocess the image pixel_values = processor(images=image, return_tensors="pt").pixel_values # Generate captions output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences caption = tokenizer.decode(output_ids[0], skip_special_tokens=True) # Display the generated caption st.write(f"**Generated Caption:** {caption}")