Spaces:
Sleeping
Sleeping
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer | |
from PIL import Image | |
import torch | |
import gradio as gr | |
# Load your model and tokenizer | |
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
# Function to generate captions from images | |
def generate_caption(image): | |
# Preprocess the image | |
pixel_values = processor(images=image, return_tensors="pt").pixel_values | |
# Generate captions | |
output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences | |
caption = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
return caption | |
# Create a Gradio Interface | |
interface = gr.Interface( | |
fn=generate_caption, | |
inputs=gr.Image(type="pil"), | |
outputs=gr.Textbox(), | |
title="Image Caption Generator", | |
description="Upload an image and click 'Generate' to get a caption." | |
) | |
# Launch the app in Hugging Face Spaces | |
interface.launch() | |