Spaces:

scr930
/

geolocal-StreetCLIP

Running

File size: 1,344 Bytes

9c53e9f
72afe1e
 
1848536
9c53e9f
72afe1e
 
 
 
 
1848536
 
 
 
 
 
72afe1e
 
1848536
72afe1e
 
 
1848536
 
 
 
 
 
 
 
72afe1e
 
 
 
6b1798b
1848536
72afe1e

import gradio as gr
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

# Load the model and processor
model = CLIPModel.from_pretrained("geolocal/StreetCLIP")
processor = CLIPProcessor.from_pretrained("geolocal/StreetCLIP")

def classify_image(image):
    # Example labels for classification
    labels = ["a photo of a cat", "a photo of a dog", "a photo of a car", "a photo of a tree"]
    
    # Preprocess the image and text
    inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
    
    # Perform the inference
    outputs = model(**inputs)
    
    # Postprocess the outputs
    logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
    probs = logits_per_image.softmax(dim=1)  # we can use softmax to get probabilities
    
    # Convert the probabilities to a list
    probs_list = probs.tolist()[0]
    
    # Create a dictionary of labels and probabilities
    result = {label: prob for label, prob in zip(labels, probs_list)}
    
    return result

# Define Gradio interface
iface = gr.Interface(
    fn=classify_image,
    inputs=gr.Image(type="pil"),
    outputs="label",
    title="Geolocal StreetCLIP Classification",
    description="Upload an image to classify using Geolocal StreetCLIP"
)

# Launch the interface
iface.launch()