import io import json import torch from PIL import Image from transformers import DetrImageProcessor, DetrForObjectDetection # Initialize the DETR model and processor processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50", revision="no_timm") model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50", revision="no_timm") def predict(inputs): # Load the image from the provided inputs image = Image.open(io.BytesIO(inputs["image"])) # Preprocess the image inputs = processor(images=image, return_tensors="pt") # Perform object detection outputs = model(**inputs) # Convert outputs to COCO API format target_sizes = torch.tensor([image.size[::-1]]) results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0] # Prepare the results in a dictionary format detections = [{"label": model.config.id2label[label.item()], "confidence": score.item(), "box": box.tolist()} for score, label, box in zip(results["scores"], results["labels"], results["boxes"])] return detections # Define the API endpoint for Hugging Face Spaces def huggingface_spaces_endpoint(inputs): # Call the predict function with the provided inputs detections = predict(inputs) # Return the detections as a JSON object return json.dumps(detections)