luodian
/

OTTER-Video-LLaMA7B-DenseCaption

Text2Text Generation

Transformers

PyTorch

otter

Inference Endpoints

Model card Files Files and versions Community

luodian commited on Jun 23, 2023

Commit

2079d37

•

1 Parent(s): 9f5457a

Update README.md

Browse files

Files changed (1) hide show

README.md +52 -35

README.md CHANGED Viewed

@@ -83,7 +83,9 @@ def get_image(url: str) -> Union[Image.Image, list]:
     if "://" not in url:  # Local file
         content_type = get_content_type(url)
     else:  # Remote URL
-        content_type = requests.head(url, stream=True, verify=False).headers.get("Content-Type")
     if "image" in content_type:
         if "://" not in url:  # Local file
@@ -114,11 +116,23 @@ def get_formatted_prompt(prompt: str) -> str:
 def get_response(input_data, prompt: str, model=None, image_processor=None) -> str:
     if isinstance(input_data, Image.Image):
-        vision_x = image_processor.preprocess([input_data], return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
     elif isinstance(input_data, list):  # list of video frames
-        vision_x = image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
     else:
-        raise ValueError("Invalid input data. Expected PIL Image or list of video frames.")
     lang_x = model.text_tokenizer(
         [
@@ -148,36 +162,39 @@ def get_response(input_data, prompt: str, model=None, image_processor=None) -> s
     )
     return parsed_output
 if __name__ == "__main__":
-  # ------------------- Main Function -------------------
-  load_bit = "fp16"
-  if load_bit == "fp16":
-      precision = {"torch_dtype": torch.float16}
-  elif load_bit == "bf16":
-      precision = {"torch_dtype": torch.bfloat16}
-  elif load_bit == "fp32":
-      precision = {"torch_dtype": torch.float32}
-  # This model version is trained on MIMIC-IT DC dataset.
-  model = OtterForConditionalGeneration.from_pretrained("luodian/otter-9b-dc-hf", device_map="auto", **precision)
-  model.text_tokenizer.padding_side = "left"
-  tokenizer = model.text_tokenizer
-  image_processor = transformers.CLIPImageProcessor()
-  model.eval()
-  while True:
-      video_url = "demo.mp4"  # Replace with the path to your video file
-      frames_list = get_image(video_url)
-      prompts_input = input("Enter prompts (comma-separated): ")
-      prompts = [prompt.strip() for prompt in prompts_input.split(",")]
-      for prompt in prompts:
-          print(f"\nPrompt: {prompt}")
-          response = get_response(frames_list, prompt, model, image_processor)
-          print(f"Response: {response}")
-      if prompts_input.lower() == "quit":
-          break
 ```

     if "://" not in url:  # Local file
         content_type = get_content_type(url)
     else:  # Remote URL
+        content_type = requests.head(url, stream=True, verify=False).headers.get(
+            "Content-Type"
+        )
     if "image" in content_type:
         if "://" not in url:  # Local file
 def get_response(input_data, prompt: str, model=None, image_processor=None) -> str:
     if isinstance(input_data, Image.Image):
+        vision_x = (
+            image_processor.preprocess([input_data], return_tensors="pt")[
+                "pixel_values"
+            ]
+            .unsqueeze(1)
+            .unsqueeze(0)
+        )
     elif isinstance(input_data, list):  # list of video frames
+        vision_x = (
+            image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"]
+            .unsqueeze(1)
+            .unsqueeze(0)
+        )
     else:
+        raise ValueError(
+            "Invalid input data. Expected PIL Image or list of video frames."
+        )
     lang_x = model.text_tokenizer(
         [
     )
     return parsed_output
 if __name__ == "__main__":
+    # ------------------- Main Function -------------------
+    load_bit = "fp16"
+    if load_bit == "fp16":
+        precision = {"torch_dtype": torch.float16}
+    elif load_bit == "bf16":
+        precision = {"torch_dtype": torch.bfloat16}
+    elif load_bit == "fp32":
+        precision = {"torch_dtype": torch.float32}
+    # This model version is trained on MIMIC-IT DC dataset.
+    model = OtterForConditionalGeneration.from_pretrained(
+        "luodian/otter-9b-dc-hf", device_map="auto", **precision
+    )
+    model.text_tokenizer.padding_side = "left"
+    tokenizer = model.text_tokenizer
+    image_processor = transformers.CLIPImageProcessor()
+    model.eval()
+    while True:
+        video_url = "demo.mp4"  # Replace with the path to your video file
+        frames_list = get_image(video_url)
+        prompts_input = input("Enter prompts (comma-separated): ")
+        prompts = [prompt.strip() for prompt in prompts_input.split(",")]
+        for prompt in prompts:
+            print(f"\nPrompt: {prompt}")
+            response = get_response(frames_list, prompt, model, image_processor)
+            print(f"Response: {response}")
+        if prompts_input.lower() == "quit":
+            break
 ```