luodian commited on
Commit
9f5457a
1 Parent(s): a133174

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +32 -33
README.md CHANGED
@@ -38,16 +38,12 @@ An example of using this model to run on your video. Please first clone [Otter](
38
  ```python
39
  import mimetypes
40
  import os
41
- from io import BytesIO
42
  from typing import Union
43
  import cv2
44
  import requests
45
  import torch
46
  import transformers
47
  from PIL import Image
48
- from torchvision.transforms import Compose, Resize, ToTensor
49
- from tqdm import tqdm
50
- import sys
51
 
52
  from otter.modeling_otter import OtterForConditionalGeneration
53
 
@@ -118,9 +114,7 @@ def get_formatted_prompt(prompt: str) -> str:
118
 
119
  def get_response(input_data, prompt: str, model=None, image_processor=None) -> str:
120
  if isinstance(input_data, Image.Image):
121
- vision_x = (
122
- image_processor.preprocess([input_data], return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
123
- )
124
  elif isinstance(input_data, list): # list of video frames
125
  vision_x = image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
126
  else:
@@ -154,31 +148,36 @@ def get_response(input_data, prompt: str, model=None, image_processor=None) -> s
154
  )
155
  return parsed_output
156
 
157
-
158
- # ------------------- Main Function -------------------
159
-
160
  if __name__ == "__main__":
161
- model = OtterForConditionalGeneration.from_pretrained(
162
- "luodian/otter-9b-dc-hf",
163
- )
164
- model.text_tokenizer.padding_side = "left"
165
- tokenizer = model.text_tokenizer
166
- image_processor = transformers.CLIPImageProcessor()
167
- model.eval()
168
-
169
- while True:
170
- video_url = "dc_demo.mp4" # Replace with the path to your video file
171
-
172
- frames_list = get_image(video_url)
173
-
174
- prompts_input = input("Enter prompts (comma-separated): ")
175
- prompts = [prompt.strip() for prompt in prompts_input.split(",")]
176
-
177
- for prompt in prompts:
178
- print(f"\nPrompt: {prompt}")
179
- response = get_response(frames_list, prompt, model, image_processor)
180
- print(f"Response: {response}")
181
-
182
- if prompts_input.lower() == "quit":
183
- break
 
 
 
 
 
 
 
 
184
  ```
 
38
  ```python
39
  import mimetypes
40
  import os
 
41
  from typing import Union
42
  import cv2
43
  import requests
44
  import torch
45
  import transformers
46
  from PIL import Image
 
 
 
47
 
48
  from otter.modeling_otter import OtterForConditionalGeneration
49
 
 
114
 
115
  def get_response(input_data, prompt: str, model=None, image_processor=None) -> str:
116
  if isinstance(input_data, Image.Image):
117
+ vision_x = image_processor.preprocess([input_data], return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
 
 
118
  elif isinstance(input_data, list): # list of video frames
119
  vision_x = image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
120
  else:
 
148
  )
149
  return parsed_output
150
 
 
 
 
151
  if __name__ == "__main__":
152
+ # ------------------- Main Function -------------------
153
+ load_bit = "fp16"
154
+ if load_bit == "fp16":
155
+ precision = {"torch_dtype": torch.float16}
156
+ elif load_bit == "bf16":
157
+ precision = {"torch_dtype": torch.bfloat16}
158
+ elif load_bit == "fp32":
159
+ precision = {"torch_dtype": torch.float32}
160
+
161
+ # This model version is trained on MIMIC-IT DC dataset.
162
+ model = OtterForConditionalGeneration.from_pretrained("luodian/otter-9b-dc-hf", device_map="auto", **precision)
163
+ model.text_tokenizer.padding_side = "left"
164
+ tokenizer = model.text_tokenizer
165
+ image_processor = transformers.CLIPImageProcessor()
166
+ model.eval()
167
+
168
+ while True:
169
+ video_url = "demo.mp4" # Replace with the path to your video file
170
+
171
+ frames_list = get_image(video_url)
172
+
173
+ prompts_input = input("Enter prompts (comma-separated): ")
174
+ prompts = [prompt.strip() for prompt in prompts_input.split(",")]
175
+
176
+ for prompt in prompts:
177
+ print(f"\nPrompt: {prompt}")
178
+ response = get_response(frames_list, prompt, model, image_processor)
179
+ print(f"Response: {response}")
180
+
181
+ if prompts_input.lower() == "quit":
182
+ break
183
  ```