Update README.md
Browse files
README.md
CHANGED
@@ -38,16 +38,12 @@ An example of using this model to run on your video. Please first clone [Otter](
|
|
38 |
```python
|
39 |
import mimetypes
|
40 |
import os
|
41 |
-
from io import BytesIO
|
42 |
from typing import Union
|
43 |
import cv2
|
44 |
import requests
|
45 |
import torch
|
46 |
import transformers
|
47 |
from PIL import Image
|
48 |
-
from torchvision.transforms import Compose, Resize, ToTensor
|
49 |
-
from tqdm import tqdm
|
50 |
-
import sys
|
51 |
|
52 |
from otter.modeling_otter import OtterForConditionalGeneration
|
53 |
|
@@ -118,9 +114,7 @@ def get_formatted_prompt(prompt: str) -> str:
|
|
118 |
|
119 |
def get_response(input_data, prompt: str, model=None, image_processor=None) -> str:
|
120 |
if isinstance(input_data, Image.Image):
|
121 |
-
vision_x = (
|
122 |
-
image_processor.preprocess([input_data], return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
|
123 |
-
)
|
124 |
elif isinstance(input_data, list): # list of video frames
|
125 |
vision_x = image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
|
126 |
else:
|
@@ -154,31 +148,36 @@ def get_response(input_data, prompt: str, model=None, image_processor=None) -> s
|
|
154 |
)
|
155 |
return parsed_output
|
156 |
|
157 |
-
|
158 |
-
# ------------------- Main Function -------------------
|
159 |
-
|
160 |
if __name__ == "__main__":
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
```
|
|
|
38 |
```python
|
39 |
import mimetypes
|
40 |
import os
|
|
|
41 |
from typing import Union
|
42 |
import cv2
|
43 |
import requests
|
44 |
import torch
|
45 |
import transformers
|
46 |
from PIL import Image
|
|
|
|
|
|
|
47 |
|
48 |
from otter.modeling_otter import OtterForConditionalGeneration
|
49 |
|
|
|
114 |
|
115 |
def get_response(input_data, prompt: str, model=None, image_processor=None) -> str:
|
116 |
if isinstance(input_data, Image.Image):
|
117 |
+
vision_x = image_processor.preprocess([input_data], return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
|
|
|
|
|
118 |
elif isinstance(input_data, list): # list of video frames
|
119 |
vision_x = image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
|
120 |
else:
|
|
|
148 |
)
|
149 |
return parsed_output
|
150 |
|
|
|
|
|
|
|
151 |
if __name__ == "__main__":
|
152 |
+
# ------------------- Main Function -------------------
|
153 |
+
load_bit = "fp16"
|
154 |
+
if load_bit == "fp16":
|
155 |
+
precision = {"torch_dtype": torch.float16}
|
156 |
+
elif load_bit == "bf16":
|
157 |
+
precision = {"torch_dtype": torch.bfloat16}
|
158 |
+
elif load_bit == "fp32":
|
159 |
+
precision = {"torch_dtype": torch.float32}
|
160 |
+
|
161 |
+
# This model version is trained on MIMIC-IT DC dataset.
|
162 |
+
model = OtterForConditionalGeneration.from_pretrained("luodian/otter-9b-dc-hf", device_map="auto", **precision)
|
163 |
+
model.text_tokenizer.padding_side = "left"
|
164 |
+
tokenizer = model.text_tokenizer
|
165 |
+
image_processor = transformers.CLIPImageProcessor()
|
166 |
+
model.eval()
|
167 |
+
|
168 |
+
while True:
|
169 |
+
video_url = "demo.mp4" # Replace with the path to your video file
|
170 |
+
|
171 |
+
frames_list = get_image(video_url)
|
172 |
+
|
173 |
+
prompts_input = input("Enter prompts (comma-separated): ")
|
174 |
+
prompts = [prompt.strip() for prompt in prompts_input.split(",")]
|
175 |
+
|
176 |
+
for prompt in prompts:
|
177 |
+
print(f"\nPrompt: {prompt}")
|
178 |
+
response = get_response(frames_list, prompt, model, image_processor)
|
179 |
+
print(f"Response: {response}")
|
180 |
+
|
181 |
+
if prompts_input.lower() == "quit":
|
182 |
+
break
|
183 |
```
|