Update README.md

42a8ed8 over 1 year ago

6.37 kB

	---
	license: mit
	---

	<p align="center" width="100%">
	<img src="https://i.postimg.cc/MKmyP9wH/new-banner.png" width="80%" height="80%">
	</p>


	<div>
	<div align="center">
	<a href='https://brianboli.com/' target='_blank'>Bo Li*<sup>1</sup></a>&emsp;
	<a href='https://zhangyuanhan-ai.github.io/' target='_blank'>Yuanhan Zhang*<sup>,1</sup></a>&emsp;
	<a href='https://cliangyu.com/' target='_blank'>Liangyu Chen*<sup>,1</sup></a>&emsp;
	<a href='https://king159.github.io/' target='_blank'>Jinghao Wang*<sup>,1</sup></a>&emsp;
	<a href='https://pufanyi.github.io/' target='_blank'>Fanyi Pu*<sup>,1</sup></a>&emsp;
	</br>
	<a href='https://jingkang50.github.io/' target='_blank'>Jingkang Yang<sup>1</sup></a>&emsp;
	<a href='https://chunyuan.li/' target='_blank'>Chunyuan Li<sup>2</sup></a>&emsp;
	<a href='https://liuziwei7.github.io/' target='_blank'>Ziwei Liu<sup>1</sup></a>
	</div>
	<div>
	<div align="center">
	<sup>1</sup>S-Lab, Nanyang Technological University&emsp;
	<sup>2</sup>Microsoft Research, Redmond
	</div>

	-----------------

	![](https://img.shields.io/badge/otter-v0.2-darkcyan)
	![](https://img.shields.io/github/stars/luodian/otter?style=social)
	[![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FLuodian%2Fotter&count_bg=%23FFA500&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=visitors&edge_flat=false)](https://hits.seeyoufarm.com)
	![](https://black.readthedocs.io/en/stable/_static/license.svg)
	![](https://img.shields.io/badge/code%20style-black-000000.svg)

	An example of using this model to run on your video. Please first clone [Otter](https://github.com/Luodian/Otter) to your local disk. Place following script inside the `Otter` folder to make sure it has the access to `otter/modeling_otter.py`.

	```python
	import mimetypes
	import os
	from io import BytesIO
	from typing import Union
	import cv2
	import requests
	import torch
	import transformers
	from PIL import Image
	from torchvision.transforms import Compose, Resize, ToTensor
	from tqdm import tqdm
	import sys

	from otter.modeling_otter import OtterForConditionalGeneration

	# Disable warnings
	requests.packages.urllib3.disable_warnings()

	# ------------------- Utility Functions -------------------


	def get_content_type(file_path):
	content_type, _ = mimetypes.guess_type(file_path)
	return content_type


	# ------------------- Image and Video Handling Functions -------------------


	def extract_frames(video_path, num_frames=128):
	video = cv2.VideoCapture(video_path)
	total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
	frame_step = total_frames // num_frames
	frames = []

	for i in range(num_frames):
	video.set(cv2.CAP_PROP_POS_FRAMES, i * frame_step)
	ret, frame = video.read()
	if ret:
	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	frame = Image.fromarray(frame).convert("RGB")
	frames.append(frame)

	video.release()
	return frames


	def get_image(url: str) -> Union[Image.Image, list]:
	if "://" not in url: # Local file
	content_type = get_content_type(url)
	else: # Remote URL
	content_type = requests.head(url, stream=True, verify=False).headers.get("Content-Type")

	if "image" in content_type:
	if "://" not in url: # Local file
	return Image.open(url)
	else: # Remote URL
	return Image.open(requests.get(url, stream=True, verify=False).raw)
	elif "video" in content_type:
	video_path = "temp_video.mp4"
	if "://" not in url: # Local file
	video_path = url
	else: # Remote URL
	with open(video_path, "wb") as f:
	f.write(requests.get(url, stream=True, verify=False).content)
	frames = extract_frames(video_path)
	if "://" in url: # Only remove the temporary video file if it was downloaded
	os.remove(video_path)
	return frames
	else:
	raise ValueError("Invalid content type. Expected image or video.")


	# ------------------- OTTER Prompt and Response Functions -------------------


	def get_formatted_prompt(prompt: str) -> str:
	return f"<image>User: {prompt} GPT:<answer>"


	def get_response(input_data, prompt: str, model=None, image_processor=None) -> str:
	if isinstance(input_data, Image.Image):
	vision_x = (
	image_processor.preprocess([input_data], return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
	)
	elif isinstance(input_data, list): # list of video frames
	vision_x = image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
	else:
	raise ValueError("Invalid input data. Expected PIL Image or list of video frames.")

	lang_x = model.text_tokenizer(
	[
	get_formatted_prompt(prompt),
	],
	return_tensors="pt",
	)

	generated_text = model.generate(
	vision_x=vision_x.to(model.device),
	lang_x=lang_x["input_ids"].to(model.device),
	attention_mask=lang_x["attention_mask"].to(model.device),
	max_new_tokens=512,
	num_beams=3,
	no_repeat_ngram_size=3,
	)
	parsed_output = (
	model.text_tokenizer.decode(generated_text[0])
	.split("<answer>")[-1]
	.lstrip()
	.rstrip()
	.split("<\|endofchunk\|>")[0]
	.lstrip()
	.rstrip()
	.lstrip('"')
	.rstrip('"')
	)
	return parsed_output


	# ------------------- Main Function -------------------

	if __name__ == "__main__":
	model = OtterForConditionalGeneration.from_pretrained(
	"luodian/otter-9b-dc-hf",
	)
	model.text_tokenizer.padding_side = "left"
	tokenizer = model.text_tokenizer
	image_processor = transformers.CLIPImageProcessor()
	model.eval()

	while True:
	video_url = "dc_demo.mp4" # Replace with the path to your video file

	frames_list = get_image(video_url)

	prompts_input = input("Enter prompts (comma-separated): ")
	prompts = [prompt.strip() for prompt in prompts_input.split(",")]

	for prompt in prompts:
	print(f"\nPrompt: {prompt}")
	response = get_response(frames_list, prompt, model, image_processor)
	print(f"Response: {response}")

	if prompts_input.lower() == "quit":
	break
	```