Build

Paused

App Files Files Community

ManishThota commited on Mar 9

Commit

0a6288f

•

1 Parent(s): ae8423a

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -66

app.py CHANGED Viewed

@@ -58,95 +58,77 @@ def extract_frames(frame):
     return image_bgr
-def predict_answer(video, question, max_tokens=100):
     text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
     input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
-    frames = video_to_frames(video)
-    answers = []
-    for i in range(len(frames)):
-        image = extract_frames(frames[i])
-        image_tensor = model.image_preprocess([image])
-        # Generate the answer
-        output_ids = model.generate(
-                input_ids,
-                max_new_tokens=max_tokens,
-                images=image_tensor,
-                use_cache=True)[0]
-        answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
-        answers.append(answer)
-        return answers
-    # if image:
-    #     # Process as an image
-    #     image = image.convert("RGB")
-    #     image_tensor = model.image_preprocess(image)
-    #     #Generate the answer
-    #     output_ids = model.generate(
-    #         input_ids,
-    #         max_new_tokens=max_tokens,
-    #         images=image_tensor,
-    #         use_cache=True)[0]
-    #     return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
-    # elif video:
-    #     # Process as a video
-    #     frames = video_to_frames(video)
-    #     answers = []
-    #     for frame in frames:
-    #         image = extract_frames(frame)
-    #         image_tensor = model.image_preprocess(image)
-    #         # Generate the answer
-    #         output_ids = model.generate(
-    #             input_ids,
-    #             max_new_tokens=max_tokens,
-    #             images=image_tensor,
-    #             use_cache=True)[0]
-    #         answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
-    #         answers.append(answer)
-    #     return answers
-    # else:
-    #     return "Unsupported file type. Please upload an image or video."
-# def gradio_predict(image, video, question, max_tokens):
-#     answer = predict_answer(image, video, question, max_tokens)
-#     return answer
-# iface = gr.Interface(
-#     fn=gradio_predict,
-#     inputs=[
-#         gr.Image(type="pil", label="Upload or Drag an Image"),
-#         gr.Video(label="Upload your video here"),
-#         gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
-#         gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
-#     outputs=gr.TextArea(label="Answer"),
-#     # outputs=gr.Image(label="Output"),
-#     title="Video/Image Viewer",
-#     description="Upload an image or video to view it or extract frames from the video.",
-# )
-# iface.launch(debug=True)
-def gradio_predict(video, question, max_tokens):
-    answer = predict_answer(video, question, max_tokens)
     return answer
 iface = gr.Interface(
     fn=gradio_predict,
     inputs=[
         gr.Video(label="Upload your video here"),
         gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
         gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
@@ -156,4 +138,4 @@ iface = gr.Interface(
     description="Upload an image or video to view it or extract frames from the video.",
 )
-iface.launch(debug=True)

     return image_bgr
+def predict_answer(image, video, question, max_tokens=100):
     text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
     input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
+    # frames = video_to_frames(video)
+    # answers = []
+    # for i in range(len(frames)):
+    #     image = extract_frames(frames[i])
+    #     image_tensor = model.image_preprocess([image])
+    #     # Generate the answer
+    #     output_ids = model.generate(
+    #             input_ids,
+    #             max_new_tokens=max_tokens,
+    #             images=image_tensor,
+    #             use_cache=True)[0]
+    #     answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
+    #     answers.append(answer)
+    #     return answers
+    if image:
+        # Process as an image
+        image = image.convert("RGB")
+        image_tensor = model.image_preprocess(image)
+        #Generate the answer
+        output_ids = model.generate(
+            input_ids,
+            max_new_tokens=max_tokens,
+            images=image_tensor,
+            use_cache=True)[0]
+        return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
+    elif video:
+        # Process as a video
+        frames = video_to_frames(video)
+        answers = []
+        for frame in frames:
+            image = extract_frames(frame)
+            image_tensor = model.image_preprocess([image])
+            # Generate the answer
+            output_ids = model.generate(
+                input_ids,
+                max_new_tokens=max_tokens,
+                images=image_tensor,
+                use_cache=True)[0]
+            answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
+            answers.append(answer)
+        return answers
+    else:
+        return "Unsupported file type. Please upload an image or video."
+def gradio_predict(image, video, question, max_tokens):
+    answer = predict_answer(image, video, question, max_tokens)
     return answer
 iface = gr.Interface(
     fn=gradio_predict,
     inputs=[
+        gr.Image(type="pil", label="Upload or Drag an Image"),
         gr.Video(label="Upload your video here"),
         gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
         gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
     description="Upload an image or video to view it or extract frames from the video.",
 )
+iface.launch(debug=True)