import gradio as gr import numpy as np import imageio import tensorflow as tf from tensorflow import keras from utils import TubeMaskingGenerator from utils import read_video, frame_sampling, denormalize, reconstrunction from utils import IMAGENET_MEAN, IMAGENET_STD, num_frames, patch_size, input_size from labels import K400_label_map, SSv2_label_map, UCF_label_map MODELS = { 'K400': [ 'innat/videomae/TFVideoMAE_S_K400_16x224_FT', 'innat/videomae/TFVideoMAE_S_K400_16x224_PT' ], 'SSv2': [], 'UCF' : [] } def tube_mask_generator(): window_size = ( num_frames // 2, input_size // patch_size[0], input_size // patch_size[1] ) tube_mask = TubeMaskingGenerator( input_size=window_size, mask_ratio=0.70 ) make_bool = tube_mask() bool_masked_pos_tf = tf.constant(make_bool, dtype=tf.int32) bool_masked_pos_tf = tf.expand_dims(bool_masked_pos_tf, axis=0) bool_masked_pos_tf = tf.cast(bool_masked_pos_tf, tf.bool) return bool_masked_pos_tf def video_to_gif(video_array, gif_filename): imageio.mimsave( gif_filename, video_array, duration=100 ) def get_model(data_type): ft_model = keras.models.load_model(MODELS[data_type][0]) pt_model = keras.models.load_model(MODELS[data_type][1]) label_map = {v: k for k, v in K400_label_map.items()} return ft_model, pt_model, label_map def inference(video_file, dataset_type): container = read_video(video_file) frames = frame_sampling(container, num_frames=num_frames) bool_masked_pos_tf = tube_mask_generator() ft_model, pt_model, label_map = get_model(dataset_type) ft_model.trainable = False pt_model.trainable = False # inference on fine-tune model outputs_ft = ft_model(frames[None, ...], training=False) probabilities = tf.nn.softmax(outputs_ft).numpy().squeeze(0) confidences = { label_map[i]: float(probabilities[i]) for i in np.argsort(probabilities)[::-1] } # inference on pre-trained model outputs_pt = pt_model(frames[None, ...], bool_masked_pos_tf, training=False) reconstruct_output, mask = reconstrunction( frames[None, ...], bool_masked_pos_tf, outputs_pt ) input_frame = denormalize(frames) input_mask = denormalize(mask[0] * frames) output_frame = denormalize(reconstruct_output) frames = [] for frame_a, frame_b, frame_c in zip(input_frame, input_mask, output_frame): combined_frame = np.hstack([frame_a, frame_b, frame_c]) frames.append(combined_frame) combined_gif = 'combined.gif' imageio.mimsave(combined_gif, frames, duration=300, loop=0) return confidences, combined_gif gr.Interface( fn=inference, inputs=[ gr.Video(type="file"), gr.Radio( ['K400', 'SSv2', 'UCF'], label='Dataset', value='K400' ), ], outputs=[ gr.Label(num_top_classes=3, label='confidence scores'), gr.Image(type="filepath", label='reconstructed masked autoencoder') ], examples=[ ["examples/k400.mp4"], ["examples/k400.mp4"], ["examples/k400.mp4"], ], title="VideoMAE", ).launch()