from huggingface_hub import hf_hub_download import gradio as gr import numpy as np import imageio import tensorflow as tf from tensorflow import keras from utils import TubeMaskingGenerator from utils import read_video, frame_sampling, denormalize, reconstrunction from utils import IMAGENET_MEAN, IMAGENET_STD, num_frames, patch_size, input_size from labels import K400_label_map, SSv2_label_map, UCF_label_map MODELS = { 'K400': [ './TFVideoMAE_S_K400_16x224_FT', './TFVideoMAE_S_K400_16x224_PT' ], 'SSv2': [ './TFVideoMAE_S_K400_16x224_FT', './TFVideoMAE_S_K400_16x224_PT' ], 'UCF' : [ './TFVideoMAE_S_K400_16x224_FT', './TFVideoMAE_S_K400_16x224_PT' ] } LABEL_MAPS = { 'K400': K400_label_map, 'SSv2': SSv2_label_map, 'UCF' : UCF_label_map } def tube_mask_generator(mask_ratio): window_size = ( num_frames // 2, input_size // patch_size[0], input_size // patch_size[1] ) tube_mask = TubeMaskingGenerator( input_size=window_size, mask_ratio=mask_ratio ) make_bool = tube_mask() bool_masked_pos_tf = tf.constant(make_bool, dtype=tf.int32) bool_masked_pos_tf = tf.expand_dims(bool_masked_pos_tf, axis=0) bool_masked_pos_tf = tf.cast(bool_masked_pos_tf, tf.bool) return bool_masked_pos_tf def get_model(model_type): print('---------------------') print('innat/videomae/' + model_type + '_FT') ft_path = hf_hub_download( repo_id='innat/videomae', filename=model_type + '_FT', repo_type="model" ) pt_path = hf_hub_download( repo_id='innat/videomae', filename=model_type + '_PT', repo_type="model" ) ft_model = keras.models.load_model(ft_path) pt_model = keras.models.load_model(pt_path) if 'K400' in model_type: data_type = 'K400' elif 'SSv2' in model_type: data_type = 'SSv2' else: data_type = 'UCF' label_map = LABEL_MAPS.get(data_type) label_map = K400_label_map label_map = {v: k for k, v in label_map.items()} return ft_model, pt_model, label_map def inference(video_file, model_type, mask_ratio): # get sample data container = read_video(video_file) frames = frame_sampling(container, num_frames=num_frames) # get models bool_masked_pos_tf = tube_mask_generator(mask_ratio) ft_model, pt_model, label_map = get_model(model_type) ft_model.trainable = False pt_model.trainable = False # inference on fine-tune model outputs_ft = ft_model(frames[None, ...], training=False) probabilities = tf.nn.softmax(outputs_ft).numpy().squeeze(0) confidences = { label_map[i]: float(probabilities[i]) for i in np.argsort(probabilities)[::-1] } # inference on pre-trained model outputs_pt = pt_model(frames[None, ...], bool_masked_pos_tf, training=False) reconstruct_output, mask = reconstrunction( frames[None, ...], bool_masked_pos_tf, outputs_pt ) # post process input_frame = denormalize(frames) input_mask = denormalize(mask[0] * frames) output_frame = denormalize(reconstruct_output) frames = [] for frame_a, frame_b, frame_c in zip(input_frame, input_mask, output_frame): combined_frame = np.hstack([frame_a, frame_b, frame_c]) frames.append(combined_frame) combined_gif = 'combined.gif' imageio.mimsave(combined_gif, frames, duration=300, loop=0) return confidences, combined_gif def main(): datasets = ['K400', 'SSv2', 'UCF'] ALL_MODELS = [ 'TFVideoMAE_L_K400_16x224', 'TFVideoMAE_B_SSv2_16x224', 'TFVideoMAE_B_UCF_16x224', ] sample_example = [ ["examples/k400.mp4", ALL_MODELS[0], 0.9], ["examples/k400.mp4", ALL_MODELS[1], 0.8], ["examples/ucf.mp4", ALL_MODELS[2], 0.7], ] iface = gr.Interface( fn=inference, inputs=[ gr.Video(type="file", label="Input Video"), gr.Dropdown( choices=ALL_MODELS, value="TFVideoMAE_S_K400_16x224", label="Model" ), # gr.Radio( # datasets, # type='value', # default=datasets[0], # label='Dataset', # ), gr.Slider( 0.5, 1.0, step=0.1, default=0.5, label='Mask Ratio' ) ], outputs=[ gr.Label(num_top_classes=3, label='scores'), gr.Image(type="filepath", label='reconstructed') ], examples=sample_example, title="VideoMAE", description="Keras reimplementation of VideoMAE is presented here." ) iface.launch() if __name__ == '__main__': main()