import gradio as gr import numpy as np import imageio import tensorflow as tf from tensorflow import keras from utils import TubeMaskingGenerator from utils import read_video, frame_sampling, denormalize, reconstrunction from utils import IMAGENET_MEAN, IMAGENET_STD, num_frames, patch_size, input_size from labels import K400_label_map, SSv2_label_map, UCF_label_map MODELS = { 'K400': [ './TFVideoMAE_S_K400_16x224_FT', './TFVideoMAE_S_K400_16x224_PT' ], 'SSv2': [ './TFVideoMAE_S_K400_16x224_FT', './TFVideoMAE_S_K400_16x224_PT' ], 'UCF' : [ './TFVideoMAE_S_K400_16x224_FT', './TFVideoMAE_S_K400_16x224_PT' ] } LABEL_MAPS = { 'K400': K400_label_map, 'SSv2': SSv2_label_map, 'UCF' : UCF_label_map } def tube_mask_generator(mask_ratio): window_size = ( num_frames // 2, input_size // patch_size[0], input_size // patch_size[1] ) tube_mask = TubeMaskingGenerator( input_size=window_size, mask_ratio=mask_ratio ) make_bool = tube_mask() bool_masked_pos_tf = tf.constant(make_bool, dtype=tf.int32) bool_masked_pos_tf = tf.expand_dims(bool_masked_pos_tf, axis=0) bool_masked_pos_tf = tf.cast(bool_masked_pos_tf, tf.bool) return bool_masked_pos_tf def get_model(data_type): ft_model = keras.models.load_model(MODELS[data_type][0]) pt_model = keras.models.load_model(MODELS[data_type][1]) label_map = LABEL_MAPS.get(data_type) label_map = {v: k for k, v in label_map.items()} return ft_model, pt_model, label_map def inference(video_file, data_type, mask_ratio): print('---------------------------') print(video_file) print(data_type) print(mask_ratio) print('---------------------------') # get sample data container = read_video(video_file) frames = frame_sampling(container, num_frames=num_frames) # get models bool_masked_pos_tf = tube_mask_generator(mask_ratio) ft_model, pt_model, label_map = get_model(data_type) ft_model.trainable = False pt_model.trainable = False # inference on fine-tune model outputs_ft = ft_model(frames[None, ...], training=False) probabilities = tf.nn.softmax(outputs_ft).numpy().squeeze(0) confidences = { label_map[i]: float(probabilities[i]) for i in np.argsort(probabilities)[::-1] } # inference on pre-trained model outputs_pt = pt_model(frames[None, ...], bool_masked_pos_tf, training=False) reconstruct_output, mask = reconstrunction( frames[None, ...], bool_masked_pos_tf, outputs_pt ) # post process input_frame = denormalize(frames) input_mask = denormalize(mask[0] * frames) output_frame = denormalize(reconstruct_output) frames = [] for frame_a, frame_b, frame_c in zip(input_frame, input_mask, output_frame): combined_frame = np.hstack([frame_a, frame_b, frame_c]) frames.append(combined_frame) combined_gif = 'combined.gif' imageio.mimsave(combined_gif, frames, duration=300, loop=0) return confidences, combined_gif def main(): datasets = ['K400', 'SSv2', 'UCF'] sample_example = [ ["examples/k400.mp4", 'Kintetics-400'], ["examples/k400.mp4", 'Something-Something-V2'], ["examples/k400.mp4", 'UCF101'] ] iface = gr.Interface( fn=inference, inputs=[ gr.Video(type="file", label="Input Video"), gr.Radio( datasets, type='value', default=datasets[0], label='Dataset', ), gr.Slider( 0.5, 1.0, step=0.1, default=0.5, label='Mask Ratio' ) ], outputs=[ gr.Label(num_top_classes=3, label='scores'), gr.Image(type="filepath", label='reconstructed') ], examples=sample_example, title="VideoMAE", description="Keras reimplementation of VideoMAE is presented here." ) iface.launch() if __name__ == '__main__': main()