import gradio as gr
import numpy as np
import imageio

import tensorflow as tf
from tensorflow import keras

from utils import TubeMaskingGenerator
from utils import read_video, frame_sampling, denormalize, reconstrunction
from utils import IMAGENET_MEAN, IMAGENET_STD, num_frames, patch_size, input_size
from labels import K400_label_map, SSv2_label_map, UCF_label_map

MODELS = {
    'K400': [
        './TFVideoMAE_S_K400_16x224_FT',
        './TFVideoMAE_S_K400_16x224_PT'
        ],
    'SSv2': [
        './TFVideoMAE_S_K400_16x224_FT',
        './TFVideoMAE_S_K400_16x224_PT'
        ],
    'UCF' : [
        './TFVideoMAE_S_K400_16x224_FT',
        './TFVideoMAE_S_K400_16x224_PT'
        ]
}

LABEL_MAPS = {
    'K400': K400_label_map,
    'SSv2': SSv2_label_map,
    'UCF' : UCF_label_map 
}

def tube_mask_generator(mask_ratio):
    window_size = (
        num_frames // 2, 
        input_size // patch_size[0], 
        input_size // patch_size[1]
    )
    tube_mask = TubeMaskingGenerator(
        input_size=window_size, 
        mask_ratio=mask_ratio
    )
    make_bool = tube_mask()
    bool_masked_pos_tf = tf.constant(make_bool, dtype=tf.int32)
    bool_masked_pos_tf = tf.expand_dims(bool_masked_pos_tf, axis=0)
    bool_masked_pos_tf = tf.cast(bool_masked_pos_tf, tf.bool)
    return bool_masked_pos_tf


def get_model(data_type):
    ft_model = keras.models.load_model(MODELS[data_type][0])
    pt_model = keras.models.load_model(MODELS[data_type][1])

    label_map = LABEL_MAPS.get(data_type)
    label_map = {v: k for k, v in label_map.items()}
    
    return ft_model, pt_model, label_map


def inference(video_file, data_type, mask_ratio):
    print('---------------------------')
    print(video_file)
    print(data_type)
    print(mask_ratio)
    print('---------------------------')

    # get sample data
    container = read_video(video_file)
    frames = frame_sampling(container, num_frames=num_frames)

    # get models
    bool_masked_pos_tf = tube_mask_generator(mask_ratio)
    ft_model, pt_model, label_map = get_model(data_type)
    ft_model.trainable = False
    pt_model.trainable = False

    # inference on fine-tune model
    outputs_ft = ft_model(frames[None, ...], training=False)
    probabilities = tf.nn.softmax(outputs_ft).numpy().squeeze(0)
    confidences = {
        label_map[i]: float(probabilities[i]) for i in np.argsort(probabilities)[::-1]
    }

    # inference on pre-trained model
    outputs_pt = pt_model(frames[None, ...], bool_masked_pos_tf, training=False)
    reconstruct_output, mask = reconstrunction(
        frames[None, ...], bool_masked_pos_tf, outputs_pt
    )

    # post process
    input_frame = denormalize(frames)
    input_mask = denormalize(mask[0] * frames)
    output_frame = denormalize(reconstruct_output)

    frames = []
    for frame_a, frame_b, frame_c in zip(input_frame, input_mask, output_frame):
        combined_frame = np.hstack([frame_a, frame_b, frame_c])
        frames.append(combined_frame)

    combined_gif = 'combined.gif'
    imageio.mimsave(combined_gif, frames, duration=300, loop=0)
    return confidences, combined_gif


def main():
    datasets = ['K400', 'SSv2', 'UCF']
    sample_example = [
        ["examples/k400.mp4", 'Kintetics-400'],
        ["examples/k400.mp4", 'Something-Something-V2'],
        ["examples/k400.mp4", 'UCF101']
    ]

    iface = gr.Interface(
        fn=inference,
        inputs=[ 
            gr.Video(type="file", label="Input Video"),
            gr.Radio(
                datasets, 
                type='value',
                default=datasets[0],
                label='Dataset', 
            ),
            gr.Slider(
                0.5,
                1.0,
                step=0.1,
                default=0.5,
                label='Mask Ratio'
            )
        ],
        outputs=[
            gr.Label(num_top_classes=3, label='scores'),
            gr.Image(type="filepath", label='reconstructed')
        ],
        examples=sample_example,
        title="VideoMAE",
        description="Keras reimplementation of <a href='https://github.com/innat/VideoMAE'>VideoMAE</a> is presented here."
    )
    
    iface.launch()

if __name__ == '__main__':
    main()