model: transformer_config: checkpoint_path: ../../../checkpoints/kny_video_light/checkpoint # vocab_size: 50257 # n_positions: 1024 # n_embd: 1024 #1280 #768 # n_layer: 24 #36 #12 # n_head: 16 #20 #12 # resid_pdrop: 0.1 # embd_pdrop: 0.1 # attn_pdrop: 0.1 # remaining_frames_method: "concat" # remaining_frames_method: "token_type_ids" remaining_frames_method: "own_embeddings" first_stage_config: checkpoint_path: ../../../checkpoints/kny_image_light_discriminator/checkpoint vqvae_config: beta: 0.25 num_embeddings: 64 embedding_dim: 256 autoencoder_config: z_channels: 128 channels: 64 channels_multiplier: - 1 - 1 - 2 - 2 - 4 num_res_blocks: 1 attention_resolution: - 16 resolution: 128 dropout: 0.0 discriminator_config: num_layers: 3 filters: 64 loss_config: discriminator: loss: "hinge" factor: 1.0 iter_start: 16200 weight: 0.3 vqvae: codebook_weight: 1.0 perceptual_weight: 4.0 perceptual_loss: "style" train: batch_size: 8 accumulation_size: 8 n_epochs: 2000 len_x_train: 631 warmup_epoch_percentage: 0.15 lr_start: 1e-5 lr_max: 2.5e-4 perceptual_loss_weight: 1.0 n_frames_before: 5 stop_ground_truth_after_epoch: 100