base_config: ./fs.yaml ########################### # models ########################### # encoders hidden_size: 192 ffn_hidden_size: 768 enc_ffn_kernel_size: 5 enc_layers: 4 dur_level: word encoder_type: rel_fft use_word_encoder: true # mix ling encoder word_enc_layers: 4 word_encoder_type: rel_fft use_pitch_embed: false enc_prenet: true enc_pre_ln: true text_encoder_postnet: true dropout: 0.0 add_word_pos: true # dur predictor dur_predictor_layers: 3 dur_predictor_kernel: 5 predictor_dropout: 0.2 ## fvae use_fvae: true latent_size: 16 fvae_encoder_type: conv fvae_decoder_type: conv fvae_enc_dec_hidden: 192 fvae_kernel_size: 5 fvae_enc_n_layers: 8 fvae_dec_n_layers: 4 fvae_strides: 4 fvae_noise_scale: 1.0 post_decoder: false post_decoder_detach_ling: false # prior flow use_prior_flow: true prior_glow_hidden: 64 glow_kernel_size: 3 prior_glow_n_blocks: 4 ########################### # training and inference ########################### lambda_kl: 1.0 kl_min: 0.0 lambda_sent_dur: 0.0 kl_start_steps: 10000 posterior_start_steps: 0 frames_multiple: 4 num_valid_plots: 10 lr: 0.0002 warmup_updates: 8000 max_tokens: 40000 valid_infer_interval: 10000 max_sentences: 80 max_updates: 480000