seed_everything: 3407 data: class_path: decoder.dataset.VocosDataModule init_args: train_params: filelist_path: ./WavTokenizer/data/train/libritts_train sampling_rate: 24000 num_samples: 72000 batch_size: 40 # 20 num_workers: 8 val_params: filelist_path: ./WavTokenizer/data/infer/librttts_val sampling_rate: 24000 num_samples: 72000 batch_size: 5 # 10 num_workers: 8 model: class_path: decoder.experiment.WavTokenizer init_args: sample_rate: 24000 initial_learning_rate: 2e-4 mel_loss_coeff: 45 mrd_loss_coeff: 1.0 num_warmup_steps: 0 # Optimizers warmup steps pretrain_mel_steps: 0 # 0 means GAN objective from the first iteration # automatic evaluation evaluate_utmos: true evaluate_pesq: true evaluate_periodicty: true resume: false resume_config: ./WavTokenizer/configs/wavtokenizer_smalldata_frame40_3s_nq1_code16384_dim512_kmeans800_attn.yaml resume_model: ./version_3/checkpoints/example.ckpt feature_extractor: class_path: decoder.feature_extractors.EncodecFeatures init_args: encodec_model: encodec_24khz bandwidths: [6.6, 6.6, 6.6, 6.6] train_codebooks: true num_quantizers: 1 dowmsamples: [6, 5, 5, 4] vq_bins: 4096 vq_kmeans: 200 backbone: class_path: decoder.models.VocosBackbone init_args: input_channels: 512 dim: 768 intermediate_dim: 2304 num_layers: 12 adanorm_num_embeddings: 4 head: class_path: decoder.heads.ISTFTHead init_args: dim: 768 n_fft: 2400 hop_length: 600 padding: same trainer: logger: class_path: pytorch_lightning.loggers.TensorBoardLogger init_args: save_dir: ./WavTokenizer/result/train/wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn/ callbacks: - class_path: pytorch_lightning.callbacks.LearningRateMonitor - class_path: pytorch_lightning.callbacks.ModelSummary init_args: max_depth: 2 - class_path: pytorch_lightning.callbacks.ModelCheckpoint init_args: monitor: val_loss filename: wavtokenizer_checkpoint_{epoch}_{step}_{val_loss:.4f} save_top_k: 10 save_last: true - class_path: decoder.helpers.GradNormCallback # Lightning calculates max_steps across all optimizer steps (rather than number of batches) # This equals to 1M steps per generator and 1M per discriminator max_steps: 20000000 # You might want to limit val batches when evaluating all the metrics, as they are time-consuming limit_val_batches: 200 accelerator: gpu strategy: ddp devices: [0,1,2,3,4,5,6,7] log_every_n_steps: 1000