{ "train": { "log_interval": 200, "eval_interval": 1000, "seed": 1234, "port": 8001, "epochs": 10000, "learning_rate": 2e-4, "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 8, "accumulation_steps": 1, "fp16_run": false, "lr_decay": 0.998, "segment_size": 10240, "init_lr_ratio": 1, "warmup_epochs": 0, "c_mel": 45, "save_dir": "logdir/visinger2" }, "data": { "data_dir":"../../data", "dataset_type": "SingDataset", "collate_type": "SingCollate", "training_filelist":"train.list", "training_labellist":"transcriptions.txt", "validation_filelist":"test.list", "validation_labellist":"transcriptions.txt", "max_wav_value": 32768.0, "sample_rate": 44100, "n_fft": 2048, "fmin": 0, "fmax": 22050, "hop_size": 512, "win_size": 2048, "acoustic_dim": 80, "min_level_db": -115, "ref_level_db": 20, "min_db": -115, "max_abs_value": 4.0, "n_speakers": 200, "spk2id": {"opencpop": 0, "taffy": 1, "otto": 2, "nanami": 3} }, "model": { "hidden_channels": 192, "spk_channels": 192, "filter_channels": 768, "n_heads": 2, "n_layers": 4, "kernel_size": 3, "p_dropout": 0.1, "prior_hidden_channels": 192, "prior_filter_channels": 768, "prior_n_heads": 2, "prior_n_layers": 4, "prior_kernel_size": 3, "prior_p_dropout": 0.1, "resblock": "1", "use_spectral_norm": false, "resblock_kernel_sizes": [3,7,11], "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], "upsample_rates": [8,8,4,2], "upsample_initial_channel": 256, "upsample_kernel_sizes": [16,16,8,4], "n_harmonic": 64, "n_bands": 65 } }