File size: 4,922 Bytes
5085882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
log_directory: "./log/latent_diffusion"
project: "audioldm"
precision: "high"

# TODO: change this with your project path
base_root: "/content/qa-mdt"

# TODO: change this with your pretrained path
# TODO: pretrained path is also needed in "base_root/offset_pretrained_checkpoints.json"
pretrained:
 clap_music: "/content/qa-mdt/checkpoints/clap_music"
 flan_t5: "/content/qa-mdt/checkpoints/flant5"
 hifi-gan: "/content/qa-mdt/checkpoints/hifi-gan/checkpoints"
 roberta-base: "/content/qa-mdt/checkpoints/robertabase"

# TODO: lmdb dataset that stores pMOS of the training dataset
# while in inference, we don't need it !!!
# while in inference, we don't need it !!!
# while in inference, we don't need it !!!
mos_path: "" 

train_path:
  train_lmdb_path: [] # path list of training lmdb folders

val_path:
  val_lmdb_path: [] # path list of training lmdb folders
  val_key_path: [] #  path list of training lmdb key files

variables:
  sampling_rate: &sampling_rate 16000 
  mel_bins: &mel_bins 64
  latent_embed_dim: &latent_embed_dim 8
  latent_t_size: &latent_t_size 256 # TODO might need to change
  latent_f_size: &latent_f_size 16 # TODO might need to change
  in_channels: &unet_in_channels 8 # TODO might need to change
  optimize_ddpm_parameter: &optimize_ddpm_parameter true
  optimize_gpt: &optimize_gpt true
  warmup_steps: &warmup_steps 2000

# we rewrite the dataset so it may not be needed
data: 
  train: ["audiocaps"]
  val: "audiocaps"
  test: "audiocaps"
  class_label_indices: "audioset_eval_subset"
  dataloader_add_ons: ["waveform_rs_48k"] 

step:
  validation_every_n_epochs: 10000
  save_checkpoint_every_n_steps: 1000
  # limit_val_batches: 2
  max_steps: 8000000
  save_top_k: 1000

preprocessing:
  audio:
    sampling_rate: *sampling_rate
    max_wav_value: 32768.0
    duration: 10.24
  stft:
    filter_length: 1024
    hop_length: 160
    win_length: 1024
  mel:
    n_mel_channels: *mel_bins
    mel_fmin: 0
    mel_fmax: 8000 

augmentation:
  mixup: 0.0

model:
  target: audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion
  params: 
    # Autoencoder
    first_stage_config:
      base_learning_rate: 8.0e-06
      target: audioldm_train.modules.latent_encoder.autoencoder.AutoencoderKL
      params: 
        # TODO: change it with your VAE checkpoint
        reload_from_ckpt: "/content/qa-mdt/checkpoints/hifi-gan/checkpoints/vae_mel_16k_64bins.ckpt"
        sampling_rate: *sampling_rate
        batchsize: 1
        monitor: val/rec_loss
        image_key: fbank
        subband: 1
        embed_dim: *latent_embed_dim
        time_shuffle: 1
        lossconfig:
          target: audioldm_train.losses.LPIPSWithDiscriminator
          params:
            disc_start: 50001
            kl_weight: 1000.0
            disc_weight: 0.5
            disc_in_channels: 1
        ddconfig: 
          double_z: true
          mel_bins: *mel_bins
          z_channels: 8
          resolution: 256
          downsample_time: false
          in_channels: 1
          out_ch: 1
          ch: 128 
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
    
    # Other parameters
    base_learning_rate: 8.0e-5
    warmup_steps: *warmup_steps
    optimize_ddpm_parameter: *optimize_ddpm_parameter
    sampling_rate: *sampling_rate
    batchsize: 16
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    unconditional_prob_cfg: 0.1
    parameterization: eps # [eps, x0, v]
    first_stage_key: fbank
    latent_t_size: *latent_t_size
    latent_f_size: *latent_f_size
    channels: *latent_embed_dim
    monitor: val/loss_simple_ema
    scale_by_std: true

    unet_config:
      # TODO: choose your class, Default: MDT_MOS_AS_TOKEN 
      # (Noted: the 2D-Rope, SwiGLU and the MDT are in two classes, when training with all of them, they should be changed and merged)
      target: audioldm_train.modules.diffusionmodules.PixArt.PixArt_MDT_MOS_AS_TOKEN
      params:
        input_size : [256, 16]
      # patch_size: [16,4]
        patch_size : [4, 1]
        overlap_size: [0, 0]
        in_channels : 8
        hidden_size : 1152
        depth : 28
        num_heads : 16
        mlp_ratio : 4.0
        class_dropout_prob : 0.1
        pred_sigma : True
        drop_path : 0.
        window_size : 0
        window_block_indexes : None
        use_rel_pos : False
        cond_dim : 1024
        lewei_scale : 1.0
        overlap: [0, 0]
        use_cfg: true
        mask_ratio: 0.30
        decode_layer: 8
    
    cond_stage_config:
      crossattn_flan_t5:
        cond_stage_key: text
        conditioning_key: crossattn
        target: audioldm_train.conditional_models.FlanT5HiddenState

    evaluation_params:
      unconditional_guidance_scale: 3.5
      ddim_sampling_steps: 200
      n_candidates_per_samples: 3