jadechoghari commited on
Commit
2a97bc5
1 Parent(s): a705269

Update audioldm_train/config/mos_as_token/qa_mdt.yaml

Browse files
audioldm_train/config/mos_as_token/qa_mdt.yaml CHANGED
@@ -3,7 +3,7 @@ project: "audioldm"
3
  precision: "high"
4
 
5
  # TODO: change this with your project path
6
- base_root: "/content/qa-mdt"
7
 
8
  # TODO: change this with your pretrained path
9
  # TODO: pretrained path is also needed in "base_root/offset_pretrained_checkpoints.json"
@@ -70,12 +70,12 @@ augmentation:
70
  mixup: 0.0
71
 
72
  model:
73
- target: audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion
74
  params:
75
  # Autoencoder
76
  first_stage_config:
77
  base_learning_rate: 8.0e-06
78
- target: audioldm_train.modules.latent_encoder.autoencoder.AutoencoderKL
79
  params:
80
  # TODO: change it with your VAE checkpoint
81
  reload_from_ckpt: "./qa-mdt/checkpoints/hifi-gan/checkpoints/vae_mel_16k_64bins.ckpt"
@@ -87,7 +87,7 @@ model:
87
  embed_dim: *latent_embed_dim
88
  time_shuffle: 1
89
  lossconfig:
90
- target: audioldm_train.losses.LPIPSWithDiscriminator
91
  params:
92
  disc_start: 50001
93
  kl_weight: 1000.0
@@ -133,7 +133,7 @@ model:
133
  unet_config:
134
  # TODO: choose your class, Default: MDT_MOS_AS_TOKEN
135
  # (Noted: the 2D-Rope, SwiGLU and the MDT are in two classes, when training with all of them, they should be changed and merged)
136
- target: audioldm_train.modules.diffusionmodules.PixArt.PixArt_MDT_MOS_AS_TOKEN
137
  params:
138
  input_size : [256, 16]
139
  # patch_size: [16,4]
@@ -161,7 +161,7 @@ model:
161
  crossattn_flan_t5:
162
  cond_stage_key: text
163
  conditioning_key: crossattn
164
- target: audioldm_train.conditional_models.FlanT5HiddenState
165
 
166
  evaluation_params:
167
  unconditional_guidance_scale: 3.5
 
3
  precision: "high"
4
 
5
  # TODO: change this with your project path
6
+ base_root: "./qa-mdt"
7
 
8
  # TODO: change this with your pretrained path
9
  # TODO: pretrained path is also needed in "base_root/offset_pretrained_checkpoints.json"
 
70
  mixup: 0.0
71
 
72
  model:
73
+ target: qa_mdt.audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion
74
  params:
75
  # Autoencoder
76
  first_stage_config:
77
  base_learning_rate: 8.0e-06
78
+ target: qa_mdt.audioldm_train.modules.latent_encoder.autoencoder.AutoencoderKL
79
  params:
80
  # TODO: change it with your VAE checkpoint
81
  reload_from_ckpt: "./qa-mdt/checkpoints/hifi-gan/checkpoints/vae_mel_16k_64bins.ckpt"
 
87
  embed_dim: *latent_embed_dim
88
  time_shuffle: 1
89
  lossconfig:
90
+ target: qa_mdt.audioldm_train.losses.LPIPSWithDiscriminator
91
  params:
92
  disc_start: 50001
93
  kl_weight: 1000.0
 
133
  unet_config:
134
  # TODO: choose your class, Default: MDT_MOS_AS_TOKEN
135
  # (Noted: the 2D-Rope, SwiGLU and the MDT are in two classes, when training with all of them, they should be changed and merged)
136
+ target: qa_mdt.audioldm_train.modules.diffusionmodules.PixArt.PixArt_MDT_MOS_AS_TOKEN
137
  params:
138
  input_size : [256, 16]
139
  # patch_size: [16,4]
 
161
  crossattn_flan_t5:
162
  cond_stage_key: text
163
  conditioning_key: crossattn
164
+ target: qa_mdt.audioldm_train.conditional_models.FlanT5HiddenState
165
 
166
  evaluation_params:
167
  unconditional_guidance_scale: 3.5