Ocelotr
/

xvec-qasr

Model card Files Files and versions Community

Ocelotr commited on Oct 8, 2023

Commit

d5a768b

•

1 Parent(s): 76e4014

Update hyperparams.yaml

Browse files

Files changed (1) hide show

hyperparams.yaml +47 -160

hyperparams.yaml CHANGED Viewed

@@ -1,174 +1,61 @@
-# Generated 2023-10-04 from:
-# /home/wakeb/Abdulrahman-tts/speechbrain/speechbrain/recipes/VoxCeleb/SpeakerRec/hparams/train_x_vectors.yaml
-# yamllint disable
-# ################################
-# Model: Speaker identification with ECAPA
-# Authors: Hwidong Na & Mirco Ravanelli
-# ################################
-# Basic parameters
-seed: 10
-__set_seed: !apply:torch.manual_seed [10]
-output_folder: /media/wakeb/T7 Touch/speechbrain10
-save_folder: /media/wakeb/T7 Touch/speechbrain10/save
-train_log: /media/wakeb/T7 Touch/speechbrain10/train_log.txt
-# Data files
-data_folder: /media/wakeb/T7 Touch/data_qasr/  # e.g. /path/to/Voxceleb
-train_annotation: /media/wakeb/T7 Touch/data_qasr/train.csv
-valid_annotation: /media/wakeb/T7 Touch/data_qasr/dev.csv
-# Folder to extract data augmentation files
-rir_folder: /media/wakeb/T7 Touch/data_qasr/ # Change it if needed
-# Use the following links for the official voxceleb splits:
-# VoxCeleb1 (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt
-# VoxCeleb1-H (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_hard2.txt
-# VoxCeleb1-E (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_all2.txt.
-# VoxCeleb1-E and VoxCeleb1-H lists are drawn from the VoxCeleb1 training set.
-# Therefore you cannot use any files in VoxCeleb1 for training if you are using these lists for testing.
-verification_file: /home/wakeb/Abdulrahman-tts/clovaai/voxceleb_trainer/data/test_list.txt
-split_ratio: [90, 10]
-skip_prep: false
-ckpt_interval_minutes: 15 # save checkpoint every N min
-# Training parameters
-number_of_epochs: 20
-batch_size: 140
-lr: 0.0008
-lr_final: 0.0001
-sample_rate: 16000
-sentence_len: 3 # seconds
-shuffle: false
-random_chunk: true
 # Feature parameters
 n_mels: 24
-left_frames: 0
-right_frames: 0
-deltas: false
-# Number of speakers
-out_n_neurons: 4112
-emb_dim: 512
-dataloader_options:
-  batch_size: 140
-  shuffle: false
-  num_workers: 0
-# Functions
-compute_features: &id006 !new:speechbrain.lobes.features.Fbank
-  n_mels: 24
-  left_frames: 0
-  right_frames: 0
-  deltas: false
-embedding_model: &id007 !new:speechbrain.lobes.models.Xvector.Xvector
-  in_channels: 24
-  activation: !name:torch.nn.LeakyReLU
-  tdnn_blocks: 5
-  tdnn_channels: [512, 512, 512, 512, 1500]
-  tdnn_kernel_sizes: [5, 3, 3, 1, 1]
-  tdnn_dilations: [1, 2, 3, 1, 1]
-  lin_neurons: 512
-classifier: &id008 !new:speechbrain.lobes.models.Xvector.Classifier
-  input_shape: [null, null, 512]
-  activation: !name:torch.nn.LeakyReLU
-  lin_blocks: 1
-  lin_neurons: 512
-  out_neurons: 4112
-epoch_counter: &id010 !new:speechbrain.utils.epoch_loop.EpochCounter
-  limit: 20
-augment_wavedrop: &id001 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
-  sample_rate: 16000
-  speeds: [100]
-augment_speed: &id002 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
-  sample_rate: 16000
-  speeds: [95, 100, 105]
-add_rev: &id003 !new:speechbrain.lobes.augment.EnvCorrupt
-  openrir_folder: /media/wakeb/T7 Touch/data_qasr/
-  openrir_max_noise_len: 3.0    # seconds
-  reverb_prob: 1.0
-  noise_prob: 0.0
-  noise_snr_low: 0
-  noise_snr_high: 15
-  rir_scale_factor: 1.0
-add_noise: &id004 !new:speechbrain.lobes.augment.EnvCorrupt
-  openrir_folder: /media/wakeb/T7 Touch/data_qasr/
-  openrir_max_noise_len: 3.0    # seconds
-  reverb_prob: 0.0
-  noise_prob: 1.0
-  noise_snr_low: 0
-  noise_snr_high: 15
-  rir_scale_factor: 1.0
-add_rev_noise: &id005 !new:speechbrain.lobes.augment.EnvCorrupt
-  openrir_folder: /media/wakeb/T7 Touch/data_qasr/
-  openrir_max_noise_len: 3.0    # seconds
-  reverb_prob: 1.0
-  noise_prob: 1.0
-  noise_snr_low: 0
-  noise_snr_high: 15
-  rir_scale_factor: 1.0
-# Definition of the augmentation pipeline.
-# If concat_augment = False, the augmentation techniques are applied
-# in sequence. If concat_augment = True, all the augmented signals
-# are concatenated in a single big batch.
-augment_pipeline: [*id001, *id002, *id003, *id004, *id005]
-concat_augment: true
-mean_var_norm: &id009 !new:speechbrain.processing.features.InputNormalization
-# Cost + optimization
-  norm_type: sentence
-  std_norm: false
 modules:
-  compute_features: *id006
-  augment_wavedrop: *id001
-  augment_speed: *id002
-  add_rev: *id003
-  add_noise: *id004
-  add_rev_noise: *id005
-  embedding_model: *id007
-  classifier: *id008
-  mean_var_norm: *id009
-compute_cost: !name:speechbrain.nnet.losses.nll_loss
-# compute_error: !name:speechbrain.nnet.losses.classification_error
-opt_class: !name:torch.optim.Adam
-  lr: 0.0008
-  weight_decay: 0.000002
-lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler
-  initial_value: 0.0008
-  final_value: 0.0001
-  epoch_count: 20
-# Logging + checkpoints
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-  save_file: /media/wakeb/T7 Touch/speechbrain10/train_log.txt
-error_stats: !name:speechbrain.utils.metric_stats.MetricStats
-  metric: !name:speechbrain.nnet.losses.classification_error
-    reduction: batch
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-  checkpoints_dir: /media/wakeb/T7 Touch/speechbrain10/save
-  recoverables:
-    embedding_model: *id007
-    classifier: *id008
-    normalizer: *id009
-    counter: *id010

 # Feature parameters
 n_mels: 24
+# Pretrain folder (HuggingFace)
+pretrained_path: Ocelotr/xvec-qasr
+# Output parameters
+out_n_neurons: 1734
+# Model params
+compute_features: !new:speechbrain.lobes.features.Fbank
+    n_mels: !ref <n_mels>
+mean_var_norm: !new:speechbrain.processing.features.InputNormalization
+    norm_type: sentence
+    std_norm: False
+embedding_model: !new:speechbrain.lobes.models.Xvector.Xvector
+    in_channels: !ref <n_mels>
+    activation: !name:torch.nn.LeakyReLU
+    tdnn_blocks: 5
+    tdnn_channels: [512, 512, 512, 512, 1500]
+    tdnn_kernel_sizes: [5, 3, 3, 1, 1]
+    tdnn_dilations: [1, 2, 3, 1, 1]
+    lin_neurons: 512
+classifier: !new:speechbrain.lobes.models.Xvector.Classifier
+    input_shape: [null, null, 512]
+    activation: !name:torch.nn.LeakyReLU
+    lin_blocks: 1
+    lin_neurons: 512
+    out_neurons: !ref <out_n_neurons>
+mean_var_norm_emb: !new:speechbrain.processing.features.InputNormalization
+    norm_type: global
+    std_norm: False
 modules:
+    compute_features: !ref <compute_features>
+    mean_var_norm: !ref <mean_var_norm>
+    embedding_model: !ref <embedding_model>
+    mean_var_norm_emb: !ref <mean_var_norm_emb>
+    classifier: !ref <classifier>
+label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+    loadables:
+        embedding_model: !ref <embedding_model>
+        mean_var_norm_emb: !ref <mean_var_norm_emb>
+        classifier: !ref <classifier>
+        label_encoder: !ref <label_encoder>
+    paths:
+        embedding_model: !ref <pretrained_path>/embedding_model.ckpt
+        mean_var_norm_emb: !ref <pretrained_path>/mean_var_norm_emb.ckpt
+        classifier: !ref <pretrained_path>/classifier.ckpt
+        label_encoder: !ref <pretrained_path>/label_encoder.txt