File size: 2,124 Bytes
a4005ad
c377f65
 
 
a4005ad
 
 
 
 
 
 
 
c377f65
a4005ad
 
 
 
 
 
 
c377f65
a4005ad
 
 
c377f65
a4005ad
 
c377f65
a4005ad
 
 
 
 
c377f65
 
a4005ad
 
 
c377f65
a4005ad
 
 
c377f65
a4005ad
 
c377f65
 
 
a4005ad
 
c377f65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4005ad
 
 
c377f65
a4005ad
 
c377f65
a4005ad
 
 
 
 
 
 
 
 
 
c377f65
a4005ad
 
 
 
 
 
 
c377f65
 
 
 
 
a4005ad
 
 
c377f65
a4005ad
 
c377f65
a4005ad
 
 
 
c377f65
a4005ad
c377f65
a4005ad
 
 
c377f65
 
a4005ad
 
 
 
 
 
c377f65
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
dataset:
  training: []
  validation: []
  noise: []
  
  speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
  
  use_hdf5: True
  use_metadata: True
  hdf5_flag: r
  validate: True

  workers: 2
  cache: True

  phones_range: [4, 256]
  duration_range: [1.0, 16.0]

  random_utterance: 1.0
  max_prompts: 3
  prompt_duration: 6.0

  sample_type: speaker

  tasks_list: [ "tts" ] # , [ "tts", "tts-c", "ns", "sr", "tse", "cse", "nse", "tts"]

models:
  _prom_levels: 8
  _max_levels: 8

  _models:
  - name: "ar+nar"
    size: "full"
    resp_levels: 8
    prom_levels: 8
    tasks: 8
    arch_type: "retnet"
    training: True
    version: 2

hyperparameters:
  batch_size: 8
  gradient_accumulation_steps: 32
  gradient_clipping: 100
  
  optimizer: Prodigy
  torch_optimizer: True
  learning_rate: 1.0
  
  scheduler_type: ""
  #scheduler_type: OneCycle
  #scheduler_params:
  #  cycle_first_step_size: 10_000
  #  cycle_first_stair_count: 10_000

  #  cycle_second_step_size: 15_000
  #  cycle_second_stair_count: 15_000

  #  decay_step_size: 5_000

  #  cycle_min_lr: 2.5e-4 # 1.0e-5
  #  cycle_max_lr: 2.5e-4 # 1.0e-4
  #  decay_lr_rate: 0.0

  #  cycle_min_mom: 0.90
  #  cycle_max_mom: 0.99
  #  decay_mom_rate: 0.0

evaluation:
  batch_size: 16
  frequency: 250
  size: 16
  
  steps: 450
  ar_temperature: 0.95
  nar_temperature: 0.25
  load_disabled_engines: True

trainer:
  iterations: 1_000_000
  
  save_tag: step
  save_on_oom: True
  save_on_quit: True
  save_frequency: 100
  export_on_save: True

  keep_last_checkpoints: 4

  aggressive_optimizations: False
  load_disabled_engines: False

  #load_state_dict: True
  #strict_loading: False
  #load_tag: "9500"
  #load_states: False
  #restart_step_count: True
  
  gc_mode: None # "global_step"

  weight_dtype: bfloat16
  amp: False

  backend: deepspeed
  deepspeed:
    zero_optimization_level: 0
    use_compression_training: True

  activation_checkpointing: True

inference:
  use_vocos: True
  normalize: False

  weight_dtype: bfloat16
  amp: False

bitsandbytes:
  enabled: False
  injects: True
  linear: True
  embedding: True