File size: 2,492 Bytes
2de4670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03504d1
 
ea6e94f
 
2de4670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
sample_rate: 24_000
audio_backend: "vocos"

models:
- name: "ar+nar"
  size: "full"
  resp_levels: 8
  prom_levels: 8
  tasks: 8
  langs: 2
  tones: 1
  arch_type: llama
  training: False
  version: 5
  attention: auto
  dropout: 0.1
  loss_factors:
    text: 0.01
    prom: 0.5
    resp: 1.0
  capabilities: ["ar", "nar"]
  experimental:
    audio_embedding_sums: False
    unified_position_ids: False
    split_classifiers: True

hyperparameters:
  autotune: False
  autotune_params:
    start_profile_step: 1
    end_profile_step: 50
    num_tuning_micro_batch_sizes: 8

  batch_size: 16
  gradient_accumulation_steps: 8
  gradient_clipping: 1.0
  warmup_steps: 250

  optimizer: Prodigy
  learning_rate: 1.0
  torch_optimizer: True
  
  scheduler: "" # ScheduleFree
  torch_scheduler: True

evaluation:
  batch_size: 16
  frequency: 1000
  size: 16
  
  steps: 500
  ar_temperature: 0.95
  nar_temperature: 0.25
  load_disabled_engines: True

trainer:
  #no_logger: True
  ddp: False
  check_for_oom: False
  iterations: 1_000_000
  
  save_tag: step
  save_on_oom: True
  save_on_quit: True
  save_frequency: 500
  export_on_save: True

  keep_last_checkpoints: 8

  aggressive_optimizations: False
  load_disabled_engines: False
  gradient_checkpointing: True

  #load_state_dict: True
  strict_loading: False
  #load_tag: "9500"
  #load_states: False
  #restart_step_count: True
  
  gc_mode: None # "global_step"

  weight_dtype: bfloat16
  amp: True

  backend: deepspeed
  deepspeed:
    inferencing: True
    zero_optimization_level: 0
    use_compression_training: False

    amp: False

  load_webui: False

inference:
  backend: deepspeed
  audio_backend: "vocos"
  normalize: False

  weight_dtype: bfloat16
  amp: True

optimizations:
  injects: False
  replace: True

  linear: False
  embedding: False
  optimizers: True

  bitsandbytes: False
  dadaptation: False
  bitnet: False
  fp8: False

dataset:
  speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
  speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
  speaker_languages:
    ja: []

  use_hdf5: True
  use_metadata: True
  hdf5_flag: r
  validate: True

  workers: 6
  cache: True

  duration_range: [24.0, 32.0]

  random_utterance: 1.0
  max_prompts: 1
  prompt_duration_range: [3.0, 9.0]
  
  max_resps: 1
  p_resp_append: 0.25

  sample_type: path # path # speaker

  tasks_list: [ "tts" ] # , [ "tts", "tts-c", "ns", "sr", "tse", "cse", "nse", "tts"]

  training: []
  validation: []
  noise: []