File size: 2,607 Bytes
3fe76d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
sample_rate: 44_000
audio_backend: "dac"

models:
- name: "ar+nar-dac"
  size:
    audio_tokens: 1024
    text_tokens: 256
    dim: 1024
    heads: 16
    layers: 16
  resp_levels: 8
  prom_levels: 8
  tasks: 8
  langs: 2
  tones: 1
  arch_type: llama
  training: True
  version: 5
  attention: auto
  dropout: 0.1

  loss_factors:
    text: 0.01
    prom: 0.5
    resp: 1.0

  capabilities: ["ar", "nar"]

  experimental:
    hf: False
    interleave: False
    audio_embedding_sums: False
    rvq_level_range: []

hyperparameters:
  autotune: False
  autotune_params:
    start_profile_step: 1
    end_profile_step: 50
    num_tuning_micro_batch_sizes: 8

  batch_size: 16
  gradient_accumulation_steps: 2
  gradient_clipping: 1.0
  warmup_steps: 10

  optimizer: Prodigy
  learning_rate: 1.0
  torch_optimizer: True
  
  scheduler: "" # ScheduleFree
  torch_scheduler: True

evaluation:
  batch_size: 4
  frequency: 1000
  size: 4
  
  steps: 500
  ar_temperature: 1.0
  nar_temperature: 0.0
  load_disabled_engines: True

trainer:
  #no_logger: True
  ddp: False
  check_for_oom: False
  iterations: 1_000_000
  
  save_tag: step
  save_on_oom: True
  save_on_quit: True
  save_frequency: 500
  export_on_save: True

  keep_last_checkpoints: 4

  aggressive_optimizations: False
  load_disabled_engines: False
  gradient_checkpointing: True

  #load_state_dict: True
  strict_loading: False
  #load_tag: "9500"
  #load_states: False
  #restart_step_count: True
  
  gc_mode: None # "global_step"

  weight_dtype: bfloat16
  amp: False

  backend: deepspeed
  deepspeed:
    inferencing: False
    zero_optimization_level: 0
    use_compression_training: False

    amp: False

  load_webui: False

inference:
  backend: local
  normalize: False

  weight_dtype: bfloat16
  amp: False

optimizations:
  injects: False
  replace: True

  linear: False
  embedding: False
  optimizers: True

  bitsandbytes: False
  dadaptation: False
  bitnet: False
  fp8: False

dataset:
  speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
  speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
  speaker_languages:
    ja: []

  use_hdf5: True
  use_metadata: True
  hdf5_flag: r
  validate: True

  workers: 6
  cache: True

  duration_range: [2.0, 60.0]

  random_utterance: 1.0
  max_prompts: 1
  prompt_duration_range: [3.0, 3.0]
  
  max_resps: 1
  p_resp_append: 0.25

  sample_type: path # path | speaker | group
  sample_order: duration
  sample_max_duration_batch: 0

  tasks_list: [ "tts" ] # , [ "tts", "tts-c", "ns", "sr", "tse", "cse", "nse", "tts"]

  training: []
  validation: []
  noise: []