File size: 2,165 Bytes
67ce2fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
sample_rate: 24_000
audio_backend: "vocos"

models:
- name: "ar+nar-tts+stt"
  size: "full"
  resp_levels: 8
  prom_levels: 8
  tasks: 9
  langs: 2
  tones: 1
  arch_type: llama
  training: False
  version: 5
  attention: auto
  dropout: 0.1
  #loss_factors:
  #  text: 0.01
  #  prom: 0.5
  #  resp: 1.0
  capabilities: ["ar", "nar"]
  experimental:
    p_rvq_levels: "auto"
    audio_embedding_sums: True
    unified_position_ids: False
    split_classifiers: True
    # 
    causal_size: 1
    interleave: False
    rvq_level_range: []
    tie_classifier_to_embedding: False

loras:
- name : "lora-max"
  rank: 128
  alpha: 128
  training: True
  rvq_levels: []

hyperparameters:
  batch_size: 32
  gradient_accumulation_steps: 8
  gradient_clipping: 1.0
  warmup_steps: 10

  optimizer: Prodigy
  learning_rate: 1.0
  torch_optimizer: True
  
  scheduler: "" # ScheduleFree
  torch_scheduler: True

evaluation:
  batch_size: 4
  frequency: 250
  size: 4
  
  steps: 500
  ar_temperature: 1.0
  nar_temperature: 0.0

trainer:
  iterations: 1_000_000  
  save_frequency: 250
  keep_last_checkpoints: 4

  resize_modules: True
  
  check_for_oom: False
  gradient_checkpointing: True

  weight_dtype: bfloat16
  amp: True

  backend: deepspeed
  deepspeed:
    inferencing: False
    amp: False

  load_webui: False

inference:
  backend: local
  normalize: False

  weight_dtype: bfloat16
  amp: True

optimizations:
  injects: False
  replace: True

  linear: False
  embedding: False
  optimizers: True

  bitsandbytes: False
  dadaptation: False
  bitnet: False
  fp8: False

dataset:
  use_hdf5: True
  hdf5_flag: r
  
  use_metadata: True
  validate: True

  workers: 1
  cache: True

  duration_range: [3.0, 12.0]

  random_utterance: 1.0
  max_prompts: 1
  prompt_duration_range: [3.0, 3.0]
  
  max_resps: 1
  p_resp_append: 0.25

  sample_type: path # path # speaker
  sample_order: duration
  sample_max_duration_batch: 300
  sample_shuffle: False

  tasks_list: [ "tts", "stt" ]

  training: []
  validation: []
  noise: []