ecker commited on
Commit
3fe76d3
1 Parent(s): bdb59bd

Upload 2 files

Browse files
.gitattributes CHANGED
@@ -1,3 +1,4 @@
1
  model/ckpt/ar-llama-1/fp32.pth filter=lfs diff=lfs merge=lfs -text
2
  model/ckpt/ar+nar-llama-8/fp32.pth filter=lfs diff=lfs merge=lfs -text
3
  model/ckpt/nar-llama-8/fp32.pth filter=lfs diff=lfs merge=lfs -text
 
 
1
  model/ckpt/ar-llama-1/fp32.pth filter=lfs diff=lfs merge=lfs -text
2
  model/ckpt/ar+nar-llama-8/fp32.pth filter=lfs diff=lfs merge=lfs -text
3
  model/ckpt/nar-llama-8/fp32.pth filter=lfs diff=lfs merge=lfs -text
4
+ model/ckpt/ar+nar-dac-llama-9/ckpt/fp32.pth filter=lfs diff=lfs merge=lfs -text
model/ckpt/ar+nar-dac-llama-9/ckpt/fp32.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c3d6445b433edf576c4f48670035f4bf1054c4b4bb78fae3b77c3757f406b46
3
+ size 445272958
model/config.dac.yaml ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sample_rate: 44_000
2
+ audio_backend: "dac"
3
+
4
+ models:
5
+ - name: "ar+nar-dac"
6
+ size:
7
+ audio_tokens: 1024
8
+ text_tokens: 256
9
+ dim: 1024
10
+ heads: 16
11
+ layers: 16
12
+ resp_levels: 8
13
+ prom_levels: 8
14
+ tasks: 8
15
+ langs: 2
16
+ tones: 1
17
+ arch_type: llama
18
+ training: True
19
+ version: 5
20
+ attention: auto
21
+ dropout: 0.1
22
+
23
+ loss_factors:
24
+ text: 0.01
25
+ prom: 0.5
26
+ resp: 1.0
27
+
28
+ capabilities: ["ar", "nar"]
29
+
30
+ experimental:
31
+ hf: False
32
+ interleave: False
33
+ audio_embedding_sums: False
34
+ rvq_level_range: []
35
+
36
+ hyperparameters:
37
+ autotune: False
38
+ autotune_params:
39
+ start_profile_step: 1
40
+ end_profile_step: 50
41
+ num_tuning_micro_batch_sizes: 8
42
+
43
+ batch_size: 16
44
+ gradient_accumulation_steps: 2
45
+ gradient_clipping: 1.0
46
+ warmup_steps: 10
47
+
48
+ optimizer: Prodigy
49
+ learning_rate: 1.0
50
+ torch_optimizer: True
51
+
52
+ scheduler: "" # ScheduleFree
53
+ torch_scheduler: True
54
+
55
+ evaluation:
56
+ batch_size: 4
57
+ frequency: 1000
58
+ size: 4
59
+
60
+ steps: 500
61
+ ar_temperature: 1.0
62
+ nar_temperature: 0.0
63
+ load_disabled_engines: True
64
+
65
+ trainer:
66
+ #no_logger: True
67
+ ddp: False
68
+ check_for_oom: False
69
+ iterations: 1_000_000
70
+
71
+ save_tag: step
72
+ save_on_oom: True
73
+ save_on_quit: True
74
+ save_frequency: 500
75
+ export_on_save: True
76
+
77
+ keep_last_checkpoints: 4
78
+
79
+ aggressive_optimizations: False
80
+ load_disabled_engines: False
81
+ gradient_checkpointing: True
82
+
83
+ #load_state_dict: True
84
+ strict_loading: False
85
+ #load_tag: "9500"
86
+ #load_states: False
87
+ #restart_step_count: True
88
+
89
+ gc_mode: None # "global_step"
90
+
91
+ weight_dtype: bfloat16
92
+ amp: False
93
+
94
+ backend: deepspeed
95
+ deepspeed:
96
+ inferencing: False
97
+ zero_optimization_level: 0
98
+ use_compression_training: False
99
+
100
+ amp: False
101
+
102
+ load_webui: False
103
+
104
+ inference:
105
+ backend: local
106
+ normalize: False
107
+
108
+ weight_dtype: bfloat16
109
+ amp: False
110
+
111
+ optimizations:
112
+ injects: False
113
+ replace: True
114
+
115
+ linear: False
116
+ embedding: False
117
+ optimizers: True
118
+
119
+ bitsandbytes: False
120
+ dadaptation: False
121
+ bitnet: False
122
+ fp8: False
123
+
124
+ dataset:
125
+ speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
126
+ speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
127
+ speaker_languages:
128
+ ja: []
129
+
130
+ use_hdf5: True
131
+ use_metadata: True
132
+ hdf5_flag: r
133
+ validate: True
134
+
135
+ workers: 6
136
+ cache: True
137
+
138
+ duration_range: [2.0, 60.0]
139
+
140
+ random_utterance: 1.0
141
+ max_prompts: 1
142
+ prompt_duration_range: [3.0, 3.0]
143
+
144
+ max_resps: 1
145
+ p_resp_append: 0.25
146
+
147
+ sample_type: path # path | speaker | group
148
+ sample_order: duration
149
+ sample_max_duration_batch: 0
150
+
151
+ tasks_list: [ "tts" ] # , [ "tts", "tts-c", "ns", "sr", "tse", "cse", "nse", "tts"]
152
+
153
+ training: []
154
+ validation: []
155
+ noise: []