ecker commited on
Commit
822037a
1 Parent(s): 9ddb793

Upload 2 files

Browse files
.gitattributes CHANGED
@@ -6,3 +6,4 @@ models/experiments/ckpt/ar+nar-llama-8/fp32.pth filter=lfs diff=lfs merge=lfs -t
6
  models/experiments/ckpt/nar-len-llama-9/fp32.pth filter=lfs diff=lfs merge=lfs -text
7
  models/ckpt/ar+nar-llama-8/fp32.sft filter=lfs diff=lfs merge=lfs -text
8
  models/ckpt/ar+nar-retnet-8/fp32.sft filter=lfs diff=lfs merge=lfs -text
 
 
6
  models/experiments/ckpt/nar-len-llama-9/fp32.pth filter=lfs diff=lfs merge=lfs -text
7
  models/ckpt/ar+nar-llama-8/fp32.sft filter=lfs diff=lfs merge=lfs -text
8
  models/ckpt/ar+nar-retnet-8/fp32.sft filter=lfs diff=lfs merge=lfs -text
9
+ models/ckpt/ar+nar-tts+stt-llama-8/fp32.sft filter=lfs diff=lfs merge=lfs -text
models/ckpt/ar+nar-tts+stt-llama-8/fp32.sft ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbf25cb391bbbb79ee05b5ecb9b219dce8785d9561473001e0e041a70d0e634b
3
+ size 456272602
models/config.llama.tts+stt.yaml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sample_rate: 24_000
2
+ audio_backend: "vocos"
3
+
4
+ models:
5
+ - name: "ar+nar-tts+stt"
6
+ size: "full"
7
+ resp_levels: 8
8
+ prom_levels: 8
9
+ tasks: 9
10
+ langs: 2
11
+ tones: 1
12
+ arch_type: llama
13
+ training: False
14
+ version: 5
15
+ attention: auto
16
+ dropout: 0.1
17
+ #loss_factors:
18
+ # text: 0.01
19
+ # prom: 0.5
20
+ # resp: 1.0
21
+ capabilities: ["ar", "nar"]
22
+ experimental:
23
+ audio_embedding_sums: False
24
+ interleave: False
25
+ unified_position_ids: False
26
+ rvq_level_range: []
27
+ split_classifiers: True
28
+ tie_classifier_to_embedding: False
29
+ causal_size: 1
30
+ p_rvq_levels: "auto"
31
+
32
+ #loras:
33
+ #- name : "lora"
34
+ # rank: 128
35
+ # alpha: 128
36
+ # training: True
37
+ # rvq_levels: []
38
+
39
+ hyperparameters:
40
+ batch_size: 32
41
+ gradient_accumulation_steps: 8
42
+ gradient_clipping: 1.0
43
+ warmup_steps: 10
44
+
45
+ optimizer: Prodigy
46
+ learning_rate: 1.0
47
+ torch_optimizer: True
48
+
49
+ scheduler: "" # ScheduleFree
50
+ torch_scheduler: True
51
+
52
+ evaluation:
53
+ batch_size: 4
54
+ frequency: 250
55
+ size: 4
56
+
57
+ steps: 500
58
+ ar_temperature: 1.0
59
+ nar_temperature: 0.0
60
+
61
+ trainer:
62
+ iterations: 1_000_000
63
+ save_frequency: 250
64
+ keep_last_checkpoints: 4
65
+
66
+ resize_modules: True
67
+
68
+ check_for_oom: False
69
+ gradient_checkpointing: True
70
+
71
+ weight_dtype: bfloat16
72
+ amp: True
73
+
74
+ backend: deepspeed
75
+ deepspeed:
76
+ inferencing: False
77
+ amp: False
78
+
79
+ load_webui: False
80
+
81
+ inference:
82
+ backend: local
83
+ normalize: False
84
+
85
+ weight_dtype: bfloat16
86
+ amp: True
87
+
88
+ optimizations:
89
+ injects: False
90
+ replace: True
91
+
92
+ linear: False
93
+ embedding: False
94
+ optimizers: True
95
+
96
+ bitsandbytes: False
97
+ dadaptation: False
98
+ bitnet: False
99
+ fp8: False
100
+
101
+ dataset:
102
+ speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
103
+ speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
104
+ speaker_languages:
105
+ ja: [
106
+ "housamo"
107
+ ]
108
+
109
+ use_hdf5: True
110
+ hdf5_flag: r
111
+
112
+ use_metadata: True
113
+ validate: True
114
+
115
+ workers: 1
116
+ cache: True
117
+
118
+ duration_range: [3.0, 12.0]
119
+
120
+ random_utterance: 1.0
121
+ max_prompts: 1
122
+ prompt_duration_range: [3.0, 3.0]
123
+
124
+ max_resps: 1
125
+ p_resp_append: 0.25
126
+
127
+ sample_type: path # path # speaker
128
+ sample_order: duration
129
+ sample_max_duration_batch: 300
130
+ sample_shuffle: False
131
+
132
+ training: []
133
+ validation: []
134
+ noise: []