ecker commited on
Commit
f0fb314
1 Parent(s): 2b739a9

So far so good

Browse files
README.md CHANGED
@@ -1,3 +1,9 @@
1
  ---
2
  license: agpl-3.0
3
  ---
 
 
 
 
 
 
 
1
  ---
2
  license: agpl-3.0
3
  ---
4
+
5
+ This repo contains the necessary weights and configuration file for use with my VALL-E implementation: [mrq/vall-e](https://git.ecker.tech/mrq/vall-e)
6
+
7
+ The model currently is in a *semi-usable* state, and I'm only releasing them now in hopes that it also helps jumpstart anyone else that wants to use them.
8
+
9
+ In the future, I'll release my dataset as well, so anyone can also grab the dataset and train from scratch or continue off from this repo.
ckpt/ar-retnet-4/fp32.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddbb2dc8049ccfc5547d8dcfb5c6c47dc82b7bcdb3014a3bcf193e21588f254a
3
+ size 418040447
ckpt/nar-retnet-4/fp32.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0170d5e6862cfb5871de952e93ff848e457f631a2cddd2975407c9d4031d2f46
3
+ size 422230591
config.yaml ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ training: [
3
+ ]
4
+
5
+ validation: [
6
+ ]
7
+ noise: [
8
+ ]
9
+
10
+ speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
11
+
12
+ use_hdf5: True
13
+ hdf5_flag: r
14
+ validate: True
15
+
16
+ workers: 4
17
+ cache: True
18
+
19
+ phones_range: [4, 512]
20
+ duration_range: [1.0, 24.0]
21
+
22
+ random_utterance: 1.0
23
+ max_prompts: 3
24
+ prompt_duration: 3.0
25
+
26
+ sample_type: speaker
27
+
28
+ tasks_list: ["tts"] # ["tts", "ns", "sr", "tse", "cse", "nse", "tts"]
29
+
30
+ models:
31
+ _max_levels: 8
32
+ _models:
33
+ - name: "ar"
34
+ size: "full"
35
+ resp_levels: 1
36
+ prom_levels: 2
37
+ tasks: 8
38
+ arch_type: "retnet"
39
+
40
+ - name: "nar"
41
+ size: "full"
42
+ resp_levels: 3
43
+ prom_levels: 4
44
+ tasks: 8
45
+ arch_type: "retnet"
46
+
47
+
48
+ hyperparameters:
49
+ batch_size: 32
50
+ gradient_accumulation_steps: 4
51
+ gradient_clipping: 100
52
+
53
+ optimizer: AdamW
54
+ learning_rate: 1.0e-6
55
+
56
+ scheduler_type: ""
57
+ #scheduler_type: OneCycle
58
+ #scheduler_params:
59
+ # cycle_first_step_size: 10_000
60
+ # cycle_first_stair_count: 10_000
61
+
62
+ # cycle_second_step_size: 15_000
63
+ # cycle_second_stair_count: 15_000
64
+
65
+ # decay_step_size: 5_000
66
+
67
+ # cycle_min_lr: 2.5e-4 # 1.0e-5
68
+ # cycle_max_lr: 2.5e-4 # 1.0e-4
69
+ # decay_lr_rate: 0.0
70
+
71
+ # cycle_min_mom: 0.90
72
+ # cycle_max_mom: 0.99
73
+ # decay_mom_rate: 0.0
74
+
75
+ evaluation:
76
+ batch_size: 64
77
+ frequency: 500
78
+ size: 64
79
+
80
+ steps: 300
81
+ ar_temperature: 0.95
82
+ nar_temperature: 0.25
83
+
84
+ trainer:
85
+ iterations: 1_000_000
86
+
87
+ save_tag: step
88
+ save_on_oom: True
89
+ save_on_quit: True
90
+ save_frequency: 25
91
+
92
+ keep_last_checkpoints: 2
93
+
94
+ aggressive_optimizations: False
95
+
96
+ load_state_dict: True
97
+ strict_loading: False
98
+ #load_tag: "9500"
99
+ #load_states: False
100
+ #restart_step_count: True
101
+
102
+ gc_mode: None # "global_step"
103
+
104
+ weight_dtype: bfloat16
105
+
106
+ backend: deepspeed
107
+ deepspeed:
108
+ zero_optimization_level: 2
109
+ use_compression_training: True
110
+
111
+ inference:
112
+ use_vocos: True
113
+ normalize: False
114
+
115
+ weight_dtype: float32
116
+
117
+ bitsandbytes:
118
+ enabled: False
119
+ injects: True
120
+ linear: True
121
+ embedding: True