Spaces:
Running
Running
内存优化
Browse files- .gitignore +2 -0
- app.py +8 -13
- inference/infer_tool.py +3 -7
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
|
2 |
+
*.pyc
|
app.py
CHANGED
@@ -2,16 +2,17 @@ import argparse
|
|
2 |
import logging
|
3 |
import os
|
4 |
import re
|
5 |
-
import
|
6 |
-
|
|
|
7 |
import gradio as gr
|
|
|
8 |
import librosa
|
9 |
import numpy as np
|
10 |
import soundfile
|
11 |
from scipy.io import wavfile
|
12 |
-
import edge_tts
|
13 |
-
import tempfile
|
14 |
|
|
|
15 |
from inference.infer_tool import Svc
|
16 |
|
17 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
@@ -28,6 +29,8 @@ tts_voice = {
|
|
28 |
"英文女": "en-US-AnaNeural"
|
29 |
}
|
30 |
|
|
|
|
|
31 |
|
32 |
def create_fn(model, spk):
|
33 |
def svc_fn(input_audio, vc_transform, auto_f0, f0p):
|
@@ -39,6 +42,7 @@ def create_fn(model, spk):
|
|
39 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
40 |
temp_path = "temp.wav"
|
41 |
soundfile.write(temp_path, audio, sr, format="wav")
|
|
|
42 |
out_audio = model.slice_inference(raw_audio_path=temp_path,
|
43 |
spk=spk,
|
44 |
slice_db=-40,
|
@@ -58,15 +62,6 @@ def create_fn(model, spk):
|
|
58 |
input_text = re.sub(r"[\n\,\(\) ]", "", input_text)
|
59 |
voice = tts_voice[gender]
|
60 |
ratestr = "+{:.0%}".format(tts_rate) if tts_rate >= 0 else "{:.0%}".format(tts_rate)
|
61 |
-
# temp_path = "temp.wav"
|
62 |
-
# p = subprocess.Popen("edge-tts " +
|
63 |
-
# " --text " + input_text +
|
64 |
-
# " --write-media " + temp_path +
|
65 |
-
# " --voice " + voice +
|
66 |
-
# " --rate=" + ratestr, shell=True,
|
67 |
-
# stdout=subprocess.PIPE,
|
68 |
-
# stdin=subprocess.PIPE)
|
69 |
-
# p.wait()
|
70 |
communicate = edge_tts.Communicate(text=input_text,
|
71 |
voice=voice,
|
72 |
rate=ratestr)
|
|
|
2 |
import logging
|
3 |
import os
|
4 |
import re
|
5 |
+
import tempfile
|
6 |
+
|
7 |
+
import edge_tts
|
8 |
import gradio as gr
|
9 |
+
import gradio.processing_utils as gr_pu
|
10 |
import librosa
|
11 |
import numpy as np
|
12 |
import soundfile
|
13 |
from scipy.io import wavfile
|
|
|
|
|
14 |
|
15 |
+
import utils
|
16 |
from inference.infer_tool import Svc
|
17 |
|
18 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
|
|
29 |
"英文女": "en-US-AnaNeural"
|
30 |
}
|
31 |
|
32 |
+
hubert_model = utils.get_speech_encoder("vec768l12", device="cpu")
|
33 |
+
|
34 |
|
35 |
def create_fn(model, spk):
|
36 |
def svc_fn(input_audio, vc_transform, auto_f0, f0p):
|
|
|
42 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
43 |
temp_path = "temp.wav"
|
44 |
soundfile.write(temp_path, audio, sr, format="wav")
|
45 |
+
model.hubert_model = hubert_model
|
46 |
out_audio = model.slice_inference(raw_audio_path=temp_path,
|
47 |
spk=spk,
|
48 |
slice_db=-40,
|
|
|
62 |
input_text = re.sub(r"[\n\,\(\) ]", "", input_text)
|
63 |
voice = tts_voice[gender]
|
64 |
ratestr = "+{:.0%}".format(tts_rate) if tts_rate >= 0 else "{:.0%}".format(tts_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
communicate = edge_tts.Communicate(text=input_text,
|
66 |
voice=voice,
|
67 |
rate=ratestr)
|
inference/infer_tool.py
CHANGED
@@ -172,13 +172,9 @@ class Svc(object):
|
|
172 |
self.shallow_diffusion = self.only_diffusion = False
|
173 |
|
174 |
# load hubert and model
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
self.volume_extractor = utils.Volume_Extractor(self.hop_size)
|
179 |
-
else:
|
180 |
-
self.hubert_model = utils.get_speech_encoder(self.diffusion_args.data.encoder, device=self.dev)
|
181 |
-
self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
|
182 |
|
183 |
if os.path.exists(cluster_model_path):
|
184 |
if self.feature_retrieval:
|
|
|
172 |
self.shallow_diffusion = self.only_diffusion = False
|
173 |
|
174 |
# load hubert and model
|
175 |
+
self.load_model(spk_mix_enable)
|
176 |
+
# self.hubert_model = utils.get_speech_encoder(self.speech_encoder, device=self.dev)
|
177 |
+
self.volume_extractor = utils.Volume_Extractor(self.hop_size)
|
|
|
|
|
|
|
|
|
178 |
|
179 |
if os.path.exists(cluster_model_path):
|
180 |
if self.feature_retrieval:
|