Katock commited on
Commit
3ad3198
1 Parent(s): fe495ec

内存优化

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. app.py +8 -13
  3. inference/infer_tool.py +3 -7
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+
2
+ *.pyc
app.py CHANGED
@@ -2,16 +2,17 @@ import argparse
2
  import logging
3
  import os
4
  import re
5
- import subprocess
6
- import gradio.processing_utils as gr_pu
 
7
  import gradio as gr
 
8
  import librosa
9
  import numpy as np
10
  import soundfile
11
  from scipy.io import wavfile
12
- import edge_tts
13
- import tempfile
14
 
 
15
  from inference.infer_tool import Svc
16
 
17
  logging.getLogger('numba').setLevel(logging.WARNING)
@@ -28,6 +29,8 @@ tts_voice = {
28
  "英文女": "en-US-AnaNeural"
29
  }
30
 
 
 
31
 
32
  def create_fn(model, spk):
33
  def svc_fn(input_audio, vc_transform, auto_f0, f0p):
@@ -39,6 +42,7 @@ def create_fn(model, spk):
39
  audio = librosa.to_mono(audio.transpose(1, 0))
40
  temp_path = "temp.wav"
41
  soundfile.write(temp_path, audio, sr, format="wav")
 
42
  out_audio = model.slice_inference(raw_audio_path=temp_path,
43
  spk=spk,
44
  slice_db=-40,
@@ -58,15 +62,6 @@ def create_fn(model, spk):
58
  input_text = re.sub(r"[\n\,\(\) ]", "", input_text)
59
  voice = tts_voice[gender]
60
  ratestr = "+{:.0%}".format(tts_rate) if tts_rate >= 0 else "{:.0%}".format(tts_rate)
61
- # temp_path = "temp.wav"
62
- # p = subprocess.Popen("edge-tts " +
63
- # " --text " + input_text +
64
- # " --write-media " + temp_path +
65
- # " --voice " + voice +
66
- # " --rate=" + ratestr, shell=True,
67
- # stdout=subprocess.PIPE,
68
- # stdin=subprocess.PIPE)
69
- # p.wait()
70
  communicate = edge_tts.Communicate(text=input_text,
71
  voice=voice,
72
  rate=ratestr)
 
2
  import logging
3
  import os
4
  import re
5
+ import tempfile
6
+
7
+ import edge_tts
8
  import gradio as gr
9
+ import gradio.processing_utils as gr_pu
10
  import librosa
11
  import numpy as np
12
  import soundfile
13
  from scipy.io import wavfile
 
 
14
 
15
+ import utils
16
  from inference.infer_tool import Svc
17
 
18
  logging.getLogger('numba').setLevel(logging.WARNING)
 
29
  "英文女": "en-US-AnaNeural"
30
  }
31
 
32
+ hubert_model = utils.get_speech_encoder("vec768l12", device="cpu")
33
+
34
 
35
  def create_fn(model, spk):
36
  def svc_fn(input_audio, vc_transform, auto_f0, f0p):
 
42
  audio = librosa.to_mono(audio.transpose(1, 0))
43
  temp_path = "temp.wav"
44
  soundfile.write(temp_path, audio, sr, format="wav")
45
+ model.hubert_model = hubert_model
46
  out_audio = model.slice_inference(raw_audio_path=temp_path,
47
  spk=spk,
48
  slice_db=-40,
 
62
  input_text = re.sub(r"[\n\,\(\) ]", "", input_text)
63
  voice = tts_voice[gender]
64
  ratestr = "+{:.0%}".format(tts_rate) if tts_rate >= 0 else "{:.0%}".format(tts_rate)
 
 
 
 
 
 
 
 
 
65
  communicate = edge_tts.Communicate(text=input_text,
66
  voice=voice,
67
  rate=ratestr)
inference/infer_tool.py CHANGED
@@ -172,13 +172,9 @@ class Svc(object):
172
  self.shallow_diffusion = self.only_diffusion = False
173
 
174
  # load hubert and model
175
- if not self.only_diffusion:
176
- self.load_model(spk_mix_enable)
177
- self.hubert_model = utils.get_speech_encoder(self.speech_encoder, device=self.dev)
178
- self.volume_extractor = utils.Volume_Extractor(self.hop_size)
179
- else:
180
- self.hubert_model = utils.get_speech_encoder(self.diffusion_args.data.encoder, device=self.dev)
181
- self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
182
 
183
  if os.path.exists(cluster_model_path):
184
  if self.feature_retrieval:
 
172
  self.shallow_diffusion = self.only_diffusion = False
173
 
174
  # load hubert and model
175
+ self.load_model(spk_mix_enable)
176
+ # self.hubert_model = utils.get_speech_encoder(self.speech_encoder, device=self.dev)
177
+ self.volume_extractor = utils.Volume_Extractor(self.hop_size)
 
 
 
 
178
 
179
  if os.path.exists(cluster_model_path):
180
  if self.feature_retrieval: