pragnakalp commited on
Commit
33d1219
1 Parent(s): 68c6e5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -6
app.py CHANGED
@@ -14,8 +14,8 @@ import imageio
14
  import os
15
  import ffmpeg
16
 
17
- # from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
18
- # from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
19
 
20
  block = gr.Blocks()
21
 
@@ -126,8 +126,9 @@ def audio_video():
126
  input_video = ffmpeg.input('/content/video_output.mp4')
127
 
128
  input_audio = ffmpeg.input('/content/audio.wav')
129
-
130
  ffmpeg.concat(input_video, input_audio, v=1, a=1).output('/content/final_output.mp4').run()
 
131
  return "/content/final_output.mp4"
132
 
133
  def one_shot_talking(image_in,audio_in):
@@ -141,7 +142,7 @@ def one_shot_talking(image_in,audio_in):
141
  # time.sleep(60)
142
  image_in_one_shot='/content/results/restored_imgs/image_pre.png'
143
  #One Shot Talking Face algorithm
144
- return calculate(image_in_one_shot,audio_in)
145
 
146
  #Video Quality Improvement
147
 
@@ -169,7 +170,31 @@ def one_shot(image_in,input_text,gender):
169
  sound.export("/content/audio.wav", format="wav")
170
  audio_in="/content/audio.wav"
171
  return one_shot_talking(image_in,audio_in)
172
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
 
175
  def run():
@@ -179,7 +204,7 @@ def run():
179
  with gr.Box():
180
  with gr.Row().style(equal_height=True):
181
  image_in = gr.Image(show_label=True, type="filepath",label="Input Image")
182
- input_text = gr.Textbox(show_label=False,label="Text")
183
  gender = gr.Radio(["Female","Male"],value="Female",label="Gender")
184
  video_out = gr.Video(show_label=True,label="Output")
185
  with gr.Row().style(equal_height=True):
 
14
  import os
15
  import ffmpeg
16
 
17
+ from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
18
+ from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
19
 
20
  block = gr.Blocks()
21
 
 
126
  input_video = ffmpeg.input('/content/video_output.mp4')
127
 
128
  input_audio = ffmpeg.input('/content/audio.wav')
129
+ os.system(f"rm -rf /content/final_output.mp4")
130
  ffmpeg.concat(input_video, input_audio, v=1, a=1).output('/content/final_output.mp4').run()
131
+
132
  return "/content/final_output.mp4"
133
 
134
  def one_shot_talking(image_in,audio_in):
 
142
  # time.sleep(60)
143
  image_in_one_shot='/content/results/restored_imgs/image_pre.png'
144
  #One Shot Talking Face algorithm
145
+ calculate(image_in_one_shot,audio_in)
146
 
147
  #Video Quality Improvement
148
 
 
170
  sound.export("/content/audio.wav", format="wav")
171
  audio_in="/content/audio.wav"
172
  return one_shot_talking(image_in,audio_in)
173
+ elif gender == 'Male':
174
+
175
+ models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
176
+ "Voicemod/fastspeech2-en-male1",
177
+ arg_overrides={"vocoder": "hifigan", "fp16": False}
178
+ )
179
+
180
+ model = models[0]
181
+ TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
182
+ generator = task.build_generator([model], cfg)
183
+ # next(model.parameters()).device
184
+
185
+ sample = TTSHubInterface.get_model_input(task, input_text)
186
+ sample["net_input"]["src_tokens"] = sample["net_input"]["src_tokens"]
187
+ sample["net_input"]["src_lengths"] = sample["net_input"]["src_lengths"]
188
+ sample["speaker"] = sample["speaker"]
189
+
190
+ wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
191
+ # soundfile.write("/content/audio_before.wav", wav, rate)
192
+ soundfile.write("/content/audio_before.wav", wav.cpu().clone().numpy(), rate)
193
+ cmd='ffmpeg -i /content/audio_before.wav -filter:a "atempo=0.7" -vn /content/audio.wav'
194
+ os.system(cmd)
195
+ audio_in="/content/audio.wav"
196
+
197
+ return one_shot_talking(image_in,audio_in)
198
 
199
 
200
  def run():
 
204
  with gr.Box():
205
  with gr.Row().style(equal_height=True):
206
  image_in = gr.Image(show_label=True, type="filepath",label="Input Image")
207
+ input_text = gr.Textbox(show_label=True,label="Input Text")
208
  gender = gr.Radio(["Female","Male"],value="Female",label="Gender")
209
  video_out = gr.Video(show_label=True,label="Output")
210
  with gr.Row().style(equal_height=True):