pragnakalp commited on
Commit
9e02b18
1 Parent(s): 9462b41

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -69
app.py CHANGED
@@ -1,30 +1,24 @@
1
  import gradio as gr
2
- # import os, subprocess, torchaudio
3
- # import torch
4
- from PIL import Image
5
- from gtts import gTTS
6
- import tempfile
7
- from pydub import AudioSegment
8
- from pydub.generators import Sine
9
- # from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
10
- # from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
11
- import soundfile
12
-
13
- import dlib
14
- import cv2
15
- import imageio
16
- import os
17
- import gradio as gr
18
  import os, subprocess, torchaudio
 
19
  from PIL import Image
20
- import ffmpeg
21
-
22
-
23
 
24
  block = gr.Blocks()
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def calculate(image_in, audio_in):
27
- print("in calculate")
28
  waveform, sample_rate = torchaudio.load(audio_in)
29
  waveform = torch.mean(waveform, dim=0, keepdim=True)
30
  torchaudio.save("/content/audio.wav", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
@@ -40,66 +34,23 @@ def calculate(image_in, audio_in):
40
  os.system(f"cd /content/one-shot-talking-face && python3 -B test_script.py --img_path /content/image.png --audio_path /content/audio.wav --phoneme_path /content/test.json --save_dir /content/train")
41
  return "/content/train/image_audio.mp4"
42
 
43
-
44
- def one_shot(image,input_text,gender):
45
- if gender == 'Female' or gender == 'female':
46
- tts = gTTS(input_text)
47
- with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
48
- tts.write_to_fp(f)
49
- f.seek(0)
50
- sound = AudioSegment.from_file(f.name, format="mp3")
51
- sound.export("/content/audio.wav", format="wav")
52
- return calculate(image,"/content/audio.wav")
53
- elif gender == 'Male' or gender == 'male':
54
- print(gender)
55
- models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
56
- "Voicemod/fastspeech2-en-male1",
57
- arg_overrides={"vocoder": "hifigan", "fp16": False}
58
- )
59
- model = models[0].cuda()
60
- TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
61
- generator = task.build_generator([model], cfg)
62
- # next(model.parameters()).device
63
-
64
- sample = TTSHubInterface.get_model_input(task, input_text)
65
- sample["net_input"]["src_tokens"] = sample["net_input"]["src_tokens"].cuda()
66
- sample["net_input"]["src_lengths"] = sample["net_input"]["src_lengths"].cuda()
67
- sample["speaker"] = sample["speaker"].cuda()
68
-
69
- wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
70
- # soundfile.write("/content/audio_before.wav", wav, rate)
71
- soundfile.write("/content/audio_before.wav", wav.cpu().clone().numpy(), rate)
72
- cmd='ffmpeg -i /content/audio_before.wav -filter:a "atempo=0.7" -vn /content/audio.wav'
73
- os.system(cmd)
74
- calculate(image,'audio.wav')
75
-
76
-
77
-
78
- def generate_ocr(method,image,gender):
79
- return "Hello"
80
-
81
  def run():
82
  with block:
83
-
84
  with gr.Group():
85
  with gr.Box():
86
  with gr.Row().style(equal_height=True):
87
  image_in = gr.Image(show_label=False, type="filepath")
88
- # audio_in = gr.Audio(show_label=False, type='filepath')
89
- input_text=gr.Textbox(lines=3, value="Hello How are you?", label="Input Text")
90
- gender = gr.Radio(["Female","Male"],value="Female",label="Gender")
91
- video_out = gr.Video(label="output")
92
- # video_out = gr.Video(show_label=False)
93
  with gr.Row().style(equal_height=True):
94
  btn = gr.Button("Generate")
95
 
96
- btn.click(one_shot, inputs=[image_in, input_text,gender], outputs=[video_out])
97
- # block.queue()
 
98
  block.launch(server_name="0.0.0.0", server_port=7860)
99
 
100
  if __name__ == "__main__":
101
  run()
102
 
103
-
104
-
105
-
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import os, subprocess, torchaudio
3
+ import torch
4
  from PIL import Image
 
 
 
5
 
6
  block = gr.Blocks()
7
 
8
+ def pad_image(image):
9
+ w, h = image.size
10
+ if w == h:
11
+ return image
12
+ elif w > h:
13
+ new_image = Image.new(image.mode, (w, w), (0, 0, 0))
14
+ new_image.paste(image, (0, (w - h) // 2))
15
+ return new_image
16
+ else:
17
+ new_image = Image.new(image.mode, (h, h), (0, 0, 0))
18
+ new_image.paste(image, ((h - w) // 2, 0))
19
+ return new_image
20
+
21
  def calculate(image_in, audio_in):
 
22
  waveform, sample_rate = torchaudio.load(audio_in)
23
  waveform = torch.mean(waveform, dim=0, keepdim=True)
24
  torchaudio.save("/content/audio.wav", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
 
34
  os.system(f"cd /content/one-shot-talking-face && python3 -B test_script.py --img_path /content/image.png --audio_path /content/audio.wav --phoneme_path /content/test.json --save_dir /content/train")
35
  return "/content/train/image_audio.mp4"
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def run():
38
  with block:
39
+
40
  with gr.Group():
41
  with gr.Box():
42
  with gr.Row().style(equal_height=True):
43
  image_in = gr.Image(show_label=False, type="filepath")
44
+ audio_in = gr.Audio(show_label=False, type='filepath')
45
+ video_out = gr.Video(show_label=False)
 
 
 
46
  with gr.Row().style(equal_height=True):
47
  btn = gr.Button("Generate")
48
 
49
+
50
+ btn.click(calculate, inputs=[image_in, audio_in], outputs=[video_out])
51
+ block.queue()
52
  block.launch(server_name="0.0.0.0", server_port=7860)
53
 
54
  if __name__ == "__main__":
55
  run()
56