pragnakalp commited on
Commit
b632575
β€’
1 Parent(s): 9019b20

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -97
app.py CHANGED
@@ -1,116 +1,59 @@
1
  import gradio as gr
2
- # import os, subprocess, torchaudio
3
- # import torch
4
- from PIL import Image
5
- from gtts import gTTS
6
- import tempfile
7
- from pydub import AudioSegment
8
- from pydub.generators import Sine
9
- # from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
10
- # from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
11
- import soundfile
12
-
13
- import dlib
14
- import cv2
15
- import imageio
16
- import os
17
- import gradio as gr
18
  import os, subprocess, torchaudio
 
19
  from PIL import Image
20
- import ffmpeg
21
-
22
-
23
 
24
  block = gr.Blocks()
25
 
26
-
27
-
28
- def one_shot(image,input_text,gender):
29
- if gender == 'Female' or gender == 'female':
30
- tts = gTTS(input_text)
31
- with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
32
- tts.write_to_fp(f)
33
- f.seek(0)
34
- sound = AudioSegment.from_file(f.name, format="mp3")
35
- sound.export("/content/audio.wav", format="wav")
36
- waveform, sample_rate = torchaudio.load("/content/audio.wav")
37
-
38
-
39
- torchaudio.save("/content/audio.wav", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
40
- image = Image.open(image_in)
41
- image = pad_image(image)
42
- image.save("/content/image_pre.png")
43
- pocketsphinx_run = subprocess.run(['pocketsphinx', '-phone_align', 'yes', 'single', '/content/audio.wav'], check=True, capture_output=True)
44
- jq_run = subprocess.run(['jq', '[.w[]|{word: (.t | ascii_upcase | sub("<S>"; "sil") | sub("<SIL>"; "sil") | sub("\\\(2\\\)"; "") | sub("\\\(3\\\)"; "") | sub("\\\(4\\\)"; "") | sub("\\\[SPEECH\\\]"; "SIL") | sub("\\\[NOISE\\\]"; "SIL")), phones: [.w[]|{ph: .t | sub("\\\+SPN\\\+"; "SIL") | sub("\\\+NSN\\\+"; "SIL"), bg: (.b*100)|floor, ed: (.b*100+.d*100)|floor}]}]'], input=pocketsphinx_run.stdout, capture_output=True)
45
- with open("test.json", "w") as f:
46
- f.write(jq_run.stdout.decode('utf-8').strip())
47
- import json
48
-
49
- with open('test.json') as user_file:
50
- file_contents = user_file.read()
51
-
52
-
53
-
54
- parsed_json = json.loads(file_contents)
55
- return parsed_json
56
- exit()
57
- os.system(f"cd /content/one-shot-talking-face && python3 -B test_script.py --img_path /content/image_pre.png --audio_path /content/audio.wav --phoneme_path /content/test.json --save_dir /content/train")
58
-
59
-
60
-
61
-
62
- elif gender == 'Male' or gender == 'male':
63
- print(gender)
64
- models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
65
- "Voicemod/fastspeech2-en-male1",
66
- arg_overrides={"vocoder": "hifigan", "fp16": False}
67
- )
68
-
69
- model = models[0].cuda()
70
- TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
71
- generator = task.build_generator([model], cfg)
72
- # next(model.parameters()).device
73
-
74
- sample = TTSHubInterface.get_model_input(task, input_text)
75
- sample["net_input"]["src_tokens"] = sample["net_input"]["src_tokens"].cuda()
76
- sample["net_input"]["src_lengths"] = sample["net_input"]["src_lengths"].cuda()
77
- sample["speaker"] = sample["speaker"].cuda()
78
-
79
- wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
80
- # soundfile.write("/content/audio_before.wav", wav, rate)
81
- soundfile.write("/content/audio_before.wav", wav.cpu().clone().numpy(), rate)
82
- cmd='ffmpeg -i /content/audio_before.wav -filter:a "atempo=0.7" -vn /content/audio.wav'
83
- os.system(cmd)
84
- one_shot_talking(image,'audio.wav')
85
-
86
-
87
-
88
-
89
- def generate_ocr(method,image,gender):
90
- return "Hello"
91
 
92
  def run():
93
  with block:
94
-
 
 
 
 
95
  with gr.Group():
96
  with gr.Box():
97
  with gr.Row().style(equal_height=True):
98
  image_in = gr.Image(show_label=False, type="filepath")
99
- # audio_in = gr.Audio(show_label=False, type='filepath')
100
- input_text=gr.Textbox(lines=3, value="Hello How are you?", label="Input Text")
101
- gender = gr.Radio(["Female","Male"],value="Female",label="Gender")
102
- video_out = gr.Textbox(label="output")
103
- # video_out = gr.Video(show_label=False)
104
  with gr.Row().style(equal_height=True):
105
  btn = gr.Button("Generate")
106
 
107
- btn.click(one_shot, inputs=[image_in, input_text,gender], outputs=[video_out])
108
- # block.queue()
 
109
  block.launch(server_name="0.0.0.0", server_port=7860)
110
 
111
  if __name__ == "__main__":
112
- run()
113
-
114
-
115
-
116
-
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import os, subprocess, torchaudio
3
+ import torch
4
  from PIL import Image
 
 
 
5
 
6
  block = gr.Blocks()
7
 
8
+ def pad_image(image):
9
+ w, h = image.size
10
+ if w == h:
11
+ return image
12
+ elif w > h:
13
+ new_image = Image.new(image.mode, (w, w), (0, 0, 0))
14
+ new_image.paste(image, (0, (w - h) // 2))
15
+ return new_image
16
+ else:
17
+ new_image = Image.new(image.mode, (h, h), (0, 0, 0))
18
+ new_image.paste(image, ((h - w) // 2, 0))
19
+ return new_image
20
+
21
+ def calculate(image_in, audio_in):
22
+ waveform, sample_rate = torchaudio.load(audio_in)
23
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
24
+ torchaudio.save("/content/audio.wav", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
25
+ image = Image.open(image_in)
26
+ image = pad_image(image)
27
+ image.save("image.png")
28
+
29
+ pocketsphinx_run = subprocess.run(['pocketsphinx', '-phone_align', 'yes', 'single', '/content/audio.wav'], check=True, capture_output=True)
30
+ jq_run = subprocess.run(['jq', '[.w[]|{word: (.t | ascii_upcase | sub("<S>"; "sil") | sub("<SIL>"; "sil") | sub("\\\(2\\\)"; "") | sub("\\\(3\\\)"; "") | sub("\\\(4\\\)"; "") | sub("\\\[SPEECH\\\]"; "SIL") | sub("\\\[NOISE\\\]"; "SIL")), phones: [.w[]|{ph: .t | sub("\\\+SPN\\\+"; "SIL") | sub("\\\+NSN\\\+"; "SIL"), bg: (.b*100)|floor, ed: (.b*100+.d*100)|floor}]}]'], input=pocketsphinx_run.stdout, capture_output=True)
31
+ with open("test.json", "w") as f:
32
+ f.write(jq_run.stdout.decode('utf-8').strip())
33
+
34
+ os.system(f"cd /content/one-shot-talking-face && python3 -B test_script.py --img_path /content/image.png --audio_path /content/audio.wav --phoneme_path /content/test.json --save_dir /content/train")
35
+ return "/content/train/image_audio.mp4"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  def run():
38
  with block:
39
+ gr.Markdown(
40
+ """
41
+ <style> body { text-align: right} </style>
42
+ map: πŸ“„ [arxiv](https://arxiv.org/abs/2112.02749) &nbsp; ⇨ πŸ‘©β€πŸ’» [github](https://github.com/FuxiVirtualHuman/AAAI22-one-shot-talking-face) &nbsp; ⇨ πŸ¦’ [colab](https://github.com/camenduru/one-shot-talking-face-colab) &nbsp; ⇨ πŸ€— [huggingface](https://huggingface.co/spaces/camenduru/one-shot-talking-face) &nbsp; | &nbsp; tools: πŸŒ€ [duplicate this space](https://huggingface.co/spaces/camenduru/sandbox?duplicate=true) &nbsp; | 🐒 [tortoise tts](https://huggingface.co/spaces/mdnestor/tortoise) &nbsp; | πŸ“Ί [video upscaler](https://huggingface.co/spaces/kadirnar/Anime4k) &nbsp; | 🎨 [text-to-image](https://huggingface.co/models?pipeline_tag=text-to-image&sort=downloads) &nbsp; | 🐣 [twitter](https://twitter.com/camenduru) &nbsp; | β˜• [buy-a-coffee](https://ko-fi.com/camenduru) &nbsp;
43
+ """)
44
  with gr.Group():
45
  with gr.Box():
46
  with gr.Row().style(equal_height=True):
47
  image_in = gr.Image(show_label=False, type="filepath")
48
+ audio_in = gr.Audio(show_label=False, type='filepath')
49
+ video_out = gr.Video(show_label=False)
 
 
 
50
  with gr.Row().style(equal_height=True):
51
  btn = gr.Button("Generate")
52
 
53
+
54
+ btn.click(calculate, inputs=[image_in, audio_in], outputs=[video_out])
55
+ block.queue()
56
  block.launch(server_name="0.0.0.0", server_port=7860)
57
 
58
  if __name__ == "__main__":
59
+ run()