pragnakalp commited on
Commit
68c6e5c
1 Parent(s): 5d54c3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -8
app.py CHANGED
@@ -3,18 +3,74 @@ import os, subprocess, torchaudio
3
  import torch
4
  from PIL import Image
5
  import gradio as gr
6
- import os, subprocess, torchaudio
7
- import torch
8
- from PIL import Image
9
  import soundfile
10
  from gtts import gTTS
11
  import tempfile
12
  from pydub import AudioSegment
13
  from pydub.generators import Sine
 
 
 
 
 
14
 
 
 
15
 
16
  block = gr.Blocks()
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def pad_image(image):
19
  w, h = image.size
20
  if w == h:
@@ -44,6 +100,65 @@ def calculate(image_in, audio_in):
44
  os.system(f"cd /content/one-shot-talking-face && python3 -B test_script.py --img_path /content/image.png --audio_path /content/audio.wav --phoneme_path /content/test.json --save_dir /content/train")
45
  return "/content/train/image_audio.mp4"
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def one_shot(image_in,input_text,gender):
48
  if gender == "Female":
49
  tts = gTTS(input_text)
@@ -53,7 +168,9 @@ def one_shot(image_in,input_text,gender):
53
  sound = AudioSegment.from_file(f.name, format="mp3")
54
  sound.export("/content/audio.wav", format="wav")
55
  audio_in="/content/audio.wav"
56
- return calculate(image_in,audio_in)
 
 
57
 
58
  def run():
59
  with block:
@@ -61,10 +178,10 @@ def run():
61
  with gr.Group():
62
  with gr.Box():
63
  with gr.Row().style(equal_height=True):
64
- image_in = gr.Image(show_label=False, type="filepath")
65
  input_text = gr.Textbox(show_label=False,label="Text")
66
  gender = gr.Radio(["Female","Male"],value="Female",label="Gender")
67
- video_out = gr.Video(show_label=False)
68
  with gr.Row().style(equal_height=True):
69
  btn = gr.Button("Generate")
70
 
@@ -74,5 +191,4 @@ def run():
74
  block.launch(server_name="0.0.0.0", server_port=7860)
75
 
76
  if __name__ == "__main__":
77
- run()
78
-
 
3
  import torch
4
  from PIL import Image
5
  import gradio as gr
 
 
 
6
  import soundfile
7
  from gtts import gTTS
8
  import tempfile
9
  from pydub import AudioSegment
10
  from pydub.generators import Sine
11
+ import dlib
12
+ import cv2
13
+ import imageio
14
+ import os
15
+ import ffmpeg
16
 
17
+ # from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
18
+ # from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
19
 
20
  block = gr.Blocks()
21
 
22
+ def compute_aspect_preserved_bbox(bbox, increase_area, h, w):
23
+ left, top, right, bot = bbox
24
+ width = right - left
25
+ height = bot - top
26
+
27
+ width_increase = max(increase_area, ((1 + 2 * increase_area) * height - width) / (2 * width))
28
+ height_increase = max(increase_area, ((1 + 2 * increase_area) * width - height) / (2 * height))
29
+
30
+ left_t = int(left - width_increase * width)
31
+ top_t = int(top - height_increase * height)
32
+ right_t = int(right + width_increase * width)
33
+ bot_t = int(bot + height_increase * height)
34
+
35
+ left_oob = -min(0, left_t)
36
+ right_oob = right - min(right_t, w)
37
+ top_oob = -min(0, top_t)
38
+ bot_oob = bot - min(bot_t, h)
39
+
40
+ if max(left_oob, right_oob, top_oob, bot_oob) > 0:
41
+ max_w = max(left_oob, right_oob)
42
+ max_h = max(top_oob, bot_oob)
43
+ if max_w > max_h:
44
+ return left_t + max_w, top_t + max_w, right_t - max_w, bot_t - max_w
45
+ else:
46
+ return left_t + max_h, top_t + max_h, right_t - max_h, bot_t - max_h
47
+
48
+ else:
49
+ return (left_t, top_t, right_t, bot_t)
50
+
51
+ def crop_src_image(src_img, detector=None):
52
+ if detector is None:
53
+ detector = dlib.get_frontal_face_detector()
54
+ save_img='/content/image_pre.png'
55
+ img = cv2.imread(src_img)
56
+ faces = detector(img, 0)
57
+ h, width, _ = img.shape
58
+ if len(faces) > 0:
59
+ bbox = [faces[0].left(), faces[0].top(),faces[0].right(), faces[0].bottom()]
60
+ l = bbox[3]-bbox[1]
61
+ bbox[1]= bbox[1]-l*0.1
62
+ bbox[3]= bbox[3]-l*0.1
63
+ bbox[1] = max(0,bbox[1])
64
+ bbox[3] = min(h,bbox[3])
65
+ bbox = compute_aspect_preserved_bbox(tuple(bbox), 0.5, img.shape[0], img.shape[1])
66
+ img = img[bbox[1] :bbox[3] , bbox[0]:bbox[2]]
67
+ img = cv2.resize(img, (256, 256))
68
+ cv2.imwrite(save_img,img)
69
+ else:
70
+ img = cv2.resize(img,(256,256))
71
+ cv2.imwrite(save_img, img)
72
+ return save_img
73
+
74
  def pad_image(image):
75
  w, h = image.size
76
  if w == h:
 
100
  os.system(f"cd /content/one-shot-talking-face && python3 -B test_script.py --img_path /content/image.png --audio_path /content/audio.wav --phoneme_path /content/test.json --save_dir /content/train")
101
  return "/content/train/image_audio.mp4"
102
 
103
+ def merge_frames():
104
+
105
+
106
+ path = '/content/video_results/restored_imgs'
107
+ image_folder = os.fsencode(path)
108
+ print(image_folder)
109
+ filenames = []
110
+
111
+ for file in os.listdir(image_folder):
112
+ filename = os.fsdecode(file)
113
+ if filename.endswith( ('.jpg', '.png', '.gif') ):
114
+ filenames.append(filename)
115
+
116
+ filenames.sort() # this iteration technique has no built in order, so sort the frames
117
+ print(filenames)
118
+ images = list(map(lambda filename: imageio.imread("/content/video_results/restored_imgs/"+filename), filenames))
119
+
120
+
121
+ imageio.mimsave('/content/video_output.mp4', images, fps=25.0) # modify the frame duration as needed
122
+ return "/content/video_output.mp4"
123
+
124
+ def audio_video():
125
+
126
+ input_video = ffmpeg.input('/content/video_output.mp4')
127
+
128
+ input_audio = ffmpeg.input('/content/audio.wav')
129
+
130
+ ffmpeg.concat(input_video, input_audio, v=1, a=1).output('/content/final_output.mp4').run()
131
+ return "/content/final_output.mp4"
132
+
133
+ def one_shot_talking(image_in,audio_in):
134
+
135
+
136
+ # Pre-processing of image
137
+ crop_img=crop_src_image(image_in)
138
+
139
+ #Improve quality of input image
140
+ os.system(f"python /content/GFPGAN/inference_gfpgan.py --upscale 2 -i /content/image_pre.png -o /content/results --bg_upsampler realesrgan")
141
+ # time.sleep(60)
142
+ image_in_one_shot='/content/results/restored_imgs/image_pre.png'
143
+ #One Shot Talking Face algorithm
144
+ return calculate(image_in_one_shot,audio_in)
145
+
146
+ #Video Quality Improvement
147
+
148
+ #1. Extract the frames from the video file using PyVideoFramesExtractor
149
+ os.system(f"python /content/PyVideoFramesExtractor/extract.py --video=/content/train/image_audio.mp4")
150
+
151
+ #2. Improve image quality using GFPGAN on each frames
152
+ os.system(f"python /content/GFPGAN/inference_gfpgan.py --upscale 2 -i /content/extracted_frames/image_audio_frames -o /content/video_results --bg_upsampler realesrgan")
153
+
154
+ #3. Merge all the frames to a one video using imageio
155
+ merge_frames()
156
+ return audio_video()
157
+
158
+
159
+
160
+
161
+
162
  def one_shot(image_in,input_text,gender):
163
  if gender == "Female":
164
  tts = gTTS(input_text)
 
168
  sound = AudioSegment.from_file(f.name, format="mp3")
169
  sound.export("/content/audio.wav", format="wav")
170
  audio_in="/content/audio.wav"
171
+ return one_shot_talking(image_in,audio_in)
172
+
173
+
174
 
175
  def run():
176
  with block:
 
178
  with gr.Group():
179
  with gr.Box():
180
  with gr.Row().style(equal_height=True):
181
+ image_in = gr.Image(show_label=True, type="filepath",label="Input Image")
182
  input_text = gr.Textbox(show_label=False,label="Text")
183
  gender = gr.Radio(["Female","Male"],value="Female",label="Gender")
184
+ video_out = gr.Video(show_label=True,label="Output")
185
  with gr.Row().style(equal_height=True):
186
  btn = gr.Button("Generate")
187
 
 
191
  block.launch(server_name="0.0.0.0", server_port=7860)
192
 
193
  if __name__ == "__main__":
194
+ run()