ymzhang319 commited on
Commit
e562afd
1 Parent(s): 3af5e96

update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -23
app.py CHANGED
@@ -130,7 +130,7 @@ class FoleyController:
130
  prompt_textbox,
131
  negative_prompt_textbox,
132
  ip_adapter_scale,
133
- # temporal_scale,
134
  sampler_dropdown,
135
  sample_step_slider,
136
  cfg_scale_slider,
@@ -154,7 +154,7 @@ class FoleyController:
154
  if seed_textbox != "":
155
  torch.manual_seed(int(seed_textbox))
156
  generator.manual_seed(int(seed_textbox))
157
- max_frame_nums = 15
158
  frames, duration = read_frames_with_moviepy(input_video, max_frame_nums=max_frame_nums)
159
  if duration >= 10:
160
  duration = 10
@@ -169,7 +169,9 @@ class FoleyController:
169
  time_condition = time_condition + [-1] * (1024 - len(time_condition))
170
  # w -> b c h w
171
  time_condition = torch.FloatTensor(time_condition).unsqueeze(0).unsqueeze(0).unsqueeze(0).repeat(1, 1, 256, 1)
172
-
 
 
173
  images = self.image_processor(images=frames, return_tensors="pt").to(device)
174
  image_embeddings = self.image_encoder(**images).image_embeds
175
  image_embeddings = torch.mean(image_embeddings, dim=0, keepdim=True).unsqueeze(0).unsqueeze(0)
@@ -253,18 +255,20 @@ with gr.Blocks(css=css) as demo:
253
  negative_prompt_textbox = gr.Textbox(value=N_PROMPT, label="Negative prompt", lines=1)
254
 
255
  with gr.Row():
256
- sampler_dropdown = gr.Dropdown(
257
- label="Sampling method",
258
- choices=list(scheduler_dict.keys()),
259
- value=list(scheduler_dict.keys())[0],
260
- )
261
- sample_step_slider = gr.Slider(
262
- label="Sampling steps", value=25, minimum=10, maximum=100, step=1
263
- )
264
-
265
- cfg_scale_slider = gr.Slider(label="CFG Scale", value=7.5, minimum=0, maximum=20)
266
- ip_adapter_scale = gr.Slider(label="Visual Content Scale", value=1.0, minimum=0, maximum=1)
267
- # temporal_scale = gr.Slider(label="Temporal Align Scale", value=0., minimum=0., maximum=1.0)
 
 
268
 
269
  with gr.Row():
270
  seed_textbox = gr.Textbox(label="Seed", value=42)
@@ -273,7 +277,12 @@ with gr.Blocks(css=css) as demo:
273
 
274
  generate_button = gr.Button(value="Generate", variant="primary")
275
 
276
- result_video = gr.Video(label="Generated Audio", interactive=False)
 
 
 
 
 
277
 
278
  generate_button.click(
279
  fn=controller.foley,
@@ -282,7 +291,7 @@ with gr.Blocks(css=css) as demo:
282
  prompt_textbox,
283
  negative_prompt_textbox,
284
  ip_adapter_scale,
285
- # temporal_scale,
286
  sampler_dropdown,
287
  sample_step_slider,
288
  cfg_scale_slider,
@@ -292,13 +301,22 @@ with gr.Blocks(css=css) as demo:
292
  )
293
 
294
  gr.Examples(
295
- examples= [
296
- ['examples/videos/51701454.mp4', 'seagulls', '', 1.0, 'DDIM', 25, 7.5, 10014024412012338098],
297
- ['examples/videos/42.mp4', '', '', 1.0, 'DDIM', 25, 7.5, 42],
298
- ['examples/videos/1.mp4', '', '', 1.0, 'DDIM', 25, 7.5, 93493458],
299
- ['examples/videos/2.mp4', '', '', 1.0, 'DDIM', 25, 7.5, 16520432],
 
 
 
 
 
 
300
  ],
301
- inputs=[init_img,prompt_textbox,negative_prompt_textbox,ip_adapter_scale,sampler_dropdown,sample_step_slider,cfg_scale_slider,seed_textbox],
 
 
 
302
  )
303
 
304
  demo.queue(10)
 
130
  prompt_textbox,
131
  negative_prompt_textbox,
132
  ip_adapter_scale,
133
+ temporal_scale,
134
  sampler_dropdown,
135
  sample_step_slider,
136
  cfg_scale_slider,
 
154
  if seed_textbox != "":
155
  torch.manual_seed(int(seed_textbox))
156
  generator.manual_seed(int(seed_textbox))
157
+ max_frame_nums = 150
158
  frames, duration = read_frames_with_moviepy(input_video, max_frame_nums=max_frame_nums)
159
  if duration >= 10:
160
  duration = 10
 
169
  time_condition = time_condition + [-1] * (1024 - len(time_condition))
170
  # w -> b c h w
171
  time_condition = torch.FloatTensor(time_condition).unsqueeze(0).unsqueeze(0).unsqueeze(0).repeat(1, 1, 256, 1)
172
+
173
+ # Note that clip need fewer frames
174
+ frames = frames[::10]
175
  images = self.image_processor(images=frames, return_tensors="pt").to(device)
176
  image_embeddings = self.image_encoder(**images).image_embeds
177
  image_embeddings = torch.mean(image_embeddings, dim=0, keepdim=True).unsqueeze(0).unsqueeze(0)
 
255
  negative_prompt_textbox = gr.Textbox(value=N_PROMPT, label="Negative prompt", lines=1)
256
 
257
  with gr.Row():
258
+ ip_adapter_scale = gr.Slider(label="Visual Content Scale", value=1.0, minimum=0, maximum=1)
259
+ temporal_scale = gr.Slider(label="Temporal Align Scale", value=0.2, minimum=0., maximum=1.0)
260
+
261
+ with gr.Accordion("Sampling Settings", open=False):
262
+ with gr.Row():
263
+ sampler_dropdown = gr.Dropdown(
264
+ label="Sampling method",
265
+ choices=list(scheduler_dict.keys()),
266
+ value=list(scheduler_dict.keys())[0],
267
+ )
268
+ sample_step_slider = gr.Slider(
269
+ label="Sampling steps", value=25, minimum=10, maximum=100, step=1
270
+ )
271
+ cfg_scale_slider = gr.Slider(label="CFG Scale", value=7.5, minimum=0, maximum=20)
272
 
273
  with gr.Row():
274
  seed_textbox = gr.Textbox(label="Seed", value=42)
 
277
 
278
  generate_button = gr.Button(value="Generate", variant="primary")
279
 
280
+ with gr.Column():
281
+ result_video = gr.Video(label="Generated Audio", interactive=False)
282
+ gr.Markdown('**Tips**: <br> \
283
+ 1. With strong temporal visual cues in input video, you can scale up the **Temporal Align Scale**. <br>\
284
+ 2. **Visual content scale** is the level of semantic alignment with visual content. \
285
+ ')
286
 
287
  generate_button.click(
288
  fn=controller.foley,
 
291
  prompt_textbox,
292
  negative_prompt_textbox,
293
  ip_adapter_scale,
294
+ temporal_scale,
295
  sampler_dropdown,
296
  sample_step_slider,
297
  cfg_scale_slider,
 
301
  )
302
 
303
  gr.Examples(
304
+ # examples= [
305
+ # ['examples/videos/51701454.mp4', 'seagulls', '', 1.0, 'DDIM', 25, 7.5, 10014024412012338098],
306
+ # ['examples/videos/42.mp4', '', '', 1.0, 'DDIM', 25, 7.5, 42],
307
+ # ['examples/videos/1.mp4', '', '', 1.0, 'DDIM', 25, 7.5, 93493458],
308
+ # ['examples/videos/2.mp4', '', '', 1.0, 'DDIM', 25, 7.5, 16520432],
309
+ # ],
310
+ examples=[
311
+ ['examples/input/case1.mp4', '', '', 1.0, 0.2, 'DDIM', 25, 7.5, 33817921],
312
+ ['examples/input/case3.mp4', '', '', 1.0, 0.2,'DDIM', 25, 7.5, 94667578],
313
+ ['examples/input/case5.mp4', '', '', 0.75, 0.2,'DDIM', 25, 7.5, 92890876],
314
+ ['examples/input/case6.mp4', '', '', 1.0, 0.2, 'DDIM', 25, 7.5, 77015909],
315
  ],
316
+ inputs=[init_img,prompt_textbox,negative_prompt_textbox,ip_adapter_scale,temporal_scale,sampler_dropdown,sample_step_slider,cfg_scale_slider,seed_textbox],
317
+ cache_examples=True,
318
+ outputs=[result_video],
319
+ fn=controller.foley,
320
  )
321
 
322
  demo.queue(10)