Spaces:

OpenSound
/

EzAudio

Running on Zero

App Files Files Community

Update app.py

by Higobeatz - opened 10 days ago

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+231

-55

Files changed (1) hide show

app.py +231 -55

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import os
 import torch
 import random
-import spaces
 import numpy as np
 import gradio as gr
-import soundfile as sf
 from accelerate import Accelerator
 from transformers import T5Tokenizer, T5EncoderModel
 from diffusers import DDIMScheduler
@@ -54,9 +54,8 @@ MAX_SEED = np.iinfo(np.int32).max
 config_name = 'ckpts/ezaudio-xl.yml'
 ckpt_path = 'ckpts/s3/ezaudio_s3_xl.pt'
 vae_path = 'ckpts/vae/1m.pt'
-save_path = 'output/'
-os.makedirs(save_path, exist_ok=True)
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 autoencoder, unet, tokenizer, text_encoder, noise_scheduler, params = load_models(config_name, ckpt_path, vae_path,
@@ -70,10 +69,17 @@ def generate_audio(text, length,
     neg_text = None
     length = length * params['autoencoder']['latent_sr']
     if randomize_seed:
         random_seed = random.randint(0, MAX_SEED)
-    pred = inference(autoencoder, unet, None, None,
                      tokenizer, text_encoder,
                      params, noise_scheduler,
                      text, neg_text,
@@ -89,13 +95,100 @@ def generate_audio(text, length,
     return params['autoencoder']['sr'], pred
 # Examples (if needed for the demo)
 examples = [
-    "the sound of rain falling softly",
     "a dog barking in the distance",
     "light guitar music is playing",
 ]
 # CSS styling (optional)
 css = """
 #col-container {
@@ -109,53 +202,136 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
     with gr.Column(elem_id="col-container"):
         gr.Markdown("""
         # EzAudio: High-quality Text-to-Audio Generator
-        Generate audio from text using a diffusion transformer. Adjust advanced settings for more control.
         """)
-    # Basic Input: Text prompt
-    with gr.Row():
-        text_input = gr.Textbox(
-            label="Text Prompt",
-            show_label=True,
-            max_lines=2,
-            placeholder="Enter your prompt",
-            container=True,
-            value="a dog barking in the distance",
-            scale=4
-        )
-        # Run button
-        run_button = gr.Button("Generate", scale=1)
-    # Output Component
-    result = gr.Audio(label="Result", type="numpy")
-    # Advanced settings in an Accordion
-    with gr.Accordion("Advanced Settings", open=False):
-        # Audio Length
-        length_input = gr.Slider(minimum=1, maximum=10, step=1, value=10, label="Audio Length (in seconds)")
-        guidance_scale = gr.Slider(minimum=1.0, maximum=10, step=0.1, value=5.0, label="Guidance Scale")
-        guidance_rescale = gr.Slider(minimum=0.0, maximum=1, step=0.05, value=0.75, label="Guidance Rescale")
-        ddim_steps = gr.Slider(minimum=25, maximum=200, step=5, value=50, label="DDIM Steps")
-        eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=1.0, label="Eta")
-        seed = gr.Slider(minimum=0, maximum=100, step=1, value=0, label="Seed")
-        randomize_seed = gr.Checkbox(label="Randomize Seed (Disable Seed)", value=True)
-    # Examples block
-    gr.Examples(
-        examples=examples,
-        inputs=[text_input]
-    )
-    # Define the trigger and input-output linking
-    run_button.click(
-        fn=generate_audio,
-        inputs=[text_input, length_input, guidance_scale, guidance_rescale, ddim_steps, eta, seed, randomize_seed],
-        outputs=[result]
-    )
-    text_input.submit(fn=generate_audio,
-        inputs=[text_input, length_input, guidance_scale, guidance_rescale, ddim_steps, eta, seed, randomize_seed],
-        outputs=[result]
-    )
-# Launch the Gradio demo
-demo.launch()

 import os
 import torch
 import random
 import numpy as np
 import gradio as gr
+import librosa
+import space
 from accelerate import Accelerator
 from transformers import T5Tokenizer, T5EncoderModel
 from diffusers import DDIMScheduler
 config_name = 'ckpts/ezaudio-xl.yml'
 ckpt_path = 'ckpts/s3/ezaudio_s3_xl.pt'
 vae_path = 'ckpts/vae/1m.pt'
+# save_path = 'output/'
+# os.makedirs(save_path, exist_ok=True)
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 autoencoder, unet, tokenizer, text_encoder, noise_scheduler, params = load_models(config_name, ckpt_path, vae_path,
     neg_text = None
     length = length * params['autoencoder']['latent_sr']
+    gt, gt_mask = None, None
+    if text == '':
+        guidance_scale = None
+        print('empyt input')
     if randomize_seed:
         random_seed = random.randint(0, MAX_SEED)
+    pred = inference(autoencoder, unet,
+                     gt, gt_mask,
                      tokenizer, text_encoder,
                      params, noise_scheduler,
                      text, neg_text,
     return params['autoencoder']['sr'], pred
+@spaces.GPU
+def editing_audio(text, boundary,
+                  gt_file, mask_start, mask_length,
+                  guidance_scale, guidance_rescale, ddim_steps, eta,
+                  random_seed, randomize_seed):
+    neg_text = None
+    max_length = 10
+    if text == '':
+        guidance_scale = None
+        print('empyt input')
+    mask_end = mask_start + mask_length
+    # Load and preprocess ground truth audio
+    gt, sr = librosa.load(gt_file, sr=params['autoencoder']['sr'])
+    gt = gt / (np.max(np.abs(gt)) + 1e-9)
+    audio_length = len(gt) / sr
+    mask_start = min(mask_start, audio_length)
+    if mask_end > audio_length:
+        # outpadding mode
+        padding = round((mask_end - audio_length)*params['autoencoder']['sr'])
+        gt = np.pad(gt, (0, padding), 'constant')
+        audio_length = len(gt) / sr
+    output_audio = gt.copy()
+    gt = torch.tensor(gt).unsqueeze(0).unsqueeze(1).to(device)
+    boundary = min((max_length - (mask_end - mask_start))/2, (mask_end - mask_start)/2, boundary)
+    # print(boundary)
+    # Calculate start and end indices
+    start_idx = max(mask_start - boundary, 0)
+    end_idx = min(mask_end + boundary, audio_length)
+    # print(start_idx)
+    # print(end_idx)
+    mask_start -= start_idx
+    mask_end -= start_idx
+    gt = gt[:, :, round(start_idx*params['autoencoder']['sr']):round(end_idx*params['autoencoder']['sr'])]
+    # Encode the audio to latent space
+    gt_latent = autoencoder(audio=gt)
+    B, D, L = gt_latent.shape
+    length = L
+    gt_mask = torch.zeros(B, D, L).to(device)
+    latent_sr = params['autoencoder']['latent_sr']
+    gt_mask[:, :, round(mask_start * latent_sr): round(mask_end * latent_sr)] = 1
+    gt_mask = gt_mask.bool()
+    if randomize_seed:
+        random_seed = random.randint(0, MAX_SEED)
+    # Perform inference to get the edited latent representation
+    pred = inference(autoencoder, unet,
+                     gt_latent, gt_mask,
+                     tokenizer, text_encoder,
+                     params, noise_scheduler,
+                     text, neg_text,
+                     length,
+                     guidance_scale, guidance_rescale,
+                     ddim_steps, eta, random_seed,
+                     device)
+    pred = pred.cpu().numpy().squeeze(0).squeeze(0)
+    chunk_length = end_idx - start_idx
+    pred = pred[:round(chunk_length*params['autoencoder']['sr'])]
+    output_audio[round(start_idx*params['autoencoder']['sr']):round(end_idx*params['autoencoder']['sr'])] = pred
+    pred = output_audio
+    return params['autoencoder']['sr'], pred
 # Examples (if needed for the demo)
 examples = [
     "a dog barking in the distance",
+    "the sound of rain falling softly",
     "light guitar music is playing",
 ]
+# Examples (if needed for the demo)
+examples_edit = [
+    ["a dog barking in the background", 6, 3],
+    ["kids playing and laughing nearby", 5, 4],
+    ["rock music playing on the street", 8, 6]
+]
 # CSS styling (optional)
 css = """
 #col-container {
     with gr.Column(elem_id="col-container"):
         gr.Markdown("""
         # EzAudio: High-quality Text-to-Audio Generator
+        Generate and edit audio from text using a diffusion transformer. Adjust advanced settings for more control.
         """)
+        # Tabs for Generate and Edit
+        with gr.Tab("Audio Generation"):
+            # Basic Input: Text prompt
+            with gr.Row():
+                text_input = gr.Textbox(
+                    label="Text Prompt",
+                    show_label=True,
+                    max_lines=2,
+                    placeholder="Enter your prompt",
+                    container=True,
+                    value="a dog barking in the distance",
+                    scale=4
+                )
+                # Run button
+                run_button = gr.Button("Generate", scale=1)
+            # Output Component
+            result = gr.Audio(label="Generate", type="numpy")
+            # Advanced settings in an Accordion
+            with gr.Accordion("Advanced Settings", open=False):
+                # Audio Length
+                audio_length = gr.Slider(minimum=1, maximum=10, step=1, value=10, label="Audio Length (in seconds)")
+                guidance_scale = gr.Slider(minimum=1.0, maximum=10, step=0.1, value=5.0, label="Guidance Scale")
+                guidance_rescale = gr.Slider(minimum=0.0, maximum=1, step=0.05, value=0.75, label="Guidance Rescale")
+                ddim_steps = gr.Slider(minimum=25, maximum=200, step=5, value=50, label="DDIM Steps")
+                eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=1.0, label="Eta")
+                seed = gr.Slider(minimum=0, maximum=100, step=1, value=0, label="Seed")
+                randomize_seed = gr.Checkbox(label="Randomize Seed (Disable Seed)", value=True)
+            # Examples block
+            gr.Examples(
+                examples=examples,
+                inputs=[text_input]
+            )
+            # Define the trigger and input-output linking for generation
+            run_button.click(
+                fn=generate_audio,
+                inputs=[text_input, audio_length, guidance_scale, guidance_rescale, ddim_steps, eta, seed, randomize_seed],
+                outputs=[result]
+            )
+            text_input.submit(fn=generate_audio,
+                inputs=[text_input, audio_length, guidance_scale, guidance_rescale, ddim_steps, eta, seed, randomize_seed],
+                outputs=[result]
+            )
+        with gr.Tab("Audio Editing and Inpainting"):
+            # Input: Upload audio file
+            with gr.Row():
+                gt_file_input = gr.Audio(label="Upload Audio to Edit", type="filepath", value="edit_example.wav")
+            # Text prompt for editing
+            text_edit_input = gr.Textbox(
+                label="Edit Prompt",
+                show_label=True,
+                max_lines=2,
+                placeholder="Describe the edit you wat",
+                container=True,
+                value="a dog barking in the background",
+                scale=4
+            )
+            # Mask settings
+            mask_start = gr.Number(label="Edit Start (seconds)", value=6.0)
+            mask_length = gr.Slider(minimum=0.5, maximum=10, step=0.5, value=3, label="Edit Length (seconds)")
+            edit_explanation = gr.Markdown(value="**Edit Start**: Time (in seconds) when the edit begins. \n\n**Edit Length**: Duration (in seconds) of the segment to be edited. \n\n**Outpainting**: If the sum of the start time and edit length exceeds the audio length, the Outpainting Mode will be activated.")
+            # Run button for editing
+            edit_button = gr.Button("Generate", scale=1)
+            # Output Component for edited audio
+            edited_result = gr.Audio(label="Edited Audio", type="numpy")
+            # Advanced settings in an Accordion
+            with gr.Accordion("Advanced Settings", open=False):
+                # Audio Length (optional for editing, can be auto or user-defined)
+                edit_boundary = gr.Slider(minimum=0.5, maximum=4, step=0.5, value=2, label="Edit Boundary (in seconds)")
+                edit_guidance_scale = gr.Slider(minimum=1.0, maximum=10, step=0.5, value=5.0, label="Guidance Scale")
+                edit_guidance_rescale = gr.Slider(minimum=0.0, maximum=1, step=0.05, value=0.75, label="Guidance Rescale")
+                edit_ddim_steps = gr.Slider(minimum=25, maximum=200, step=5, value=50, label="DDIM Steps")
+                edit_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=1.0, label="Eta")
+                edit_seed = gr.Slider(minimum=0, maximum=100, step=1, value=0, label="Seed")
+                edit_randomize_seed = gr.Checkbox(label="Randomize Seed (Disable Seed)", value=True)
+            # Examples block
+            gr.Examples(
+                examples=examples_edit,
+                inputs=[text_edit_input, mask_start, mask_length]
+            )
+            # Define the trigger and input-output linking for editing
+            edit_button.click(
+                fn=editing_audio,
+                inputs=[
+                    text_edit_input,
+                    edit_boundary,
+                    gt_file_input,
+                    mask_start,
+                    mask_length,
+                    edit_guidance_scale,
+                    edit_guidance_rescale,
+                    edit_ddim_steps,
+                    edit_eta,
+                    edit_seed,
+                    edit_randomize_seed
+                ],
+                outputs=[edited_result]
+            )
+            text_edit_input.submit(
+                fn=editing_audio,
+                inputs=[
+                    text_edit_input,
+                    edit_boundary,
+                    gt_file_input,
+                    mask_start,
+                    mask_length,
+                    edit_guidance_scale,
+                    edit_guidance_rescale,
+                    edit_ddim_steps,
+                    edit_eta,
+                    edit_seed,
+                    edit_randomize_seed
+                ],
+                outputs=[edited_result]
+            )
+    # Launch the Gradio demo
+    demo.launch()