Drew commited on
Commit
c879843
1 Parent(s): 929aad9
Files changed (2) hide show
  1. app.py +10 -44
  2. requirements.txt +5 -1
app.py CHANGED
@@ -5,6 +5,7 @@ import gradio as gr
5
  import spaces
6
  import os
7
  import uuid
 
8
 
9
  # Importing the model-related functions
10
  from stable_audio_tools import get_pretrained_model
@@ -79,8 +80,15 @@ def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
79
  torchaudio.save(unique_filename, output, sample_rate)
80
  print(f"Audio saved: {unique_filename}")
81
 
 
 
 
 
 
 
 
82
  # Return the path to the generated audio file
83
- return unique_filename
84
 
85
  # Setting up the Gradio Interface
86
  interface = gr.Interface(
@@ -94,49 +102,7 @@ interface = gr.Interface(
94
  outputs=gr.Audio(type="filepath", label="Generated Audio"),
95
  title="Stable Audio Generator",
96
  description="Generate variable-length stereo audio at 44.1kHz from text prompts using Stable Audio Open 1.0.",
97
- examples=[
98
- [
99
- "Create a serene soundscape of a quiet beach at sunset.", # Text prompt
100
-
101
- 45, # Duration in Seconds
102
- 100, # Number of Diffusion Steps
103
- 10, # CFG Scale
104
- ],
105
- [
106
- "Generate an energetic and bustling city street scene with distant traffic and close conversations.", # Text prompt
107
-
108
- 30, # Duration in Seconds
109
- 120, # Number of Diffusion Steps
110
- 5, # CFG Scale
111
- ],
112
- [
113
- "Simulate a forest ambiance with birds chirping and wind rustling through the leaves.", # Text prompt
114
- 60, # Duration in Seconds
115
- 140, # Number of Diffusion Steps
116
- 7.5, # CFG Scale
117
- ],
118
- [
119
- "Recreate a gentle rainfall with distant thunder.", # Text prompt
120
-
121
- 35, # Duration in Seconds
122
- 110, # Number of Diffusion Steps
123
- 8, # CFG Scale
124
-
125
- ],
126
- [
127
- "Imagine a jazz cafe environment with soft music and ambient chatter.", # Text prompt
128
- 25, # Duration in Seconds
129
- 90, # Number of Diffusion Steps
130
- 6, # CFG Scale
131
-
132
- ],
133
- ["Rock beat played in a treated studio, session drumming on an acoustic kit.",
134
- 30, # Duration in Seconds
135
- 100, # Number of Diffusion Steps
136
- 7, # CFG Scale
137
-
138
- ]
139
- ])
140
 
141
 
142
  # Pre-load the model to avoid multiprocessing issues
 
5
  import spaces
6
  import os
7
  import uuid
8
+ from pydub import AudioSegment
9
 
10
  # Importing the model-related functions
11
  from stable_audio_tools import get_pretrained_model
 
80
  torchaudio.save(unique_filename, output, sample_rate)
81
  print(f"Audio saved: {unique_filename}")
82
 
83
+ # Convert WAV to MP3 using pydub without ffmpeg
84
+ audio = AudioSegment.from_wav(unique_filename)
85
+ full_path_mp3 = unique_filename.replace('wav', 'mp3')
86
+ audio.export(full_path_mp3, format="mp3")
87
+
88
+ print(f"Audio converted and saved to MP3: {full_path_mp3}")
89
+
90
  # Return the path to the generated audio file
91
+ return audio
92
 
93
  # Setting up the Gradio Interface
94
  interface = gr.Interface(
 
102
  outputs=gr.Audio(type="filepath", label="Generated Audio"),
103
  title="Stable Audio Generator",
104
  description="Generate variable-length stereo audio at 44.1kHz from text prompts using Stable Audio Open 1.0.",
105
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
 
108
  # Pre-load the model to avoid multiprocessing issues
requirements.txt CHANGED
@@ -9,4 +9,8 @@ torch
9
  torchaudio
10
  stable-audio-tools
11
  openai
12
- pydub
 
 
 
 
 
9
  torchaudio
10
  stable-audio-tools
11
  openai
12
+ pydub
13
+ git+https://github.com/huggingface/diffusers.git
14
+ transformers
15
+ accelerate
16
+ sentencepiece