Spaces:

drengskapur
/

openai-text-to-speech

Running

App Files Files

jonathanagustin commited on about 1 month ago

Commit

98b32f1

•

1 Parent(s): 0034b02

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +139 -66

app.py CHANGED Viewed

@@ -3,16 +3,26 @@ import tempfile
 import openai
 import requests
 import os
-def tts(input_text: str, model: str, voice: str, api_key: str) -> str:
     """
     Convert input text to speech using OpenAI's Text-to-Speech API.
     Parameters:
         input_text (str): The text to be converted to speech.
         model (str): The model to use for synthesis (e.g., 'tts-1', 'tts-1-hd').
-        voice (str): The voice profile to use (e.g., 'alloy', 'echo', 'fable', etc.).
         api_key (str): OpenAI API key.
     Returns:
         str: File path to the generated audio file.
@@ -28,34 +38,61 @@ def tts(input_text: str, model: str, voice: str, api_key: str) -> str:
     if not input_text.strip():
         raise gr.Error("Input text cannot be empty.")
-    openai.api_key = api_key
     try:
-        response = openai.Audio.create(text=input_text, voice=voice, model=model)
-    except openai.OpenAIError as e:
-        # Catch-all for OpenAI exceptions
-        raise gr.Error(f"An OpenAI error occurred: {e}")
-    except Exception as e:
-        # Catch any other exceptions
-        raise gr.Error(f"An unexpected error occurred: {e}")
-    if not hasattr(response, "audio"):
-        raise gr.Error(
-            "Invalid response from OpenAI API. The response does not contain audio content."
         )
-    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
-        temp_file.write(response.audio)
         temp_file_path = temp_file.name
     return temp_file_path
 def main():
     """
     Main function to create and launch the Gradio interface.
     """
     MODEL_OPTIONS = ["tts-1", "tts-1-hd"]
     VOICE_OPTIONS = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
     # Predefine voice previews URLs
     VOICE_PREVIEW_URLS = {
@@ -81,58 +118,67 @@ def main():
         VOICE_PREVIEW_FILES[voice] = local_file_path
     # Set static paths for Gradio to serve
-    # This needs to be done before creating the Gradio app
-    gr.set_static_paths([PREVIEW_DIR])
     with gr.Blocks(title="OpenAI - Text to Speech") as demo:
         with gr.Row():
             with gr.Column(scale=1):
-                gr.Markdown("### Voice Previews")
-                # Create an audio component to play the samples
-                preview_audio = gr.Audio(
-                    interactive=False,
-                    label="Preview Audio",
-                    value=None,
-                    visible=True,
-                    autoplay=True,
-                )
-                # A function to update the preview_audio component
-                def play_voice_sample(voice):
-                    return gr.update(value=VOICE_PREVIEW_FILES[voice])
-                # Create buttons for each voice inside a grid
-                for voice in VOICE_OPTIONS:
-                    # Create a button for each voice
-                    voice_button = gr.Button(
-                        value=f"{voice.capitalize()}",
-                        variant="secondary",
-                        size="sm",
                     )
-                    # Attach the click handler
-                    voice_button.click(
-                        fn=lambda v=voice: play_voice_sample(v),
-                        outputs=preview_audio,
                     )
-            with gr.Column(scale=1):
-                api_key_input = gr.Textbox(
-                    label="OpenAI API Key",
-                    info="https://platform.openai.com/account/api-keys",
-                    type="password",
-                    placeholder="Enter your OpenAI API Key",
-                )
-                model_dropdown = gr.Dropdown(
-                    choices=MODEL_OPTIONS,
-                    label="Model",
-                    value="tts-1",
-                )
-                voice_dropdown = gr.Dropdown(
-                    choices=VOICE_OPTIONS,
-                    label="Voice Options",
-                    value="echo",
-                )
             with gr.Column(scale=2):
                 input_textbox = gr.Textbox(
@@ -140,6 +186,21 @@ def main():
                     lines=10,
                     placeholder="Type your text here...",
                 )
                 submit_button = gr.Button(
                     "Convert Text to Speech",
                     variant="primary",
@@ -148,19 +209,31 @@ def main():
                 output_audio = gr.Audio(label="Output Audio")
         # Define the event handler for the submit button with error handling
-        def on_submit(input_text, model, voice, api_key):
-            audio_file = tts(input_text, model, voice, api_key)
             return audio_file
         # Trigger the conversion when the submit button is clicked
         submit_button.click(
             fn=on_submit,
-            inputs=[input_textbox, model_dropdown, voice_dropdown, api_key_input],
             outputs=output_audio,
         )
     # Launch the Gradio app with error display enabled
     demo.launch(show_error=True)
 if __name__ == "__main__":
     main()

 import openai
 import requests
 import os
+from functools import partial
+def tts(
+    input_text: str,
+    model: str,
+    voice: str,
+    api_key: str,
+    response_format: str = "mp3",
+    speed: float = 1.0,
+) -> str:
     """
     Convert input text to speech using OpenAI's Text-to-Speech API.
     Parameters:
         input_text (str): The text to be converted to speech.
         model (str): The model to use for synthesis (e.g., 'tts-1', 'tts-1-hd').
+        voice (str): The voice to use when generating the audio.
         api_key (str): OpenAI API key.
+        response_format (str): Format of the output audio. Defaults to 'mp3'.
+        speed (float): Speed of the generated audio. Defaults to 1.0.
     Returns:
         str: File path to the generated audio file.
     if not input_text.strip():
         raise gr.Error("Input text cannot be empty.")
+    if len(input_text) > 4096:
+        raise gr.Error("Input text exceeds the maximum length of 4096 characters.")
+    if speed < 0.25 or speed > 4.0:
+        raise gr.Error("Speed must be between 0.25 and 4.0.")
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+    }
+    data = {
+        "model": model,
+        "input": input_text,
+        "voice": voice,
+        "response_format": response_format,
+        "speed": speed,
+    }
     try:
+        response = requests.post(
+            "https://api.openai.com/v1/audio/speech",
+            headers=headers,
+            json=data,
         )
+        response.raise_for_status()
+    except requests.exceptions.HTTPError as http_err:
+        raise gr.Error(f"HTTP error occurred: {http_err} - {response.text}")
+    except Exception as err:
+        raise gr.Error(f"An error occurred: {err}")
+    # The content will be the audio file content
+    audio_content = response.content
+    file_extension = response_format.lower()
+    # PCM is raw data, so it does not have a standard file extension
+    if file_extension == "pcm":
+        file_extension = "raw"
+    with tempfile.NamedTemporaryFile(
+        suffix=f".{file_extension}", delete=False
+    ) as temp_file:
+        temp_file.write(audio_content)
         temp_file_path = temp_file.name
     return temp_file_path
 def main():
     """
     Main function to create and launch the Gradio interface.
     """
     MODEL_OPTIONS = ["tts-1", "tts-1-hd"]
     VOICE_OPTIONS = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
+    RESPONSE_FORMAT_OPTIONS = ["mp3", "opus", "aac", "flac", "wav", "pcm"]
     # Predefine voice previews URLs
     VOICE_PREVIEW_URLS = {
         VOICE_PREVIEW_FILES[voice] = local_file_path
     # Set static paths for Gradio to serve
+    gr.static(PREVIEW_DIR)
     with gr.Blocks(title="OpenAI - Text to Speech") as demo:
+        gr.Markdown("# OpenAI Text-to-Speech Demo")
         with gr.Row():
             with gr.Column(scale=1):
+                with gr.Group():
+                    preview_audio = gr.Audio(
+                        interactive=False,
+                        label="Preview Audio",
+                        value=None,
+                        visible=True,
                     )
+                    # Function to play the selected voice sample
+                    def play_voice_sample(voice):
+                        return gr.update(value=VOICE_PREVIEW_FILES[voice])
+                    # Create buttons for each voice
+                    for voice in VOICE_OPTIONS:
+                        voice_button = gr.Button(
+                            value=f"{voice.capitalize()}",
+                            variant="secondary",
+                            size="sm",
+                        )
+                        voice_button.click(
+                            fn=partial(play_voice_sample, voice=voice),
+                            outputs=preview_audio,
+                        )
+                with gr.Column(scale=1):
+                    api_key_input = gr.Textbox(
+                        label="OpenAI API Key",
+                        info="https://platform.openai.com/account/api-keys",
+                        type="password",
+                        placeholder="Enter your OpenAI API Key",
+                    )
+                    model_dropdown = gr.Dropdown(
+                        choices=MODEL_OPTIONS,
+                        label="Model",
+                        value="tts-1",
+                        info="Select tts-1 for speed or tts-1-hd for quality.",
+                    )
+                    voice_dropdown = gr.Dropdown(
+                        choices=VOICE_OPTIONS,
+                        label="Voice Options",
+                        value="echo",
+                        info="The voice to use when generating the audio.",
+                    )
+                    response_format_dropdown = gr.Dropdown(
+                        choices=RESPONSE_FORMAT_OPTIONS,
+                        label="Response Format",
+                        value="mp3",
+                    )
+                    speed_slider = gr.Slider(
+                        minimum=0.25,
+                        maximum=4.0,
+                        step=0.05,
+                        label="Voice Speed",
+                        value=1.0,
                     )
             with gr.Column(scale=2):
                 input_textbox = gr.Textbox(
                     lines=10,
                     placeholder="Type your text here...",
                 )
+                # Add a character counter below the input textbox
+                char_count_text = gr.Markdown("0 / 4096")
+                # Function to update the character count
+                def update_char_count(input_text):
+                    char_count = len(input_text)
+                    return f"**{char_count} / 4096**"
+                # Update character count when the user stops typing
+                input_textbox.change(
+                    fn=update_char_count,
+                    inputs=input_textbox,
+                    outputs=char_count_text,
+                )
                 submit_button = gr.Button(
                     "Convert Text to Speech",
                     variant="primary",
                 output_audio = gr.Audio(label="Output Audio")
         # Define the event handler for the submit button with error handling
+        def on_submit(
+            input_text, model, voice, api_key, response_format, speed
+        ):
+            audio_file = tts(
+                input_text, model, voice, api_key, response_format, speed
+            )
             return audio_file
         # Trigger the conversion when the submit button is clicked
         submit_button.click(
             fn=on_submit,
+            inputs=[
+                input_textbox,
+                model_dropdown,
+                voice_dropdown,
+                api_key_input,
+                response_format_dropdown,
+                speed_slider,
+            ],
             outputs=output_audio,
         )
     # Launch the Gradio app with error display enabled
     demo.launch(show_error=True)
 if __name__ == "__main__":
     main()