import spaces import tempfile import wave import gradio as gr import os from whisperspeech.pipeline import Pipeline import torch import soundfile as sf import numpy as np import torch.nn.functional as F from whisperspeech.languages import LANGUAGES from whisperspeech.pipeline import Pipeline from whisperspeech.utils import resampler title = """# 🙋🏻‍♂️ Welcome to🌟Tonic's🌬️💬📝WhisperSpeech You can use this ZeroGPU Space to test out the current model [🌬️💬📝collabora/whisperspeech](https://huggingface.co/collabora/whisperspeech). 🌬️💬📝collabora/whisperspeech is An Open Source text-to-speech system built by inverting Whisper. Previously known as spear-tts-pytorch. It's like Stable Diffusion but for speech – both powerful and easily customizable. You can also use 🌬️💬📝WhisperSpeech by cloning this space. 🧬🔬🔍 Simply click here: Duplicate Space Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Polytonic](https://github.com/tonic-ai) & contribute to 🌟 [Poly](https://github.com/tonic-ai/poly) 🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗 """ @spaces.GPU def whisper_speech_demo(text, lang, speaker_audio, mix_lang, mix_text): pipe = Pipeline() speaker_url = None if speaker_audio is not None: speaker_url = speaker_audio if mix_lang and mix_text: mixed_langs = lang.split(',') + mix_lang.split(',') mixed_texts = [text] + mix_text.split(',') stoks = pipe.t2s.generate(mixed_texts, lang=mixed_langs) audio_data = pipe.generate(stoks, speaker_url, lang=mixed_langs[0]) else: audio_data = pipe.generate(text, speaker_url, lang) resample_audio = resampler(newsr=24000) audio_data_resampled = next(resample_audio([{'sample_rate': 22050, 'samples': audio_data.cpu()}]))['samples_24k'] with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file: tmp_file_name = tmp_file.name audio_np = audio_data_resampled.numpy() # Convert to numpy array if audio_np.max() > 1.0 or audio_np.min() < -1.0: audio_np = audio_np / np.max(np.abs(audio_np)) if audio_np.ndim > 1: audio_np = audio_np[:,0] audio_np = np.int16(audio_np * 32767) with wave.open(tmp_file_name, 'w') as wav_file: wav_file.setnchannels(1) wav_file.setsampwidth(2) wav_file.setframerate(24000) wav_file.writeframes(audio_np.tobytes()) return tmp_file_name with gr.Blocks() as demo: gr.Markdown(title) with gr.Tabs(): with gr.TabItem("🌬️💬📝Standard TTS"): with gr.Row(): text_input_standard = gr.Textbox(label="Enter text") lang_input_standard = gr.Dropdown(choices=list(LANGUAGES.keys()), label="Language") speaker_input_standard = gr.Audio(label="Upload or Record Speaker Audio (optional)", sources=["upload", "microphone"], type="filepath") placeholder_mix_lang = gr.Textbox(visible=False) # Placeholder, hidden placeholder_mix_text = gr.Textbox(visible=False) # Placeholder, hidden generate_button_standard = gr.Button("Generate Speech") output_audio_standard = gr.Audio(label="🌬️💬📝WhisperSpeech") generate_button_standard.click( whisper_speech_demo, inputs=[text_input_standard, lang_input_standard, speaker_input_standard, placeholder_mix_lang, placeholder_mix_text], outputs=output_audio_standard ) with gr.TabItem("🌬️💬📝Mixed Language TTS"): with gr.Row(): placeholder_text_input = gr.Textbox(visible=False) # Placeholder, hidden placeholder_lang_input = gr.Dropdown(choices=[], visible=False) # Placeholder, hidden placeholder_speaker_input = gr.Audio(visible=False) mix_lang_input_mixed = gr.CheckboxGroup(choices=list(LANGUAGES.keys()), label="Select Languages") mix_text_input_mixed = gr.Textbox(label="Enter mixed language text", placeholder="e.g., Hello, Cześć") generate_button_mixed = gr.Button("Generate Mixed Speech") output_audio_mixed = gr.Audio(label="Mixed🌬️💬📝WhisperSpeech") generate_button_mixed.click( whisper_speech_demo, inputs=[placeholder_text_input, placeholder_lang_input, placeholder_speaker_input, mix_lang_input_mixed, mix_text_input_mixed], outputs=output_audio_mixed ) demo.launch()