import streamlit as st from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech import torch import soundfile as sf from datasets import load_dataset # Initialize the processor and model processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") # Load the vocoder vocoder = torch.hub.load("s3prl/s3prl", "mb_melgan") # Initialize session state if 'text' not in st.session_state: st.session_state['text'] = "Hello, my dog is cooler than you!" # Function to update session state def update_text(): st.session_state['text'] = st.text_area("Text", st.session_state['text']) st.title("Text-to-Speech with SpeechT5") st.write("Enter the text you want to convert to speech:") # Use session state to store text update_text() if st.button("Generate Speech"): st.write("Generating speech...") # Process the input text inputs = processor(text=st.session_state['text'], return_tensors="pt") # Generate speech speech = model.generate_speech(inputs["input_ids"], speaker_embeddings=None) # Use the vocoder to convert the generated speech to audio with torch.no_grad(): audio = vocoder(speech) # Save the audio to a file sf.write("output.wav", audio.cpu().numpy(), samplerate=16000) # Provide a download link for the generated speech st.audio("output.wav", format="audio/wav") st.write("Speech generation complete. You can listen to the generated speech above.")