text-to-speach / app.py
Shabbir-Anjum's picture
Update app.py
e7a21bd verified
raw
history blame contribute delete
No virus
1.52 kB
import streamlit as st
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
import torch
import soundfile as sf
from datasets import load_dataset
# Initialize the processor and model
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
# Load the vocoder
vocoder = torch.hub.load("s3prl/s3prl", "mb_melgan")
# Initialize session state
if 'text' not in st.session_state:
st.session_state['text'] = "Hello, my dog is cooler than you!"
# Function to update session state
def update_text():
st.session_state['text'] = st.text_area("Text", st.session_state['text'])
st.title("Text-to-Speech with SpeechT5")
st.write("Enter the text you want to convert to speech:")
# Use session state to store text
update_text()
if st.button("Generate Speech"):
st.write("Generating speech...")
# Process the input text
inputs = processor(text=st.session_state['text'], return_tensors="pt")
# Generate speech
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings=None)
# Use the vocoder to convert the generated speech to audio
with torch.no_grad():
audio = vocoder(speech)
# Save the audio to a file
sf.write("output.wav", audio.cpu().numpy(), samplerate=16000)
# Provide a download link for the generated speech
st.audio("output.wav", format="audio/wav")
st.write("Speech generation complete. You can listen to the generated speech above.")