Shabbir-Anjum commited on
Commit
e7a21bd
1 Parent(s): 8908b04

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -49
app.py CHANGED
@@ -1,52 +1,46 @@
1
  import streamlit as st
2
- from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
3
- from datasets import load_dataset
4
  import torch
5
  import soundfile as sf
6
- import os
7
-
8
- # Function to generate speech using the pipeline method
9
- def generate_speech_pipeline(text, speaker_embedding):
10
- synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
11
- speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
12
- return speech["audio"], speech["sampling_rate"]
13
-
14
- # Function to generate speech using the processor + generate method
15
- def generate_speech_processor(text, speaker_embedding):
16
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
17
- model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
18
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
19
-
20
- inputs = processor(text=text, return_tensors="pt")
21
- speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
22
- return speech.numpy(), 16000
23
-
24
- def main():
25
- st.title("Text-to-Speech with SpeechT5")
26
-
27
- st.write("Enter the text you want to convert to speech:")
28
-
29
- text = st.text_area("Text", "Hello, my dog is cooler than you!")
30
-
31
- if st.button("Generate Speech"):
32
- st.write("Generating speech...")
33
-
34
- # Load xvector containing speaker's voice characteristics from a dataset
35
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
36
- speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
37
-
38
- # Choose the method to generate speech
39
- method = st.selectbox("Choose the method for generating speech", ["Pipeline", "Processor + Generate"])
40
-
41
- if method == "Pipeline":
42
- audio, samplerate = generate_speech_pipeline(text, speaker_embedding)
43
- else:
44
- audio, samplerate = generate_speech_processor(text, speaker_embedding)
45
-
46
- # Save and play the generated speech
47
- output_path = "speech.wav"
48
- sf.write(output_path, audio, samplerate=samplerate)
49
- st.audio(output_path)
50
-
51
- if __name__ == "__main__":
52
- main()
 
1
  import streamlit as st
2
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
 
3
  import torch
4
  import soundfile as sf
5
+ from datasets import load_dataset
6
+
7
+ # Initialize the processor and model
8
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
9
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
10
+
11
+ # Load the vocoder
12
+ vocoder = torch.hub.load("s3prl/s3prl", "mb_melgan")
13
+
14
+ # Initialize session state
15
+ if 'text' not in st.session_state:
16
+ st.session_state['text'] = "Hello, my dog is cooler than you!"
17
+
18
+ # Function to update session state
19
+ def update_text():
20
+ st.session_state['text'] = st.text_area("Text", st.session_state['text'])
21
+
22
+ st.title("Text-to-Speech with SpeechT5")
23
+ st.write("Enter the text you want to convert to speech:")
24
+
25
+ # Use session state to store text
26
+ update_text()
27
+
28
+ if st.button("Generate Speech"):
29
+ st.write("Generating speech...")
30
+
31
+ # Process the input text
32
+ inputs = processor(text=st.session_state['text'], return_tensors="pt")
33
+
34
+ # Generate speech
35
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings=None)
36
+
37
+ # Use the vocoder to convert the generated speech to audio
38
+ with torch.no_grad():
39
+ audio = vocoder(speech)
40
+
41
+ # Save the audio to a file
42
+ sf.write("output.wav", audio.cpu().numpy(), samplerate=16000)
43
+
44
+ # Provide a download link for the generated speech
45
+ st.audio("output.wav", format="audio/wav")
46
+ st.write("Speech generation complete. You can listen to the generated speech above.")