mhemanthkmr143 commited on
Commit
dbc5a52
1 Parent(s): d22e7f9

Code Added

Browse files
Files changed (2) hide show
  1. app.py +122 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torchaudio
3
+ import torch
4
+ import torch.nn.functional as F
5
+ from speechbrain.inference.speaker import EncoderClassifier
6
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
+ import noisereduce as nr
8
+ import librosa
9
+
10
+ # Load the classifier model
11
+ classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb")
12
+
13
+
14
+ def f2embed(wav_file, classifier, size_embed):
15
+
16
+ signal, fs = stereo_to_mono(wav_file)
17
+ if signal is None:
18
+ return None
19
+ # print(fs, "FS")
20
+ if fs != 16000:
21
+ signal, fs = resample_to_16000(signal, fs)
22
+ if signal is None:
23
+ return None
24
+ assert fs == 16000, fs
25
+ with torch.no_grad():
26
+ embeddings = classifier.encode_batch(signal)
27
+ embeddings = F.normalize(embeddings, dim=2)
28
+ embeddings = embeddings.squeeze().cpu().numpy()
29
+ assert embeddings.shape[0] == size_embed, embeddings.shape[0]
30
+ return embeddings
31
+
32
+ def stereo_to_mono(wav_file):
33
+ try:
34
+ signal, fs = torchaudio.load(wav_file)
35
+ signal_np = signal.numpy()
36
+ if signal_np.shape[0] == 2: # If stereo
37
+ signal_mono = librosa.to_mono(signal_np)
38
+ signal_mono = torch.from_numpy(signal_mono).unsqueeze(0)
39
+ else:
40
+ signal_mono = signal # Already mono
41
+ print(f"Converted to mono: {signal_mono.shape}")
42
+ return signal_mono, fs
43
+ except Exception as e:
44
+ print(f"Error in stereo_to_mono: {e}")
45
+ return None, None
46
+
47
+ def resample_to_16000(signal, original_sr):
48
+ try:
49
+ signal_np = signal.numpy().flatten()
50
+ signal_resampled = librosa.resample(signal_np, orig_sr=original_sr, target_sr=16000)
51
+ signal_resampled = torch.from_numpy(signal_resampled).unsqueeze(0)
52
+ print(f"Resampled to 16000 Hz: {signal_resampled.shape}")
53
+ return signal_resampled, 16000
54
+ except Exception as e:
55
+ print(f"Error in resample_to_16000: {e}")
56
+ return None, None
57
+
58
+ def reduce_noise(speech, noise_reduction_amount=0.5):
59
+ try:
60
+ denoised_speech = nr.reduce_noise(y=speech, sr=16000)
61
+ return denoised_speech
62
+ except Exception as e:
63
+ print(f"Error in reduce_noise: {e}")
64
+ return speech
65
+
66
+
67
+
68
+ def process_audio(wav_file, text):
69
+ try:
70
+ # Extract speaker embeddings
71
+ speaker_embeddings = f2embed(wav_file, classifier, 512)
72
+ if speaker_embeddings is None:
73
+ return None, "Error in speaker embedding extraction"
74
+
75
+ embeddings = torch.tensor(speaker_embeddings).unsqueeze(0)
76
+
77
+ # Load and process the speech file
78
+ signal, fs = torchaudio.load(wav_file)
79
+ signal_np = signal.numpy().flatten()
80
+ print(f"Loaded signal: {signal_np.shape}, Sample rate: {fs}")
81
+
82
+ # Convert text to speech using the speaker embeddings
83
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
84
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
85
+ inputs = processor(text=text, return_tensors="pt")
86
+ inputs.update({"speaker_embeddings": embeddings})
87
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
88
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings=inputs["speaker_embeddings"],vocoder=vocoder)
89
+ print(f"Generated speech, shape: {speech.shape}")
90
+
91
+ # Reduce noise
92
+ speech_denoised = reduce_noise(speech)
93
+ print(f"Reduced noise, signal shape: {speech_denoised.shape}")
94
+ return speech_denoised, 16000
95
+ except Exception as e:
96
+ print(f"Error in process_audio: {e}")
97
+ return None, "Error in audio processing"
98
+
99
+ # Gradio interface
100
+ def gradio_interface(wav_file, text):
101
+ try:
102
+ processed_audio, rate = process_audio(wav_file, text)
103
+ if processed_audio is None:
104
+ return "Error occurred during processing"
105
+ return (rate, processed_audio)
106
+ except Exception as e:
107
+ print(f"Error in gradio_interface: {e}")
108
+ return "Error occurred during processing"
109
+
110
+ # Create Gradio interface
111
+ gr_interface = gr.Interface(
112
+ fn=gradio_interface,
113
+ inputs=[gr.Audio(type="filepath"), gr.Textbox(lines=2, placeholder="Enter text here...")],
114
+ outputs=gr.Audio(type="numpy"),
115
+ title="Text-to-Speech with Speaker Embeddings",
116
+ description="Upload a speaker audio file and enter text to convert the text to speech using the speaker's voice.",
117
+ )
118
+
119
+ gr_interface.launch()
120
+
121
+
122
+ # process_audio("/content/Network Chunck.mp3","Hello this network chunk")
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ torchaudio
3
+ transformers
4
+ noisereduce
5
+ librosa
6
+ speechbrain