import gradio as gr import torchaudio import torch import torch.nn.functional as F from speechbrain.inference.speaker import EncoderClassifier from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan import noisereduce as nr import librosa # Load the classifier model classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb") def f2embed(wav_file, classifier, size_embed): signal, fs = stereo_to_mono(wav_file) if signal is None: return None # print(fs, "FS") if fs != 16000: signal, fs = resample_to_16000(signal, fs) if signal is None: return None assert fs == 16000, fs with torch.no_grad(): embeddings = classifier.encode_batch(signal) embeddings = F.normalize(embeddings, dim=2) embeddings = embeddings.squeeze().cpu().numpy() assert embeddings.shape[0] == size_embed, embeddings.shape[0] return embeddings def stereo_to_mono(wav_file): try: signal, fs = torchaudio.load(wav_file) signal_np = signal.numpy() if signal_np.shape[0] == 2: # If stereo signal_mono = librosa.to_mono(signal_np) signal_mono = torch.from_numpy(signal_mono).unsqueeze(0) else: signal_mono = signal # Already mono print(f"Converted to mono: {signal_mono.shape}") return signal_mono, fs except Exception as e: print(f"Error in stereo_to_mono: {e}") return None, None def resample_to_16000(signal, original_sr): try: signal_np = signal.numpy().flatten() signal_resampled = librosa.resample(signal_np, orig_sr=original_sr, target_sr=16000) signal_resampled = torch.from_numpy(signal_resampled).unsqueeze(0) print(f"Resampled to 16000 Hz: {signal_resampled.shape}") return signal_resampled, 16000 except Exception as e: print(f"Error in resample_to_16000: {e}") return None, None def reduce_noise(speech, noise_reduction_amount=0.5): try: denoised_speech = nr.reduce_noise(y=speech, sr=16000) return denoised_speech except Exception as e: print(f"Error in reduce_noise: {e}") return speech def process_audio(wav_file, text): try: # Extract speaker embeddings speaker_embeddings = f2embed(wav_file, classifier, 512) if speaker_embeddings is None: return None, "Error in speaker embedding extraction" embeddings = torch.tensor(speaker_embeddings).unsqueeze(0) # Load and process the speech file signal, fs = torchaudio.load(wav_file) signal_np = signal.numpy().flatten() print(f"Loaded signal: {signal_np.shape}, Sample rate: {fs}") # Convert text to speech using the speaker embeddings processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") inputs = processor(text=text, return_tensors="pt") inputs.update({"speaker_embeddings": embeddings}) vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") speech = model.generate_speech(inputs["input_ids"], speaker_embeddings=inputs["speaker_embeddings"],vocoder=vocoder) print(f"Generated speech, shape: {speech.shape}") # Reduce noise speech_denoised = reduce_noise(speech) print(f"Reduced noise, signal shape: {speech_denoised.shape}") return speech_denoised, 16000 except Exception as e: print(f"Error in process_audio: {e}") return None, "Error in audio processing" # Gradio interface def gradio_interface(wav_file, text): try: processed_audio, rate = process_audio(wav_file, text) if processed_audio is None: return "Error occurred during processing" return (rate, processed_audio) except Exception as e: print(f"Error in gradio_interface: {e}") return "Error occurred during processing" # Create Gradio interface gr_interface = gr.Interface( fn=gradio_interface, inputs=[gr.Audio(type="filepath"), gr.Textbox(lines=2, placeholder="Enter text here...")], outputs=gr.Audio(type="numpy"), title="Text-to-Speech with Speaker Embeddings", description="Upload a speaker audio file and enter text to convert the text to speech using the speaker's voice.", ) gr_interface.launch() # process_audio("/content/Network Chunck.mp3","Hello this network chunk")