speech-to-speech-translation

Running

File size: 3,168 Bytes

0a5f7db
 
 
5099504
37cb8cc
 
 
 
 
d912609
 
37cb8cc
5099504
0a5f7db
 
 
37cb8cc
0a5f7db
5099504
 
 
 
 
37cb8cc
 
 
 
0a5f7db
 
 
 
37cb8cc
 
 
 
 
 
 
 
0a5f7db
37cb8cc
 
 
 
 
 
 
 
 
 
0a5f7db
37cb8cc
 
0a5f7db
 
 
 
 
 
37cb8cc
 
0a5f7db
 
 
37cb8cc
0a5f7db
 
 
 
37cb8cc
 
0a5f7db
 
 
37cb8cc
0a5f7db
 
 
 
 
37cb8cc

import gradio as gr
import numpy as np
import torch
from transformers import AutoProcessor, pipeline, BarkModel, GenerationConfig

ASR_MODEL_NAME = "bofenghuang/whisper-large-v2-cv11-german"
TTS_MODEL_NAME = "suno/bark-small"
BATCH_SIZE = 8
voices = {
"male" : "v2/de_speaker_0",
"female" : "v2/de_speaker_3"
}

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model=ASR_MODEL_NAME, chunk_length_s=10,device=device)

# update the generation config
MULTILINGUAL = True  # set True for multilingual models, False for English-only
generation_config = GenerationConfig.from_pretrained("openai/whisper-large-v2")


# load text-to-speech checkpoint
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small").to(device)
sampling_rate = model.generation_config.sample_rate

def translate(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
    return outputs["text"]
    
def synthesise(text, voice_preset):
    inputs = processor(text=text, return_tensors="pt",voice_preset=voice_preset)
    speech = model.generate(**inputs.to(device))
    return speech[0]
    
def speech_to_speech_translation(audio, voice):
    voice_preset = None
    translated_text = translate(audio)
    print(translated_text)
    if voice == "Female":
     voice_preset = voices["female"]
    else:
     voice_preset = voices["male"]
    synthesised_speech = synthesise(translated_text, voice_preset)
    synthesised_speech = (synthesised_speech.cpu().numpy() * 32767).astype(np.int16)
    return sampling_rate, synthesised_speech
    
title = "Cascaded STST - Any language to German speech"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in German. Demo uses fine-tuned version of openai/whisper-large-v2 model (https://huggingface.co/bofenghuang/whisper-large-v2-cv11-german) for speech translation, and Suno's
[Bark-large](https://huggingface.co/suno/bark-small) model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=[gr.Audio(source="microphone", type="filepath"),
    gr.inputs.Radio(["Male", "Female"], label="Voice", default="Male")],
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
    allow_flagging="never"
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=[gr.Audio(source="upload", type="filepath"),
    gr.inputs.Radio(["Male", "Female"], label="Voice", default="Male")],
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
    allow_flagging="never"
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.queue(concurrency_count=2,max_size=10)
demo.launch()