File size: 2,414 Bytes
0a5f7db
 
 
4990310
37cb8cc
0a5f7db
 
4990310
b503d5a
0a5f7db
71c7a94
37cb8cc
4990310
 
0a5f7db
b503d5a
0a5f7db
4990310
0a5f7db
37cb8cc
 
4990310
 
 
 
 
37cb8cc
4990310
 
 
 
0a5f7db
4990310
 
 
 
37cb8cc
 
0a5f7db
4990310
 
0a5f7db
 
 
 
 
 
4990310
0a5f7db
 
 
 
 
 
 
4990310
0a5f7db
4990310
0a5f7db
 
 
 
 
 
 
37cb8cc
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gradio as gr
import numpy as np
import torch
from transformers import AutoProcessor, pipeline, VitsModel, VitsTokenizer


# load speech translation checkpoint
ASR_MODEL_NAME = 'openai/whisper-base'
asr_pipe = pipeline("automatic-speech-recognition", model=ASR_MODEL_NAME, chunk_length_s=30,device=device)


# load text-to-speech checkpoint
model = VitsModel.from_pretrained("Matthijs/mms-tts-deu")
tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu")


def translate(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "de"})
    return outputs["text"]
    
def synthesise(text, voice_preset):
    inputs = tokenizer(text_example, return_tensors="pt")
    input_ids = inputs["input_ids"]

    with torch.no_grad():
        outputs = model(input_ids)
    
    speech = outputs.audio[0]
    return speech.cpu()
    
def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
    return 16000, synthesised_speech
    
    
title = "Cascaded STST - Any language to German speech"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in German. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
[MMS TTS](https://huggingface.co/Matthijs/mms-tts-deu) model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]],
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.queue(concurrency_count=2,max_size=10)
demo.launch()