File size: 2,646 Bytes
1c2b467
7beb980
 
 
1c2b467
7beb980
1c2b467
20945f0
7beb980
44a183b
7beb980
 
1c2b467
602984b
6a07d2a
 
 
 
602984b
1c2b467
7beb980
 
 
 
 
1c2b467
 
7beb980
 
 
 
1c2b467
7beb980
 
 
 
1c2b467
 
7beb980
1c2b467
 
 
7beb980
602984b
1c2b467
 
 
7beb980
1c2b467
 
20945f0
7beb980
1c2b467
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7beb980
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import torch
from transformers import pipeline, VitsModel, VitsTokenizer
import numpy as np
import gradio as gr

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load Whisper-small
pipe = pipeline("automatic-speech-recognition",
                model="openai/whisper-small",
                device=device
)

# Load the model checkpoint and tokenizer
#model = VitsModel.from_pretrained("Matthijs/mms-tts-fra")
#tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-fra")
model = VitsModel.from_pretrained("facebook/mms-tts-fra")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-fra")


# Define a function to translate an audio, in French here
def translate(audio):
    outputs = pipe(audio, max_new_tokens=256,
                   generate_kwargs={"task": "transcribe", "language": "fr"})
    return outputs["text"]


# Define function to generate the waveform output
def synthesise(text):
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"]

    with torch.no_grad(): 
      outputs = model(input_ids)
    
    return outputs.audio[0]


# Define the pipeline
def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (
        synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech


# Define the title etc
title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in French. Demo uses OpenAI's [Whisper Small](https://huggingface.co/openai/whisper-small) model for speech translation, and Facebook's
[MMS TTS](https://huggingface.co/facebook/mms-tts) model, finetuned by [Matthijs](https://huggingface.co/Matthijs), for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]],
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch()