Marco-Cheung commited on
Commit
4990310
1 Parent(s): b503d5a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -42
app.py CHANGED
@@ -1,82 +1,63 @@
1
  import gradio as gr
2
  import numpy as np
3
  import torch
4
- from transformers import AutoProcessor, pipeline, BarkModel, GenerationConfig
5
 
6
- ASR_MODEL_NAME = "bofenghuang/whisper-large-v2-cv11-german"
7
- TTS_MODEL_NAME = "suno/bark-small"
8
- BATCH_SIZE = 8
9
- voices = {
10
- "male" : "v2/de_speaker_0",
11
- "female" : "v2/de_speaker_3"
12
- }
13
-
14
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
15
 
16
  # load speech translation checkpoint
 
17
  asr_pipe = pipeline("automatic-speech-recognition", model=ASR_MODEL_NAME, chunk_length_s=30,device=device)
18
 
19
- MULTILINGUAL = True # set True for multilingual models, False for English-only
20
-
21
- if MULTILINGUAL:
22
- generation_config = GenerationConfig.from_pretrained("openai/whisper-large-v2")
23
 
24
  # load text-to-speech checkpoint
25
- processor = AutoProcessor.from_pretrained("suno/bark-small")
26
- model = BarkModel.from_pretrained("suno/bark-small").to(device)
27
- sampling_rate = model.generation_config.sample_rate
28
 
29
- # set the forced ids
30
- model.config.forced_decoder_ids = None
31
- model.config.suppress_tokens = []
32
 
33
  def translate(audio):
34
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
35
  return outputs["text"]
36
 
37
  def synthesise(text, voice_preset):
38
- inputs = processor(text=text, return_tensors="pt",voice_preset=voice_preset)
39
- speech = model.generate(**inputs.to(device))
40
- return speech[0]
 
 
41
 
42
- def speech_to_speech_translation(audio, voice):
43
- voice_preset = None
 
 
44
  translated_text = translate(audio)
45
- print(translated_text)
46
- if voice == "Female":
47
- voice_preset = voices["female"]
48
- else:
49
- voice_preset = voices["male"]
50
- synthesised_speech = synthesise(translated_text, voice_preset)
51
- synthesised_speech = (synthesised_speech.cpu().numpy() * 32767).astype(np.int16)
52
- return sampling_rate, synthesised_speech
53
 
54
  title = "Cascaded STST - Any language to German speech"
55
  description = """
56
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in German. Demo uses fine-tuned version of openai/whisper-large-v2 model (https://huggingface.co/bofenghuang/whisper-large-v2-cv11-german) for speech translation, and Suno's
57
- [Bark-large](https://huggingface.co/suno/bark-small) model for text-to-speech:
58
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
59
  """
60
  demo = gr.Blocks()
61
 
62
  mic_translate = gr.Interface(
63
  fn=speech_to_speech_translation,
64
- inputs=[gr.Audio(source="microphone", type="filepath"),
65
- gr.inputs.Radio(["Male", "Female"], label="Voice", default="Male")],
66
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
67
  title=title,
68
  description=description,
69
- allow_flagging="never"
70
  )
71
 
72
  file_translate = gr.Interface(
73
  fn=speech_to_speech_translation,
74
- inputs=[gr.Audio(source="upload", type="filepath"),
75
- gr.inputs.Radio(["Male", "Female"], label="Voice", default="Male")],
76
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
 
77
  title=title,
78
  description=description,
79
- allow_flagging="never"
80
  )
81
 
82
  with demo:
 
1
  import gradio as gr
2
  import numpy as np
3
  import torch
4
+ from transformers import AutoProcessor, pipeline, VitsModel, VitsTokenizer
5
 
 
 
 
 
 
 
 
 
 
6
 
7
  # load speech translation checkpoint
8
+ ASR_MODEL_NAME = 'openai/whisper-base'
9
  asr_pipe = pipeline("automatic-speech-recognition", model=ASR_MODEL_NAME, chunk_length_s=30,device=device)
10
 
 
 
 
 
11
 
12
  # load text-to-speech checkpoint
13
+ model = VitsModel.from_pretrained("Matthijs/mms-tts-deu")
14
+ tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu")
 
15
 
 
 
 
16
 
17
  def translate(audio):
18
+ outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "de"})
19
  return outputs["text"]
20
 
21
  def synthesise(text, voice_preset):
22
+ inputs = tokenizer(text_example, return_tensors="pt")
23
+ input_ids = inputs["input_ids"]
24
+
25
+ with torch.no_grad():
26
+ outputs = model(input_ids)
27
 
28
+ speech = outputs.audio[0]
29
+ return speech.cpu()
30
+
31
+ def speech_to_speech_translation(audio):
32
  translated_text = translate(audio)
33
+ synthesised_speech = synthesise(translated_text)
34
+ synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
35
+ return 16000, synthesised_speech
36
+
 
 
 
 
37
 
38
  title = "Cascaded STST - Any language to German speech"
39
  description = """
40
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in German. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
41
+ [MMS TTS](https://huggingface.co/Matthijs/mms-tts-deu) model for text-to-speech:
42
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
43
  """
44
  demo = gr.Blocks()
45
 
46
  mic_translate = gr.Interface(
47
  fn=speech_to_speech_translation,
48
+ inputs=gr.Audio(source="microphone", type="filepath"),
 
49
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
50
  title=title,
51
  description=description,
 
52
  )
53
 
54
  file_translate = gr.Interface(
55
  fn=speech_to_speech_translation,
56
+ inputs=gr.Audio(source="upload", type="filepath"),
 
57
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
58
+ examples=[["./example.wav"]],
59
  title=title,
60
  description=description,
 
61
  )
62
 
63
  with demo: