Oysiyl commited on
Commit
090156b
1 Parent(s): 833dd4d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -0
app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Text
3
+ import gradio as gr
4
+ import soundfile as sf
5
+ from transformers import pipeline
6
+ import numpy as np
7
+ import torch
8
+ import re
9
+ from speechbrain.pretrained import EncoderClassifier
10
+
11
+
12
+ def create_speaker_embedding(speaker_model, waveform: np.ndarray) -> np.ndarray:
13
+ with torch.no_grad():
14
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
15
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
16
+ if device.type != 'cuda':
17
+ speaker_embeddings = speaker_embeddings.squeeze().numpy()
18
+ else:
19
+ speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
20
+ speaker_embeddings = torch.tensor(speaker_embeddings, dtype=dtype).unsqueeze(0).to(device)
21
+ return speaker_embeddings
22
+
23
+
24
+ def remove_special_characters_s(text: Text) -> Text:
25
+ chars_to_remove_regex = '[\=\´\–\“\”\…\=]'
26
+ # remove special characters
27
+ text = re.sub(chars_to_remove_regex, '', text).lower()
28
+ text = re.sub("‘", "'", text).lower()
29
+ text = re.sub("’", "'", text).lower()
30
+ text = re.sub("´", "'", text).lower()
31
+ text = text.lower()
32
+ return text
33
+
34
+
35
+ def dutch_to_english(text: Text) -> Text:
36
+ replacements = [
37
+ ("à", "a"),
38
+ ("ç", "c"),
39
+ ("è", "e"),
40
+ ("ë", "e"),
41
+ ("í", "i"),
42
+ ("ï", "i"),
43
+ ("ö", "o"),
44
+ ("ü", "u"),
45
+ ('&', "en"),
46
+ ('á','a'),
47
+ ('ä','a'),
48
+ ('î','i'),
49
+ ('ó','o'),
50
+ ('ö','o'),
51
+ ('ú','u'),
52
+ ('û','u'),
53
+ ('ă','a'),
54
+ ('ć','c'),
55
+ ('đ','d'),
56
+ ('š','s'),
57
+ ('ţ','t'),
58
+ ('j', 'y'),
59
+ ('k', 'k'),
60
+ ('ci', 'si'),
61
+ ('ce', 'se'),
62
+ ('ca', 'ka'),
63
+ ('co', 'ko'),
64
+ ('cu', 'ku'),
65
+ (' sch', ' sg'),
66
+ ('sch ', 's '),
67
+ ('ch', 'g'),
68
+ ('eeuw', 'eaw'),
69
+ ('ee', 'ea'),
70
+ ('aai','ay'),
71
+ ('oei', 'ooy'),
72
+ ('ooi', 'oay'),
73
+ ('ieuw', 'eew'),
74
+ ('ie', 'ee'),
75
+ ('oo', 'oa'),
76
+ ('oe', 'oo'),
77
+ ('ei', '\\i\\'),
78
+ ('ij', 'i'),
79
+ ('\\i\\', 'i')
80
+ ]
81
+
82
+ for src, dst in replacements:
83
+ text = text.replace(src, dst)
84
+ return text
85
+
86
+
87
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
88
+ if torch.cuda.is_available():
89
+ dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16
90
+ else:
91
+ dtype = torch.float32
92
+
93
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
94
+
95
+ speaker_model = EncoderClassifier.from_hparams(
96
+ source=spk_model_name,
97
+ run_opts={"device": device},
98
+ savedir=os.path.join("/tmp", spk_model_name)
99
+ )
100
+
101
+ waveform, samplerate = sf.read("files/speaker.wav")
102
+
103
+ speaker_embeddings = create_speaker_embedding(speaker_model, waveform)
104
+
105
+ transcriber = pipeline("text-to-speech", model="Oysiyl/speecht5_tts_common_voice_nl")
106
+
107
+ def transcribe(text: Text) -> tuple((int, np.ndarray)):
108
+ text = remove_special_characters_s(text)
109
+ text = dutch_to_english(text)
110
+ out = transcriber(text, forward_params={"speaker_embeddings": speaker_embeddings})
111
+ audio, sr = out["audio"], out["sampling_rate"]
112
+ return sr, audio
113
+
114
+
115
+ demo = gr.Interface(
116
+ transcribe,
117
+ gr.Textbox(),
118
+ outputs="audio",
119
+ title="Text to Speech for Dutch language demo",
120
+ description="Click on the example below or type text!",
121
+ examples=[["hallo allemaal, ik praat nederlands. groetjes aan iedereen"]],
122
+ cache_examples=True
123
+ )
124
+
125
+ demo.launch()