alakxender commited on
Commit
b04ca6e
0 Parent(s):

initial commit

Browse files
Files changed (7) hide show
  1. .gitattributes +35 -0
  2. README.md +12 -0
  3. app.py +240 -0
  4. requirements.txt +4 -0
  5. samples/audio1.mp3 +0 -0
  6. samples/audio2.wav +0 -0
  7. samples/audio3.wav +0 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Dhivehi Mms Zeroshot
3
+ emoji: 📈
4
+ colorFrom: yellow
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 4.42.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ import librosa
4
+ import torch
5
+
6
+ from transformers import Wav2Vec2ForCTC, AutoProcessor
7
+ from huggingface_hub import hf_hub_download
8
+ from torchaudio.models.decoder import ctc_decoder
9
+ # https://github.com/facebookresearch/fairseq/tree/main/examples/mms/zero_shot
10
+
11
+ ASR_SAMPLING_RATE = 16_000
12
+
13
+ WORD_SCORE_DEFAULT_IF_LM = -0.18
14
+ WORD_SCORE_DEFAULT_IF_NOLM = -3.5
15
+ LM_SCORE_DEFAULT = 1.48
16
+
17
+ MODEL_ID = "mms-meta/mms-zeroshot-300m"
18
+
19
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
20
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
21
+
22
+ token_file = hf_hub_download(
23
+ repo_id=MODEL_ID,
24
+ filename="tokens.txt",
25
+ )
26
+
27
+ lm5gram = hf_hub_download(
28
+ repo_id="alakxender/w2v-bert-2.0-dhivehi-syn",
29
+ filename="language_model/5gram.bin",
30
+ )
31
+
32
+ lexicon_file = hf_hub_download(
33
+ repo_type="dataset",
34
+ repo_id="alakxender/dv-domain-lexicons",
35
+ filename="dv.domain.news.small.v1.lexicon",
36
+ )
37
+
38
+ @spaces.GPU
39
+ def transcribe(
40
+ audio_data,
41
+ wscore=None,
42
+ lmscore=None,
43
+ wscore_usedefault=True,
44
+ lmscore_usedefault=True,
45
+ uselm=True,
46
+ reference=None,
47
+ ):
48
+
49
+ if not audio_data:
50
+ yield "ERROR: Empty audio data"
51
+ return
52
+
53
+ # audio
54
+ if isinstance(audio_data, tuple):
55
+ # microphone
56
+ sr, audio_samples = audio_data
57
+ audio_samples = (audio_samples / 32768.0).astype(float)
58
+
59
+ if sr != ASR_SAMPLING_RATE:
60
+ audio_samples = librosa.resample(
61
+ audio_samples, orig_sr=sr, target_sr=ASR_SAMPLING_RATE
62
+ )
63
+ else:
64
+ # file upload
65
+ assert isinstance(audio_data, str)
66
+ audio_samples = librosa.load(audio_data, sr=ASR_SAMPLING_RATE, mono=True)[0]
67
+
68
+ inputs = processor(
69
+ audio_samples, sampling_rate=ASR_SAMPLING_RATE, return_tensors="pt"
70
+ )
71
+
72
+ # set device
73
+ if torch.cuda.is_available():
74
+ device = torch.device("cuda")
75
+ else:
76
+ device = torch.device("cpu")
77
+
78
+ model.to(device)
79
+ inputs = inputs.to(device)
80
+
81
+ with torch.no_grad():
82
+ outputs = model(**inputs).logits
83
+
84
+ # params
85
+ if uselm == True:
86
+ lm_path=lm5gram
87
+ else:
88
+ lm_path=None
89
+
90
+ if lm_path is not None and not lm_path.strip():
91
+ lm_path = None
92
+
93
+ if wscore_usedefault:
94
+ wscore = (
95
+ WORD_SCORE_DEFAULT_IF_LM
96
+ if lm_path is not None
97
+ else WORD_SCORE_DEFAULT_IF_NOLM
98
+ )
99
+ if lmscore_usedefault:
100
+ lmscore = LM_SCORE_DEFAULT if lm_path is not None else 0
101
+
102
+ beam_search_decoder = ctc_decoder(
103
+ lexicon=lexicon_file,
104
+ tokens=token_file,
105
+ lm=lm_path,
106
+ nbest=1,
107
+ beam_size=500,
108
+ beam_size_token=50,
109
+ lm_weight=lmscore,
110
+ word_score=wscore,
111
+ sil_score=0,
112
+ blank_token="<s>",
113
+ )
114
+
115
+ beam_search_result = beam_search_decoder(outputs.to("cpu"))
116
+ transcription = " ".join(beam_search_result[0][0].words).strip()
117
+
118
+ yield transcription
119
+
120
+ styles = """
121
+ .thaana textarea {
122
+ font-size: 18px !important;
123
+ font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important;
124
+ line-height: 1.8 !important;
125
+ }
126
+ .textbox2 textarea {
127
+ display: none;
128
+ }
129
+ """
130
+
131
+ with gr.Blocks(css=styles) as demo:
132
+ gr.Markdown("# <center> Transcribe Dhivehi Audio with MMS-ZEROSHOT</center>")
133
+ with gr.Row():
134
+ with gr.Column():
135
+ audio = gr.Audio(label="Audio Input\n(use microphone or upload a file)",min_length=1,max_length=60)
136
+
137
+ with gr.Accordion("Advanced Settings", open=False):
138
+ gr.Markdown(
139
+ "The following parameters are used for beam-search decoding. Use the default values if you are not sure."
140
+ )
141
+ with gr.Row():
142
+ with gr.Column():
143
+ wscore_usedefault = gr.Checkbox(
144
+ label="Use Default Word Insertion Score", value=True
145
+ )
146
+ wscore = gr.Slider(
147
+ minimum=-10.0,
148
+ maximum=10.0,
149
+ value=WORD_SCORE_DEFAULT_IF_LM,
150
+ step=0.1,
151
+ interactive=False,
152
+ label="Word Insertion Score",
153
+ )
154
+
155
+ with gr.Column():
156
+ lmscore_usedefault = gr.Checkbox(
157
+ label="Use Default Language Model Score", value=True
158
+ )
159
+ lmscore = gr.Slider(
160
+ minimum=-10.0,
161
+ maximum=10.0,
162
+ value=LM_SCORE_DEFAULT,
163
+ step=0.1,
164
+ interactive=False,
165
+ label="Language Model Score",
166
+ )
167
+ with gr.Column():
168
+ uselm = gr.Checkbox(
169
+ label="Use LM",
170
+ value=True,
171
+ )
172
+ btn = gr.Button("Submit", elem_id="submit")
173
+
174
+ @gr.on(
175
+ inputs=[wscore_usedefault, lmscore_usedefault, uselm],
176
+ outputs=[wscore, lmscore],
177
+ )
178
+ def update_slider(ws, ls, lm, alm):
179
+
180
+ ws_slider = gr.Slider(
181
+ minimum=-10.0,
182
+ maximum=10.0,
183
+ value=LM_SCORE_DEFAULT if (lm is not None or alm) else 0,
184
+ step=0.1,
185
+ interactive=not ws,
186
+ label="Word Insertion Score",
187
+ )
188
+ ls_slider = gr.Slider(
189
+ minimum=-10.0,
190
+ maximum=10.0,
191
+ value=WORD_SCORE_DEFAULT_IF_NOLM
192
+ if (lm is None and not alm)
193
+ else WORD_SCORE_DEFAULT_IF_LM,
194
+ step=0.1,
195
+ interactive=not ls,
196
+ label="Language Model Score",
197
+ )
198
+ return ws_slider, ls_slider
199
+
200
+ with gr.Column():
201
+ text = gr.Textbox(label="Transcript",rtl=True,elem_classes="thaana")
202
+
203
+ reference = gr.Textbox(label="Reference Transcript", visible=False)
204
+
205
+ btn.click(
206
+ transcribe,
207
+ inputs=[
208
+ audio,
209
+ wscore,
210
+ lmscore,
211
+ wscore_usedefault,
212
+ lmscore_usedefault,
213
+ uselm,
214
+ reference,
215
+ ],
216
+ outputs=[text],
217
+ )
218
+
219
+ # Examples
220
+ gr.Examples(
221
+ examples=[
222
+ [
223
+ "samples/audio1.mp3",
224
+ "އަޅުގަނޑުވެސް ދާކަށް ބޭނުމެއްނުވި"
225
+ ],
226
+ [
227
+ "samples/audio2.wav",
228
+ "ރަނގަޅަށްވިއްޔާ އެވާނީ މުސްކުޅި ކުރެހުމަކަށް"
229
+ ],
230
+
231
+ [
232
+ "samples/audio3.wav",
233
+ "އެއީ ޞަހްޔޫނީންގެ ޒަމާންވީ ރޭވުމެއްގެ ދަށުން މެދުނުކެނޑި ކުރިއަށްވާ ޕްރޮގްރާމެއް"
234
+ ],
235
+ ],
236
+ inputs=[audio, reference],
237
+ label="Dhivehi Audio Samples",
238
+ )
239
+
240
+ demo.launch(show_api=False)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers
2
+ flashlight-text
3
+ librosa
4
+ torchaudio
samples/audio1.mp3 ADDED
Binary file (21.9 kB). View file
 
samples/audio2.wav ADDED
Binary file (194 kB). View file
 
samples/audio3.wav ADDED
Binary file (267 kB). View file