File size: 7,388 Bytes
b04ca6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0dddab9
 
 
 
 
 
 
b04ca6e
 
 
353089a
b04ca6e
 
0bbc91c
b04ca6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import spaces
import gradio as gr
import librosa
import torch

from transformers import Wav2Vec2ForCTC, AutoProcessor
from huggingface_hub import hf_hub_download
from torchaudio.models.decoder import ctc_decoder
# https://github.com/facebookresearch/fairseq/tree/main/examples/mms/zero_shot

ASR_SAMPLING_RATE = 16_000

WORD_SCORE_DEFAULT_IF_LM = -0.18
WORD_SCORE_DEFAULT_IF_NOLM = -3.5
LM_SCORE_DEFAULT = 1.48

MODEL_ID = "mms-meta/mms-zeroshot-300m"

processor = AutoProcessor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

token_file = hf_hub_download(
    repo_id=MODEL_ID,
    filename="tokens.txt",
)

lm5gram =  hf_hub_download(
    repo_id="alakxender/w2v-bert-2.0-dhivehi-syn",
    filename="language_model/5gram.bin",
)

lex_files = [
    "dv.domain.news.small.v1.lexicon",
    "dv.domain.news.small.v2.lexicon",
    "dv.domain.news.large.v1.lexicon",
    "dv.domain.stories.small.v1.lexicon",
]

lexicon_file =  hf_hub_download(
    repo_type="dataset",
    repo_id="alakxender/dv-domain-lexicons",
    filename=lex_files[0],
)

@spaces.GPU
def transcribe(
    audio_data,
    wscore=None,
    lmscore=None,
    wscore_usedefault=True,
    lmscore_usedefault=True,
    uselm=True,
    reference=None,
):

    if not audio_data:
        yield "ERROR: Empty audio data"
        return

    # audio
    if isinstance(audio_data, tuple):
        # microphone
        sr, audio_samples = audio_data
        audio_samples = (audio_samples / 32768.0).astype(float)

        if sr != ASR_SAMPLING_RATE:
            audio_samples = librosa.resample(
                audio_samples, orig_sr=sr, target_sr=ASR_SAMPLING_RATE
            )
    else:
        # file upload
        assert isinstance(audio_data, str)
        audio_samples = librosa.load(audio_data, sr=ASR_SAMPLING_RATE, mono=True)[0]

    inputs = processor(
        audio_samples, sampling_rate=ASR_SAMPLING_RATE, return_tensors="pt"
    )

    # set device
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    model.to(device)
    inputs = inputs.to(device)

    with torch.no_grad():
        outputs = model(**inputs).logits
        
    # params
    if uselm == True:
        lm_path=lm5gram
    else:
        lm_path=None
    
    if lm_path is not None and not lm_path.strip():
        lm_path = None

    if wscore_usedefault:
        wscore = (
            WORD_SCORE_DEFAULT_IF_LM
            if lm_path is not None
            else WORD_SCORE_DEFAULT_IF_NOLM
        )
    if lmscore_usedefault:
        lmscore = LM_SCORE_DEFAULT if lm_path is not None else 0

    beam_search_decoder = ctc_decoder(
            lexicon=lexicon_file,
            tokens=token_file,
            lm=lm_path,
            nbest=1,
            beam_size=500,
            beam_size_token=50,
            lm_weight=lmscore,
            word_score=wscore,
            sil_score=0,
            blank_token="<s>",
        )

    beam_search_result = beam_search_decoder(outputs.to("cpu"))
    transcription = " ".join(beam_search_result[0][0].words).strip()

    yield transcription

styles = """
.thaana textarea {
    font-size: 18px !important;
    font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important;
    line-height: 1.8 !important;
}
.textbox2 textarea {
    display: none;
}
"""

with gr.Blocks(css=styles) as demo:
    gr.Markdown("# <center> Transcribe Dhivehi Audio with MMS-ZEROSHOT</center>")
    with gr.Row():
        with gr.Column():
            audio = gr.Audio(label="Audio Input\n(use microphone or upload a file)",min_length=1,max_length=60)

            with gr.Accordion("Advanced Settings", open=False):
                gr.Markdown(
                    "The following parameters are used for beam-search decoding. Use the default values if you are not sure."
                )
                with gr.Row(): 
                    with gr.Column():
                        wscore_usedefault = gr.Checkbox(
                            label="Use Default Word Insertion Score", value=True
                        )
                        wscore = gr.Slider(
                            minimum=-10.0,
                            maximum=10.0,
                            value=WORD_SCORE_DEFAULT_IF_LM,
                            step=0.1,
                            interactive=False,
                            label="Word Insertion Score",
                        )

                    with gr.Column():
                        lmscore_usedefault = gr.Checkbox(
                            label="Use Default Language Model Score", value=True
                        )
                        lmscore = gr.Slider(
                            minimum=-10.0,
                            maximum=10.0,
                            value=LM_SCORE_DEFAULT,
                            step=0.1,
                            interactive=False,
                            label="Language Model Score",
                        )
                    with gr.Column():
                        uselm = gr.Checkbox(
                            label="Use LM",
                            value=True,
                        )
            btn = gr.Button("Submit", elem_id="submit")

            @gr.on(
                inputs=[wscore_usedefault, lmscore_usedefault, uselm],
                outputs=[wscore, lmscore],
            )
            def update_slider(ws, ls, lm, alm):

                ws_slider = gr.Slider(
                    minimum=-10.0,
                    maximum=10.0,
                    value=LM_SCORE_DEFAULT if (lm is not None or alm) else 0,
                    step=0.1,
                    interactive=not ws,
                    label="Word Insertion Score",
                )
                ls_slider = gr.Slider(
                    minimum=-10.0,
                    maximum=10.0,
                    value=WORD_SCORE_DEFAULT_IF_NOLM
                    if (lm is None and not alm)
                    else WORD_SCORE_DEFAULT_IF_LM,
                    step=0.1,
                    interactive=not ls,
                    label="Language Model Score",
                )
                return ws_slider, ls_slider

        with gr.Column():
            text = gr.Textbox(label="Transcript",rtl=True,elem_classes="thaana")

    reference = gr.Textbox(label="Reference Transcript", visible=False)

    btn.click(
        transcribe,
        inputs=[
            audio,
            wscore,
            lmscore,
            wscore_usedefault,
            lmscore_usedefault,
            uselm,
            reference,
        ],
        outputs=[text],
    )

    # Examples
    gr.Examples(
        examples=[
            [
                "samples/audio1.mp3",
                "އަޅުގަނޑުވެސް ދާކަށް ބޭނުމެއްނުވި"
            ],
            [
                "samples/audio2.wav",
                "ރަނގަޅަށްވިއްޔާ އެވާނީ މުސްކުޅި ކުރެހުމަކަށް"
            ],

            [
                "samples/audio3.wav",
                "އެއީ ޞަހްޔޫނީންގެ ޒަމާންވީ ރޭވުމެއްގެ ދަށުން މެދުނުކެނޑި ކުރިއަށްވާ ޕްރޮގްރާމެއް"
            ],
        ],
        inputs=[audio, reference],
        label="Dhivehi Audio Samples",
    )
  
demo.launch(show_api=False)