File size: 6,377 Bytes
7f1afcd
daad6da
 
 
 
 
 
7f1afcd
a4107b1
297e244
462b91a
297e244
 
f0d393c
297e244
 
 
 
a4107b1
297e244
a4107b1
297e244
a4107b1
 
 
 
 
 
0f191f9
 
 
 
 
 
 
1be831a
0f191f9
 
 
 
daad6da
0f191f9
 
 
 
 
 
 
1be831a
0f191f9
 
 
 
6f27821
 
f138a14
 
6f27821
a4107b1
daad6da
 
6f27821
daad6da
 
6f27821
0f191f9
 
 
 
6f27821
0f191f9
 
 
 
 
 
 
daad6da
6f27821
daad6da
0f191f9
 
 
 
 
daad6da
297e244
 
daad6da
 
 
78e8beb
d15da79
 
a4107b1
 
 
 
 
 
 
 
 
 
6f27821
78e8beb
a4107b1
daad6da
 
 
 
 
 
 
78e8beb
 
 
 
 
 
 
 
 
 
 
 
6f27821
78e8beb
 
daad6da
d15da79
daad6da
 
 
 
 
78e8beb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
daad6da
d15da79
daad6da
a4107b1
7f1afcd
a4107b1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import gradio as gr
from zeroshot import (
    process,
    WORD_SCORE_DEFAULT_IF_LM,
    WORD_SCORE_DEFAULT_IF_NOLM,
    LM_SCORE_DEFAULT,
)

with gr.Blocks(css="style.css") as demo:
    gr.Markdown(
        "<p align='center' style='font-size: 20px;'>MMS Zero-shot ASR Demo. See our arXiV <a href='http://arxiv.org/abs/2407.17852'>paper</a> for model details.</p>"
    )
    gr.HTML(
        """<center>The demo works on input audio in any language, as long as you provide a list of words or sentences for that language and an optional n-gram language model (even a simple 1-gram model will work!) to help with accuracy.<br>We recommend having a minimum of 10000 sentences in the textfile to acheive a good performance.</center>"""
    )
    with gr.Row():
        with gr.Column():
            audio = gr.Audio(label="Audio Input\n(use microphone or upload a file)")

            with gr.Row():
                words_file = gr.File(label="Text Data")
                lm_file = gr.File(label="Language Model\n(optional)")

            with gr.Accordion("Advanced Settings", open=False):
                gr.Markdown(
                    "The following parameters are used for beam-search decoding. Use the default values if you are not sure."
                )
                with gr.Row():
                    with gr.Column():
                        wscore_usedefault = gr.Checkbox(
                            label="Use Default Word Insertion Score", value=True
                        )
                        wscore = gr.Slider(
                            minimum=-10.0,
                            maximum=10.0,
                            value=WORD_SCORE_DEFAULT_IF_LM,
                            step=0.1,
                            interactive=False,
                            label="Word Insertion Score",
                        )

                    with gr.Column():
                        lmscore_usedefault = gr.Checkbox(
                            label="Use Default Language Model Score", value=True
                        )
                        lmscore = gr.Slider(
                            minimum=-10.0,
                            maximum=10.0,
                            value=LM_SCORE_DEFAULT,
                            step=0.1,
                            interactive=False,
                            label="Language Model Score",
                        )
                    with gr.Column():
                        autolm = gr.Checkbox(
                            label="Automatically create Unigram LM from text data",
                            value=True,
                        )
            btn = gr.Button("Submit", elem_id="submit")

            @gr.on(
                inputs=[wscore_usedefault, lmscore_usedefault, lm_file, autolm],
                outputs=[wscore, lmscore],
            )
            def update_slider(ws, ls, lm, alm):

                ws_slider = gr.Slider(
                    minimum=-10.0,
                    maximum=10.0,
                    value=LM_SCORE_DEFAULT if (lm is not None or alm) else 0,
                    step=0.1,
                    interactive=not ws,
                    label="Word Insertion Score",
                )
                ls_slider = gr.Slider(
                    minimum=-10.0,
                    maximum=10.0,
                    value=WORD_SCORE_DEFAULT_IF_NOLM
                    if (lm is None and not alm)
                    else WORD_SCORE_DEFAULT_IF_LM,
                    step=0.1,
                    interactive=not ls,
                    label="Language Model Score",
                )
                return ws_slider, ls_slider

        with gr.Column():
            text = gr.Textbox(label="Transcript")
            with gr.Accordion("Logs", open=False):
                logs = gr.Textbox(show_label=False)

    # hack
    reference = gr.Textbox(label="Reference Transcript", visible=False)

    btn.click(
        process,
        inputs=[
            audio,
            words_file,
            lm_file,
            wscore,
            lmscore,
            wscore_usedefault,
            lmscore_usedefault,
            autolm,
            reference,
        ],
        outputs=[text, logs],
    )

    # Examples
    gr.Examples(
        examples=[
            # ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
            [
                "upload/english/english.mp3",
                "upload/english/c4_10k_sentences.txt",
                " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
            ],
            [
                "upload/english/english.mp3",
                "upload/english/c4_5k_sentences.txt",
                " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
            ],
            [
                "upload/english/english.mp3",
                "upload/english/gutenberg_27045.txt",
                " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
            ],
        ],
        inputs=[audio, words_file, reference],
        label="English",
    )
    gr.Examples(
        examples=[
            # ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
            [
                "upload/ligurian/ligurian_1.mp3",
                "upload/ligurian/zenamt_10k_sentences.txt",
                "I mæ colleghi m’an domandou d’aggiuttâli à fâ unna preuva co-o zeneise pe vedde s’o fonçioña.",
            ],
            [
                "upload/ligurian/ligurian_2.mp3",
                "upload/ligurian/zenamt_10k_sentences.txt",
                "Staseia vaggo à çenâ con mæ moggê e doî amixi che de chì à quarche settemaña faian stramuo feua stato.",
            ],
            [
                "upload/ligurian/ligurian_3.mp3",
                "upload/ligurian/zenamt_5k_sentences.txt",
                "Pe inandiâ o pesto ghe veu o baxaicò, i pigneu, l’euio, o formaggio, l’aggio e a sâ.",
            ],
        ],
        inputs=[audio, words_file, reference],
        label="Ligurian",
    )

demo.launch()