File size: 5,688 Bytes
7f1afcd
daad6da
 
 
 
 
 
7f1afcd
a4107b1
297e244
 
 
 
a4107b1
297e244
 
 
 
a4107b1
297e244
a4107b1
297e244
a4107b1
 
 
 
 
 
0f191f9
 
 
 
 
 
 
 
 
 
 
 
daad6da
0f191f9
 
 
 
 
 
 
 
 
 
 
 
a4107b1
daad6da
 
 
 
 
 
0f191f9
 
 
 
 
 
 
 
 
 
 
 
daad6da
 
 
0f191f9
 
 
 
 
daad6da
297e244
 
daad6da
 
 
d15da79
 
 
a4107b1
 
 
 
 
 
 
 
 
 
d15da79
a4107b1
daad6da
 
 
 
 
 
 
d15da79
 
 
daad6da
d15da79
daad6da
 
 
 
 
d15da79
 
 
daad6da
d15da79
daad6da
a4107b1
7f1afcd
a4107b1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import gradio as gr
from zeroshot import (
    process,
    WORD_SCORE_DEFAULT_IF_LM,
    WORD_SCORE_DEFAULT_IF_NOLM,
    LM_SCORE_DEFAULT,
)

with gr.Blocks(css="style.css") as demo:
    gr.Markdown(
        "<p align='center' style='font-size: 20px;'>MMS Zero-shot ASR Demo. See our arXiV <a href='https://arxiv.org/'>paper</a> for model details.</p>"
    )
    gr.HTML(
        """<center>The demo works on input audio in any language, as long as you provide a list of words or sentences for that language and an optional n-gram language model (even a simple 1-gram model will work!) to help with accuracy.<br>We recommend having a minimum of 5000 distinct words in the textfile to acheive a good performance.</center>"""
    )
    with gr.Row():
        with gr.Column():
            audio = gr.Audio(label="Audio Input\n(use microphone or upload a file)")

            with gr.Row():
                words_file = gr.File(label="Text Data")
                lm_file = gr.File(label="Language Model\n(optional)")

            with gr.Accordion("Advanced Settings", open=False):
                gr.Markdown(
                    "The following parameters are used for beam-search decoding. Use the default values if you are not sure."
                )
                with gr.Row():
                    with gr.Column():
                        wscore_usedefault = gr.Checkbox(
                            label="Use Default Word Insertion Score", value=True
                        )
                        wscore = gr.Slider(
                            minimum=-10.0,
                            maximum=10.0,
                            value=WORD_SCORE_DEFAULT_IF_NOLM,
                            step=0.1,
                            interactive=False,
                            label="Word Insertion Score",
                        )

                    with gr.Column():
                        lmscore_usedefault = gr.Checkbox(
                            label="Use Default Language Model Score", value=True
                        )
                        lmscore = gr.Slider(
                            minimum=-10.0,
                            maximum=10.0,
                            value=0,
                            step=0.1,
                            interactive=False,
                            label="Language Model Score",
                        )
            btn = gr.Button("Submit", elem_id="submit")

            @gr.on(
                inputs=[wscore_usedefault, lmscore_usedefault, lm_file],
                outputs=[wscore, lmscore],
            )
            def update_slider(ws, ls, lm):

                ws_slider = gr.Slider(
                    minimum=-10.0,
                    maximum=10.0,
                    value=LM_SCORE_DEFAULT if lm is not None else 0,
                    step=0.1,
                    interactive=not ws,
                    label="Word Insertion Score",
                )
                ls_slider = gr.Slider(
                    minimum=-10.0,
                    maximum=10.0,
                    value=WORD_SCORE_DEFAULT_IF_NOLM
                    if lm is None
                    else WORD_SCORE_DEFAULT_IF_LM,
                    step=0.1,
                    interactive=not ls,
                    label="Language Model Score",
                )
                return ws_slider, ls_slider

        with gr.Column():
            text = gr.Textbox(label="Transcript")
            with gr.Accordion("Logs", open=False):
                logs = gr.Textbox(show_label=False)

    # hack 
    reference = gr.Textbox(label="Reference Transcript", visible=False)

    btn.click(
        process,
        inputs=[
            audio,
            words_file,
            lm_file,
            wscore,
            lmscore,
            wscore_usedefault,
            lmscore_usedefault,
            reference
        ],
        outputs=[text, logs],
    )

    # Examples
    gr.Examples(
        examples=[
            # ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
            ["upload/english/english.mp3", "upload/english/c4_10k_sentences.txt", " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import"],
            ["upload/english/english.mp3", "upload/english/c4_5k_sentences.txt", " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import"],
            ["upload/english/english.mp3", "upload/english/cv8_top10k_words.txt", " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import"],
        ],
        inputs=[audio, words_file, reference],
        label="English",
    )
    gr.Examples(
        examples=[
            # ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
            ["upload/ligurian/ligurian_1.mp3", "upload/ligurian/zenamt_10k_sentences.txt", "I mæ colleghi m’an domandou d’aggiuttâli à fâ unna preuva co-o zeneise pe vedde s’o fonçioña."],
            ["upload/ligurian/ligurian_2.mp3", "upload/ligurian/zenamt_10k_sentences.txt", "Staseia vaggo à çenâ con mæ moggê e doî amixi che de chì à quarche settemaña faian stramuo feua stato."],
            ["upload/ligurian/ligurian_3.mp3", "upload/ligurian/zenamt_5k_sentences.txt", "Pe inandiâ o pesto ghe veu o baxaicò, i pigneu, l’euio, o formaggio, l’aggio e a sâ."],
        ],
        inputs=[audio, words_file, reference],
        label="Ligurian",
    )

demo.launch()