import gradio as gr from zeroshot import process, ZS_EXAMPLES with gr.Blocks(css="style.css") as demo: gr.Markdown( "

MMS Zero-shot ASR Demo. See our arXiV paper for model details.

" ) gr.HTML( """
The demo works on input audio in any language, as long as you provide a list of words or sentences for that language and an optional n-gram language model (even a simple 1-gram model will work!) to help with accuracy.
We recommend having a minimum of 5000 distinct words in the textfile to acheive a good performance.
""" ) with gr.Row(): with gr.Column(): audio = gr.Audio(label="Audio Input\n(use microphone or upload a file)") with gr.Row(): words_file = gr.File(label="Text Data") lm_file = gr.File(label="Language Model\n(optional)") with gr.Accordion("Advanced Settings", open=False): gr.Markdown( "The following parameters are used for beam-search decoding. Use the default values if you are not sure." ) with gr.Row(): wscore = gr.Slider( minimum=-10.0, maximum=10.0, value=0, step=0.1, interactive=True, label="Word Insertion Score", ) lmscore = gr.Slider( minimum=-10.0, maximum=10.0, value=0, step=0.1, interactive=True, label="Language Model Score", ) with gr.Row(): wscore_usedefault = gr.Checkbox( label="Use Default Word Insertion Score", value=True ) lmscore_usedefault = gr.Checkbox( label="Use Default Language Model Score", value=True ) btn = gr.Button("Submit", elem_id="submit") with gr.Column(): text = gr.Textbox(label="Transcript") btn.click( process, inputs=[ audio, words_file, lm_file, wscore, lmscore, wscore_usedefault, lmscore_usedefault, ], outputs=text, ) examples = gr.Examples(examples=ZS_EXAMPLES, inputs=[audio, words_file]) demo.launch()