import gradio as gr from transformers import pipeline from librosa import load, resample asr_model = 'facebook/wav2vec2-base-960h' asr = pipeline('automatic-speech-recognition', model=asr_model, feature_extractor=asr_model) def transcribe(filepath): speech, sampling_rate = load(filepath) if sampling_rate != 16000: speech = resample(speech, sampling_rate, 16000) text = asr(speech)['text'] return text mic = gr.inputs.Audio(source='microphone', type='filepath', label='Speech input', optional=False) transcript = gr.outputs.Textbox(type='auto', label='Transcription') iface = gr.Interface( theme='huggingface', description='Testing transcription', fn=transcribe, inputs=[mic], outputs=[transcript] ) iface.launch()