import gradio as gr from transformers import pipeline, Wav2Vec2ProcessorWithLM from librosa import load, resample from rpunct import RestorePuncts asr_model = 'patrickvonplaten/wav2vec2-base-100h-with-lm' processor = Wav2Vec2ProcessorWithLM.from_pretrained(asr_model) asr = pipeline('automatic-speech-recognition', model=asr_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, decoder=processor.decoder) rpunct = RestorePuncts() def transcribe(filepath): speech, sampling_rate = load(filepath) if sampling_rate != 16000: speech = resample(speech, sampling_rate, 16000) text = asr(speech)['text'] text = rpunct.punctuate(text.lower()) return text mic = gr.inputs.Audio(source='microphone', type='filepath', label='Speech input', optional=False) transcript = gr.outputs.Textbox(type='auto', label='Transcription') iface = gr.Interface( theme='huggingface', description='Testing transcription', fn=transcribe, inputs=[mic], outputs=[transcript] ) iface.launch()