import gradio as gr
from transformers import pipeline
from librosa import load, resample


asr_model = 'facebook/wav2vec2-base-960h'

asr = pipeline('automatic-speech-recognition', model=asr_model, feature_extractor=asr_model)

def transcribe(filepath):
	speech, sampling_rate = load(filepath)
	if sampling_rate != 16000:
		speech = resample(speech, sampling_rate, 16000)
	text = asr(speech)['text']
	return text

mic = gr.inputs.Audio(source='microphone', type='filepath', label='Speech input', optional=False)

transcript = gr.outputs.Textbox(type='auto', label='Transcription')

iface = gr.Interface(
	theme='huggingface',
	description='Testing transcription',
	fn=transcribe,
	inputs=[mic],
	outputs=[transcript]
)
iface.launch()