import gradio as gr
from transformers import pipeline, Wav2Vec2ProcessorWithLM
from librosa import load, resample
from rpunct import RestorePuncts

asr_model = 'patrickvonplaten/wav2vec2-base-100h-with-lm'
processor = Wav2Vec2ProcessorWithLM.from_pretrained(asr_model)
asr = pipeline('automatic-speech-recognition', model=asr_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, decoder=processor.decoder)

rpunct = RestorePuncts()

def transcribe(filepath):
	speech, sampling_rate = load(filepath)
	if sampling_rate != 16000:
		speech = resample(speech, sampling_rate, 16000)
	text = asr(speech)['text']
	text = rpunct.punctuate(text.lower())
	return text

mic = gr.inputs.Audio(source='microphone', type='filepath', label='Speech input', optional=False)

transcript = gr.outputs.Textbox(type='auto', label='Transcription')

iface = gr.Interface(
	theme='huggingface',
	description='Testing transcription',
	fn=transcribe,
	inputs=[mic],
	outputs=[transcript]
)
iface.launch()