#from transformers import pipeline #import gradio as gr #import nemo.collections.asr as nemo_asr #import gradio #model = pipeline("automatic-speech-recognition") #model = pipeline("automatic-speech-recognition", model="openai/whisper-base.en") #model = pipeline("automatic-speech-recognition", model="nvidia/parakeet-ctc-0.6b") ''' asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="nvidia/parakeet-ctc-1.1b") def transcribe_audio(mic=None, file=None): if mic is not None: audio = mic elif file is not None: audio = file else: return "You must either provide a mic recording or a file" #transcription = model(audio)["text"] transcription = asr_model(audio) return transcription gr.Interface( fn=transcribe_audio, inputs=[ gr.Audio(sources="microphone", type="filepath"), gr.Audio(sources="upload", type="filepath"), ], outputs="text", ).launch(share=True) ''' #gr.load("models/nvidia/parakeet-ctc-1.1b").launch() #gr.load("models/openai/whisper-medium.en").launch() #gr.load("models/nvidia/stt_en_conformer_ctc_small").launch(share=True) import os os.system("""pip install nemo_toolkit['all']""") import nemo.collections.asr as nemo_asr from transformers import pipeline import numpy as np import gradio as gr import librosa from scipy.io.wavfile import write def respond(message, chat_history): bot_message = message chat_history.append((message, bot_message)) return "", chat_history def transcribe(audio): sr, y = audio audio_name = "resampled_audio.wav" resampled_audio = librosa.resample(y=y.astype("float"), orig_sr=sr, target_sr=16000) write(audio_name, 16000, resampled_audio) result = asr_model.transcribe([f"./{audio_name}"]) return result[0] asr_model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="nvidia/parakeet-ctc-0.6b") with gr.Blocks() as demo: with gr.Column(): gr.Markdown( """ # VR test """) va = gr.Chatbot(container=False) with gr.Row(): # text input text_input = gr.Textbox(placeholder="Ask me anything...", container=False, scale=1) submit_btn = gr.Button("Submit", scale=0) with gr.Row(): # audio input recording = gr.Microphone(show_download_button=False, container=False) with gr.Row(): # button toolbar clear = gr.ClearButton([text_input, va]) text_input.submit(respond, [text_input, va], [text_input, va], queue=False) submit_btn.click(respond, [text_input, va], [text_input, va], queue=False) # recording.stop_recording(transcribe, [recording], [text_input]).then(respond,s [text_input, va], [text_input, va], queue=False) recording.stop_recording(transcribe, [recording], [text_input]) if __name__ == "__main__": demo.launch(share=True)