Spaces:

jiuuee
/

my-alexa

Sleeping

App Files Files Community

jiuuee commited on May 2

Commit

a8286dc

•

1 Parent(s): 3f5328d

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -49

app.py CHANGED Viewed

@@ -1,62 +1,40 @@
-from nemo.collections.asr.models import EncDecMultiTaskModel
-# Load the Canary-1B model
-canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
-# Define the input manifest file for ASR
-input_manifest = {
-    "audio_filepath": "/path/to/audio.wav",
-    "duration": 1000,  # duration of the audio, can be set to `None` if using NeMo main branch
-    "taskname": "asr",
-    "source_lang": "en", # language of the audio input, set `source_lang`==`target_lang` for ASR, choices=['en','de','es','fr']
-    "target_lang": "en", # language of the text output, choices=['en','de','es','fr']
-    "pnc": "yes",  # whether to have PnC output, choices=['yes', 'no']
-    "answer": "na",
-}
-# Transcribe audio using the Canary-1B model
-predicted_text = canary_model.transcribe(
-    input_manifest,
-    batch_size=16  # batch size to run the inference with
-)
-print("Predicted Text:", predicted_text)
-'''import gradio as gr
 from nemo.collections.asr.models import ASRModel
 import librosa
 # Load the NeMo ASR model
 model = ASRModel.from_pretrained("nvidia/canary-1b")
 model.eval()
-def preprocess_audio(audio):
-    # Convert audio data to mono channel and resample to 16kHz if necessary
-    audio_mono = librosa.to_mono(audio.T)
-    audio_resampled = librosa.resample(audio_mono, orig_sr=gradio.inputs.Audio.DEFAULT_SAMPLE_RATE, target_sr=16000)
-    return audio_resampled
-def transcribe(audio):
-    if audio is None:
-        raise gr.InterfaceError("Please provide some input audio: either upload an audio file or use the microphone")
-    print("Received audio:", audio)
     # Preprocess audio
-    audio_input = preprocess_audio(audio)
-    print("Preprocessed audio:", audio_input)
     # Perform speech recognition
-    transcription = model.transcribe([audio_input])
-    print("Transcription:", transcription)
     return transcription[0]
-audio_input = gr.components.Audio()
-iface = gr.Interface(transcribe, audio_input, "text", title="ASR with NeMo Canary Model")
-iface.launch()'''

+import gradio as gr
 from nemo.collections.asr.models import ASRModel
 import librosa
+import tempfile
 # Load the NeMo ASR model
 model = ASRModel.from_pretrained("nvidia/canary-1b")
 model.eval()
+# Function to preprocess the audio
+def preprocess_audio(audio, sample_rate):
+    # Save audio to a temporary file
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
+        temp_audio_path = temp_audio_file.name
+        librosa.output.write_wav(temp_audio_path, audio.squeeze(), sample_rate)
+    return temp_audio_path
+# Function to transcribe audio
+def transcribe_audio(audio):
     # Preprocess audio
+    audio_path = preprocess_audio(audio, 16000)
     # Perform speech recognition
+    transcription = model.transcribe([audio_path])
     return transcription[0]
+# Interface
+audio_input = gr.inputs.Audio(source="microphone", label="Record Audio")
+output_text = gr.outputs.Textbox(label="Transcription")
+iface = gr.Interface(
+    transcribe_audio,
+    audio_input,
+    output_text,
+    title="Automatic Speech Recognition using Canary 1b",
+    description="Click 'Record Audio' to start recording.",
+)
+iface.launch()