jiuuee commited on
Commit
a8286dc
1 Parent(s): 3f5328d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -49
app.py CHANGED
@@ -1,62 +1,40 @@
1
- from nemo.collections.asr.models import EncDecMultiTaskModel
2
-
3
- # Load the Canary-1B model
4
- canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
5
-
6
- # Define the input manifest file for ASR
7
- input_manifest = {
8
- "audio_filepath": "/path/to/audio.wav",
9
- "duration": 1000, # duration of the audio, can be set to `None` if using NeMo main branch
10
- "taskname": "asr",
11
- "source_lang": "en", # language of the audio input, set `source_lang`==`target_lang` for ASR, choices=['en','de','es','fr']
12
- "target_lang": "en", # language of the text output, choices=['en','de','es','fr']
13
- "pnc": "yes", # whether to have PnC output, choices=['yes', 'no']
14
- "answer": "na",
15
- }
16
-
17
- # Transcribe audio using the Canary-1B model
18
- predicted_text = canary_model.transcribe(
19
- input_manifest,
20
- batch_size=16 # batch size to run the inference with
21
- )
22
-
23
- print("Predicted Text:", predicted_text)
24
-
25
-
26
-
27
- '''import gradio as gr
28
  from nemo.collections.asr.models import ASRModel
29
  import librosa
 
30
 
31
  # Load the NeMo ASR model
32
  model = ASRModel.from_pretrained("nvidia/canary-1b")
33
  model.eval()
34
 
35
- def preprocess_audio(audio):
36
- # Convert audio data to mono channel and resample to 16kHz if necessary
37
- audio_mono = librosa.to_mono(audio.T)
38
- audio_resampled = librosa.resample(audio_mono, orig_sr=gradio.inputs.Audio.DEFAULT_SAMPLE_RATE, target_sr=16000)
39
- return audio_resampled
40
-
41
- def transcribe(audio):
42
- if audio is None:
43
- raise gr.InterfaceError("Please provide some input audio: either upload an audio file or use the microphone")
44
-
45
- print("Received audio:", audio)
46
 
 
 
47
  # Preprocess audio
48
- audio_input = preprocess_audio(audio)
49
-
50
- print("Preprocessed audio:", audio_input)
51
-
52
  # Perform speech recognition
53
- transcription = model.transcribe([audio_input])
54
-
55
- print("Transcription:", transcription)
56
-
57
  return transcription[0]
58
 
59
- audio_input = gr.components.Audio()
 
 
 
 
 
 
 
 
 
 
60
 
61
- iface = gr.Interface(transcribe, audio_input, "text", title="ASR with NeMo Canary Model")
62
- iface.launch()'''
 
1
+ import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from nemo.collections.asr.models import ASRModel
3
  import librosa
4
+ import tempfile
5
 
6
  # Load the NeMo ASR model
7
  model = ASRModel.from_pretrained("nvidia/canary-1b")
8
  model.eval()
9
 
10
+ # Function to preprocess the audio
11
+ def preprocess_audio(audio, sample_rate):
12
+ # Save audio to a temporary file
13
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
14
+ temp_audio_path = temp_audio_file.name
15
+ librosa.output.write_wav(temp_audio_path, audio.squeeze(), sample_rate)
16
+ return temp_audio_path
 
 
 
 
17
 
18
+ # Function to transcribe audio
19
+ def transcribe_audio(audio):
20
  # Preprocess audio
21
+ audio_path = preprocess_audio(audio, 16000)
22
+
 
 
23
  # Perform speech recognition
24
+ transcription = model.transcribe([audio_path])
25
+
 
 
26
  return transcription[0]
27
 
28
+ # Interface
29
+ audio_input = gr.inputs.Audio(source="microphone", label="Record Audio")
30
+ output_text = gr.outputs.Textbox(label="Transcription")
31
+
32
+ iface = gr.Interface(
33
+ transcribe_audio,
34
+ audio_input,
35
+ output_text,
36
+ title="Automatic Speech Recognition using Canary 1b",
37
+ description="Click 'Record Audio' to start recording.",
38
+ )
39
 
40
+ iface.launch()