Spaces:

Steveeeeeeen
/

ASR-comparaison

Sleeping

Steveeeeeeen commited on Aug 13

Commit

66cbb93

•

1 Parent(s): f0f7172

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -31,7 +31,7 @@ def compute_wer_table(audio, text):
     for model in model_name:
         pipe = pipeline("automatic-speech-recognition", model=model_name[model])
         transcription = pipe(audio_input)['text']
-        transcription = "".join([char for char in transcription if char.isalpha() or char.isspace()])
         trans.append(transcription)
         wer = wer_metric.compute(predictions=[transcription.upper()], references=[text.upper()])
         wer_scores.append(wer)
@@ -62,9 +62,9 @@ with gr.Blocks() as demo:
                 "Lower WER scores indicate better performance."
                 "\n\n| Model                   | WER                     |\n"
                 "|--------------------------|--------------------------|\n"
-                "| [whisper-tiny](https://huggingface.co/openai/whisper-tiny.en)         | 0.06052      |\n"
-                "| [wav2vec2-large-960h](https://huggingface.co/facebook/wav2vec2-large-960h)  | 0.02201     |\n"
-                "| [distill-whisper-small](https://huggingface.co/distil-whisper/distil-small.en)| 0.03959      |\n"
                 "\n\n### Data Source\n"
                 "The data used in this demo is a subset of the [LibriSpeech](https://huggingface.co/datasets/openslr/librispeech_asr) dataset which contains the first 100 audio samples and their corresponding reference texts in the validation set."
             ),

     for model in model_name:
         pipe = pipeline("automatic-speech-recognition", model=model_name[model])
         transcription = pipe(audio_input)['text']
+        transcription = transcription.replace(",", "").replace(".", "").replace("!", "").replace("?", "")
         trans.append(transcription)
         wer = wer_metric.compute(predictions=[transcription.upper()], references=[text.upper()])
         wer_scores.append(wer)
                 "Lower WER scores indicate better performance."
                 "\n\n| Model                   | WER                     |\n"
                 "|--------------------------|--------------------------|\n"
+                "| [whisper-tiny](https://huggingface.co/openai/whisper-tiny.en)         | 0.05511      |\n"
+                "| [wav2vec2-large-960h](https://huggingface.co/facebook/wav2vec2-large-960h)  | 0.01617     |\n"
+                "| [distill-whisper-small](https://huggingface.co/distil-whisper/distil-small.en)| 0.03686      |\n"
                 "\n\n### Data Source\n"
                 "The data used in this demo is a subset of the [LibriSpeech](https://huggingface.co/datasets/openslr/librispeech_asr) dataset which contains the first 100 audio samples and their corresponding reference texts in the validation set."
             ),