Steven Zheng commited on
Commit
ed6583e
1 Parent(s): 461492b

Add application file

Browse files
Files changed (1) hide show
  1. app.py +73 -0
app.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, Dataset
2
+ from transformers import pipeline
3
+ import evaluate
4
+ import numpy as np
5
+ import gradio as gr
6
+ import json
7
+ from pathlib import Path
8
+
9
+
10
+ # Load WER metric
11
+ wer_metric = evaluate.load("wer")
12
+
13
+ model_name = {
14
+ "whisper-tiny": "openai/whisper-tiny.en",
15
+ "wav2vec2-large-960h": "facebook/wav2vec2-base-960h",
16
+ "distill-whisper-small": "distil-whisper/distil-small.en",
17
+ }
18
+
19
+ # open ds_data.json
20
+ with open("models/ds_data.json", "r") as f:
21
+ table_data = json.load(f)
22
+
23
+ def compute_wer_table(audio, text):
24
+ # Convert the wav into an array
25
+ audio_input = audio[1]
26
+ audio_input = audio_input.astype(np.float32)
27
+ audio_input = audio_input / 32767
28
+
29
+ trans = []
30
+ wer_scores = []
31
+ for model in model_name:
32
+ pipe = pipeline("automatic-speech-recognition", model=model_name[model])
33
+ transcription = pipe(audio_input)['text']
34
+ transcription = transcription.replace(",", "").replace(".", "")
35
+ trans.append(transcription)
36
+ wer = wer_metric.compute(predictions=[transcription.upper()], references=[text.upper()])
37
+ wer_scores.append(wer)
38
+
39
+ result = [[model, t, s] for model, t, s in zip(model_name.keys(), trans, wer_scores)]
40
+
41
+ return result
42
+ with gr.Blocks() as demo:
43
+ with gr.Tab("Docs"):
44
+ gr.Markdown((Path(__file__).parent / "demo.md").read_text())
45
+ with gr.Tab("Demo"):
46
+ gr.Interface(
47
+ fn=compute_wer_table,
48
+ inputs=[
49
+ gr.Audio(label="Input Audio"),
50
+ gr.Textbox(label="Reference Text")
51
+ ],
52
+ outputs=gr.Dataframe(headers=["Model", "Transcription", "WER"], label="WER Results"),
53
+ examples=[[f"assets/output_audio_{i}.wav", table_data[i]['reference']] for i in range(100)],
54
+ title="ASR Model Evaluation",
55
+ description=(
56
+ "This application allows you to evaluate the performance of various Automatic Speech Recognition (ASR) models on "
57
+ "a given audio sample. Simply provide an audio file and the corresponding reference text, and the app will compute "
58
+ "the Word Error Rate (WER) for each model. The results will be presented in a table that includes the model name, "
59
+ "the transcribed text, and the calculated WER. "
60
+ "\n\n### Table of Results\n"
61
+ "The table below shows the transcriptions generated by different ASR models, along with their corresponding WER scores. "
62
+ "Lower WER scores indicate better performance."
63
+ "\n\n| Model | WER |\n"
64
+ "|--------------------------|--------------------------|\n"
65
+ "| [whisper-tiny](https://huggingface.co/openai/whisper-tiny.en) | 0.06175 |\n"
66
+ "| [wav2vec2-large-960h](https://huggingface.co/facebook/wav2vec2-large-960h) | 0.01617 |\n"
67
+ "| [distill-whisper-small](https://huggingface.co/distil-whisper/distil-small.en)| 0.04350 |\n"
68
+ "\n\n### Data Source\n"
69
+ "The data used in this demo is a subset of the [LibriSpeech](https://huggingface.co/datasets/openslr/librispeech_asr) dataset which contains the first 100 audio samples and their corresponding reference texts in the validation set."
70
+ ),
71
+ )
72
+
73
+ demo.launch(share=True)