from datasets import load_dataset, Dataset from transformers import pipeline import evaluate import numpy as np import gradio as gr import json from pathlib import Path # Load WER metric wer_metric = evaluate.load("wer") model_name = { "whisper-tiny": "openai/whisper-tiny.en", "wav2vec2-large-960h": "facebook/wav2vec2-base-960h", "distill-whisper-small": "distil-whisper/distil-small.en", } # open ds_data.json with open("ds_data.json", "r") as f: table_data = json.load(f) def compute_wer_table(audio, text): # Convert the wav into an array audio_input = audio[1] audio_input = audio_input.astype(np.float32) audio_input = audio_input / 32767 trans = [] wer_scores = [] for model in model_name: pipe = pipeline("automatic-speech-recognition", model=model_name[model]) transcription = pipe(audio_input)['text'] transcription = transcription.replace(",", "").replace(".", "").replace("!", "").replace("?", "") trans.append(transcription) wer = wer_metric.compute(predictions=[transcription.upper()], references=[text.upper()]) wer_scores.append(wer) result = [[model, t, s] for model, t, s in zip(model_name.keys(), trans, wer_scores)] return result with gr.Blocks() as demo: with gr.Tab("Docs"): gr.Markdown((Path(__file__).parent / "demo.md").read_text()) with gr.Tab("Demo"): gr.Interface( fn=compute_wer_table, inputs=[ gr.Audio(label="Input Audio"), gr.Textbox(label="Reference Text") ], outputs=gr.Dataframe(headers=["Model", "Transcription", "WER"], label="WER Results"), examples=[[f"assets/output_audio_{i}.wav", table_data[i]['reference']] for i in range(100)], title="ASR Model Evaluation", description=( "This application allows you to evaluate the performance of various Automatic Speech Recognition (ASR) models on " "a given audio sample. Simply provide an audio file and the corresponding reference text, and the app will compute " "the Word Error Rate (WER) for each model. The results will be presented in a table that includes the model name, " "the transcribed text, and the calculated WER. " "\n\n### Table of Results\n" "The table below shows the transcriptions generated by different ASR models, along with their corresponding WER scores. " "Lower WER scores indicate better performance." "\n\n| Model | WER |\n" "|--------------------------|--------------------------|\n" "| [whisper-tiny](https://huggingface.co/openai/whisper-tiny.en) | 0.05511 |\n" "| [wav2vec2-large-960h](https://huggingface.co/facebook/wav2vec2-large-960h) | 0.01617 |\n" "| [distill-whisper-small](https://huggingface.co/distil-whisper/distil-small.en)| 0.03686 |\n" "\n\n### Data Source\n" "The data used in this demo is a subset of the [LibriSpeech](https://huggingface.co/datasets/openslr/librispeech_asr) dataset which contains the first 100 audio samples and their corresponding reference texts in the validation set." ), ) demo.launch()