# Evaluate ASR models This is a breakdown of the steps to evaluate ASR models on a small subset of the Librispeech dataset based on the script in the `evaluate_asr.py` file. ## 0. Import the necessary libraries ```python from datasets import load_dataset, Dataset from transformers import pipeline import evaluate import torch import numpy as np from tqdm import tqdm import gradio as gr from collections import defaultdict import json ``` ## 1. Pick a speech dataset (English)from the Hugging Face hub and create a small subset of this dataset (100 rows) by streaming the data We will use the `librispeech_asr` dataset from the Hugging Face hub. We will use the `clean` split and the `validation` subset. ```python # Load data ds = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True) ds = ds.take(100) ``` ## 3. Pick three transformers-compatible speech recognition models We will evaluate the following models: - `openai/whisper-tiny.en` - `facebook/wav2vec2-base-960h` - `distil-whisper/distil-small.en` ```python model_name = { "whisper-tiny": "openai/whisper-tiny.en", "wav2vec2-large-960h": "facebook/wav2vec2-base-960h", "distill-whisper-small": "distil-whisper/distil-small.en", } ``` ## 4. Evaluate the models on the dataset ```python def evaluate_model(ds, pipe, wer_metric): wer_scores = [] wer_results = [] for idx, sample in enumerate(tqdm(ds, desc="Evaluating", total=len(list(ds)))): audio_sample = sample["audio"] transcription = pipe(audio_sample["array"])['text'] # Keep only letter and spaces for evaluation transcription = transcription.replace(",", "").replace(".", "").replace("!", "").replace("?", "") wer = wer_metric.compute(predictions=[transcription.upper()], references=[sample["text"].upper()]) wer_scores.append(wer) wer_results.append({ "index": idx, "transcription": transcription.upper(), "reference": sample["text"].upper(), "wer": wer }) return wer_scores, wer_results # Load WER metric wer_metric = evaluate.load("wer") results = {} model_wer_results = {} # Evaluate model for model in model_name: pipe = pipeline("automatic-speech-recognition", model=model_name[model]) wer_scores, wer_results = evaluate_model(ds, pipe, wer_metric) results[model] = np.mean(wer_scores) model_wer_results[model] = wer_results for model in results: print(f"Model: {model}, WER: {results[model]}") ```