from datasets import load_dataset from transformers import pipeline import evaluate import numpy as np from tqdm import tqdm ds = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True) ds = ds.take(100) model_name = { "whisper-tiny": "openai/whisper-tiny.en", "wav2vec2-large-960h": "facebook/wav2vec2-base-960h", "distill-whisper-small": "distil-whisper/distil-small.en", } def evaluate_model(ds, pipe, wer_metric): wer_scores = [] wer_results = [] for idx, sample in enumerate(tqdm(ds, desc="Evaluating", total=len(list(ds)))): audio_sample = sample["audio"] transcription = pipe(audio_sample["array"])['text'] # Keep only letter and spaces for evaluation transcription = transcription.replace(",", "").replace(".", "").replace("!", "").replace("?", "") wer = wer_metric.compute(predictions=[transcription.upper()], references=[sample["text"].upper()]) wer_scores.append(wer) wer_results.append({ "index": idx, "transcription": transcription.upper(), "reference": sample["text"].upper(), "wer": wer }) return wer_scores, wer_results # Load WER metric wer_metric = evaluate.load("wer") results = {} model_wer_results = {} # Evaluate model for model in model_name: pipe = pipeline("automatic-speech-recognition", model=model_name[model]) wer_scores, wer_results = evaluate_model(ds, pipe, wer_metric) results[model] = np.mean(wer_scores) model_wer_results[model] = wer_results for model in results: print(f"Model: {model}, WER: {results[model]}")