Spaces:

speechbox
/

whisper-restore-punctuation

Running

Create app.py

7fb9ab5 almost 2 years ago

711 Bytes

	from speechbox import PunctuationRestorer
	from datasets import load_dataset

	streamed_dataset = load_dataset("librispeech_asr", "clean", split="validation", streaming=True)

	# get first sample
	sample = next(iter(streamed_dataset))

	# print out normalized transcript
	print(sample["text"])
	# => "HE WAS IN A FEVERED STATE OF MIND OWING TO THE BLIGHT HIS WIFE'S ACTION THREATENED TO CAST UPON HIS ENTIRE FUTURE"

	# load the restoring class
	restorer = PunctuationRestorer.from_pretrained("openai/whisper-tiny.en")
	restorer.to("cuda")

	restored_text, log_probs = restorer(sample["audio"]["array"], sample["text"], sampling_rate=sample["audio"]["sampling_rate"], num_beams=1)

	print("Restored text:\n", restored_text)