# Lint as: python3 """Simple, minimal ASR dataset template.""" import csv import os import datasets from datasets.tasks import AutomaticSpeechRecognition _CITATION = "" _DESCRIPTION = """\ This is a private dataset """ _URL = "https://localhost" _DL_URL = "http://localhost:8000/data_simple.tgz" class SimpleTplConfig(datasets.BuilderConfig): """BuilderConfig for LucerneTest.""" def __init__(self, name, **kwargs): """ Args: data_dir: `string`, the path to the folder containing the audio files in the downloaded .tar.gz file. citation: `string`, optional citation for the dataset. url: `string`, url for information about the dataset. **kwargs: keyword arguments forwarded to super. """ self.num_of_voice = 100 description = f"Simple Dataset." super(SimpleTplConfig, self).__init__( name=name, version=datasets.Version("1.1.0", ""), description=description, **kwargs ) class SimpleTpl(datasets.GeneratorBasedBuilder): """Simple Speech dataset.""" VERSION = datasets.Version("1.1.0") #SimpleTplConfig(name="simpletpl") DEFAULT_WRITER_BATCH_SIZE = 1000 BUILDER_CONFIGS = [ datasets.BuilderConfig( name="main", version=VERSION, description="The simple dataset" ) ] def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "audio": datasets.Audio(sampling_rate=16000), "path": datasets.Value("string"), "sentence": datasets.Value("string"), } ), supervised_keys=None, homepage=_URL, citation=_CITATION, task_templates=[ AutomaticSpeechRecognition( audio_file_path_column="path", transcription_column="sentence") ], ) def _split_generators(self, dl_manager): root_path = dl_manager.download_and_extract(_DL_URL) root_path = os.path.join(root_path, "data_simple") wav_path = os.path.join(root_path, "audio") train_csv = os.path.join(root_path, "train.csv") valid_csv = os.path.join(root_path, "valid.csv") test_csv = os.path.join(root_path, "test.csv") return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"wav_path": wav_path, "csv_path": train_csv} ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"wav_path": wav_path, "csv_path": valid_csv} ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"wav_path": wav_path, "csv_path": test_csv} ), ] def _generate_examples(self, wav_path, csv_path): """Generate examples from a Speech archive_path.""" with open(csv_path, encoding="utf-8") as csv_file: csv_reader = csv.reader( csv_file, delimiter=",", quotechar=None, skipinitialspace=True ) for idx,row in enumerate(csv_reader): if idx == 0: continue wav_path, sentence = row example = { "path": wav_path, "audio": wav_path, "sentence": sentence, } yield wav_path, example