nbroad HF staff commited on
Commit
67c3f28
1 Parent(s): 8f7b0ec

add option for wikipedia

Browse files
Files changed (1) hide show
  1. utils.py +25 -0
utils.py CHANGED
@@ -112,7 +112,32 @@ def load_hf_dataset(ds_name: str, ds_config: str = None, ds_split: str = "train"
112
 
113
  return ds
114
 
 
 
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  def get_model_and_tokenizer(model_name: str, optimization_level: str, progress):
117
  """
118
  Load the model and tokenizer from the HuggingFace Hub.
 
112
 
113
  return ds
114
 
115
+ def download_wikipedia(ds_name, ds_config):
116
+ ds = load_dataset(ds_name, ds_config, streaming=True, split="train")
117
 
118
+ def gen():
119
+ for example in ds:
120
+ yield {"text": example["text"]}
121
+
122
+ ds2 = Dataset.from_generator(gen)
123
+
124
+ chunk_size = 200_000
125
+
126
+ filenames = []
127
+
128
+ Path("wiki_chunks").mkdir(exist_ok=True)
129
+
130
+ for chunk_num, start_idx in enumerate(range(0, len(ds2), chunk_size)):
131
+ end_idx = min(start_idx + chunk_size, len(ds2))
132
+
133
+ temp = ds2.select(range(start_idx, end_idx))
134
+
135
+ temp.to_parquet(f"wiki_chunks/chunk_{chunk_num}")
136
+ filenames.append(f"wiki_chunks/chunk_{chunk_num}")
137
+
138
+ return load_dataset("parquet", data_files=filenames, split="train")
139
+
140
+
141
  def get_model_and_tokenizer(model_name: str, optimization_level: str, progress):
142
  """
143
  Load the model and tokenizer from the HuggingFace Hub.