Files changed (3) hide show
  1. curated.py +84 -50
  2. main.py +4 -4
  3. web.py +1 -0
curated.py CHANGED
@@ -33,7 +33,9 @@ curated_sources_intro = Div(
33
  P(
34
  "Curated sources comprise high-quality datasets that contain domain-specificity.",
35
  B(
36
- " TxT360 was strongly influenced by The Pile", D_cite(bibtex_key="thepile"), " regarding both inclusion of the dataset and filtering techniques."
 
 
37
  ),
38
  " These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ",
39
  ),
@@ -129,16 +131,16 @@ wikipedia_filter = pd.DataFrame(
129
  "0.00%",
130
  ],
131
  "Percent Removed After Local Dedup": [
132
- "",
133
  ],
134
  "Total Percentage Remaining": [
135
- "",
136
  ],
137
  }
138
  )
139
 
140
  table_html_wikipedia = wikipedia_filter.to_html(index=False, border=0)
141
- table_div_wikipedia = Div(NotStr(table_html_wikipedia), style="margin-left: auto; width: 80%; align: center;")
142
 
143
  freelaw_filter = pd.DataFrame(
144
  {
@@ -167,7 +169,7 @@ freelaw_filter = pd.DataFrame(
167
  )
168
 
169
  table_html_freelaw = freelaw_filter.to_html(index=False, border=0)
170
- table_div_freelaw = Div(NotStr(table_html_freelaw), style="margin-left: auto; width: 80%; align: center;")
171
 
172
  dmm_filter = pd.DataFrame(
173
  {
@@ -187,16 +189,16 @@ dmm_filter = pd.DataFrame(
187
  "0.00%",
188
  ],
189
  "Percent Removed After Local Dedup": [
190
- "",
191
  ],
192
  "Total Percentage Remaining": [
193
- "%",
194
  ],
195
  }
196
  )
197
 
198
  table_html_dmm = dmm_filter.to_html(index=False, border=0)
199
- table_div_dmm = Div(NotStr(table_html_dmm), style="margin-left: auto; width: 80%; align: center;")
200
 
201
 
202
  uspto_filter = pd.DataFrame(
@@ -217,16 +219,16 @@ uspto_filter = pd.DataFrame(
217
  "0.01%",
218
  ],
219
  "Percent Removed After Local Dedup": [
220
- "",
221
  ],
222
  "Total Percentage Remaining": [
223
- "%",
224
  ],
225
  }
226
  )
227
 
228
  table_html_uspto = uspto_filter.to_html(index=False, border=0)
229
- table_div_uspto = Div(NotStr(table_html_uspto), style="margin-left: auto; width: 80%; align: center;")
230
 
231
  pg19_filter = pd.DataFrame(
232
  {
@@ -246,16 +248,16 @@ pg19_filter = pd.DataFrame(
246
  "0.17%",
247
  ],
248
  "Percent Removed After Local Dedup": [
249
- "",
250
  ],
251
  "Total Percentage Remaining": [
252
- "%",
253
  ],
254
  }
255
  )
256
 
257
  table_html_pg19 = pg19_filter.to_html(index=False, border=0)
258
- table_div_pg19 = Div(NotStr(table_html_pg19), style="margin-left: auto; width: 80%; align: center;")
259
 
260
 
261
  hn_filter = pd.DataFrame(
@@ -276,16 +278,16 @@ hn_filter = pd.DataFrame(
276
  "0.34%",
277
  ],
278
  "Percent Removed After Local Dedup": [
279
- "",
280
  ],
281
  "Total Percentage Remaining": [
282
- "%",
283
  ],
284
  }
285
  )
286
 
287
  table_html_hn = hn_filter.to_html(index=False, border=0)
288
- table_div_hn = Div(NotStr(table_html_hn), style="margin-left: auto; width: 80%; align: center;")
289
 
290
 
291
  uirc_filter = pd.DataFrame(
@@ -306,16 +308,16 @@ uirc_filter = pd.DataFrame(
306
  "1.12%",
307
  ],
308
  "Percent Removed After Local Dedup": [
309
- "",
310
  ],
311
  "Total Percentage Remaining": [
312
- "%",
313
  ],
314
  }
315
  )
316
 
317
  table_html_uirc = uirc_filter.to_html(index=False, border=0)
318
- table_div_uirc = Div(NotStr(table_html_uirc), style="margin-left: auto; width: 80%; align: center;")
319
 
320
  up_filter = pd.DataFrame(
321
  {
@@ -335,16 +337,16 @@ up_filter = pd.DataFrame(
335
  "0.00%",
336
  ],
337
  "Percent Removed After Local Dedup": [
338
- "",
339
  ],
340
  "Total Percentage Remaining": [
341
- "%",
342
  ],
343
  }
344
  )
345
 
346
  table_html_up = up_filter.to_html(index=False, border=0)
347
- table_div_up = Div(NotStr(table_html_up), style="margin-left: auto; width: 80%; align: center;")
348
 
349
  se_filter = pd.DataFrame(
350
  {
@@ -364,16 +366,16 @@ se_filter = pd.DataFrame(
364
  "0.00%",
365
  ],
366
  "Percent Removed After Local Dedup": [
367
- "",
368
  ],
369
  "Total Percentage Remaining": [
370
- "%",
371
  ],
372
  }
373
  )
374
 
375
  table_html_se = se_filter.to_html(index=False, border=0)
376
- table_div_se = Div(NotStr(table_html_se), style="margin-left: auto; width: 80%; align: center;")
377
 
378
  arx_filter = pd.DataFrame(
379
  {
@@ -393,16 +395,16 @@ arx_filter = pd.DataFrame(
393
  "0.07%",
394
  ],
395
  "Percent Removed After Local Dedup": [
396
- "",
397
  ],
398
  "Total Percentage Remaining": [
399
- "%",
400
  ],
401
  }
402
  )
403
 
404
  table_html_arx = arx_filter.to_html(index=False, border=0)
405
- table_div_arx = Div(NotStr(table_html_arx), style="margin-left: auto; width: 80%; align: center;")
406
 
407
  s2o_filter = pd.DataFrame(
408
  {
@@ -422,16 +424,16 @@ s2o_filter = pd.DataFrame(
422
  "0.00%",
423
  ],
424
  "Percent Removed After Local Dedup": [
425
- "",
426
  ],
427
  "Total Percentage Remaining": [
428
- "%",
429
  ],
430
  }
431
  )
432
 
433
  table_html_s2o = s2o_filter.to_html(index=False, border=0)
434
- table_div_s2o = Div(NotStr(table_html_s2o), style="margin-left: auto; width: 80%; align: center;")
435
 
436
  med_filter = pd.DataFrame(
437
  {
@@ -451,16 +453,16 @@ med_filter = pd.DataFrame(
451
  "0.02%",
452
  ],
453
  "Percent Removed After Local Dedup": [
454
- "",
455
  ],
456
  "Total Percentage Remaining": [
457
- "%",
458
  ],
459
  }
460
  )
461
 
462
  table_html_med = med_filter.to_html(index=False, border=0)
463
- table_div_med = Div(NotStr(table_html_med), style="margin-left: auto; width: 80%; align: center;")
464
 
465
  phil_filter = pd.DataFrame(
466
  {
@@ -480,16 +482,16 @@ phil_filter = pd.DataFrame(
480
  "0.12%",
481
  ],
482
  "Percent Removed After Local Dedup": [
483
- "",
484
  ],
485
  "Total Percentage Remaining": [
486
- "%",
487
  ],
488
  }
489
  )
490
 
491
  table_html_phil = phil_filter.to_html(index=False, border=0)
492
- table_div_phil = Div(NotStr(table_html_phil), style="margin-left: auto; width: 80%; align: center;")
493
  ## end individual tables showing filterin
494
 
495
 
@@ -681,24 +683,51 @@ filtering_process = Div(
681
  P(
682
  B("Download and Extraction: "),
683
  "All the data was downloaded in original latex format from ArXiv official S3 repo: ",
684
- A("s3://arxic/src", href="s3://arxic/src"),
685
- ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format",
 
 
686
  D_code(
687
- "pandoc -s {tex} -o out/{out_name}.md --wrap=none",
688
- language="python",
689
  ),
690
- ". All markdowns were combined to create jsonl files.",
691
  ),
692
  P(B("Unique Data Preparation Challenges: ")),
 
 
 
693
  Ul(
694
  Li(
695
- "Due to large amounts of meaningful data being contained in table formats, special consideration was taken to extract the data and proper metadata.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
696
  style="margin-bottom: -3px",
697
  ),
698
  ),
699
  P(
700
  B(" Filters Applied: "),
701
- "multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset", D_cite(bibtex_key="peS2o"),
 
702
  ),
703
  Ul(
704
  Li(
@@ -851,13 +880,16 @@ filtering_process = Div(
851
  href="ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/",
852
  ),
853
  ". PubMed Central (PMC) files are downloaded in an xml.tar format. The tar files are opened and converted to markdown format using pandoc",
854
- D_code("pandoc -f jats {nxml} -o {pmcid}.md", language="bash"),
855
- ". The markdown files are combined to create jsonl files. PubMed Abstract (PMA) files were downloaded in xml. The BeautifulSoup library was used to extract the abstract, title, and PMID. All files were stored in jsonl format.",
 
 
 
856
  ),
857
  P(B("Unique Data Preparation Challenges: ")),
858
  Ul(
859
  Li(
860
- "Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.",
861
  style="margin-bottom: -3px",
862
  ),
863
  ),
@@ -1584,7 +1616,8 @@ def curated():
1584
  table_html = data_preparation_steps.to_html(index=False, border=0)
1585
  table_div = Div(NotStr(table_html), style="margin: 40px;")
1586
 
1587
- text = P("""This initial stage serves as the foundation for the entire
 
1588
  process. Here, we focus on acquiring and extracting the raw data, which can
1589
  come from various sources such as crawling websites, using HTTP/FTP dumps,
1590
  or working with archive dumps. For instance, to download and prepare a
@@ -1594,7 +1627,8 @@ def curated():
1594
  preparation process: It is worth noting that some pipelines might require
1595
  invoking additional functions or scripts to handle specific data sources or
1596
  formats. These helper scripts can be located within specific directories
1597
- or modules dedicated to the dataset.""")
 
1598
 
1599
  return Div(
1600
  Section(
 
33
  P(
34
  "Curated sources comprise high-quality datasets that contain domain-specificity.",
35
  B(
36
+ " TxT360 was strongly influenced by The Pile",
37
+ D_cite(bibtex_key="thepile"),
38
+ " regarding both inclusion of the dataset and filtering techniques.",
39
  ),
40
  " These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ",
41
  ),
 
131
  "0.00%",
132
  ],
133
  "Percent Removed After Local Dedup": [
134
+ "0.31%",
135
  ],
136
  "Total Percentage Remaining": [
137
+ "97.84%",
138
  ],
139
  }
140
  )
141
 
142
  table_html_wikipedia = wikipedia_filter.to_html(index=False, border=0)
143
+ table_div_wikipedia = Div(NotStr(table_html_wikipedia))
144
 
145
  freelaw_filter = pd.DataFrame(
146
  {
 
169
  )
170
 
171
  table_html_freelaw = freelaw_filter.to_html(index=False, border=0)
172
+ table_div_freelaw = Div(NotStr(table_html_freelaw))
173
 
174
  dmm_filter = pd.DataFrame(
175
  {
 
189
  "0.00%",
190
  ],
191
  "Percent Removed After Local Dedup": [
192
+ "0.00%",
193
  ],
194
  "Total Percentage Remaining": [
195
+ "100.00%",
196
  ],
197
  }
198
  )
199
 
200
  table_html_dmm = dmm_filter.to_html(index=False, border=0)
201
+ table_div_dmm = Div(NotStr(table_html_dmm))
202
 
203
 
204
  uspto_filter = pd.DataFrame(
 
219
  "0.01%",
220
  ],
221
  "Percent Removed After Local Dedup": [
222
+ "22.94%",
223
  ],
224
  "Total Percentage Remaining": [
225
+ "75.60%",
226
  ],
227
  }
228
  )
229
 
230
  table_html_uspto = uspto_filter.to_html(index=False, border=0)
231
+ table_div_uspto = Div(NotStr(table_html_uspto))
232
 
233
  pg19_filter = pd.DataFrame(
234
  {
 
248
  "0.17%",
249
  ],
250
  "Percent Removed After Local Dedup": [
251
+ "0.80%",
252
  ],
253
  "Total Percentage Remaining": [
254
+ "98.78%",
255
  ],
256
  }
257
  )
258
 
259
  table_html_pg19 = pg19_filter.to_html(index=False, border=0)
260
+ table_div_pg19 = Div(NotStr(table_html_pg19))
261
 
262
 
263
  hn_filter = pd.DataFrame(
 
278
  "0.34%",
279
  ],
280
  "Percent Removed After Local Dedup": [
281
+ "61.84%",
282
  ],
283
  "Total Percentage Remaining": [
284
+ "37.03%",
285
  ],
286
  }
287
  )
288
 
289
  table_html_hn = hn_filter.to_html(index=False, border=0)
290
+ table_div_hn = Div(NotStr(table_html_hn))
291
 
292
 
293
  uirc_filter = pd.DataFrame(
 
308
  "1.12%",
309
  ],
310
  "Percent Removed After Local Dedup": [
311
+ "0.66%",
312
  ],
313
  "Total Percentage Remaining": [
314
+ "60.72%",
315
  ],
316
  }
317
  )
318
 
319
  table_html_uirc = uirc_filter.to_html(index=False, border=0)
320
+ table_div_uirc = Div(NotStr(table_html_uirc))
321
 
322
  up_filter = pd.DataFrame(
323
  {
 
337
  "0.00%",
338
  ],
339
  "Percent Removed After Local Dedup": [
340
+ "1.00%",
341
  ],
342
  "Total Percentage Remaining": [
343
+ "99.00%",
344
  ],
345
  }
346
  )
347
 
348
  table_html_up = up_filter.to_html(index=False, border=0)
349
+ table_div_up = Div(NotStr(table_html_up))
350
 
351
  se_filter = pd.DataFrame(
352
  {
 
366
  "0.00%",
367
  ],
368
  "Percent Removed After Local Dedup": [
369
+ "0.00%",
370
  ],
371
  "Total Percentage Remaining": [
372
+ "100.00%",
373
  ],
374
  }
375
  )
376
 
377
  table_html_se = se_filter.to_html(index=False, border=0)
378
+ table_div_se = Div(NotStr(table_html_se))
379
 
380
  arx_filter = pd.DataFrame(
381
  {
 
395
  "0.07%",
396
  ],
397
  "Percent Removed After Local Dedup": [
398
+ "0.00%",
399
  ],
400
  "Total Percentage Remaining": [
401
+ "92.20%",
402
  ],
403
  }
404
  )
405
 
406
  table_html_arx = arx_filter.to_html(index=False, border=0)
407
+ table_div_arx = Div(NotStr(table_html_arx))
408
 
409
  s2o_filter = pd.DataFrame(
410
  {
 
424
  "0.00%",
425
  ],
426
  "Percent Removed After Local Dedup": [
427
+ "0.00%",
428
  ],
429
  "Total Percentage Remaining": [
430
+ "100.00%",
431
  ],
432
  }
433
  )
434
 
435
  table_html_s2o = s2o_filter.to_html(index=False, border=0)
436
+ table_div_s2o = Div(NotStr(table_html_s2o))
437
 
438
  med_filter = pd.DataFrame(
439
  {
 
453
  "0.02%",
454
  ],
455
  "Percent Removed After Local Dedup": [
456
+ "0.00%",
457
  ],
458
  "Total Percentage Remaining": [
459
+ "91.14%",
460
  ],
461
  }
462
  )
463
 
464
  table_html_med = med_filter.to_html(index=False, border=0)
465
+ table_div_med = Div(NotStr(table_html_med))
466
 
467
  phil_filter = pd.DataFrame(
468
  {
 
482
  "0.12%",
483
  ],
484
  "Percent Removed After Local Dedup": [
485
+ "0.00%",
486
  ],
487
  "Total Percentage Remaining": [
488
+ "79.22%",
489
  ],
490
  }
491
  )
492
 
493
  table_html_phil = phil_filter.to_html(index=False, border=0)
494
+ table_div_phil = Div(NotStr(table_html_phil))
495
  ## end individual tables showing filterin
496
 
497
 
 
683
  P(
684
  B("Download and Extraction: "),
685
  "All the data was downloaded in original latex format from ArXiv official S3 repo: ",
686
+ A("s3://arxiv/src", href="s3://arxiv/src"),
687
+ ". We aim to encode the downloaded data in UTF-8 format, and when necessary, utilize the chardet library to infer the appropriate encoding. After that, we use ",
688
+ A("Pandoc", href="https://pandoc.org/"),
689
+ " to extract information from the latex files into markdown format. The command we use is",
690
  D_code(
691
+ "pandoc <raw_tex_path> -s -o <output_markdown_path> -f latex+raw_tex -t markdown_mmd [--lua-filter <lua_filter_path>]",
692
+ language="bash",
693
  ),
694
+ ". Finally, all markdowns were combined to create jsonl files.",
695
  ),
696
  P(B("Unique Data Preparation Challenges: ")),
697
+ P(
698
+ "When converting LaTeX files into Markdown using Pandoc, it is crucial to account for different data formats to minimize information loss while also filtering out noisy content in LaTeX. Below, we outline our considerations and methods for handling various data types during this conversion process:"
699
+ ),
700
  Ul(
701
  Li(
702
+ B("Tables: "),
703
+ "The process for handling tables follows three main approaches. First, tables compatible with Pandoc’s built-in formats are directly converted into standard Markdown tables. Notably, LaTeX’s '\\multicolumn' and '\\multirow' commands can be successfully translated into valid Markdown tables. Second, tables unsupported by Pandoc’s native functionality, such as deluxetable or other complex LaTeX types, are preserved in their original LaTeX format to maintain the integrity of complex structures. Third, only a few remaining tables have been converted to HTML web tables.",
704
+ style="margin-bottom: -3px",
705
+ ),
706
+ Li(
707
+ B("Mathematical Expressions: "),
708
+ "Inline mathematical expressions are rendered in Markdown. More complex equations remain unchanged, e.g., presented as '\\begin{aligned}' blocks, to ensure accuracy and readability.",
709
+ style="margin-bottom: -3px",
710
+ ),
711
+ Li(
712
+ B("Figures: "),
713
+ "All figures are removed during the conversion process. Placeholder figures might not contribute to the paper’s data quality and, as such, have been omitted to streamline the output.",
714
+ style="margin-bottom: -3px",
715
+ ),
716
+ Li(
717
+ B("Section Headers: "),
718
+ "Section headers are converted into markdown format, using leading '#' symbols to represent the heading levels.",
719
+ style="margin-bottom: -3px",
720
+ ),
721
+ Li(
722
+ B("References: "),
723
+ "References are removed. Although they may be informative, references often introduce formatting inconsistencies or add little value compared to the core content of the paper.",
724
  style="margin-bottom: -3px",
725
  ),
726
  ),
727
  P(
728
  B(" Filters Applied: "),
729
+ "multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset",
730
+ D_cite(bibtex_key="peS2o"),
731
  ),
732
  Ul(
733
  Li(
 
880
  href="ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/",
881
  ),
882
  ". PubMed Central (PMC) files are downloaded in an xml.tar format. The tar files are opened and converted to markdown format using pandoc",
883
+ D_code(
884
+ "pandoc <raw_xml_path> -s -o <output_markdown_path> -f jats -t markdown_mmd [--lua-filter <lua_filter_path>]",
885
+ language="bash",
886
+ ),
887
+ ". The markdown files are combined to create jsonl files. PubMed Abstract (PMA) files were downloaded in xml. The BeautifulSoup library was used to extract the abstract, title, and PMID. All files were stored in jsonl format.",
888
  ),
889
  P(B("Unique Data Preparation Challenges: ")),
890
  Ul(
891
  Li(
892
+ "We tried similar attempts on PMC as we did on ArXiv. The resulted markdown might have slight difference due to the different structure of the XML files.",
893
  style="margin-bottom: -3px",
894
  ),
895
  ),
 
1616
  table_html = data_preparation_steps.to_html(index=False, border=0)
1617
  table_div = Div(NotStr(table_html), style="margin: 40px;")
1618
 
1619
+ text = P(
1620
+ """This initial stage serves as the foundation for the entire
1621
  process. Here, we focus on acquiring and extracting the raw data, which can
1622
  come from various sources such as crawling websites, using HTTP/FTP dumps,
1623
  or working with archive dumps. For instance, to download and prepare a
 
1627
  preparation process: It is worth noting that some pipelines might require
1628
  invoking additional functions or scripts to handle specific data sources or
1629
  formats. These helper scripts can be located within specific directories
1630
+ or modules dedicated to the dataset."""
1631
+ )
1632
 
1633
  return Div(
1634
  Section(
main.py CHANGED
@@ -175,7 +175,7 @@ def main():
175
  Div(
176
  A(
177
  "TxT360",
178
- href="#section1",
179
  )
180
  ),
181
  Div(
@@ -359,8 +359,8 @@ def main():
359
  ),
360
  ),
361
  intro(),
362
- curated.curated(),
363
  web.web_data(),
 
364
  common.common_steps(),
365
  results.results(),
366
  ),
@@ -757,7 +757,7 @@ dataset_sources = pd.DataFrame(
757
  "StackExchange",
758
  ],
759
  "Raw Data Size": [
760
- "11 TB",
761
  "712 GB",
762
  "210 GB",
763
  "23 GB",
@@ -770,7 +770,7 @@ dataset_sources = pd.DataFrame(
770
  "45 GB",
771
  ],
772
  "Token Count": [
773
- "5.71T",
774
  "154.96B",
775
  "4.75B",
776
  "7.34B",
 
175
  Div(
176
  A(
177
  "TxT360",
178
+ href="#section11",
179
  )
180
  ),
181
  Div(
 
359
  ),
360
  ),
361
  intro(),
 
362
  web.web_data(),
363
+ curated.curated(),
364
  common.common_steps(),
365
  results.results(),
366
  ),
 
757
  "StackExchange",
758
  ],
759
  "Raw Data Size": [
760
+ "9.2 TB",
761
  "712 GB",
762
  "210 GB",
763
  "23 GB",
 
770
  "45 GB",
771
  ],
772
  "Token Count": [
773
+ "4.83T",
774
  "154.96B",
775
  "4.75B",
776
  "7.34B",
web.py CHANGED
@@ -376,6 +376,7 @@ def web_data():
376
  return Div(
377
  Section(
378
  Div(
 
379
  H2("Common Crawl Snapshot Processing"),
380
  H3("What This Section Contains"),
381
  P("This section provides a complete discussion on the filtering applied to the 99 Common Crawl snapshots that comprise the web data section of TxT360. The section is split into the following topic areas: "),
 
376
  return Div(
377
  Section(
378
  Div(
379
+ H1("Web Data Processing"),
380
  H2("Common Crawl Snapshot Processing"),
381
  H3("What This Section Contains"),
382
  P("This section provides a complete discussion on the filtering applied to the 99 Common Crawl snapshots that comprise the web data section of TxT360. The section is split into the following topic areas: "),