BlankCheng commited on
Commit
0863523
2 Parent(s): 076f237 2dd50ea

Merge conflicts

Browse files
Files changed (3) hide show
  1. curated.py +37 -63
  2. main.py +2 -2
  3. web.py +1 -0
curated.py CHANGED
@@ -131,18 +131,16 @@ wikipedia_filter = pd.DataFrame(
131
  "0.00%",
132
  ],
133
  "Percent Removed After Local Dedup": [
134
- "",
135
  ],
136
  "Total Percentage Remaining": [
137
- "",
138
  ],
139
  }
140
  )
141
 
142
  table_html_wikipedia = wikipedia_filter.to_html(index=False, border=0)
143
- table_div_wikipedia = Div(
144
- NotStr(table_html_wikipedia), style="margin-left: auto; width: 80%; align: center;"
145
- )
146
 
147
  freelaw_filter = pd.DataFrame(
148
  {
@@ -171,9 +169,7 @@ freelaw_filter = pd.DataFrame(
171
  )
172
 
173
  table_html_freelaw = freelaw_filter.to_html(index=False, border=0)
174
- table_div_freelaw = Div(
175
- NotStr(table_html_freelaw), style="margin-left: auto; width: 80%; align: center;"
176
- )
177
 
178
  dmm_filter = pd.DataFrame(
179
  {
@@ -193,18 +189,16 @@ dmm_filter = pd.DataFrame(
193
  "0.00%",
194
  ],
195
  "Percent Removed After Local Dedup": [
196
- "",
197
  ],
198
  "Total Percentage Remaining": [
199
- "%",
200
  ],
201
  }
202
  )
203
 
204
  table_html_dmm = dmm_filter.to_html(index=False, border=0)
205
- table_div_dmm = Div(
206
- NotStr(table_html_dmm), style="margin-left: auto; width: 80%; align: center;"
207
- )
208
 
209
 
210
  uspto_filter = pd.DataFrame(
@@ -225,18 +219,16 @@ uspto_filter = pd.DataFrame(
225
  "0.01%",
226
  ],
227
  "Percent Removed After Local Dedup": [
228
- "",
229
  ],
230
  "Total Percentage Remaining": [
231
- "%",
232
  ],
233
  }
234
  )
235
 
236
  table_html_uspto = uspto_filter.to_html(index=False, border=0)
237
- table_div_uspto = Div(
238
- NotStr(table_html_uspto), style="margin-left: auto; width: 80%; align: center;"
239
- )
240
 
241
  pg19_filter = pd.DataFrame(
242
  {
@@ -256,18 +248,16 @@ pg19_filter = pd.DataFrame(
256
  "0.17%",
257
  ],
258
  "Percent Removed After Local Dedup": [
259
- "",
260
  ],
261
  "Total Percentage Remaining": [
262
- "%",
263
  ],
264
  }
265
  )
266
 
267
  table_html_pg19 = pg19_filter.to_html(index=False, border=0)
268
- table_div_pg19 = Div(
269
- NotStr(table_html_pg19), style="margin-left: auto; width: 80%; align: center;"
270
- )
271
 
272
 
273
  hn_filter = pd.DataFrame(
@@ -288,18 +278,16 @@ hn_filter = pd.DataFrame(
288
  "0.34%",
289
  ],
290
  "Percent Removed After Local Dedup": [
291
- "",
292
  ],
293
  "Total Percentage Remaining": [
294
- "%",
295
  ],
296
  }
297
  )
298
 
299
  table_html_hn = hn_filter.to_html(index=False, border=0)
300
- table_div_hn = Div(
301
- NotStr(table_html_hn), style="margin-left: auto; width: 80%; align: center;"
302
- )
303
 
304
 
305
  uirc_filter = pd.DataFrame(
@@ -320,18 +308,16 @@ uirc_filter = pd.DataFrame(
320
  "1.12%",
321
  ],
322
  "Percent Removed After Local Dedup": [
323
- "",
324
  ],
325
  "Total Percentage Remaining": [
326
- "%",
327
  ],
328
  }
329
  )
330
 
331
  table_html_uirc = uirc_filter.to_html(index=False, border=0)
332
- table_div_uirc = Div(
333
- NotStr(table_html_uirc), style="margin-left: auto; width: 80%; align: center;"
334
- )
335
 
336
  up_filter = pd.DataFrame(
337
  {
@@ -351,18 +337,16 @@ up_filter = pd.DataFrame(
351
  "0.00%",
352
  ],
353
  "Percent Removed After Local Dedup": [
354
- "",
355
  ],
356
  "Total Percentage Remaining": [
357
- "%",
358
  ],
359
  }
360
  )
361
 
362
  table_html_up = up_filter.to_html(index=False, border=0)
363
- table_div_up = Div(
364
- NotStr(table_html_up), style="margin-left: auto; width: 80%; align: center;"
365
- )
366
 
367
  se_filter = pd.DataFrame(
368
  {
@@ -382,18 +366,16 @@ se_filter = pd.DataFrame(
382
  "0.00%",
383
  ],
384
  "Percent Removed After Local Dedup": [
385
- "",
386
  ],
387
  "Total Percentage Remaining": [
388
- "%",
389
  ],
390
  }
391
  )
392
 
393
  table_html_se = se_filter.to_html(index=False, border=0)
394
- table_div_se = Div(
395
- NotStr(table_html_se), style="margin-left: auto; width: 80%; align: center;"
396
- )
397
 
398
  arx_filter = pd.DataFrame(
399
  {
@@ -413,18 +395,16 @@ arx_filter = pd.DataFrame(
413
  "0.07%",
414
  ],
415
  "Percent Removed After Local Dedup": [
416
- "",
417
  ],
418
  "Total Percentage Remaining": [
419
- "%",
420
  ],
421
  }
422
  )
423
 
424
  table_html_arx = arx_filter.to_html(index=False, border=0)
425
- table_div_arx = Div(
426
- NotStr(table_html_arx), style="margin-left: auto; width: 80%; align: center;"
427
- )
428
 
429
  s2o_filter = pd.DataFrame(
430
  {
@@ -444,18 +424,16 @@ s2o_filter = pd.DataFrame(
444
  "0.00%",
445
  ],
446
  "Percent Removed After Local Dedup": [
447
- "",
448
  ],
449
  "Total Percentage Remaining": [
450
- "%",
451
  ],
452
  }
453
  )
454
 
455
  table_html_s2o = s2o_filter.to_html(index=False, border=0)
456
- table_div_s2o = Div(
457
- NotStr(table_html_s2o), style="margin-left: auto; width: 80%; align: center;"
458
- )
459
 
460
  med_filter = pd.DataFrame(
461
  {
@@ -475,18 +453,16 @@ med_filter = pd.DataFrame(
475
  "0.02%",
476
  ],
477
  "Percent Removed After Local Dedup": [
478
- "",
479
  ],
480
  "Total Percentage Remaining": [
481
- "%",
482
  ],
483
  }
484
  )
485
 
486
  table_html_med = med_filter.to_html(index=False, border=0)
487
- table_div_med = Div(
488
- NotStr(table_html_med), style="margin-left: auto; width: 80%; align: center;"
489
- )
490
 
491
  phil_filter = pd.DataFrame(
492
  {
@@ -506,18 +482,16 @@ phil_filter = pd.DataFrame(
506
  "0.12%",
507
  ],
508
  "Percent Removed After Local Dedup": [
509
- "",
510
  ],
511
  "Total Percentage Remaining": [
512
- "%",
513
  ],
514
  }
515
  )
516
 
517
  table_html_phil = phil_filter.to_html(index=False, border=0)
518
- table_div_phil = Div(
519
- NotStr(table_html_phil), style="margin-left: auto; width: 80%; align: center;"
520
- )
521
  ## end individual tables showing filterin
522
 
523
 
 
131
  "0.00%",
132
  ],
133
  "Percent Removed After Local Dedup": [
134
+ "0.31%",
135
  ],
136
  "Total Percentage Remaining": [
137
+ "97.84%",
138
  ],
139
  }
140
  )
141
 
142
  table_html_wikipedia = wikipedia_filter.to_html(index=False, border=0)
143
+ table_div_wikipedia = Div(NotStr(table_html_wikipedia))
 
 
144
 
145
  freelaw_filter = pd.DataFrame(
146
  {
 
169
  )
170
 
171
  table_html_freelaw = freelaw_filter.to_html(index=False, border=0)
172
+ table_div_freelaw = Div(NotStr(table_html_freelaw))
 
 
173
 
174
  dmm_filter = pd.DataFrame(
175
  {
 
189
  "0.00%",
190
  ],
191
  "Percent Removed After Local Dedup": [
192
+ "0.00%",
193
  ],
194
  "Total Percentage Remaining": [
195
+ "100.00%",
196
  ],
197
  }
198
  )
199
 
200
  table_html_dmm = dmm_filter.to_html(index=False, border=0)
201
+ table_div_dmm = Div(NotStr(table_html_dmm))
 
 
202
 
203
 
204
  uspto_filter = pd.DataFrame(
 
219
  "0.01%",
220
  ],
221
  "Percent Removed After Local Dedup": [
222
+ "22.94%",
223
  ],
224
  "Total Percentage Remaining": [
225
+ "75.60%",
226
  ],
227
  }
228
  )
229
 
230
  table_html_uspto = uspto_filter.to_html(index=False, border=0)
231
+ table_div_uspto = Div(NotStr(table_html_uspto))
 
 
232
 
233
  pg19_filter = pd.DataFrame(
234
  {
 
248
  "0.17%",
249
  ],
250
  "Percent Removed After Local Dedup": [
251
+ "0.80%",
252
  ],
253
  "Total Percentage Remaining": [
254
+ "98.78%",
255
  ],
256
  }
257
  )
258
 
259
  table_html_pg19 = pg19_filter.to_html(index=False, border=0)
260
+ table_div_pg19 = Div(NotStr(table_html_pg19))
 
 
261
 
262
 
263
  hn_filter = pd.DataFrame(
 
278
  "0.34%",
279
  ],
280
  "Percent Removed After Local Dedup": [
281
+ "61.84%",
282
  ],
283
  "Total Percentage Remaining": [
284
+ "37.03%",
285
  ],
286
  }
287
  )
288
 
289
  table_html_hn = hn_filter.to_html(index=False, border=0)
290
+ table_div_hn = Div(NotStr(table_html_hn))
 
 
291
 
292
 
293
  uirc_filter = pd.DataFrame(
 
308
  "1.12%",
309
  ],
310
  "Percent Removed After Local Dedup": [
311
+ "0.66%",
312
  ],
313
  "Total Percentage Remaining": [
314
+ "60.72%",
315
  ],
316
  }
317
  )
318
 
319
  table_html_uirc = uirc_filter.to_html(index=False, border=0)
320
+ table_div_uirc = Div(NotStr(table_html_uirc))
 
 
321
 
322
  up_filter = pd.DataFrame(
323
  {
 
337
  "0.00%",
338
  ],
339
  "Percent Removed After Local Dedup": [
340
+ "1.00%",
341
  ],
342
  "Total Percentage Remaining": [
343
+ "99.00%",
344
  ],
345
  }
346
  )
347
 
348
  table_html_up = up_filter.to_html(index=False, border=0)
349
+ table_div_up = Div(NotStr(table_html_up))
 
 
350
 
351
  se_filter = pd.DataFrame(
352
  {
 
366
  "0.00%",
367
  ],
368
  "Percent Removed After Local Dedup": [
369
+ "0.00%",
370
  ],
371
  "Total Percentage Remaining": [
372
+ "100.00%",
373
  ],
374
  }
375
  )
376
 
377
  table_html_se = se_filter.to_html(index=False, border=0)
378
+ table_div_se = Div(NotStr(table_html_se))
 
 
379
 
380
  arx_filter = pd.DataFrame(
381
  {
 
395
  "0.07%",
396
  ],
397
  "Percent Removed After Local Dedup": [
398
+ "0.00%",
399
  ],
400
  "Total Percentage Remaining": [
401
+ "92.20%",
402
  ],
403
  }
404
  )
405
 
406
  table_html_arx = arx_filter.to_html(index=False, border=0)
407
+ table_div_arx = Div(NotStr(table_html_arx))
 
 
408
 
409
  s2o_filter = pd.DataFrame(
410
  {
 
424
  "0.00%",
425
  ],
426
  "Percent Removed After Local Dedup": [
427
+ "0.00%",
428
  ],
429
  "Total Percentage Remaining": [
430
+ "100.00%",
431
  ],
432
  }
433
  )
434
 
435
  table_html_s2o = s2o_filter.to_html(index=False, border=0)
436
+ table_div_s2o = Div(NotStr(table_html_s2o))
 
 
437
 
438
  med_filter = pd.DataFrame(
439
  {
 
453
  "0.02%",
454
  ],
455
  "Percent Removed After Local Dedup": [
456
+ "0.00%",
457
  ],
458
  "Total Percentage Remaining": [
459
+ "91.14%",
460
  ],
461
  }
462
  )
463
 
464
  table_html_med = med_filter.to_html(index=False, border=0)
465
+ table_div_med = Div(NotStr(table_html_med))
 
 
466
 
467
  phil_filter = pd.DataFrame(
468
  {
 
482
  "0.12%",
483
  ],
484
  "Percent Removed After Local Dedup": [
485
+ "0.00%",
486
  ],
487
  "Total Percentage Remaining": [
488
+ "79.22%",
489
  ],
490
  }
491
  )
492
 
493
  table_html_phil = phil_filter.to_html(index=False, border=0)
494
+ table_div_phil = Div(NotStr(table_html_phil))
 
 
495
  ## end individual tables showing filterin
496
 
497
 
main.py CHANGED
@@ -757,7 +757,7 @@ dataset_sources = pd.DataFrame(
757
  "StackExchange",
758
  ],
759
  "Raw Data Size": [
760
- "11 TB",
761
  "712 GB",
762
  "210 GB",
763
  "23 GB",
@@ -770,7 +770,7 @@ dataset_sources = pd.DataFrame(
770
  "45 GB",
771
  ],
772
  "Token Count": [
773
- "5.71T",
774
  "154.96B",
775
  "4.75B",
776
  "7.34B",
 
757
  "StackExchange",
758
  ],
759
  "Raw Data Size": [
760
+ "9.2 TB",
761
  "712 GB",
762
  "210 GB",
763
  "23 GB",
 
770
  "45 GB",
771
  ],
772
  "Token Count": [
773
+ "4.83T",
774
  "154.96B",
775
  "4.75B",
776
  "7.34B",
web.py CHANGED
@@ -376,6 +376,7 @@ def web_data():
376
  return Div(
377
  Section(
378
  Div(
 
379
  H2("Common Crawl Snapshot Processing"),
380
  H3("What This Section Contains"),
381
  P("This section provides a complete discussion on the filtering applied to the 99 Common Crawl snapshots that comprise the web data section of TxT360. The section is split into the following topic areas: "),
 
376
  return Div(
377
  Section(
378
  Div(
379
+ H1("Web Data Processing"),
380
  H2("Common Crawl Snapshot Processing"),
381
  H3("What This Section Contains"),
382
  P("This section provides a complete discussion on the filtering applied to the 99 Common Crawl snapshots that comprise the web data section of TxT360. The section is split into the following topic areas: "),