victormiller commited on
Commit
d82c7d9
1 Parent(s): 394b3ae

Update common.py

Browse files
Files changed (1) hide show
  1. common.py +2 -1
common.py CHANGED
@@ -37,6 +37,7 @@ dask.bag.from_sequence(doc_file_paths)
37
  .map_partitions(make_doc_pairs)
38
  .compute()
39
  """
 
40
 
41
  global_div = Div(
42
  Section(
@@ -97,7 +98,7 @@ global_div = Div(
97
  H3("Removing PII"),
98
  P("We have removed two types of PII from the dataset: email address and IP address. Regular expressions are used to identify and replace these PII with a generic placeholder. Below is an example of how we removed email addresses from the dataset:"),
99
  P("We have used the following regular expressions to identify and replace PII:"),
100
- Ul(Li("Email: NEED TO UPDATE"),Li("IP Address: NEED TO UPDATE")),
101
  ),
102
  Section(
103
  H2("Normalization Form C (NFC)"),
 
37
  .map_partitions(make_doc_pairs)
38
  .compute()
39
  """
40
+ email_code = "r&quot[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:(?:[A-Za-z0-9](?:[" r"A-Za-z0-9-]*[A-Za-z0-9])?\.)+[A-Za-z0-9](?:[A-Za-z0-9-]*[A-Za-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[&quot r&quot01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[A-Za-z0-9-]*[A-Za-z0-9]:)])"
41
 
42
  global_div = Div(
43
  Section(
 
98
  H3("Removing PII"),
99
  P("We have removed two types of PII from the dataset: email address and IP address. Regular expressions are used to identify and replace these PII with a generic placeholder. Below is an example of how we removed email addresses from the dataset:"),
100
  P("We have used the following regular expressions to identify and replace PII:"),
101
+ Ul(Li("Email:" + Pre(Code(email_code))),Li("IP Address: NEED TO UPDATE")),
102
  ),
103
  Section(
104
  H2("Normalization Form C (NFC)"),