import os import re colon = ":" comma = "," exclamation_mark = "!" period = re.escape(".") question_mark = re.escape("?") semicolon = ";" left_curly_bracket = "{" right_curly_bracket = "}" quotation_mark = '"' basic_punc = ( period + question_mark + comma + colon + exclamation_mark + left_curly_bracket + right_curly_bracket ) # General punc unicode block (0x2000-0x206F) zero_width_space = r"\u200B" zero_width_nonjoiner = r"\u200C" left_to_right_mark = r"\u200E" right_to_left_mark = r"\u200F" left_to_right_embedding = r"\u202A" pop_directional_formatting = r"\u202C" # Here are some commonly ill-typed versions of apostrophe right_single_quotation_mark = r"\u2019" left_single_quotation_mark = r"\u2018" # Language specific definitions # Spanish inverted_exclamation_mark = r"\u00A1" inverted_question_mark = r"\u00BF" # Hindi hindi_danda = "\u0964" # Egyptian Arabic # arabic_percent = r"\u066A" arabic_comma = r"\u060C" arabic_question_mark = r"\u061F" arabic_semicolon = r"\u061B" arabic_diacritics = r"\u064B-\u0652" arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657" # Chinese full_stop = r"\u3002" full_comma = r"\uFF0C" full_exclamation_mark = r"\uFF01" full_question_mark = r"\uFF1F" full_semicolon = r"\uFF1B" full_colon = r"\uFF1A" full_parentheses = r"\uFF08\uFF09" quotation_mark_horizontal = r"\u300C-\u300F" quotation_mark_vertical = r"\uFF41-\uFF44" title_marks = r"\u3008-\u300B" wavy_low_line = r"\uFE4F" ellipsis = r"\u22EF" enumeration_comma = r"\u3001" hyphenation_point = r"\u2027" forward_slash = r"\uFF0F" wavy_dash = r"\uFF5E" box_drawings_light_horizontal = r"\u2500" fullwidth_low_line = r"\uFF3F" chinese_punc = ( full_stop + full_comma + full_exclamation_mark + full_question_mark + full_semicolon + full_colon + full_parentheses + quotation_mark_horizontal + quotation_mark_vertical + title_marks + wavy_low_line + ellipsis + enumeration_comma + hyphenation_point + forward_slash + wavy_dash + box_drawings_light_horizontal + fullwidth_low_line ) # Armenian armenian_apostrophe = r"\u055A" emphasis_mark = r"\u055B" exclamation_mark = r"\u055C" armenian_comma = r"\u055D" armenian_question_mark = r"\u055E" abbreviation_mark = r"\u055F" armenian_full_stop = r"\u0589" armenian_punc = ( armenian_apostrophe + emphasis_mark + exclamation_mark + armenian_comma + armenian_question_mark + abbreviation_mark + armenian_full_stop ) lesser_than_symbol = r"<" greater_than_symbol = r">" lesser_than_sign = r"\u003c" greater_than_sign = r"\u003e" nbsp_written_form = r" " # Quotation marks left_double_quotes = r"\u201c" right_double_quotes = r"\u201d" left_double_angle = r"\u00ab" right_double_angle = r"\u00bb" left_single_angle = r"\u2039" right_single_angle = r"\u203a" low_double_quotes = r"\u201e" low_single_quotes = r"\u201a" high_double_quotes = r"\u201f" high_single_quotes = r"\u201b" all_punct_quotes = ( left_double_quotes + right_double_quotes + left_double_angle + right_double_angle + left_single_angle + right_single_angle + low_double_quotes + low_single_quotes + high_double_quotes + high_single_quotes + right_single_quotation_mark + left_single_quotation_mark ) mapping_quotes = ( "[" + high_single_quotes + right_single_quotation_mark + left_single_quotation_mark + "]" ) # Digits english_digits = r"\u0030-\u0039" bengali_digits = r"\u09e6-\u09ef" khmer_digits = r"\u17e0-\u17e9" devanagari_digits = r"\u0966-\u096f" oriya_digits = r"\u0b66-\u0b6f" extended_arabic_indic_digits = r"\u06f0-\u06f9" kayah_li_digits = r"\ua900-\ua909" fullwidth_digits = r"\uff10-\uff19" malayam_digits = r"\u0d66-\u0d6f" myanmar_digits = r"\u1040-\u1049" roman_numeral = r"\u2170-\u2179" nominal_digit_shapes = r"\u206f" # Load punctuations from MMS-lab data with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f: punc_list = punc_f.readlines() punct_pattern = r"" for punc in punc_list: # the first character in the tab separated line is the punc to be removed punct_pattern += re.escape(punc.split("\t")[0]) shared_digits = ( english_digits + bengali_digits + khmer_digits + devanagari_digits + oriya_digits + extended_arabic_indic_digits + kayah_li_digits + fullwidth_digits + malayam_digits + myanmar_digits + roman_numeral + nominal_digit_shapes ) shared_punc_list = ( basic_punc + all_punct_quotes + greater_than_sign + lesser_than_sign + inverted_question_mark + full_stop + semicolon + armenian_punc + inverted_exclamation_mark + arabic_comma + enumeration_comma + hindi_danda + quotation_mark + arabic_semicolon + arabic_question_mark + chinese_punc + punct_pattern ) shared_mappping = { lesser_than_symbol: "", greater_than_symbol: "", nbsp_written_form: "", # r"(\S+)" + mapping_quotes + r"(\S+)": r"\1'\2", # slow to run } shared_deletion_list = ( left_to_right_mark + zero_width_nonjoiner + arabic_subscript_alef_and_inverted_damma + zero_width_space + arabic_diacritics + pop_directional_formatting + right_to_left_mark + left_to_right_embedding ) norm_config = { "*": { "lower_case": True, "punc_set": shared_punc_list, "del_set": shared_deletion_list, "mapping": shared_mappping, "digit_set": shared_digits, "unicode_norm": "NFKC", "rm_diacritics": False, } } # =============== Mongolian ===============# norm_config["mon"] = norm_config["*"].copy() # add soft hyphen to punc list to match with fleurs norm_config["mon"]["del_set"] += r"\u00AD" norm_config["khk"] = norm_config["mon"].copy() # =============== Hebrew ===============# norm_config["heb"] = norm_config["*"].copy() # add "HEBREW POINT" symbols to match with fleurs norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF" # =============== Thai ===============# norm_config["tha"] = norm_config["*"].copy() # add "Zero width joiner" symbols to match with fleurs norm_config["tha"]["punc_set"] += r"\u200D" # =============== Arabic ===============# norm_config["ara"] = norm_config["*"].copy() norm_config["ara"]["mapping"]["ٱ"] = "ا" norm_config["arb"] = norm_config["ara"].copy() # =============== Javanese ===============# norm_config["jav"] = norm_config["*"].copy() norm_config["jav"]["rm_diacritics"] = True