import json import re import unicodedata from utils.norm_config import norm_config def text_normalize( text, iso_code="xxx", lower_case=True, remove_numbers=False, remove_brackets=False, rm_extra_spaces=False, ): """Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces Args: text : The string to be normalized iso_code : remove_numbers : Boolean flag to specify if words containing only digits should be removed Returns: normalized_text : the string after all normalization """ config = norm_config.get(iso_code, norm_config["*"]) for field in [ "lower_case", "punc_set", "del_set", "mapping", "digit_set", "unicode_norm", ]: if field not in config: config[field] = norm_config["*"][field] text = unicodedata.normalize(config["unicode_norm"], text) # Convert to lower case if config["lower_case"] and lower_case: text = text.lower() # brackets # always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)" text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text) if remove_brackets: text = re.sub(r"\([^\)]*\)", " ", text) # Apply mappings for old, new in config["mapping"].items(): text = re.sub(old, new, text) # Replace punctutations with space punct_pattern = r"[" + config["punc_set"] punct_pattern += "]" normalized_text = re.sub(punct_pattern, " ", text) # remove characters in delete list delete_patten = r"[" + config["del_set"] + "]" normalized_text = re.sub(delete_patten, "", normalized_text) # Remove words containing only digits # We check for 3 cases a)text starts with a number b) a number is present somewhere in the middle of the text c) the text ends with a number # For each case we use lookaround regex pattern to see if the digit pattern in preceded and followed by whitespaces, only then we replace the numbers with space # The lookaround enables overlapping pattern matches to be replaced if remove_numbers: digits_pattern = "[" + config["digit_set"] digits_pattern += "]+" complete_digit_pattern = ( r"^" + digits_pattern + "(?=\s)|(?<=\s)" + digits_pattern + "(?=\s)|(?<=\s)" + digits_pattern + "$" ) normalized_text = re.sub(complete_digit_pattern, " ", normalized_text) if config["rm_diacritics"]: from unidecode import unidecode normalized_text = unidecode(normalized_text) if rm_extra_spaces: normalized_text = re.sub(r"\s+", " ", normalized_text).strip() return normalized_text