import re import bangla from bnnumerizer import numerize from bnunicodenormalizer import Normalizer # initialize bnorm = Normalizer() attribution_dict = { "সাঃ": "সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম", "আঃ": "আলাইহিস সালাম", "রাঃ": "রাদিআল্লাহু আনহু", "রহঃ": "রহমাতুল্লাহি আলাইহি", "রহিঃ": "রহিমাহুল্লাহ", "হাফিঃ": "হাফিযাহুল্লাহ", "বায়ান": "বাইআন", "দাঃবাঃ": "দামাত বারাকাতুহুম,দামাত বারাকাতুল্লাহ", # "আয়াত" : "আইআত",#আইআত # "ওয়া" : "ওআ", # "ওয়াসাল্লাম" : "ওআসাল্লাম", # "কেন" : "কেনো", # "কোন" : "কোনো", # "বল" : "বলো", # "চল" : "চলো", # "কর" : "করো", # "রাখ" : "রাখো", "’": "", "‘": "", # "য়" : "অ", # "সম্প্রদায়" : "সম্প্রদাই", # "রয়েছে" : "রইছে", # "রয়েছ" : "রইছ", "/": " বাই ", } def tag_text(text: str): # remove multiple spaces text = re.sub(" +", " ", text) # create start and end text = "start" + text + "end" # tag text parts = re.split("[\u0600-\u06FF]+", text) # remove non chars parts = [p for p in parts if p.strip()] # unique parts parts = set(parts) # tag the text for m in parts: if len(m.strip()) > 1: text = text.replace(m, f"{m}") # clean-tags text = text.replace("start", "") text = text.replace("end", "") return text def normalize(sen): global bnorm # pylint: disable=global-statement _words = [bnorm(word)["normalized"] for word in sen.split()] return " ".join([word for word in _words if word is not None]) def expand_full_attribution(text): for word, attr in attribution_dict.items(): if word in text: text = text.replace(word, normalize(attr)) return text def collapse_whitespace(text): # Regular expression matching whitespace: _whitespace_re = re.compile(r"\s+") return re.sub(_whitespace_re, " ", text) def bangla_text_to_phonemes(text: str) -> str: # english numbers to bangla conversion res = re.search("[0-9]", text) if res is not None: text = bangla.convert_english_digit_to_bangla_digit(text) # replace ':' in between two bangla numbers with ' এর ' pattern = r"[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]:[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]" matches = re.findall(pattern, text) for m in matches: r = m.replace(":", " এর ") text = text.replace(m, r) # numerize text text = numerize(text) # tag sections text = tag_text(text) # text blocks # blocks = text.split("") # blocks = [b for b in blocks if b.strip()] # create tuple of (lang,text) if "" in text: text = text.replace("", "").replace("", "") # Split based on sentence ending Characters bn_text = text.strip() sentenceEnders = re.compile("[।!?]") sentences = sentenceEnders.split(str(bn_text)) data = "" for sent in sentences: res = re.sub("\n", "", sent) res = normalize(res) # expand attributes res = expand_full_attribution(res) res = collapse_whitespace(res) res += "।" data += res return data