Sukalingua

File size: 3,740 Bytes

9b2107c

import re

import bangla
from bnnumerizer import numerize
from bnunicodenormalizer import Normalizer

# initialize
bnorm = Normalizer()


attribution_dict = {
    "সাঃ": "সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম",
    "আঃ": "আলাইহিস সালাম",
    "রাঃ": "রাদিআল্লাহু আনহু",
    "রহঃ": "রহমাতুল্লাহি আলাইহি",
    "রহিঃ": "রহিমাহুল্লাহ",
    "হাফিঃ": "হাফিযাহুল্লাহ",
    "বায়ান": "বাইআন",
    "দাঃবাঃ": "দামাত বারাকাতুহুম,দামাত বারাকাতুল্লাহ",
    # "আয়াত" : "আইআত",#আইআত
    # "ওয়া" : "ওআ",
    # "ওয়াসাল্লাম"  : "ওআসাল্লাম",
    # "কেন"  : "কেনো",
    # "কোন" : "কোনো",
    # "বল"   : "বলো",
    # "চল"   : "চলো",
    # "কর"   : "করো",
    # "রাখ"   : "রাখো",
    "’": "",
    "‘": "",
    # "য়"     : "অ",
    # "সম্প্রদায়" : "সম্প্রদাই",
    # "রয়েছে"   : "রইছে",
    # "রয়েছ"    : "রইছ",
    "/": " বাই ",
}


def tag_text(text: str):
    # remove multiple spaces
    text = re.sub(" +", " ", text)
    # create start and end
    text = "start" + text + "end"
    # tag text
    parts = re.split("[\u0600-\u06FF]+", text)
    # remove non chars
    parts = [p for p in parts if p.strip()]
    # unique parts
    parts = set(parts)
    # tag the text
    for m in parts:
        if len(m.strip()) > 1:
            text = text.replace(m, f"{m}")
    # clean-tags
    text = text.replace("start", "")
    text = text.replace("end", "")
    return text


def normalize(sen):
    global bnorm  # pylint: disable=global-statement
    _words = [bnorm(word)["normalized"] for word in sen.split()]
    return " ".join([word for word in _words if word is not None])


def expand_full_attribution(text):
    for word, attr in attribution_dict.items():
        if word in text:
            text = text.replace(word, normalize(attr))
    return text


def collapse_whitespace(text):
    # Regular expression matching whitespace:
    _whitespace_re = re.compile(r"\s+")
    return re.sub(_whitespace_re, " ", text)


def bangla_text_to_phonemes(text: str) -> str:
    # english numbers to bangla conversion
    res = re.search("[0-9]", text)
    if res is not None:
        text = bangla.convert_english_digit_to_bangla_digit(text)

    # replace ':' in between two bangla numbers with ' এর '
    pattern = r"[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]:[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]"
    matches = re.findall(pattern, text)
    for m in matches:
        r = m.replace(":", " এর ")
        text = text.replace(m, r)

    # numerize text
    text = numerize(text)

    # tag sections
    text = tag_text(text)

    # text blocks
    # blocks = text.split("")
    # blocks = [b for b in blocks if b.strip()]

    # create tuple of (lang,text)
    if "" in text:
        text = text.replace("", "").replace("", "")
    # Split based on sentence ending Characters
    bn_text = text.strip()

    sentenceEnders = re.compile("[।!?]")
    sentences = sentenceEnders.split(str(bn_text))

    data = ""
    for sent in sentences:
        res = re.sub("\n", "", sent)
        res = normalize(res)
        # expand attributes
        res = expand_full_attribution(res)

        res = collapse_whitespace(res)
        res += "।"
        data += res
    return data