File size: 3,740 Bytes
9b2107c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import re

import bangla
from bnnumerizer import numerize
from bnunicodenormalizer import Normalizer

# initialize
bnorm = Normalizer()


attribution_dict = {
    "সাঃ": "সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম",
    "আঃ": "আলাইহিস সালাম",
    "রাঃ": "রাদিআল্লাহু আনহু",
    "রহঃ": "রহমাতুল্লাহি আলাইহি",
    "রহিঃ": "রহিমাহুল্লাহ",
    "হাফিঃ": "হাফিযাহুল্লাহ",
    "বায়ান": "বাইআন",
    "দাঃবাঃ": "দামাত বারাকাতুহুম,দামাত বারাকাতুল্লাহ",
    # "আয়াত" : "আইআত",#আইআত
    # "ওয়া" : "ওআ",
    # "ওয়াসাল্লাম"  : "ওআসাল্লাম",
    # "কেন"  : "কেনো",
    # "কোন" : "কোনো",
    # "বল"   : "বলো",
    # "চল"   : "চলো",
    # "কর"   : "করো",
    # "রাখ"   : "রাখো",
    "’": "",
    "‘": "",
    # "য়"     : "অ",
    # "সম্প্রদায়" : "সম্প্রদাই",
    # "রয়েছে"   : "রইছে",
    # "রয়েছ"    : "রইছ",
    "/": " বাই ",
}


def tag_text(text: str):
    # remove multiple spaces
    text = re.sub(" +", " ", text)
    # create start and end
    text = "start" + text + "end"
    # tag text
    parts = re.split("[\u0600-\u06FF]+", text)
    # remove non chars
    parts = [p for p in parts if p.strip()]
    # unique parts
    parts = set(parts)
    # tag the text
    for m in parts:
        if len(m.strip()) > 1:
            text = text.replace(m, f"{m}")
    # clean-tags
    text = text.replace("start", "")
    text = text.replace("end", "")
    return text


def normalize(sen):
    global bnorm  # pylint: disable=global-statement
    _words = [bnorm(word)["normalized"] for word in sen.split()]
    return " ".join([word for word in _words if word is not None])


def expand_full_attribution(text):
    for word, attr in attribution_dict.items():
        if word in text:
            text = text.replace(word, normalize(attr))
    return text


def collapse_whitespace(text):
    # Regular expression matching whitespace:
    _whitespace_re = re.compile(r"\s+")
    return re.sub(_whitespace_re, " ", text)


def bangla_text_to_phonemes(text: str) -> str:
    # english numbers to bangla conversion
    res = re.search("[0-9]", text)
    if res is not None:
        text = bangla.convert_english_digit_to_bangla_digit(text)

    # replace ':' in between two bangla numbers with ' এর '
    pattern = r"[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]:[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]"
    matches = re.findall(pattern, text)
    for m in matches:
        r = m.replace(":", " এর ")
        text = text.replace(m, r)

    # numerize text
    text = numerize(text)

    # tag sections
    text = tag_text(text)

    # text blocks
    # blocks = text.split("")
    # blocks = [b for b in blocks if b.strip()]

    # create tuple of (lang,text)
    if "" in text:
        text = text.replace("", "").replace("", "")
    # Split based on sentence ending Characters
    bn_text = text.strip()

    sentenceEnders = re.compile("[।!?]")
    sentences = sentenceEnders.split(str(bn_text))

    data = ""
    for sent in sentences:
        res = re.sub("\n", "", sent)
        res = normalize(res)
        # expand attributes
        res = expand_full_attribution(res)

        res = collapse_whitespace(res)
        res += "।"
        data += res
    return data