Sukalingua

App Files Files Community

Sukalingua / TTS /tts /utils /text /bangla /phonemizer.py

Shadhil

voice-clone with single audio sample input

9b2107c 11 months ago

raw

history blame

3.74 kB

	import re

	import bangla
	from bnnumerizer import numerize
	from bnunicodenormalizer import Normalizer

	# initialize
	bnorm = Normalizer()


	attribution_dict = {
	"সাঃ": "সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম",
	"আঃ": "আলাইহিস সালাম",
	"রাঃ": "রাদিআল্লাহু আনহু",
	"রহঃ": "রহমাতুল্লাহি আলাইহি",
	"রহিঃ": "রহিমাহুল্লাহ",
	"হাফিঃ": "হাফিযাহুল্লাহ",
	"বায়ান": "বাইআন",
	"দাঃবাঃ": "দামাত বারাকাতুহুম,দামাত বারাকাতুল্লাহ",
	# "আয়াত" : "আইআত",#আইআত
	# "ওয়া" : "ওআ",
	# "ওয়াসাল্লাম" : "ওআসাল্লাম",
	# "কেন" : "কেনো",
	# "কোন" : "কোনো",
	# "বল" : "বলো",
	# "চল" : "চলো",
	# "কর" : "করো",
	# "রাখ" : "রাখো",
	"’": "",
	"‘": "",
	# "য়" : "অ",
	# "সম্প্রদায়" : "সম্প্রদাই",
	# "রয়েছে" : "রইছে",
	# "রয়েছ" : "রইছ",
	"/": " বাই ",
	}


	def tag_text(text: str):
	# remove multiple spaces
	text = re.sub(" +", " ", text)
	# create start and end
	text = "start" + text + "end"
	# tag text
	parts = re.split("[\u0600-\u06FF]+", text)
	# remove non chars
	parts = [p for p in parts if p.strip()]
	# unique parts
	parts = set(parts)
	# tag the text
	for m in parts:
	if len(m.strip()) > 1:
	text = text.replace(m, f"{m}")
	# clean-tags
	text = text.replace("start", "")
	text = text.replace("end", "")
	return text


	def normalize(sen):
	global bnorm # pylint: disable=global-statement
	_words = [bnorm(word)["normalized"] for word in sen.split()]
	return " ".join([word for word in _words if word is not None])


	def expand_full_attribution(text):
	for word, attr in attribution_dict.items():
	if word in text:
	text = text.replace(word, normalize(attr))
	return text


	def collapse_whitespace(text):
	# Regular expression matching whitespace:
	_whitespace_re = re.compile(r"\s+")
	return re.sub(_whitespace_re, " ", text)


	def bangla_text_to_phonemes(text: str) -> str:
	# english numbers to bangla conversion
	res = re.search("[0-9]", text)
	if res is not None:
	text = bangla.convert_english_digit_to_bangla_digit(text)

	# replace ':' in between two bangla numbers with ' এর '
	pattern = r"[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]:[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]"
	matches = re.findall(pattern, text)
	for m in matches:
	r = m.replace(":", " এর ")
	text = text.replace(m, r)

	# numerize text
	text = numerize(text)

	# tag sections
	text = tag_text(text)

	# text blocks
	# blocks = text.split("")
	# blocks = [b for b in blocks if b.strip()]

	# create tuple of (lang,text)
	if "" in text:
	text = text.replace("", "").replace("", "")
	# Split based on sentence ending Characters
	bn_text = text.strip()

	sentenceEnders = re.compile("[।!?]")
	sentences = sentenceEnders.split(str(bn_text))

	data = ""
	for sent in sentences:
	res = re.sub("\n", "", sent)
	res = normalize(res)
	# expand attributes
	res = expand_full_attribution(res)

	res = collapse_whitespace(res)
	res += "।"
	data += res
	return data