RayeRen's picture
init
d1b91e7
raw
history blame
No virus
2.55 kB
import re
import unicodedata
from g2p_en import G2p
from g2p_en.expand import normalize_numbers
from nltk import pos_tag
from nltk.tokenize import TweetTokenizer
from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors
from utils.text.text_encoder import PUNCS, is_sil_phoneme
class EnG2p(G2p):
word_tokenize = TweetTokenizer().tokenize
def __call__(self, text):
# preprocessing
words = EnG2p.word_tokenize(text)
tokens = pos_tag(words) # tuples of (word, tag)
# steps
prons = []
for word, pos in tokens:
if re.search("[a-z]", word) is None:
pron = [word]
elif word in self.homograph2features: # Check homograph
pron1, pron2, pos1 = self.homograph2features[word]
if pos.startswith(pos1):
pron = pron1
else:
pron = pron2
elif word in self.cmu: # lookup CMU dict
pron = self.cmu[word][0]
else: # predict for oov
pron = self.predict(word)
prons.extend(pron)
prons.extend([" "])
return prons[:-1]
@register_txt_processors('en')
class TxtProcessor(BaseTxtProcessor):
g2p = EnG2p()
@staticmethod
def preprocess_text(text):
text = normalize_numbers(text)
text = ''.join(char for char in unicodedata.normalize('NFD', text)
if unicodedata.category(char) != 'Mn') # Strip accents
text = text.lower()
text = re.sub("[\'\"()]+", "", text)
text = re.sub("[-]+", " ", text)
text = re.sub(f"[^ a-z{PUNCS}]", "", text)
text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text) # !! -> !
text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
text = text.replace("i.e.", "that is")
text = text.replace("i.e.", "that is")
text = text.replace("etc.", "etc")
text = re.sub(f"([{PUNCS}])", r" \1 ", text)
text = re.sub(rf"\s+", r" ", text)
return text
@classmethod
def process(cls, txt, preprocess_args):
txt = cls.preprocess_text(txt).strip()
phs = cls.g2p(txt)
txt_struct = [[w, []] for w in txt.split(" ")]
i_word = 0
for p in phs:
if p == ' ':
i_word += 1
else:
txt_struct[i_word][1].append(p)
txt_struct = cls.postprocess(txt_struct, preprocess_args)
return txt_struct, txt