from typing import List import jieba import pypinyin from .pinyinToPhonemes import PINYIN_DICT def _chinese_character_to_pinyin(text: str) -> List[str]: pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True) pinyins_flat_list = [item for sublist in pinyins for item in sublist] return pinyins_flat_list def _chinese_pinyin_to_phoneme(pinyin: str) -> str: segment = pinyin[:-1] tone = pinyin[-1] phoneme = PINYIN_DICT.get(segment, [""])[0] return phoneme + tone def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str: tokenized_text = jieba.cut(text, HMM=False) tokenized_text = " ".join(tokenized_text) pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text) results: List[str] = [] for token in pinyined_text: if token[-1] in "12345": # TODO transform to is_pinyin() pinyin_phonemes = _chinese_pinyin_to_phoneme(token) results += list(pinyin_phonemes) else: # is ponctuation or other results += list(token) return seperator.join(results)