Spaces:

AkitoP
/

whisper-japanese-phone-demo

File size: 2,767 Bytes

0888de7

def parse_pitch_accent(s):
    # Remove '^', '#', and '$', keep '_', '?'
    s = s.replace('^', '').replace('#', '').replace('$', '')

    marks = []          # List to store the binary marks
    current_mark = None # Current mark (0 or 1)
    last_accent = None  # '↑' or '↓' or None
    prev_char_index = -1 # Index of the previous character (not an accent marker)
    chars = list(s)     # List of characters from the string

    i = 0
    while i < len(chars):
        char = chars[i]
        if char == '↑' or char == '↓':
            if last_accent == char:
                # Apply special rules for consecutive same accents
                if char == '↑':
                    # Mark 0 before the second '↑'
                    if prev_char_index >= 0:
                        marks[prev_char_index] = '0'
                elif char == '↓':
                    # Mark 1 before the second '↓'
                    if prev_char_index >= 0:
                        marks[prev_char_index] = '1'
            else:
                # At the start, determine the initial mark based on the first accent
                if current_mark is None:
                    current_mark = '0' if char == '↑' else '1'
            # Set the current mark after the accent
            current_mark = '1' if char == '↑' else '0'
            last_accent = char
        elif char in ['_', '?']:
            # For '_' and '?', append the current mark
            marks.append(current_mark)
            prev_char_index = len(marks) - 1
        else:
            # Regular character, append the current mark
            if current_mark is None:
                # If no accent encountered yet, look-ahead to determine the starting mark
                for j in range(i, len(chars)):
                    if chars[j] == '↑':
                        current_mark = '0'
                        break
                    elif chars[j] == '↓':
                        current_mark = '1'
                        break
            marks.append(current_mark)
            prev_char_index = len(marks) - 1
        i += 1
    # Convert the list of marks to a string
    result = ''.join(marks)
    return result
def katakana_normalize(s):
    return s.replace("^", "").replace("#", "").replace("↑", "").replace("↓", "").replace("$", "")
# Example usage
# input_str = '^ト↓シコニ#ワ↑タシワ_ホ↓ボ#マ↓イニチ_オ↑ニ↓イソンニ#ナ↑クダシオ#サ↑レテマスシ$'
# output = parse_pitch_accent(input_str)
# output_str = katakana_normalize(input_str)
# print(output_str)
# assert len(output) == len(output_str)
# for i in range(len(output)):
#     print(f"{output_str[i]}: {output[i]}")