File size: 2,767 Bytes
0888de7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def parse_pitch_accent(s):
    # Remove '^', '#', and '$', keep '_', '?'
    s = s.replace('^', '').replace('#', '').replace('$', '')

    marks = []          # List to store the binary marks
    current_mark = None # Current mark (0 or 1)
    last_accent = None  # '↑' or '↓' or None
    prev_char_index = -1 # Index of the previous character (not an accent marker)
    chars = list(s)     # List of characters from the string

    i = 0
    while i < len(chars):
        char = chars[i]
        if char == '↑' or char == '↓':
            if last_accent == char:
                # Apply special rules for consecutive same accents
                if char == '↑':
                    # Mark 0 before the second '↑'
                    if prev_char_index >= 0:
                        marks[prev_char_index] = '0'
                elif char == '↓':
                    # Mark 1 before the second '↓'
                    if prev_char_index >= 0:
                        marks[prev_char_index] = '1'
            else:
                # At the start, determine the initial mark based on the first accent
                if current_mark is None:
                    current_mark = '0' if char == '↑' else '1'
            # Set the current mark after the accent
            current_mark = '1' if char == '↑' else '0'
            last_accent = char
        elif char in ['_', '?']:
            # For '_' and '?', append the current mark
            marks.append(current_mark)
            prev_char_index = len(marks) - 1
        else:
            # Regular character, append the current mark
            if current_mark is None:
                # If no accent encountered yet, look-ahead to determine the starting mark
                for j in range(i, len(chars)):
                    if chars[j] == '↑':
                        current_mark = '0'
                        break
                    elif chars[j] == '↓':
                        current_mark = '1'
                        break
            marks.append(current_mark)
            prev_char_index = len(marks) - 1
        i += 1
    # Convert the list of marks to a string
    result = ''.join(marks)
    return result
def katakana_normalize(s):
    return s.replace("^", "").replace("#", "").replace("↑", "").replace("↓", "").replace("$", "")
# Example usage
# input_str = '^γƒˆβ†“γ‚·γ‚³γƒ‹#ワ↑タシワ_γƒ›β†“γƒœ#γƒžβ†“γ‚€γƒ‹γƒ_γ‚ͺ↑ニ↓むソンニ#γƒŠβ†‘γ‚―γƒ€γ‚·γ‚ͺ#γ‚΅β†‘γƒ¬γƒ†γƒžγ‚Ήγ‚·$'
# output = parse_pitch_accent(input_str)
# output_str = katakana_normalize(input_str)
# print(output_str)
# assert len(output) == len(output_str)
# for i in range(len(output)):
#     print(f"{output_str[i]}: {output[i]}")