def parse_pitch_accent(s): # Remove '^', '#', and '$', keep '_', '?' s = s.replace('^', '').replace('#', '').replace('$', '') marks = [] # List to store the binary marks current_mark = None # Current mark (0 or 1) last_accent = None # '↑' or '↓' or None prev_char_index = -1 # Index of the previous character (not an accent marker) chars = list(s) # List of characters from the string i = 0 while i < len(chars): char = chars[i] if char == '↑' or char == '↓': if last_accent == char: # Apply special rules for consecutive same accents if char == '↑': # Mark 0 before the second '↑' if prev_char_index >= 0: marks[prev_char_index] = '0' elif char == '↓': # Mark 1 before the second '↓' if prev_char_index >= 0: marks[prev_char_index] = '1' else: # At the start, determine the initial mark based on the first accent if current_mark is None: current_mark = '0' if char == '↑' else '1' # Set the current mark after the accent current_mark = '1' if char == '↑' else '0' last_accent = char elif char in ['_', '?']: # For '_' and '?', append the current mark marks.append(current_mark) prev_char_index = len(marks) - 1 else: # Regular character, append the current mark if current_mark is None: # If no accent encountered yet, look-ahead to determine the starting mark for j in range(i, len(chars)): if chars[j] == '↑': current_mark = '0' break elif chars[j] == '↓': current_mark = '1' break marks.append(current_mark) prev_char_index = len(marks) - 1 i += 1 # Convert the list of marks to a string result = ''.join(marks) return result def katakana_normalize(s): return s.replace("^", "").replace("#", "").replace("↑", "").replace("↓", "").replace("$", "") # Example usage # input_str = '^ト↓シコニ#ワ↑タシワ_ホ↓ボ#マ↓イニチ_オ↑ニ↓イソンニ#ナ↑クダシオ#サ↑レテマスシ$' # output = parse_pitch_accent(input_str) # output_str = katakana_normalize(input_str) # print(output_str) # assert len(output) == len(output_str) # for i in range(len(output)): # print(f"{output_str[i]}: {output[i]}")