arkanbima commited on
Commit
a41dc2f
1 Parent(s): 8e0df12

Upload bahasallamatokenizer.py

Browse files
Files changed (1) hide show
  1. bahasallamatokenizer.py +359 -0
bahasallamatokenizer.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2020 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import os
16
+ from shutil import copyfile
17
+ from typing import Optional, Tuple, Union, List
18
+ import re
19
+ import codecs
20
+
21
+ from tokenizers import processors
22
+
23
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
24
+ from transformers.utils import is_sentencepiece_available, logging
25
+ from transformers.utils.versions import require_version
26
+
27
+
28
+ require_version("tokenizers>=0.13.3")
29
+
30
+ if is_sentencepiece_available():
31
+ from transformers.models.llama.tokenization_llama import LlamaTokenizer
32
+ else:
33
+ LlamaTokenizer = None
34
+
35
+ logger = logging.get_logger(__name__)
36
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
37
+
38
+ PRETRAINED_VOCAB_FILES_MAP = {
39
+ "vocab_file": {
40
+ "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
41
+ },
42
+ "tokenizer_file": {
43
+ "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
44
+ },
45
+ }
46
+ B_INST, E_INST = "[INST]", "[/INST]"
47
+ B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
48
+
49
+ # fmt: off
50
+ DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
51
+ answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
52
+ that your responses are socially unbiased and positive in nature.
53
+
54
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
55
+ correct. If you don't know the answer to a question, please don't share false information."""
56
+ # fmt: on
57
+
58
+
59
+ class LlamaTokenizerFast(PreTrainedTokenizerFast):
60
+ """
61
+ Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding.
62
+
63
+ This uses notably ByteFallback and no normalization.
64
+
65
+ ```python
66
+ >>> from transformers import LlamaTokenizerFast
67
+
68
+ >>> tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
69
+ >>> tokenizer.encode("Hello this is a test")
70
+ [1, 15043, 445, 338, 263, 1243]
71
+ ```
72
+
73
+ If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
74
+ call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
75
+ values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
76
+ [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
77
+
78
+
79
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
80
+ refer to this superclass for more information regarding those methods.
81
+
82
+ Args:
83
+ vocab_file (`str`, *optional*):
84
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
85
+ contains the vocabulary necessary to instantiate a tokenizer.
86
+ tokenizer_file (`str`, *optional*):
87
+ [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
88
+ contains everything needed to load the tokenizer.
89
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
90
+ Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
91
+ extra spaces.
92
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
93
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
94
+ token instead.
95
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
96
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
97
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
98
+ The end of sequence token.
99
+ add_bos_token (`bool`, *optional*, defaults to `True`):
100
+ Whether or not to add an `bos_token` at the start of sequences.
101
+ add_eos_token (`bool`, *optional*, defaults to `False`):
102
+ Whether or not to add an `eos_token` at the end of sequences.
103
+ use_default_system_prompt (`bool`, *optional*, defaults to `False`):
104
+ Whether or not the default system prompt for Llama should be used.
105
+ """
106
+
107
+ vocab_files_names = VOCAB_FILES_NAMES
108
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
109
+ slow_tokenizer_class = LlamaTokenizer
110
+ padding_side = "left"
111
+ model_input_names = ["input_ids", "attention_mask"]
112
+
113
+ def __init__(
114
+ self,
115
+ vocab_file=None,
116
+ tokenizer_file=None,
117
+ clean_up_tokenization_spaces=False,
118
+ unk_token="<unk>",
119
+ bos_token="<s>",
120
+ eos_token="</s>",
121
+ add_bos_token=True,
122
+ add_eos_token=False,
123
+ use_default_system_prompt=False,
124
+ **kwargs,
125
+ ):
126
+ super().__init__(
127
+ vocab_file=vocab_file,
128
+ tokenizer_file=tokenizer_file,
129
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
130
+ unk_token=unk_token,
131
+ bos_token=bos_token,
132
+ eos_token=eos_token,
133
+ add_bos_token=add_bos_token,
134
+ add_eos_token=add_eos_token,
135
+ use_default_system_prompt=use_default_system_prompt,
136
+ **kwargs,
137
+ )
138
+ self._add_bos_token = add_bos_token
139
+ self._add_eos_token = add_eos_token
140
+ self.update_post_processor()
141
+ self.use_default_system_prompt = use_default_system_prompt
142
+ self.vocab_file = vocab_file
143
+
144
+ @property
145
+ def can_save_slow_tokenizer(self) -> bool:
146
+ return os.path.isfile(self.vocab_file) if self.vocab_file else False
147
+
148
+ def update_post_processor(self):
149
+ """
150
+ Updates the underlying post processor with the current `bos_token` and `eos_token`.
151
+ """
152
+ bos = self.bos_token
153
+ bos_token_id = self.bos_token_id
154
+ if bos is None and self.add_bos_token:
155
+ raise ValueError("add_bos_token = True but bos_token = None")
156
+
157
+ eos = self.eos_token
158
+ eos_token_id = self.eos_token_id
159
+ if eos is None and self.add_eos_token:
160
+ raise ValueError("add_eos_token = True but eos_token = None")
161
+
162
+ single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
163
+ pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
164
+
165
+ special_tokens = []
166
+ if self.add_bos_token:
167
+ special_tokens.append((bos, bos_token_id))
168
+ if self.add_eos_token:
169
+ special_tokens.append((eos, eos_token_id))
170
+ self._tokenizer.post_processor = processors.TemplateProcessing(
171
+ single=single, pair=pair, special_tokens=special_tokens
172
+ )
173
+
174
+ @property
175
+ def add_eos_token(self):
176
+ return self._add_eos_token
177
+
178
+ @property
179
+ def add_bos_token(self):
180
+ return self._add_bos_token
181
+
182
+ @add_eos_token.setter
183
+ def add_eos_token(self, value):
184
+ self._add_eos_token = value
185
+ self.update_post_processor()
186
+
187
+ @add_bos_token.setter
188
+ def add_bos_token(self, value):
189
+ self._add_bos_token = value
190
+ self.update_post_processor()
191
+
192
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
193
+ if not self.can_save_slow_tokenizer:
194
+ raise ValueError(
195
+ "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
196
+ "tokenizer."
197
+ )
198
+
199
+ if not os.path.isdir(save_directory):
200
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
201
+ return
202
+ out_vocab_file = os.path.join(
203
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
204
+ )
205
+
206
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
207
+ copyfile(self.vocab_file, out_vocab_file)
208
+
209
+ return (out_vocab_file,)
210
+
211
+ @property
212
+ # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template
213
+ def default_chat_template(self):
214
+ """
215
+ LLaMA uses [INST] and [/INST] to indicate user messages, and <<SYS>> and <</SYS>> to indicate system messages.
216
+ Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict
217
+ user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering
218
+ rather than needing special tokens. The system message is partly 'embedded' in the first user message, which
219
+ results in an unusual token ordering when it is present. This template should definitely be changed if you wish
220
+ to fine-tune a model with more flexible role ordering!
221
+
222
+ The output should look something like:
223
+
224
+ <bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos><bos>[INST] Prompt [/INST] Answer <eos>
225
+ <bos>[INST] Prompt [/INST]
226
+
227
+ The reference for this chat template is [this code
228
+ snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
229
+ in the original repository.
230
+ """
231
+ logger.warning_once(
232
+ "\nNo chat template is defined for this tokenizer - using the default template "
233
+ f"for the {self.__class__.__name__} class. If the default is not appropriate for "
234
+ "your model, please set `tokenizer.chat_template` to an appropriate template. "
235
+ "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
236
+ )
237
+ template = (
238
+ "{% if messages[0]['role'] == 'system' %}"
239
+ "{% set loop_messages = messages[1:] %}" # Extract system message if it's present
240
+ "{% set system_message = messages[0]['content'] %}"
241
+ "{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}"
242
+ "{% set loop_messages = messages %}" # Or use the default system message if the flag is set
243
+ "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
244
+ "{% else %}"
245
+ "{% set loop_messages = messages %}"
246
+ "{% set system_message = false %}"
247
+ "{% endif %}"
248
+ "{% for message in loop_messages %}" # Loop over all non-system messages
249
+ "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
250
+ "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
251
+ "{% endif %}"
252
+ "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message
253
+ "{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}"
254
+ "{% else %}"
255
+ "{% set content = message['content'] %}"
256
+ "{% endif %}"
257
+ "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way
258
+ "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}"
259
+ "{% elif message['role'] == 'system' %}"
260
+ "{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}"
261
+ "{% elif message['role'] == 'assistant' %}"
262
+ "{{ ' ' + content.strip() + ' ' + eos_token }}"
263
+ "{% endif %}"
264
+ "{% endfor %}"
265
+ )
266
+ template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
267
+ default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
268
+ template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
269
+
270
+ return template
271
+
272
+ # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers
273
+ # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
274
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
275
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
276
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
277
+
278
+ output = bos_token_id + token_ids_0 + eos_token_id
279
+
280
+ if token_ids_1 is not None:
281
+ output = output + bos_token_id + token_ids_1 + eos_token_id
282
+
283
+ return output
284
+
285
+ def decode_hex_in_sentence(self,sentence):
286
+ # Define a regular expression to match hexadecimal representations
287
+ hex_pattern = re.compile(r'<0x([0-9A-Fa-f]+)>')
288
+
289
+ # Find all matches in the sentence
290
+ matches = re.finditer(hex_pattern, sentence)
291
+
292
+ # Iterate over matches and replace them with their decoded values
293
+ for match in matches:
294
+ hex_string = match.group(1)
295
+ bytes_data = bytes.fromhex(hex_string)
296
+ try:
297
+ decoded_string = bytes_data.decode('utf-8')
298
+ except UnicodeDecodeError:
299
+ continue
300
+ sentence = sentence.replace(match.group(0), decoded_string, 1)
301
+
302
+ return sentence
303
+
304
+ def convert_emojis(self,input_string):
305
+ # Find all hexadecimal escape sequences in the input string
306
+ hex_sequences = re.findall(r'<0x([A-Fa-f0-9]+)>', input_string)
307
+
308
+ input_string = bytes(input_string,'utf-8')
309
+
310
+ # Replace each escape sequence with its decoded equivalent
311
+ for hex_seq in hex_sequences:
312
+ bytes_value = bytes.fromhex(hex_seq)
313
+ input_string = input_string.replace(bytes(f"<0x{hex_seq}>",'utf-8'), bytes_value)
314
+
315
+ decoded_str = codecs.decode(input_string, 'utf-8')
316
+
317
+ return decoded_str
318
+
319
+ def _decode(
320
+ self,
321
+ token_ids: Union[int, List[int]],
322
+ skip_special_tokens: bool = False,
323
+ clean_up_tokenization_spaces: bool = None,
324
+ **kwargs,
325
+ ) -> str:
326
+
327
+ self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
328
+
329
+ if isinstance(token_ids, int):
330
+ token_ids = [token_ids]
331
+
332
+ # custom logic since there's some spacing issue with AddedToken
333
+ tokens = self.convert_ids_to_tokens(token_ids)
334
+ text = ""
335
+ i = 0
336
+ for id,token in zip(token_ids,tokens):
337
+ if skip_special_tokens and id in self.all_special_ids:
338
+ continue
339
+
340
+ if id>=32000 and i!= 0: #check for AddedToken and not the first token
341
+ text += " " + token
342
+ else:
343
+ text += token
344
+ i += 1
345
+ text = re.sub("▁"," ",text)
346
+ text = self.decode_hex_in_sentence(text)
347
+ text = self.convert_emojis(text)
348
+ text = text.lstrip().rstrip()
349
+
350
+ clean_up_tokenization_spaces = (
351
+ clean_up_tokenization_spaces
352
+ if clean_up_tokenization_spaces is not None
353
+ else self.clean_up_tokenization_spaces
354
+ )
355
+ if clean_up_tokenization_spaces:
356
+ clean_text = self.clean_up_tokenization(text)
357
+ return clean_text
358
+ else:
359
+ return text