zackli4ai commited on
Commit
19d93c1
1 Parent(s): c119fb7

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -1,4 +1,6 @@
1
  {
 
 
2
  "<|assistant|>": 32001,
3
  "<|endoftext|>": 32000,
4
  "<|end|>": 32007,
 
1
  {
2
+ "<nexa_end>": 32012,
3
+ "<nexa_split>": 32011,
4
  "<|assistant|>": 32001,
5
  "<|endoftext|>": 32000,
6
  "<|end|>": 32007,
special_tokens_map.json CHANGED
@@ -1,6 +1,7 @@
1
  {
2
  "additional_special_tokens": [
3
  "<nexa_split>",
 
4
  ],
5
  "bos_token": {
6
  "content": "<s>",
 
1
  {
2
  "additional_special_tokens": [
3
  "<nexa_split>",
4
+ "<nexa_end>"
5
  ],
6
  "bos_token": {
7
  "content": "<s>",
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f473592bb5dc5cc078d862adc30eef76f773ace97f4c3a921cfe7cb018d8493f
3
- size 1844840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6796f846dff17e049ed0f60a0aa09a38b18b1d6b62d3adad2ae93a3b81ad81f1
3
+ size 1845214
tokenizer_config.json CHANGED
@@ -113,16 +113,36 @@
113
  "rstrip": true,
114
  "single_word": false,
115
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  }
117
  },
 
 
 
 
118
  "bos_token": "<s>",
119
  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
120
  "clean_up_tokenization_spaces": false,
121
  "eos_token": "<|endoftext|>",
122
  "legacy": false,
123
- "model_max_length": 2048,
124
  "pad_token": "<|endoftext|>",
125
- "padding_side": "right",
126
  "sp_model_kwargs": {},
127
  "tokenizer_class": "LlamaTokenizer",
128
  "unk_token": "<unk>",
 
113
  "rstrip": true,
114
  "single_word": false,
115
  "special": true
116
+ },
117
+ "32011": {
118
+ "content": "<nexa_split>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "32012": {
126
+ "content": "<nexa_end>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": true
132
  }
133
  },
134
+ "additional_special_tokens": [
135
+ "<nexa_split>",
136
+ "<nexa_end>"
137
+ ],
138
  "bos_token": "<s>",
139
  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
140
  "clean_up_tokenization_spaces": false,
141
  "eos_token": "<|endoftext|>",
142
  "legacy": false,
143
+ "model_max_length": 131072,
144
  "pad_token": "<|endoftext|>",
145
+ "padding_side": "left",
146
  "sp_model_kwargs": {},
147
  "tokenizer_class": "LlamaTokenizer",
148
  "unk_token": "<unk>",