law-llm commited on
Commit
2ab9931
1 Parent(s): d95e7ba

Upload tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|endofpiece|>": 50007,
3
+ "<|endoftext|>": 50000,
4
+ "<|startofpiece|>": 50006,
5
+ "[CLS]": 50002,
6
+ "[MASK]": 50003,
7
+ "[SEP]": 50001,
8
+ "[UNUSED1]": 50004,
9
+ "[UNUSED2]": 50005,
10
+ "[gMASK]": 50009,
11
+ "[sMASK]": 50008
12
+ }
cog-pretrain.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ea6f4164152bc58d23e24e48f7bf4187aad72a32e97ec4b3acc832fe183cbc2
3
+ size 1021864
special_tokens_map.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|startofpiece|>",
4
+ "<|endofpiece|>",
5
+ "[gMASK]",
6
+ "[sMASK]"
7
+ ],
8
+ "cls_token": "[CLS]",
9
+ "eos_token": "<|endoftext|>",
10
+ "mask_token": "[MASK]",
11
+ "pad_token": "<|endoftext|>",
12
+ "unk_token": "[UNK]"
13
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "additional_special_tokens": [
4
+ "<|startofpiece|>",
5
+ "<|endofpiece|>",
6
+ "[gMASK]",
7
+ "[sMASK]"
8
+ ],
9
+ "auto_map": {
10
+ "AutoTokenizer": [
11
+ "THUDM/glm-10b-chinese--tokenization_glm.GLMChineseTokenizer",
12
+ null
13
+ ]
14
+ },
15
+ "clean_up_tokenization_spaces": true,
16
+ "cls_token": "[CLS]",
17
+ "eos_token": "<|endoftext|>",
18
+ "mask_token": "[MASK]",
19
+ "model_max_length": 1000000000000000019884624838656,
20
+ "pad_token": "<|endoftext|>",
21
+ "tokenizer_class": "GLMChineseTokenizer",
22
+ "unk_token": "[UNK]",
23
+ "use_fast": false
24
+ }