xu-song commited on
Commit
3030d21
1 Parent(s): 293bad6

add more tokenizer

Browse files
vocab/__init__.py CHANGED
@@ -102,6 +102,7 @@ all_tokenizers = [
102
  # "goat",
103
 
104
  # tiktoken 系列
 
105
  "qwen_7b_chat",
106
  "qwen_72b_chat",
107
 
 
102
  # "goat",
103
 
104
  # tiktoken 系列
105
+ "qwen_1_8b_chat",
106
  "qwen_7b_chat",
107
  "qwen_72b_chat",
108
 
vocab/qwen_1_8b_chat/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 依赖 torch tiktoken
3
+ 依赖 transformer 4.31.0 及以上,
4
+
5
+ https://huggingface.co/tangger/Qwen-7B-Chat Qwen官方模型临时下架了,这个是备份
6
+
7
+ https://github.com/QwenLM/Qwen/blob/main/tokenization_note_zh.md
8
+ """
9
+
10
+ import os
11
+ from transformers import AutoTokenizer
12
+
13
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-1_8B-Chat", trust_remote_code=True)
14
+
15
+ tokenizer.comments = ""
16
+
17
+
18
+ def test():
19
+ encoding = tokenizer.encode("测试华为手机10086 8个空格")
20
+ for token_id in encoding:
21
+ token = tokenizer.convert_ids_to_tokens([token_id])[0].decode("utf-8")
22
+ print(token_id, ":", token)
23
+
24
+ if __name__ == "__main__":
25
+ test()