xzuyn commited on
Commit
24c02e9
1 Parent(s): 8ef7216

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -29
app.py CHANGED
@@ -2,40 +2,69 @@ from transformers import AutoTokenizer
2
  import gradio as gr
3
 
4
 
5
- # Define tokenizer models
6
- MODELS = {
7
- "LLaMa-1/LLaMa-2": "TheBloke/Llama-2-7B-fp16",
8
- "LLaMa-3": "unsloth/llama-3-8b",
9
- "Mistral": "mistral-community/Mistral-7B-v0.2",
10
- "GPT-2/GPT-J": "openai-community/gpt2",
11
- "GPT-NeoX": "EleutherAI/gpt-neox-20b",
12
- "Falcon": "tiiuae/falcon-7b",
13
- "Phi-1/Phi-2": "microsoft/phi-2",
14
- "Phi-3": "microsoft/Phi-3-mini-4k-instruct",
15
- "T5": "google/flan-t5-xxl",
16
- "Gemma": "alpindale/gemma-2b",
17
- "Command-R": "CohereForAI/c4ai-command-r-plus",
18
- "Qwen/Qwen1.5": "Qwen/Qwen1.5-7B",
19
- "CodeQwen": "Qwen/CodeQwen1.5-7B",
20
- "RWKV-v4": "RWKV/rwkv-4-14b-pile",
21
- "RWKV-v5/RWKV-v6": "RWKV/v5-EagleX-v2-7B-HF",
22
- "DeepSeek-LLM": "deepseek-ai/deepseek-llm-7b-base",
23
- "DeepSeek-V2": "deepseek-ai/DeepSeek-V2"
24
- }
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- def tokenize(input_text):
28
- results = {}
29
- for model_name, model_tokenizer in MODELS.items():
30
- tokenizer = AutoTokenizer.from_pretrained(model_tokenizer, trust_remote_code=True)
31
- tokens = len(tokenizer(input_text, add_special_tokens=True)["input_ids"])
32
- results[model_name] = tokens
33
-
34
  # Sort the results in descending order based on token length
35
  sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
 
36
  return "\n".join([f"{model}: {tokens}" for model, tokens in sorted_results])
37
 
38
 
39
  if __name__ == "__main__":
40
- iface = gr.Interface(fn=tokenize, inputs=gr.Textbox(label="Input Text", lines=len(MODELS)), outputs="text")
41
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import gradio as gr
3
 
4
 
5
+ def tokenize(input_text):
6
+ llama_tokens = len(llama_tokenizer(input_text, add_special_tokens=True)["input_ids"])
7
+ llama3_tokens = len(llama3_tokenizer(input_text, add_special_tokens=True)["input_ids"])
8
+ mistral_tokens = len(mistral_tokenizer(input_text, add_special_tokens=True)["input_ids"])
9
+ gpt2_tokens = len(gpt2_tokenizer(input_text, add_special_tokens=True)["input_ids"])
10
+ gpt_neox_tokens = len(gpt_neox_tokenizer(input_text, add_special_tokens=True)["input_ids"])
11
+ falcon_tokens = len(falcon_tokenizer(input_text, add_special_tokens=True)["input_ids"])
12
+ phi2_tokens = len(phi2_tokenizer(input_text, add_special_tokens=True)["input_ids"])
13
+ phi3_tokens = len(phi3_tokenizer(input_text, add_special_tokens=True)["input_ids"])
14
+ t5_tokens = len(t5_tokenizer(input_text, add_special_tokens=True)["input_ids"])
15
+ gemma_tokens = len(gemma_tokenizer(input_text, add_special_tokens=True)["input_ids"])
16
+ command_r_tokens = len(command_r_tokenizer(input_text, add_special_tokens=True)["input_ids"])
17
+ qwen_tokens = len(qwen_tokenizer(input_text, add_special_tokens=True)["input_ids"])
18
+ codeqwen_tokens = len(codeqwen_tokenizer(input_text, add_special_tokens=True)["input_ids"])
19
+ rwkv4_tokens = len(rwkv4_tokenizer(input_text, add_special_tokens=True)["input_ids"])
20
+ rwkv5_tokens = len(rwkv5_tokenizer(input_text, add_special_tokens=True)["input_ids"])
21
+ deepseekllm_tokens = len(deepseekllm_tokenizer(input_text, add_special_tokens=True)["input_ids"])
22
+ deepseekv2_tokens = len(deepseekv2_tokenizer(input_text, add_special_tokens=True)["input_ids"])
 
 
23
 
24
+ results = {
25
+ "LLaMa-1/LLaMa-2": llama_tokens,
26
+ "LLaMa-3": llama3_tokens,
27
+ "Mistral": mistral_tokens,
28
+ "GPT-2/GPT-J": gpt2_tokens,
29
+ "GPT-NeoX": gpt_neox_tokens,
30
+ "Falcon": falcon_tokens,
31
+ "Phi-1/Phi-2": phi2_tokens,
32
+ "Phi-3": phi3_tokens,
33
+ "T5": t5_tokens,
34
+ "Gemma": gemma_tokens,
35
+ "Command-R": command_r_tokens,
36
+ "Qwen/Qwen1.5": qwen_tokens,
37
+ "CodeQwen": codeqwen_tokens,
38
+ "RWKV-v4": rwkv4_tokens,
39
+ "RWKV-v5/RWKV-v6": rwkv5_tokens,
40
+ "DeepSeek-LLM": deepseekllm_tokens,
41
+ "DeepSeek-V2": deepseekv2_tokens
42
+ }
43
 
 
 
 
 
 
 
 
44
  # Sort the results in descending order based on token length
45
  sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
46
+
47
  return "\n".join([f"{model}: {tokens}" for model, tokens in sorted_results])
48
 
49
 
50
  if __name__ == "__main__":
51
+ llama_tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7B-fp16")
52
+ llama3_tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b")
53
+ mistral_tokenizer = AutoTokenizer.from_pretrained("mistral-community/Mistral-7B-v0.2")
54
+ gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
55
+ gpt_neox_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
56
+ falcon_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")
57
+ phi2_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
58
+ phi3_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
59
+ t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
60
+ gemma_tokenizer = AutoTokenizer.from_pretrained("alpindale/gemma-2b")
61
+ command_r_tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-plus")
62
+ qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B")
63
+ codeqwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/CodeQwen1.5-7B")
64
+ rwkv4_tokenizer = AutoTokenizer.from_pretrained("RWKV/rwkv-4-14b-pile", trust_remote_code=True)
65
+ rwkv5_tokenizer = AutoTokenizer.from_pretrained("RWKV/v5-EagleX-v2-7B-HF", trust_remote_code=True)
66
+ deepseekllm_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-llm-7b-base", trust_remote_code=True)
67
+ deepseekv2_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V2", trust_remote_code=True)
68
+
69
+ iface = gr.Interface(fn=tokenize, inputs=gr.Textbox(label="Input Text", lines=17), outputs="text")
70
+ iface.launch()