MohamedRashad commited on
Commit
d2e2dfe
1 Parent(s): 0c0efc4

refactor: improve tokenization for Arabic text

Browse files
Files changed (2) hide show
  1. app.py +121 -59
  2. arabic_tokenizers_leaderboard.jsonl +14 -0
app.py CHANGED
@@ -7,39 +7,74 @@ import random
7
  from pathlib import Path
8
 
9
  initial_list_of_models = [
 
10
  "Xenova/gpt-4o",
 
 
 
 
 
 
11
  "NousResearch/Meta-Llama-3-8B",
12
  "CohereForAI/c4ai-command-r-v01",
13
  "CohereForAI/c4ai-command-r-plus",
14
  "core42/jais-13b",
 
15
  ]
16
 
17
- dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
18
-
19
  dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
20
-
21
  if dataframe_path.exists():
22
  df = pd.read_json(dataframe_path, lines=True)
23
  else:
24
- df = pd.DataFrame(columns=["📛 Models", "➕ Total Number of Tokens", "📘 Vocab Size", "Tokenizer Class"])
 
 
 
 
 
 
 
 
 
25
 
26
- for model_name in tqdm(initial_list_of_models):
27
- if model_name in df["📛 Models"].values:
28
- continue
 
 
 
 
 
 
 
 
29
  tokenizer = AutoTokenizer.from_pretrained(
30
  model_name, use_fast=True, trust_remote_code=True
31
  )
32
  vocab_size = tokenizer.vocab_size
33
- number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
34
- df = df._append(
35
- {
36
- "📛 Models": model_name,
37
- "📘 Vocab Size": vocab_size,
38
- " Total Number of Tokens": number_of_tokens,
39
- "Tokenizer Class": tokenizer.__class__.__name__,
40
- },
41
- ignore_index=True,
42
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  # Sort the dataframe by the number of tokens
45
  df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
@@ -47,59 +82,57 @@ df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
47
  # Save the dataframe to a csv file
48
  df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
49
 
 
50
  def submit(model_name):
51
  global df
52
  if model_name in df["📛 Models"].values:
53
- return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
54
- tokenizer = AutoTokenizer.from_pretrained(
55
- model_name, use_fast=True, trust_remote_code=True
56
- )
57
- vocab_size = tokenizer.vocab_size
58
- number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
59
- df = df._append(
60
- {
61
- "📛 Models": model_name,
62
- "➕ Total Number of Tokens": number_of_tokens,
63
- "📘 Vocab Size": vocab_size,
64
- "Tokenizer Class": tokenizer.__class__.__name__,
65
- },
66
- ignore_index=True,
67
- )
68
  df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
69
  df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
70
- return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
 
 
 
 
 
71
 
72
  def generate_distinct_colors(n):
73
  """Generate n visually distinct colors in hexadecimal format."""
74
  if n > 256**3:
75
  raise ValueError("Cannot generate more than 16,777,216 unique colors.")
76
-
77
  # To ensure colors are distinct, calculate an appropriate distance between colors
78
  # The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate
79
- spacing = int((256 * 256 * 256)**(1/3) / n**(1/3))
80
  max_val = 256 - spacing
81
-
82
  # Set to keep track of used colors
83
  used_colors = set()
84
-
85
  # List to store the result colors
86
  result = []
87
-
88
  attempts = 0
89
  while len(result) < n:
90
  # Generate a color with a random start and controlled spacing
91
  r = random.randint(0, max_val)
92
  g = random.randint(0, max_val)
93
  b = random.randint(0, max_val)
94
-
95
  # Scale up by spacing to ensure minimum distance between colors
96
  r = min(255, r * spacing)
97
  g = min(255, g * spacing)
98
  b = min(255, b * spacing)
99
-
100
  # Format the color in hexadecimal
101
  color = f"#{r:02X}{g:02X}{b:02X}"
102
-
103
  # Ensure this color hasn't been used
104
  if color not in used_colors:
105
  used_colors.add(color)
@@ -111,29 +144,31 @@ def generate_distinct_colors(n):
111
  spacing = max(1, spacing - 1)
112
  max_val = 256 - spacing
113
  attempts = 0
114
-
115
  return result
116
 
 
117
  def decode_bpe_tokens(tokens):
118
  fixed_tokens = []
119
  for token in tokens:
120
  # Check if the token starts with the special BPE space character 'Ġ'
121
- if token.startswith('Ġ'):
122
  # Process the rest of the token
123
  try:
124
  # Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters
125
- fixed_token = ' ' + token[1:].encode('utf-8').decode('utf-8')
126
  except UnicodeDecodeError:
127
  fixed_token = token # Use the original token if decoding fails
128
  else:
129
  try:
130
  # Directly encode and decode without misinterpretation steps
131
- fixed_token = token.encode('utf-8').decode('utf-8')
132
  except UnicodeDecodeError:
133
  fixed_token = token # Use the original token if decoding fails
134
  fixed_tokens.append(fixed_token)
135
  return fixed_tokens
136
 
 
137
  def tokenize_text(text, chosen_model, better_tokenization=False):
138
  tokenizer = AutoTokenizer.from_pretrained(chosen_model)
139
  tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
@@ -144,11 +179,13 @@ def tokenize_text(text, chosen_model, better_tokenization=False):
144
  for token in tokenized_text:
145
  correct_tokenized_text = ""
146
  for char in text:
147
- correct_tokenized_text += char
148
- current_token = decode_bpe_tokens(tokenizer.tokenize(correct_tokenized_text))
 
 
149
  if current_token[0] == token:
150
  final_tokenized_text.append(correct_tokenized_text)
151
- text = text[len(correct_tokenized_text):]
152
  break
153
  else:
154
  final_tokenized_text = tokenized_text
@@ -158,19 +195,30 @@ def tokenize_text(text, chosen_model, better_tokenization=False):
158
  color_map = {}
159
  for idx, token in enumerate(final_tokenized_text):
160
  output.append((token, str(idx)))
161
- color_map[str(idx+1)] = random_colors[idx % len(random_colors)]
162
 
163
  return gr.HighlightedText(output, color_map)
164
 
 
165
  def refresh():
166
  global df
167
  df = pd.read_json(dataframe_path, lines=True)
168
- return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
 
 
 
 
169
 
170
- leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens summed on the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset.
171
- This dataset was chosen because it represents Arabic Fusha text in a small and consentrated manner.
172
 
173
- A tokenizer that scores high in this leaderboard will be efficient in parsing Arabic in its different dialects and forms.
 
 
 
 
 
 
 
174
  """
175
 
176
  with gr.Blocks() as demo:
@@ -188,7 +236,7 @@ with gr.Blocks() as demo:
188
  y_title=" ",
189
  width=1000,
190
  height=400,
191
- tooltip=["📘 Vocab Size", " Total Number of Tokens"],
192
  vertical=False,
193
  x_label_angle=30,
194
  )
@@ -196,10 +244,18 @@ with gr.Blocks() as demo:
196
  label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
197
  )
198
  with gr.Row():
199
- submit_new_model_btn = gr.Button(value="Submit New Model", variant="primary", scale=3)
 
 
200
  refresh_btn = gr.Button(value="Refresh", variant="secondary", scale=1)
201
  with gr.Tab(label="Try tokenizers"):
202
- text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
 
 
 
 
 
 
203
  dropdown = gr.Dropdown(
204
  label="Select a model",
205
  choices=df["📛 Models"].tolist(),
@@ -207,12 +263,18 @@ with gr.Blocks() as demo:
207
  )
208
  with gr.Row():
209
  submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3)
210
- checkbox = gr.Checkbox(label="Better tokenization for Arabic Text", value=False, scale=1)
 
 
211
  tokenized_textbox = gr.HighlightedText(label="Tokenized text")
212
 
213
- submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown])
 
 
214
  refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown])
215
- submit_text_btn.click(tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox])
 
 
216
 
217
 
218
  demo.launch()
 
7
  from pathlib import Path
8
 
9
  initial_list_of_models = [
10
+ "asafaya/bert-base-arabic",
11
  "Xenova/gpt-4o",
12
+ "FreedomIntelligence/AceGPT-v1.5-13B-Chat",
13
+ "FreedomIntelligence/AceGPT-13B",
14
+ "Qwen/Qwen1.5-7B-Chat",
15
+ "Qwen/Qwen1.5-110B-Chat",
16
+ "microsoft/Phi-3-mini-128k-instruct",
17
+ "unsloth/gemma-2b-bnb-4bit",
18
  "NousResearch/Meta-Llama-3-8B",
19
  "CohereForAI/c4ai-command-r-v01",
20
  "CohereForAI/c4ai-command-r-plus",
21
  "core42/jais-13b",
22
+ "core42/jais-30b-chat-v3",
23
  ]
24
 
 
 
25
  dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
 
26
  if dataframe_path.exists():
27
  df = pd.read_json(dataframe_path, lines=True)
28
  else:
29
+ df = pd.DataFrame(
30
+ columns=[
31
+ "👳 Tokenize Tashkeel",
32
+ "📛 Models",
33
+ "🪺 Fertility Score",
34
+ "➕ Total Number of Tokens",
35
+ "📘 Vocab Size",
36
+ "Tokenizer Class",
37
+ ]
38
+ )
39
 
40
+ # Datasets used for calculating the number of tokens
41
+ arabic_dataset1 = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
42
+ arabic_dataset2 = load_dataset("HeshamHaroon/arabic-quotes", split="train")["quote"]
43
+ arabic_dataset3 = load_dataset("SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", split="train")["text"]
44
+ all_data = arabic_dataset1 + arabic_dataset2 + arabic_dataset3
45
+ print(f"Total number of samples: {len(all_data)}")
46
+ all_text = " ".join(all_data)
47
+ all_words = all_text.split()
48
+
49
+ def benchmark_tokenizer(model_name) -> float:
50
+ # Initialize the tokenizer
51
  tokenizer = AutoTokenizer.from_pretrained(
52
  model_name, use_fast=True, trust_remote_code=True
53
  )
54
  vocab_size = tokenizer.vocab_size
55
+ total_number_of_tokens = len(tokenizer.tokenize(all_text))
56
+
57
+ # Check if the tokenizer maintains the tashkeel
58
+ dummy_text = "السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللَّهِ وَبَرَكَاتُهُ"
59
+ tokenized_text = tokenizer.decode(tokenizer.encode(dummy_text), skip_special_tokens=True)
60
+ tashkeel_maintainer = "✅" if tokenized_text == dummy_text else "❌"
61
+
62
+ return {
63
+ "👳 Tokenize Tashkeel": tashkeel_maintainer,
64
+ "📛 Models": model_name,
65
+ "🪺 Fertility Score": round(total_number_of_tokens / len(all_words), 3),
66
+ "📘 Vocab Size": vocab_size,
67
+ "➕ Total Number of Tokens": total_number_of_tokens,
68
+ "Tokenizer Class": tokenizer.__class__.__name__,
69
+ }
70
+
71
+
72
+ for model_name in tqdm(initial_list_of_models):
73
+ if model_name in df["📛 Models"].values:
74
+ continue
75
+
76
+ benchmark_data = benchmark_tokenizer(model_name)
77
+ df = df._append(benchmark_data, ignore_index=True)
78
 
79
  # Sort the dataframe by the number of tokens
80
  df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
 
82
  # Save the dataframe to a csv file
83
  df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
84
 
85
+
86
  def submit(model_name):
87
  global df
88
  if model_name in df["📛 Models"].values:
89
+ return (
90
+ gr.Dataframe(df),
91
+ gr.BarPlot(df),
92
+ gr.Dropdown(choices=df["📛 Models"].tolist()),
93
+ )
94
+ benchmark_data = benchmark_tokenizer(model_name)
95
+ df = df._append(benchmark_data, ignore_index=True)
 
 
 
 
 
 
 
 
96
  df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
97
  df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
98
+ return (
99
+ gr.Dataframe(df),
100
+ gr.BarPlot(df),
101
+ gr.Dropdown(choices=df["📛 Models"].tolist()),
102
+ )
103
+
104
 
105
  def generate_distinct_colors(n):
106
  """Generate n visually distinct colors in hexadecimal format."""
107
  if n > 256**3:
108
  raise ValueError("Cannot generate more than 16,777,216 unique colors.")
109
+
110
  # To ensure colors are distinct, calculate an appropriate distance between colors
111
  # The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate
112
+ spacing = int((256 * 256 * 256) ** (1 / 3) / n ** (1 / 3))
113
  max_val = 256 - spacing
114
+
115
  # Set to keep track of used colors
116
  used_colors = set()
117
+
118
  # List to store the result colors
119
  result = []
120
+
121
  attempts = 0
122
  while len(result) < n:
123
  # Generate a color with a random start and controlled spacing
124
  r = random.randint(0, max_val)
125
  g = random.randint(0, max_val)
126
  b = random.randint(0, max_val)
127
+
128
  # Scale up by spacing to ensure minimum distance between colors
129
  r = min(255, r * spacing)
130
  g = min(255, g * spacing)
131
  b = min(255, b * spacing)
132
+
133
  # Format the color in hexadecimal
134
  color = f"#{r:02X}{g:02X}{b:02X}"
135
+
136
  # Ensure this color hasn't been used
137
  if color not in used_colors:
138
  used_colors.add(color)
 
144
  spacing = max(1, spacing - 1)
145
  max_val = 256 - spacing
146
  attempts = 0
147
+
148
  return result
149
 
150
+
151
  def decode_bpe_tokens(tokens):
152
  fixed_tokens = []
153
  for token in tokens:
154
  # Check if the token starts with the special BPE space character 'Ġ'
155
+ if token.startswith("Ġ"):
156
  # Process the rest of the token
157
  try:
158
  # Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters
159
+ fixed_token = " " + token[1:].encode("utf-8").decode("utf-8")
160
  except UnicodeDecodeError:
161
  fixed_token = token # Use the original token if decoding fails
162
  else:
163
  try:
164
  # Directly encode and decode without misinterpretation steps
165
+ fixed_token = token.encode("utf-8").decode("utf-8")
166
  except UnicodeDecodeError:
167
  fixed_token = token # Use the original token if decoding fails
168
  fixed_tokens.append(fixed_token)
169
  return fixed_tokens
170
 
171
+
172
  def tokenize_text(text, chosen_model, better_tokenization=False):
173
  tokenizer = AutoTokenizer.from_pretrained(chosen_model)
174
  tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
 
179
  for token in tokenized_text:
180
  correct_tokenized_text = ""
181
  for char in text:
182
+ correct_tokenized_text += char
183
+ current_token = decode_bpe_tokens(
184
+ tokenizer.tokenize(correct_tokenized_text)
185
+ )
186
  if current_token[0] == token:
187
  final_tokenized_text.append(correct_tokenized_text)
188
+ text = text[len(correct_tokenized_text) :]
189
  break
190
  else:
191
  final_tokenized_text = tokenized_text
 
195
  color_map = {}
196
  for idx, token in enumerate(final_tokenized_text):
197
  output.append((token, str(idx)))
198
+ color_map[str(idx + 1)] = random_colors[idx % len(random_colors)]
199
 
200
  return gr.HighlightedText(output, color_map)
201
 
202
+
203
  def refresh():
204
  global df
205
  df = pd.read_json(dataframe_path, lines=True)
206
+ return (
207
+ gr.Dataframe(df),
208
+ gr.BarPlot(df),
209
+ gr.Dropdown(choices=df["📛 Models"].tolist()),
210
+ )
211
 
212
+ leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens got from the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset (This dataset was chosen because it represents Arabic Fusha text in a small and concentrated manner).
 
213
 
214
+ **A tokenizer that scores high in this leaderboard will be efficient in parsing Arabic in its different dialects and forms.**
215
+
216
+ ## Updates
217
+ 1. New datasets is added for the evaluation (e.g. [arabic-quotes](https://huggingface.co/datasets/HeshamHaroon/arabic-quotes), [Moroccan_Arabic_Wikipedia_20230101_nobots](https://huggingface.co/datasets/SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots)).
218
+ 1. `Fertility Score` is calculated by dividing the total number of tokens by the total number of words in the dataset (another way to interpret `Total Number of Tokens`).
219
+ 1. `Tokenize Tashkeel` is an indicator of whether the tokenizer maintains the tashkeel when tokenizing or not (`✅` for yes, `❌` for no).
220
+ 1. `Vocab Size` is the total number of tokens in the tokenizer's vocabulary (e.g. `10000` tokens).
221
+ 1. `Tokenizer Class` is the class of the tokenizer (e.g. `BertTokenizer` or `GPT2Tokenizer`)
222
  """
223
 
224
  with gr.Blocks() as demo:
 
236
  y_title=" ",
237
  width=1000,
238
  height=400,
239
+ tooltip=["📘 Vocab Size", "🪺 Fertility Score"],
240
  vertical=False,
241
  x_label_angle=30,
242
  )
 
244
  label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
245
  )
246
  with gr.Row():
247
+ submit_new_model_btn = gr.Button(
248
+ value="Submit New Model", variant="primary", scale=3
249
+ )
250
  refresh_btn = gr.Button(value="Refresh", variant="secondary", scale=1)
251
  with gr.Tab(label="Try tokenizers"):
252
+ text = gr.Textbox(
253
+ label="Enter a text",
254
+ lines=5,
255
+ value="السلام عليكم ورحمة الله",
256
+ rtl=True,
257
+ text_align="right",
258
+ )
259
  dropdown = gr.Dropdown(
260
  label="Select a model",
261
  choices=df["📛 Models"].tolist(),
 
263
  )
264
  with gr.Row():
265
  submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3)
266
+ checkbox = gr.Checkbox(
267
+ label="Better tokenization for Arabic Text", value=False, scale=1
268
+ )
269
  tokenized_textbox = gr.HighlightedText(label="Tokenized text")
270
 
271
+ submit_new_model_btn.click(
272
+ submit, model_name, outputs=[dataframe, barplot, dropdown]
273
+ )
274
  refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown])
275
+ submit_text_btn.click(
276
+ tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox]
277
+ )
278
 
279
 
280
  demo.launch()
arabic_tokenizers_leaderboard.jsonl ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"👳 Tokenize Tashkeel":"❌","📛 Models":"asafaya\/bert-base-arabic","🪺 Fertility Score":1.614,"➕ Total Number of Tokens":1242530,"📘 Vocab Size":32000,"Tokenizer Class":"BertTokenizerFast"}
2
+ {"👳 Tokenize Tashkeel":"✅","📛 Models":"core42\/jais-13b","🪺 Fertility Score":1.668,"➕ Total Number of Tokens":1284508,"📘 Vocab Size":84992,"Tokenizer Class":"PreTrainedTokenizerFast"}
3
+ {"👳 Tokenize Tashkeel":"✅","📛 Models":"core42\/jais-30b-chat-v3","🪺 Fertility Score":1.668,"➕ Total Number of Tokens":1284508,"📘 Vocab Size":84992,"Tokenizer Class":"PreTrainedTokenizerFast"}
4
+ {"👳 Tokenize Tashkeel":"✅","📛 Models":"FreedomIntelligence\/AceGPT-v1.5-13B-Chat","🪺 Fertility Score":1.888,"➕ Total Number of Tokens":1453838,"📘 Vocab Size":44800,"Tokenizer Class":"LlamaTokenizerFast"}
5
+ {"👳 Tokenize Tashkeel":"✅","📛 Models":"Xenova\/gpt-4o","🪺 Fertility Score":2.115,"➕ Total Number of Tokens":1628374,"📘 Vocab Size":200000,"Tokenizer Class":"GPT2TokenizerFast"}
6
+ {"👳 Tokenize Tashkeel":"❌","📛 Models":"CohereForAI\/c4ai-command-r-v01","🪺 Fertility Score":2.154,"➕ Total Number of Tokens":1658463,"📘 Vocab Size":255000,"Tokenizer Class":"CohereTokenizerFast"}
7
+ {"👳 Tokenize Tashkeel":"❌","📛 Models":"CohereForAI\/c4ai-command-r-plus","🪺 Fertility Score":2.154,"➕ Total Number of Tokens":1658463,"📘 Vocab Size":255000,"Tokenizer Class":"CohereTokenizerFast"}
8
+ {"👳 Tokenize Tashkeel":"✅","📛 Models":"unsloth\/gemma-2b-bnb-4bit","🪺 Fertility Score":2.199,"➕ Total Number of Tokens":1692826,"📘 Vocab Size":256000,"Tokenizer Class":"GemmaTokenizerFast"}
9
+ {"👳 Tokenize Tashkeel":"✅","📛 Models":"NousResearch\/Meta-Llama-3-8B","🪺 Fertility Score":2.374,"➕ Total Number of Tokens":1827816,"📘 Vocab Size":128000,"Tokenizer Class":"PreTrainedTokenizerFast"}
10
+ {"👳 Tokenize Tashkeel":"❌","📛 Models":"Qwen\/Qwen1.5-7B-Chat","🪺 Fertility Score":2.444,"➕ Total Number of Tokens":1881958,"📘 Vocab Size":151643,"Tokenizer Class":"Qwen2TokenizerFast"}
11
+ {"👳 Tokenize Tashkeel":"❌","📛 Models":"Qwen\/Qwen1.5-110B-Chat","🪺 Fertility Score":2.444,"➕ Total Number of Tokens":1881958,"📘 Vocab Size":151643,"Tokenizer Class":"Qwen2TokenizerFast"}
12
+ {"👳 Tokenize Tashkeel":"✅","📛 Models":"FreedomIntelligence\/AceGPT-13B","🪺 Fertility Score":5.46,"➕ Total Number of Tokens":4203685,"📘 Vocab Size":32000,"Tokenizer Class":"LlamaTokenizerFast"}
13
+ {"👳 Tokenize Tashkeel":"✅","📛 Models":"microsoft\/Phi-3-mini-128k-instruct","🪺 Fertility Score":5.46,"➕ Total Number of Tokens":4203685,"📘 Vocab Size":32000,"Tokenizer Class":"LlamaTokenizerFast"}
14
+ {"👳 Tokenize Tashkeel":"✅","📛 Models":"01-ai\/Yi-1.5-34B-Chat","🪺 Fertility Score":6.674,"➕ Total Number of Tokens":5138447,"📘 Vocab Size":64000,"Tokenizer Class":"LlamaTokenizerFast"}