from transformers import AutoTokenizer
from tqdm import tqdm
import gradio as gr
import pandas as pd
from datasets import load_dataset
import random
from pathlib import Path

initial_list_of_models = [
    "asafaya/bert-base-arabic",
    "Xenova/gpt-4o",
    "FreedomIntelligence/AceGPT-v1.5-13B-Chat",
    "FreedomIntelligence/AceGPT-13B",
    "Qwen/Qwen1.5-7B-Chat",
    "Qwen/Qwen1.5-110B-Chat",
    "microsoft/Phi-3-mini-128k-instruct",
    "unsloth/gemma-2b-bnb-4bit",
    "NousResearch/Meta-Llama-3-8B",
    "CohereForAI/c4ai-command-r-v01",
    "CohereForAI/c4ai-command-r-plus",
    "core42/jais-13b",
    "core42/jais-30b-chat-v3",
]

dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
if dataframe_path.exists():
    df = pd.read_json(dataframe_path, lines=True)
else:
    df = pd.DataFrame(
        columns=[
            "👳 Tokenize Tashkeel",
            "📛 Models",
            "🪺 Fertility Score",
            "➕ Total Number of Tokens",
            "📘 Vocab Size",
            "Tokenizer Class",
        ]
    )

# Datasets used for calculating the number of tokens
arabic_dataset1 = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
arabic_dataset2 = load_dataset("HeshamHaroon/arabic-quotes", split="train")["quote"]
arabic_dataset3 = load_dataset("SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", split="train")["text"]
all_data = arabic_dataset1 + arabic_dataset2 + arabic_dataset3
print(f"Total number of samples: {len(all_data)}") 
all_text = " ".join(all_data)
all_words = all_text.split()

def benchmark_tokenizer(model_name) -> float:
    # Initialize the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, use_fast=True, trust_remote_code=True
    )
    vocab_size = tokenizer.vocab_size
    total_number_of_tokens = len(tokenizer.tokenize(all_text))

    # Check if the tokenizer maintains the tashkeel
    dummy_text = "السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللَّهِ وَبَرَكَاتُهُ"
    tokenized_text = tokenizer.decode(tokenizer.encode(dummy_text), skip_special_tokens=True)
    tashkeel_maintainer = "✅" if tokenized_text == dummy_text else "❌"

    return {
        "👳 Tokenize Tashkeel": tashkeel_maintainer,
        "📛 Models": model_name,
        "🪺 Fertility Score": round(total_number_of_tokens / len(all_words), 3),
        "📘 Vocab Size": vocab_size,
        "➕ Total Number of Tokens": total_number_of_tokens,
        "Tokenizer Class": tokenizer.__class__.__name__,
    }


for model_name in tqdm(initial_list_of_models):
    if model_name in df["📛 Models"].values:
        continue
    
    benchmark_data = benchmark_tokenizer(model_name)
    df = df._append(benchmark_data, ignore_index=True)

# Sort the dataframe by the number of tokens
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)

# Save the dataframe to a csv file
df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)


def submit(model_name):
    global df
    if model_name in df["📛 Models"].values:
        return (
            gr.Dataframe(df),
            gr.BarPlot(df),
            gr.Dropdown(choices=df["📛 Models"].tolist()),
        )
    benchmark_data = benchmark_tokenizer(model_name)
    df = df._append(benchmark_data, ignore_index=True)
    df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
    df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
    return (
        gr.Dataframe(df),
        gr.BarPlot(df),
        gr.Dropdown(choices=df["📛 Models"].tolist()),
    )


def generate_distinct_colors(n):
    """Generate n visually distinct colors in hexadecimal format."""
    if n > 256**3:
        raise ValueError("Cannot generate more than 16,777,216 unique colors.")

    # To ensure colors are distinct, calculate an appropriate distance between colors
    # The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate
    spacing = int((256 * 256 * 256) ** (1 / 3) / n ** (1 / 3))
    max_val = 256 - spacing

    # Set to keep track of used colors
    used_colors = set()

    # List to store the result colors
    result = []

    attempts = 0
    while len(result) < n:
        # Generate a color with a random start and controlled spacing
        r = random.randint(0, max_val)
        g = random.randint(0, max_val)
        b = random.randint(0, max_val)

        # Scale up by spacing to ensure minimum distance between colors
        r = min(255, r * spacing)
        g = min(255, g * spacing)
        b = min(255, b * spacing)

        # Format the color in hexadecimal
        color = f"#{r:02X}{g:02X}{b:02X}"

        # Ensure this color hasn't been used
        if color not in used_colors:
            used_colors.add(color)
            result.append(color)
        else:
            attempts += 1
            if attempts > 50:
                # Dynamically adjust spacing if stuck
                spacing = max(1, spacing - 1)
                max_val = 256 - spacing
                attempts = 0

    return result


def decode_bpe_tokens(tokens):
    fixed_tokens = []
    for token in tokens:
        # Check if the token starts with the special BPE space character 'Ġ'
        if token.startswith("Ġ"):
            # Process the rest of the token
            try:
                # Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters
                fixed_token = " " + token[1:].encode("utf-8").decode("utf-8")
            except UnicodeDecodeError:
                fixed_token = token  # Use the original token if decoding fails
        else:
            try:
                # Directly encode and decode without misinterpretation steps
                fixed_token = token.encode("utf-8").decode("utf-8")
            except UnicodeDecodeError:
                fixed_token = token  # Use the original token if decoding fails
        fixed_tokens.append(fixed_token)
    return fixed_tokens


def tokenize_text(text, chosen_model, better_tokenization=False):
    tokenizer = AutoTokenizer.from_pretrained(chosen_model)
    tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
    random_colors = generate_distinct_colors(len(tokenized_text))

    if better_tokenization:
        final_tokenized_text = []
        for token in tokenized_text:
            correct_tokenized_text = ""
            for char in text:
                correct_tokenized_text += char
                current_token = decode_bpe_tokens(
                    tokenizer.tokenize(correct_tokenized_text)
                )
                if current_token[0] == token:
                    final_tokenized_text.append(correct_tokenized_text)
                    text = text[len(correct_tokenized_text) :]
                    break
    else:
        final_tokenized_text = tokenized_text
    print(final_tokenized_text)

    output = []
    color_map = {}
    for idx, token in enumerate(final_tokenized_text):
        output.append((token, str(idx)))
        color_map[str(idx + 1)] = random_colors[idx % len(random_colors)]

    return gr.HighlightedText(output, color_map)


def refresh():
    global df
    df = pd.read_json(dataframe_path, lines=True)
    return (
        gr.Dataframe(df),
        gr.BarPlot(df),
        gr.Dropdown(choices=df["📛 Models"].tolist()),
    )

leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens got from the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset (This dataset was chosen because it represents Arabic Fusha text in a small and concentrated manner).

**A tokenizer that scores high in this leaderboard should be efficient in parsing Arabic in its different dialects and forms.**

## Updates/Notes:
1. New datasets is added for the evaluation (e.g. [arabic-quotes](https://huggingface.co/datasets/HeshamHaroon/arabic-quotes), [Moroccan_Arabic_Wikipedia_20230101_nobots](https://huggingface.co/datasets/SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots)).
1. `Fertility Score` is calculated by dividing the total number of tokens by the total number of words in the dataset (Lower is better).
1. `Tokenize Tashkeel` is an indicator of whether the tokenizer maintains the tashkeel when tokenizing or not (`✅` for yes, `❌` for no).
1. `Vocab Size` is the total number of tokens in the tokenizer's vocabulary (e.g. `10000` tokens).
1. `Tokenizer Class` is the class of the tokenizer (e.g. `BertTokenizer` or `GPT2Tokenizer`)
1. `Total Number of Tokens` is the total number of tokens in the dataset after tokenization (Lower is better).

**Note**: Press `Refresh` to get the latest data available in the leaderboard (The initial state may be deceiving).
"""

with gr.Blocks() as demo:
    gr.HTML("<center><h1>Arabic Tokenizers Leaderboard</h1></center>")
    gr.Markdown("## What is the best tokenizer for Arabic?")
    gr.Markdown(leaderboard_description)
    with gr.Tab(label="Leaderboard"):
        dataframe = gr.Dataframe(df)
        with gr.Accordion("Barplot", open=False):
            barplot = gr.BarPlot(
                df,
                x="📛 Models",
                y="➕ Total Number of Tokens",
                x_title=" ",
                y_title=" ",
                width=1000,
                height=400,
                tooltip=["📘 Vocab Size", "🪺 Fertility Score"],
                vertical=False,
                x_label_angle=30,
            )
        model_name = gr.Textbox(
            label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
        )
        with gr.Row():
            submit_new_model_btn = gr.Button(
                value="Submit New Model", variant="primary", scale=3
            )
            refresh_btn = gr.Button(value="Refresh", variant="secondary", scale=1)
    with gr.Tab(label="Try tokenizers"):
        text = gr.Textbox(
            label="Enter a text",
            lines=5,
            value="السلام عليكم ورحمة الله",
            rtl=True,
            text_align="right",
        )
        dropdown = gr.Dropdown(
            label="Select a model",
            choices=df["📛 Models"].tolist(),
            value=df["📛 Models"].tolist()[0],
        )
        with gr.Row():
            submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3)
            checkbox = gr.Checkbox(
                label="Better tokenization for Arabic Text", value=False, scale=1
            )
        tokenized_textbox = gr.HighlightedText(label="Tokenized text")

    submit_new_model_btn.click(
        submit, model_name, outputs=[dataframe, barplot, dropdown]
    )
    refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown])
    submit_text_btn.click(
        tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox]
    )


demo.launch()