File size: 3,986 Bytes
68ddcf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from math import ceil

import gradio as gr
from datasets import load_dataset, IterableDataset
from transformers import AutoTokenizer, PreTrainedTokenizer


def count_tokens(batch, tokenizer, text_column):
    encoded = tokenizer(batch[text_column])
    return {"num_tokens": [len(input_ids) for input_ids in encoded["input_ids"]]}


def get_dataset_num_tokens(
    dataset: IterableDataset, tokenizer: PreTrainedTokenizer, text_column: str, progress=gr.Progress()
) -> int:
    progress((0, None), desc="Counting tokens", unit="tokens")
    ds = dataset.map(
        count_tokens, batched=True, batch_size=1000, fn_kwargs={"tokenizer": tokenizer, "text_column": text_column}
    )

    total_num_tokens = 0
    for sample in ds:
        total_num_tokens += sample["num_tokens"]
        progress((total_num_tokens, None), desc="Counting tokens", unit="tokens")

    return total_num_tokens


def calculate_steps(
    dataset_name: str,
    dataset_split: str,
    dataset_config: str | None,
    tokenizer_name: str,
    num_gpus_per_node: int,
    num_nodes: int,
    batch_size: int,
    grad_accum: int,
    block_size: int,
    text_column: str = "text",
    token: str | None = None,
):
    dataset_config = None if not dataset_config.strip() else dataset_config
    text_column = "text" if not text_column.strip() else text_column
    token = None if not token.strip() else token
    try:
        dataset = load_dataset(dataset_name, dataset_config, streaming=True, token=token, split=dataset_split)
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, token=token)
        total_num_tokens = get_dataset_num_tokens(dataset, tokenizer, text_column)
    except Exception as exc:
        raise gr.Error(str(exc))
    else:
        dataset_size = ceil(total_num_tokens / block_size)
        world_size = num_gpus_per_node * num_nodes
        num_steps = ceil(dataset_size / (world_size * batch_size * grad_accum))
        return dataset_size, num_steps


with gr.Blocks() as demo:
    gr.Markdown(
        """# Steps Calculator
    
Calculate the number of steps required to run through your whole dataset with a given sequence length. This is \
especially useful when training with a streaming dataset and you're not sure how many steps you need to run through \
the dataset with a given tokenizer and block size."""
    )

    with gr.Row():
        dataset_name = gr.Text(label="Dataset name")
        dataset_split = gr.Text(label="Dataset split", value="train")
        dataset_config = gr.Text(label="Dataset config (optional)")
        tokenizer_name = gr.Text(label="Tokenizer name")

    with gr.Row():
        num_gpus_per_node = gr.Number(value=1, minimum=1, label="Number of GPUs per node")
        num_nodes = gr.Number(value=1, minimum=1, label="Number of nodes")
        batch_size = gr.Number(value=8, minimum=1, label="Batch size")
        grad_accum = gr.Number(value=1, minimum=1, label="Gradient accumulation steps")
        block_size = gr.Number(value=2048, minimum=1, label="Block size")
        text_column = gr.Text(value="text", label="Text column")
        token = gr.Text(label="HF acces token (optional)")

    with gr.Row():
        with gr.Column():
            calculate_btn = gr.Button(value="Calculate")
        with gr.Column():
            samples = gr.Number(value=None, minimum=1, label="Total block-sized samples", interactive=False)
            steps = gr.Number(value=None, minimum=1, label="Total steps needed", interactive=False)

    calculate_btn.click(
        calculate_steps,
        inputs=[
            dataset_name,
            dataset_split,
            dataset_config,
            tokenizer_name,
            num_gpus_per_node,
            num_nodes,
            batch_size,
            grad_accum,
            block_size,
            text_column,
            token,
        ],
        outputs=[samples, steps],
        api_name="calculate-training-steps",
    )


if __name__ == "__main__":
    demo.queue().launch()