Spaces:
Runtime error
Runtime error
from math import ceil | |
import gradio as gr | |
from datasets import load_dataset, IterableDataset | |
from transformers import AutoTokenizer, PreTrainedTokenizer | |
def count_tokens(batch, tokenizer, text_column): | |
encoded = tokenizer(batch[text_column]) | |
return {"num_tokens": [len(input_ids) for input_ids in encoded["input_ids"]]} | |
def get_dataset_num_tokens( | |
dataset: IterableDataset, tokenizer: PreTrainedTokenizer, text_column: str, progress=gr.Progress() | |
) -> int: | |
progress((0, None), desc="Counting tokens", unit="tokens") | |
ds = dataset.map( | |
count_tokens, batched=True, batch_size=1000, fn_kwargs={"tokenizer": tokenizer, "text_column": text_column} | |
) | |
total_num_tokens = 0 | |
for sample in ds: | |
total_num_tokens += sample["num_tokens"] | |
progress((total_num_tokens, None), desc="Counting tokens", unit="tokens") | |
return total_num_tokens | |
def calculate_steps( | |
dataset_name: str, | |
dataset_split: str, | |
dataset_config: str | None, | |
tokenizer_name: str, | |
num_gpus_per_node: int, | |
num_nodes: int, | |
batch_size: int, | |
grad_accum: int, | |
block_size: int, | |
text_column: str = "text", | |
token: str | None = None, | |
): | |
dataset_config = None if not dataset_config.strip() else dataset_config | |
text_column = "text" if not text_column.strip() else text_column | |
token = None if not token.strip() else token | |
try: | |
dataset = load_dataset(dataset_name, dataset_config, streaming=True, token=token, split=dataset_split) | |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, token=token) | |
total_num_tokens = get_dataset_num_tokens(dataset, tokenizer, text_column) | |
except Exception as exc: | |
raise gr.Error(str(exc)) | |
else: | |
dataset_size = ceil(total_num_tokens / block_size) | |
world_size = num_gpus_per_node * num_nodes | |
num_steps = ceil(dataset_size / (world_size * batch_size * grad_accum)) | |
return dataset_size, num_steps | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
"""# Steps Calculator | |
Calculate the number of steps required to run through your whole dataset with a given sequence length. This is \ | |
especially useful when training with a streaming dataset and you're not sure how many steps you need to run through \ | |
the dataset with a given tokenizer and block size.""" | |
) | |
with gr.Row(): | |
dataset_name = gr.Text(label="Dataset name") | |
dataset_split = gr.Text(label="Dataset split", value="train") | |
dataset_config = gr.Text(label="Dataset config (optional)") | |
tokenizer_name = gr.Text(label="Tokenizer name") | |
with gr.Row(): | |
num_gpus_per_node = gr.Number(value=1, minimum=1, label="Number of GPUs per node") | |
num_nodes = gr.Number(value=1, minimum=1, label="Number of nodes") | |
batch_size = gr.Number(value=8, minimum=1, label="Batch size") | |
grad_accum = gr.Number(value=1, minimum=1, label="Gradient accumulation steps") | |
block_size = gr.Number(value=2048, minimum=1, label="Block size") | |
text_column = gr.Text(value="text", label="Text column") | |
token = gr.Text(label="HF acces token (optional)") | |
with gr.Row(): | |
with gr.Column(): | |
calculate_btn = gr.Button(value="Calculate") | |
with gr.Column(): | |
samples = gr.Number(value=None, minimum=1, label="Total block-sized samples", interactive=False) | |
steps = gr.Number(value=None, minimum=1, label="Total steps needed", interactive=False) | |
calculate_btn.click( | |
calculate_steps, | |
inputs=[ | |
dataset_name, | |
dataset_split, | |
dataset_config, | |
tokenizer_name, | |
num_gpus_per_node, | |
num_nodes, | |
batch_size, | |
grad_accum, | |
block_size, | |
text_column, | |
token, | |
], | |
outputs=[samples, steps], | |
api_name="calculate-training-steps", | |
) | |
if __name__ == "__main__": | |
demo.queue().launch() | |