Spaces:

thebigoed
/

LLMTesting

Running

File size: 1,215 Bytes

c41146d

# import torch
# from trl import SFTTrainer
from datasets import load_dataset
# from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
# from unsloth import FastLanguageModel, is_bfloat16_supported


def load_data(dataset, tokenizer, samples=None):
    print("Loading finetuning dataset.")

    # Base models don't have chat templates so we can choose any - ChatML is popular
    tokenizer = get_chat_template(tokenizer,
        mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
        chat_template="chatml",
        )

    def apply_template(examples):
        # Ensuring we parse the ShareGPT reformat datasets into the format we want
        messages = examples["conversations"]
        text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages]
        return {"text": text}


    if samples is not None:
        # Reducing the training load by only training on a subset
        dataset = load_dataset(dataset, split=f"train[:{int(samples)}]")
    else: 
        dataset = load_dataset(dataset, split="train")

    return dataset.map(apply_template, batched=True)