# import torch # from trl import SFTTrainer from datasets import load_dataset # from transformers import TrainingArguments, TextStreamer from unsloth.chat_templates import get_chat_template # from unsloth import FastLanguageModel, is_bfloat16_supported def load_data(dataset, tokenizer, samples=None): print("Loading finetuning dataset.") # Base models don't have chat templates so we can choose any - ChatML is popular tokenizer = get_chat_template(tokenizer, mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}, chat_template="chatml", ) def apply_template(examples): # Ensuring we parse the ShareGPT reformat datasets into the format we want messages = examples["conversations"] text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages] return {"text": text} if samples is not None: # Reducing the training load by only training on a subset dataset = load_dataset(dataset, split=f"train[:{int(samples)}]") else: dataset = load_dataset(dataset, split="train") return dataset.map(apply_template, batched=True)