LLMTesting / data /fine_tune_dataset.py
thebigoed's picture
updated app
c41146d
# import torch
# from trl import SFTTrainer
from datasets import load_dataset
# from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
# from unsloth import FastLanguageModel, is_bfloat16_supported
def load_data(dataset, tokenizer, samples=None):
print("Loading finetuning dataset.")
# Base models don't have chat templates so we can choose any - ChatML is popular
tokenizer = get_chat_template(tokenizer,
mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
chat_template="chatml",
)
def apply_template(examples):
# Ensuring we parse the ShareGPT reformat datasets into the format we want
messages = examples["conversations"]
text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages]
return {"text": text}
if samples is not None:
# Reducing the training load by only training on a subset
dataset = load_dataset(dataset, split=f"train[:{int(samples)}]")
else:
dataset = load_dataset(dataset, split="train")
return dataset.map(apply_template, batched=True)