|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import inspect
|
|
import warnings
|
|
from dataclasses import FrozenInstanceError, replace
|
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
from datasets import Dataset
|
|
from transformers import DataCollator, PreTrainedModel, PreTrainedTokenizerBase, Trainer, TrainingArguments
|
|
from transformers.trainer_callback import TrainerCallback
|
|
from transformers.trainer_pt_utils import nested_detach
|
|
from transformers.trainer_utils import EvalPrediction
|
|
|
|
from ..import_utils import is_peft_available
|
|
from .reward_config import RewardConfig
|
|
from .utils import RewardDataCollatorWithPadding, compute_accuracy
|
|
|
|
|
|
if is_peft_available():
|
|
from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training
|
|
|
|
|
|
class RewardTrainer(Trainer):
|
|
r"""
|
|
The RewardTrainer can be used to train your custom Reward Model. It is a subclass of the
|
|
`transformers.Trainer` class and inherits all of its attributes and methods. It is recommended to use
|
|
an `AutoModelForSequenceClassification` as the reward model. The reward model should be trained on a dataset
|
|
of paired examples, where each example is a tuple of two sequences. The reward model should be trained to
|
|
predict which example in the pair is more relevant to the task at hand.
|
|
|
|
The reward trainer expects a very specific format for the dataset. The dataset should contain two 4 entries at least
|
|
if you don't use the default `RewardDataCollatorWithPadding` data collator. The entries should be named
|
|
- `input_ids_chosen`
|
|
- `attention_mask_chosen`
|
|
- `input_ids_rejected`
|
|
- `attention_mask_rejected`
|
|
|
|
Optionally, you can also pass a `margin` entry to the dataset. This entry should contain the margin used to modulate the
|
|
loss of the reward model as outlined in https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/.
|
|
If you don't pass a margin, no margin will be used.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
model: Union[PreTrainedModel, nn.Module] = None,
|
|
args: Optional[RewardConfig] = None,
|
|
data_collator: Optional[DataCollator] = None,
|
|
train_dataset: Optional[Dataset] = None,
|
|
eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
|
|
tokenizer: Optional[PreTrainedTokenizerBase] = None,
|
|
model_init: Optional[Callable[[], PreTrainedModel]] = None,
|
|
compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
|
|
callbacks: Optional[List[TrainerCallback]] = None,
|
|
optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
|
|
None,
|
|
None,
|
|
),
|
|
preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
|
|
max_length: Optional[int] = None,
|
|
peft_config: Optional[Dict] = None,
|
|
):
|
|
"""
|
|
Initialize RewardTrainer.
|
|
|
|
Args:
|
|
model (`transformers.PreTrainedModel`):
|
|
The model to train, preferably an `AutoModelForSequenceClassification`.
|
|
args (`RewardConfig`):
|
|
The arguments to use for training.
|
|
data_collator (`transformers.DataCollator`):
|
|
The data collator to use for training. If None is specified, the default data collator (`RewardDataCollatorWithPadding`) will be used
|
|
which will pad the sequences to the maximum length of the sequences in the batch, given a dataset of paired sequences.
|
|
train_dataset (`datasets.Dataset`):
|
|
The dataset to use for training.
|
|
eval_dataset (`datasets.Dataset`):
|
|
The dataset to use for evaluation.
|
|
tokenizer (`transformers.PreTrainedTokenizerBase`):
|
|
The tokenizer to use for training. This argument is required if you want to use the default data collator.
|
|
model_init (`Callable[[], transformers.PreTrainedModel]`):
|
|
The model initializer to use for training. If None is specified, the default model initializer will be used.
|
|
compute_metrics (`Callable[[transformers.EvalPrediction], Dict]`, *optional* defaults to `compute_accuracy`):
|
|
The metrics to use for evaluation. If no metrics are specified, the default metric (`compute_accuracy`) will be used.
|
|
callbacks (`List[transformers.TrainerCallback]`):
|
|
The callbacks to use for training.
|
|
optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
|
|
The optimizer and scheduler to use for training.
|
|
preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
|
|
The function to use to preprocess the logits before computing the metrics.
|
|
max_length (`int`, defaults to `None`):
|
|
The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
|
|
peft_config (`Dict`, defaults to `None`):
|
|
The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in a PEFT model.
|
|
"""
|
|
if type(args) == TrainingArguments:
|
|
warnings.warn(
|
|
"Using `transformers.TrainingArguments` for `args` is deprecated and will be removed in a future version. Please use `RewardConfig` instead.",
|
|
FutureWarning,
|
|
)
|
|
if max_length is not None:
|
|
warnings.warn(
|
|
"The `max_length` argument is deprecated and will be removed in a future version. Please use the `RewardConfig` to set `max_length` instead.",
|
|
FutureWarning,
|
|
)
|
|
else:
|
|
if max_length is not None and args.max_length is not None:
|
|
raise ValueError("You cannot specify both `max_length` and `args.max_length`. Please use the `RewardConfig` to set `max_length` once.")
|
|
if max_length is not None and args.max_length is None:
|
|
warnings.warn(
|
|
"The `max_length` argument is deprecated and will be removed in a future version. Please use the `RewardConfig` to set `max_length` instead.",
|
|
FutureWarning,
|
|
)
|
|
if not is_peft_available() and peft_config is not None:
|
|
raise ValueError("PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models")
|
|
elif is_peft_available() and peft_config is not None:
|
|
if not isinstance(model, PeftModel):
|
|
if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_quantized", False):
|
|
_supports_gc_kwargs = "gradient_checkpointing_kwargs" in list(inspect.signature(prepare_model_for_kbit_training).parameters)
|
|
|
|
preprare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing}
|
|
|
|
if not _supports_gc_kwargs and args.gradient_checkpointing_kwargs is not None:
|
|
warnings.warn("You passed `gradient_checkpointing_kwargs` in the trainer's kwargs, but your peft version does not support it. " "please update to the latest version of peft to use `gradient_checkpointing_kwargs`.")
|
|
elif _supports_gc_kwargs and args.gradient_checkpointing_kwargs is not None:
|
|
preprare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs
|
|
|
|
model = prepare_model_for_kbit_training(model, **preprare_model_kwargs)
|
|
|
|
model = get_peft_model(model, peft_config)
|
|
|
|
if compute_metrics is None:
|
|
compute_metrics = compute_accuracy
|
|
|
|
if data_collator is None:
|
|
if tokenizer is None:
|
|
raise ValueError("max_length or a tokenizer must be specified when using the default RewardDataCollatorWithPadding")
|
|
if type(args) == TrainingArguments:
|
|
if max_length is None:
|
|
warnings.warn(
|
|
"When using RewardDataCollatorWithPadding, you should set `max_length` in RewardConfig." " It will be set to `512` by default, but you should do it yourself in the future.",
|
|
UserWarning,
|
|
)
|
|
max_length = 512
|
|
else:
|
|
if max_length is None and args.max_length is None:
|
|
warnings.warn(
|
|
"When using RewardDataCollatorWithPadding, you should set `max_length` in RewardConfig." " It will be set to `512` by default, but you should do it yourself in the future.",
|
|
UserWarning,
|
|
)
|
|
max_length = 512
|
|
if max_length is None and args.max_length is not None:
|
|
max_length = args.max_length
|
|
|
|
data_collator = RewardDataCollatorWithPadding(tokenizer, max_length=max_length)
|
|
|
|
if args.remove_unused_columns:
|
|
try:
|
|
args.remove_unused_columns = False
|
|
except FrozenInstanceError:
|
|
args = replace(args, remove_unused_columns=False)
|
|
|
|
warnings.warn(
|
|
"When using RewardDataCollatorWithPadding, you should set `remove_unused_columns=False` in your RewardConfig" " we have set it for you, but you should do it yourself in the future.",
|
|
UserWarning,
|
|
)
|
|
|
|
self.use_reward_data_collator = True
|
|
else:
|
|
self.use_reward_data_collator = False
|
|
super().__init__(
|
|
model,
|
|
args,
|
|
data_collator,
|
|
train_dataset,
|
|
eval_dataset,
|
|
tokenizer,
|
|
model_init,
|
|
compute_metrics,
|
|
callbacks,
|
|
optimizers,
|
|
preprocess_logits_for_metrics,
|
|
)
|
|
|
|
def compute_loss(
|
|
self,
|
|
model: Union[PreTrainedModel, nn.Module],
|
|
inputs: Dict[str, Union[torch.Tensor, Any]],
|
|
return_outputs=False,
|
|
) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, torch.Tensor]]]:
|
|
if not self.use_reward_data_collator:
|
|
warnings.warn("The current compute_loss is implemented for RewardDataCollatorWithPadding," " if you are using a custom data collator make sure you know what you are doing or" " implement your own compute_loss method.")
|
|
rewards_chosen = model(
|
|
input_ids=inputs["input_ids_chosen"],
|
|
attention_mask=inputs["attention_mask_chosen"],
|
|
return_dict=True,
|
|
)["logits"]
|
|
rewards_rejected = model(
|
|
input_ids=inputs["input_ids_rejected"],
|
|
attention_mask=inputs["attention_mask_rejected"],
|
|
return_dict=True,
|
|
)["logits"]
|
|
|
|
if "margin" in inputs:
|
|
loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected - inputs["margin"]).mean()
|
|
else:
|
|
loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected).mean()
|
|
|
|
if return_outputs:
|
|
return loss, {
|
|
"rewards_chosen": rewards_chosen,
|
|
"rewards_rejected": rewards_rejected,
|
|
}
|
|
return loss
|
|
|
|
def prediction_step(
|
|
self,
|
|
model: Union[PreTrainedModel, nn.Module],
|
|
inputs: Dict[str, Union[torch.Tensor, Any]],
|
|
prediction_loss_only: bool,
|
|
ignore_keys: Optional[List[str]] = None,
|
|
) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
|
|
inputs = self._prepare_inputs(inputs)
|
|
if ignore_keys is None:
|
|
if hasattr(self.model, "config"):
|
|
ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
|
|
else:
|
|
ignore_keys = []
|
|
|
|
with torch.no_grad():
|
|
loss, logits_dict = self.compute_loss(model, inputs, return_outputs=True)
|
|
|
|
if prediction_loss_only:
|
|
return (loss, None, None)
|
|
|
|
loss = loss.detach()
|
|
logits = tuple(v for k, v in logits_dict.items() if k not in ignore_keys)
|
|
logits = nested_detach(logits)
|
|
|
|
|
|
logits = torch.stack(logits).mean(dim=2).softmax(dim=0).T
|
|
|
|
labels = torch.zeros(logits.shape[0])
|
|
labels = self._prepare_inputs(labels)
|
|
|
|
return loss, logits, labels
|
|
|