{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.997867803837953, "eval_steps": 500, "global_step": 234, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.024216890335083, "logits/rejected": -1.8819010257720947, "logps/chosen": -1305.559326171875, "logps/rejected": -3790.57275390625, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.04, "learning_rate": 2.0833333333333334e-06, "logits/chosen": -2.2936673164367676, "logits/rejected": -2.1570396423339844, "logps/chosen": -1376.771728515625, "logps/rejected": -3041.7578125, "loss": 0.4966, "rewards/accuracies": 0.4305555522441864, "rewards/chosen": -0.037909653037786484, "rewards/margins": 0.009223480708897114, "rewards/rejected": -0.04713314026594162, "step": 10 }, { "epoch": 0.09, "learning_rate": 4.166666666666667e-06, "logits/chosen": -2.1768686771392822, "logits/rejected": -2.084563732147217, "logps/chosen": -2079.7646484375, "logps/rejected": -3319.23583984375, "loss": 0.4897, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.7143774628639221, "rewards/margins": 0.04834098741412163, "rewards/rejected": -0.7627183794975281, "step": 20 }, { "epoch": 0.13, "learning_rate": 4.989935734988098e-06, "logits/chosen": -2.145012617111206, "logits/rejected": -2.027907609939575, "logps/chosen": -2782.030517578125, "logps/rejected": -4380.7275390625, "loss": 0.4871, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.3868815898895264, "rewards/margins": 0.28924745321273804, "rewards/rejected": -1.6761291027069092, "step": 30 }, { "epoch": 0.17, "learning_rate": 4.9287250957321685e-06, "logits/chosen": -2.2485415935516357, "logits/rejected": -2.1334125995635986, "logps/chosen": -2663.207763671875, "logps/rejected": -3975.64892578125, "loss": 0.4821, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1883825063705444, "rewards/margins": 0.22184336185455322, "rewards/rejected": -1.4102258682250977, "step": 40 }, { "epoch": 0.21, "learning_rate": 4.813260751184992e-06, "logits/chosen": -2.2985146045684814, "logits/rejected": -2.210111141204834, "logps/chosen": -1755.809326171875, "logps/rejected": -3258.091796875, "loss": 0.4806, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6678739190101624, "rewards/margins": 0.2041586935520172, "rewards/rejected": -0.872032642364502, "step": 50 }, { "epoch": 0.26, "learning_rate": 4.646121984004666e-06, "logits/chosen": -2.2362232208251953, "logits/rejected": -2.1845810413360596, "logps/chosen": -2074.88134765625, "logps/rejected": -3796.244873046875, "loss": 0.473, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.5578988790512085, "rewards/margins": 0.24681946635246277, "rewards/rejected": -0.8047183752059937, "step": 60 }, { "epoch": 0.3, "learning_rate": 4.431042398061499e-06, "logits/chosen": -2.382171392440796, "logits/rejected": -2.307375431060791, "logps/chosen": -1571.745361328125, "logps/rejected": -3834.587158203125, "loss": 0.4815, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.559153139591217, "rewards/margins": 0.5565303564071655, "rewards/rejected": -1.1156834363937378, "step": 70 }, { "epoch": 0.34, "learning_rate": 4.172826515897146e-06, "logits/chosen": -2.408031940460205, "logits/rejected": -2.3543269634246826, "logps/chosen": -2339.724609375, "logps/rejected": -3922.291748046875, "loss": 0.4755, "rewards/accuracies": 0.53125, "rewards/chosen": -1.0054388046264648, "rewards/margins": 0.39337557554244995, "rewards/rejected": -1.39881432056427, "step": 80 }, { "epoch": 0.38, "learning_rate": 3.8772424536302565e-06, "logits/chosen": -2.4260120391845703, "logits/rejected": -2.373760461807251, "logps/chosen": -2360.41552734375, "logps/rejected": -3782.758544921875, "loss": 0.4727, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.8136354684829712, "rewards/margins": 0.4064968228340149, "rewards/rejected": -1.2201323509216309, "step": 90 }, { "epoch": 0.43, "learning_rate": 3.5508930707739143e-06, "logits/chosen": -2.4029459953308105, "logits/rejected": -2.3526365756988525, "logps/chosen": -2014.8980712890625, "logps/rejected": -3683.920654296875, "loss": 0.4696, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.5975068211555481, "rewards/margins": 0.31760281324386597, "rewards/rejected": -0.9151096343994141, "step": 100 }, { "epoch": 0.47, "learning_rate": 3.201068473265007e-06, "logits/chosen": -2.452263355255127, "logits/rejected": -2.4388153553009033, "logps/chosen": -2102.702392578125, "logps/rejected": -4107.12255859375, "loss": 0.4725, "rewards/accuracies": 0.53125, "rewards/chosen": -0.7340173125267029, "rewards/margins": 0.48010140657424927, "rewards/rejected": -1.2141185998916626, "step": 110 }, { "epoch": 0.51, "learning_rate": 2.835583164544139e-06, "logits/chosen": -2.4622802734375, "logits/rejected": -2.453519344329834, "logps/chosen": -2598.88037109375, "logps/rejected": -4125.0986328125, "loss": 0.4738, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9378460049629211, "rewards/margins": 0.3462037444114685, "rewards/rejected": -1.2840497493743896, "step": 120 }, { "epoch": 0.55, "learning_rate": 2.4626014824618418e-06, "logits/chosen": -2.5731091499328613, "logits/rejected": -2.559770107269287, "logps/chosen": -2807.160400390625, "logps/rejected": -4325.0205078125, "loss": 0.4802, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1455607414245605, "rewards/margins": 0.6021825075149536, "rewards/rejected": -1.7477432489395142, "step": 130 }, { "epoch": 0.6, "learning_rate": 2.090455221462156e-06, "logits/chosen": -2.468670606613159, "logits/rejected": -2.4780094623565674, "logps/chosen": -2137.4091796875, "logps/rejected": -4046.2734375, "loss": 0.4714, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8530462980270386, "rewards/margins": 0.3694917857646942, "rewards/rejected": -1.2225382328033447, "step": 140 }, { "epoch": 0.64, "learning_rate": 1.7274575140626318e-06, "logits/chosen": -2.4550023078918457, "logits/rejected": -2.4524893760681152, "logps/chosen": -1734.0328369140625, "logps/rejected": -3453.727783203125, "loss": 0.4732, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5440791845321655, "rewards/margins": 0.4087960124015808, "rewards/rejected": -0.9528751373291016, "step": 150 }, { "epoch": 0.68, "learning_rate": 1.3817171292109182e-06, "logits/chosen": -2.520023822784424, "logits/rejected": -2.549712657928467, "logps/chosen": -2198.69189453125, "logps/rejected": -3628.389892578125, "loss": 0.4758, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6990865468978882, "rewards/margins": 0.3725363612174988, "rewards/rejected": -1.0716229677200317, "step": 160 }, { "epoch": 0.72, "learning_rate": 1.0609573357858166e-06, "logits/chosen": -2.5392794609069824, "logits/rejected": -2.5447866916656494, "logps/chosen": -1648.3802490234375, "logps/rejected": -4229.2060546875, "loss": 0.4682, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7170668840408325, "rewards/margins": 0.6884183883666992, "rewards/rejected": -1.4054853916168213, "step": 170 }, { "epoch": 0.77, "learning_rate": 7.723433775328385e-07, "logits/chosen": -2.524719476699829, "logits/rejected": -2.5857484340667725, "logps/chosen": -2204.722900390625, "logps/rejected": -4786.68408203125, "loss": 0.47, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9566167593002319, "rewards/margins": 0.7360025644302368, "rewards/rejected": -1.6926193237304688, "step": 180 }, { "epoch": 0.81, "learning_rate": 5.223224133591475e-07, "logits/chosen": -2.5650665760040283, "logits/rejected": -2.5723259449005127, "logps/chosen": -2591.955322265625, "logps/rejected": -4333.18701171875, "loss": 0.4722, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.0086050033569336, "rewards/margins": 0.689775824546814, "rewards/rejected": -1.6983808279037476, "step": 190 }, { "epoch": 0.85, "learning_rate": 3.164794984571759e-07, "logits/chosen": -2.4675166606903076, "logits/rejected": -2.536062717437744, "logps/chosen": -2089.778564453125, "logps/rejected": -4025.212158203125, "loss": 0.474, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.8992789387702942, "rewards/margins": 0.5448096990585327, "rewards/rejected": -1.4440886974334717, "step": 200 }, { "epoch": 0.9, "learning_rate": 1.59412823400657e-07, "logits/chosen": -2.560384750366211, "logits/rejected": -2.6164653301239014, "logps/chosen": -2061.99658203125, "logps/rejected": -3734.251953125, "loss": 0.4744, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.9695846438407898, "rewards/margins": 0.20665684342384338, "rewards/rejected": -1.176241397857666, "step": 210 }, { "epoch": 0.94, "learning_rate": 5.463099816548578e-08, "logits/chosen": -2.4755778312683105, "logits/rejected": -2.5548205375671387, "logps/chosen": -1978.981689453125, "logps/rejected": -4176.359375, "loss": 0.4721, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8206374049186707, "rewards/margins": 0.5447834730148315, "rewards/rejected": -1.365420937538147, "step": 220 }, { "epoch": 0.98, "learning_rate": 4.474675580662113e-09, "logits/chosen": -2.485661745071411, "logits/rejected": -2.55751371383667, "logps/chosen": -2132.883056640625, "logps/rejected": -4433.28271484375, "loss": 0.4707, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.7833830714225769, "rewards/margins": 0.7396507263183594, "rewards/rejected": -1.523033857345581, "step": 230 }, { "epoch": 1.0, "step": 234, "total_flos": 0.0, "train_loss": 0.47607828460187995, "train_runtime": 5505.1583, "train_samples_per_second": 2.725, "train_steps_per_second": 0.043 } ], "logging_steps": 10, "max_steps": 234, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }