{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 3.125e-08, "logits/chosen": -1.4567933082580566, "logits/rejected": -0.871229887008667, "logps/chosen": -244.365234375, "logps/rejected": -212.26486206054688, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -1.83387291431427, "logits/rejected": -1.0804697275161743, "logps/chosen": -206.00912475585938, "logps/rejected": -202.784912109375, "loss": 0.6817, "rewards/accuracies": 0.5486111044883728, "rewards/chosen": -0.039022047072649, "rewards/margins": 0.04178649187088013, "rewards/rejected": -0.08080853521823883, "step": 10 }, { "epoch": 0.13, "learning_rate": 4.989935734988097e-07, "logits/chosen": -1.0675297975540161, "logits/rejected": -0.5359733700752258, "logps/chosen": -237.27444458007812, "logps/rejected": -251.00753784179688, "loss": 0.6561, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.7787758111953735, "rewards/margins": 0.11565746366977692, "rewards/rejected": -0.8944332003593445, "step": 20 }, { "epoch": 0.19, "learning_rate": 4.877641290737883e-07, "logits/chosen": -1.095473289489746, "logits/rejected": -0.37094515562057495, "logps/chosen": -244.32162475585938, "logps/rejected": -296.1733703613281, "loss": 0.5953, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.6706979870796204, "rewards/margins": 0.5164287090301514, "rewards/rejected": -1.187126636505127, "step": 30 }, { "epoch": 0.26, "learning_rate": 4.646121984004665e-07, "logits/chosen": -0.8634458780288696, "logits/rejected": 0.12595783174037933, "logps/chosen": -242.0459442138672, "logps/rejected": -296.41595458984375, "loss": 0.5648, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5677449703216553, "rewards/margins": 0.5976042747497559, "rewards/rejected": -1.1653492450714111, "step": 40 }, { "epoch": 0.32, "learning_rate": 4.3069871595684787e-07, "logits/chosen": -0.6954927444458008, "logits/rejected": 0.03154268115758896, "logps/chosen": -246.68258666992188, "logps/rejected": -295.62884521484375, "loss": 0.5913, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8166979551315308, "rewards/margins": 0.5098680257797241, "rewards/rejected": -1.3265659809112549, "step": 50 }, { "epoch": 0.38, "learning_rate": 3.877242453630256e-07, "logits/chosen": -0.768271267414093, "logits/rejected": 0.022685179486870766, "logps/chosen": -245.92782592773438, "logps/rejected": -300.2510681152344, "loss": 0.5887, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.7170382738113403, "rewards/margins": 0.5133967399597168, "rewards/rejected": -1.2304350137710571, "step": 60 }, { "epoch": 0.45, "learning_rate": 3.378437060203357e-07, "logits/chosen": -0.5168389081954956, "logits/rejected": 0.45852264761924744, "logps/chosen": -256.852294921875, "logps/rejected": -309.4953308105469, "loss": 0.5836, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9044780731201172, "rewards/margins": 0.5655065178871155, "rewards/rejected": -1.4699846506118774, "step": 70 }, { "epoch": 0.51, "learning_rate": 2.8355831645441387e-07, "logits/chosen": -0.3654092848300934, "logits/rejected": 0.10795004665851593, "logps/chosen": -251.9696502685547, "logps/rejected": -292.9334716796875, "loss": 0.5522, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7978931069374084, "rewards/margins": 0.38232654333114624, "rewards/rejected": -1.1802196502685547, "step": 80 }, { "epoch": 0.58, "learning_rate": 2.2759017277414164e-07, "logits/chosen": -0.8108726739883423, "logits/rejected": 0.14660978317260742, "logps/chosen": -273.36419677734375, "logps/rejected": -320.58209228515625, "loss": 0.5671, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.6586915254592896, "rewards/margins": 0.6224299669265747, "rewards/rejected": -1.2811213731765747, "step": 90 }, { "epoch": 0.64, "learning_rate": 1.7274575140626315e-07, "logits/chosen": -0.46979203820228577, "logits/rejected": 0.5494852066040039, "logps/chosen": -272.69427490234375, "logps/rejected": -317.7990417480469, "loss": 0.5547, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9002830386161804, "rewards/margins": 0.5319327116012573, "rewards/rejected": -1.432215690612793, "step": 100 }, { "epoch": 0.7, "learning_rate": 1.2177518064852348e-07, "logits/chosen": -0.3219306170940399, "logits/rejected": 0.26910799741744995, "logps/chosen": -251.5453338623047, "logps/rejected": -299.8834533691406, "loss": 0.56, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7972058057785034, "rewards/margins": 0.43246564269065857, "rewards/rejected": -1.2296714782714844, "step": 110 }, { "epoch": 0.77, "learning_rate": 7.723433775328384e-08, "logits/chosen": -0.37325382232666016, "logits/rejected": 0.5774334669113159, "logps/chosen": -233.79562377929688, "logps/rejected": -328.5582580566406, "loss": 0.5585, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6402639150619507, "rewards/margins": 0.7515830397605896, "rewards/rejected": -1.3918468952178955, "step": 120 }, { "epoch": 0.83, "learning_rate": 4.1356686569674335e-08, "logits/chosen": -0.3119003176689148, "logits/rejected": 0.8427650332450867, "logps/chosen": -233.98971557617188, "logps/rejected": -324.93316650390625, "loss": 0.5265, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7347938418388367, "rewards/margins": 0.7224765419960022, "rewards/rejected": -1.4572702646255493, "step": 130 }, { "epoch": 0.9, "learning_rate": 1.5941282340065697e-08, "logits/chosen": -0.20903070271015167, "logits/rejected": 0.7928945422172546, "logps/chosen": -274.28704833984375, "logps/rejected": -331.6188049316406, "loss": 0.5484, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9145911931991577, "rewards/margins": 0.5992218255996704, "rewards/rejected": -1.5138130187988281, "step": 140 }, { "epoch": 0.96, "learning_rate": 2.2625595580163247e-09, "logits/chosen": 0.10685434192419052, "logits/rejected": 0.766906201839447, "logps/chosen": -257.482666015625, "logps/rejected": -326.8499450683594, "loss": 0.5539, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9127100706100464, "rewards/margins": 0.6432833075523376, "rewards/rejected": -1.5559935569763184, "step": 150 }, { "epoch": 1.0, "step": 156, "total_flos": 0.0, "train_loss": 0.5776262069359804, "train_runtime": 9474.0276, "train_samples_per_second": 2.111, "train_steps_per_second": 0.016 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }