{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 100, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010468463752944255, "grad_norm": 7.355008386193469, "learning_rate": 2.083333333333333e-09, "logits/chosen": -7.783219814300537, "logits/rejected": -8.072843551635742, "logps/chosen": -335.8546142578125, "logps/rejected": -277.73626708984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.010468463752944255, "grad_norm": 6.230445884319703, "learning_rate": 2.0833333333333335e-08, "logits/chosen": -7.981190204620361, "logits/rejected": -8.130318641662598, "logps/chosen": -306.2852783203125, "logps/rejected": -309.1938781738281, "loss": 0.6928, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.0020241288002580404, "rewards/margins": 0.0018345804419368505, "rewards/rejected": 0.000189548620255664, "step": 10 }, { "epoch": 0.02093692750588851, "grad_norm": 6.688413560820125, "learning_rate": 4.166666666666667e-08, "logits/chosen": -8.508150100708008, "logits/rejected": -8.487831115722656, "logps/chosen": -342.08197021484375, "logps/rejected": -300.66973876953125, "loss": 0.6934, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0012120783794671297, "rewards/margins": -0.0002803016686812043, "rewards/rejected": 0.0014923801645636559, "step": 20 }, { "epoch": 0.031405391258832765, "grad_norm": 8.288905108120925, "learning_rate": 6.25e-08, "logits/chosen": -7.823553562164307, "logits/rejected": -8.030963897705078, "logps/chosen": -294.57769775390625, "logps/rejected": -290.71209716796875, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.001970288809388876, "rewards/margins": 0.0006792033091187477, "rewards/rejected": 0.0012910853838548064, "step": 30 }, { "epoch": 0.04187385501177702, "grad_norm": 6.7092850577908285, "learning_rate": 8.333333333333334e-08, "logits/chosen": -8.137880325317383, "logits/rejected": -8.182608604431152, "logps/chosen": -302.7641906738281, "logps/rejected": -278.7587890625, "loss": 0.6925, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.004604184068739414, "rewards/margins": 0.0005916848895139992, "rewards/rejected": 0.004012499004602432, "step": 40 }, { "epoch": 0.05234231876472128, "grad_norm": 11.714084906492422, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -8.555654525756836, "logits/rejected": -9.059080123901367, "logps/chosen": -318.3633728027344, "logps/rejected": -257.5683288574219, "loss": 0.6916, "rewards/accuracies": 0.5625, "rewards/chosen": 0.010140976868569851, "rewards/margins": 0.002551380079239607, "rewards/rejected": 0.007589596323668957, "step": 50 }, { "epoch": 0.06281078251766553, "grad_norm": 6.7230358924676015, "learning_rate": 1.25e-07, "logits/chosen": -8.04963493347168, "logits/rejected": -8.676374435424805, "logps/chosen": -334.97003173828125, "logps/rejected": -284.1954040527344, "loss": 0.6897, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0205511637032032, "rewards/margins": 0.006578472442924976, "rewards/rejected": 0.013972689397633076, "step": 60 }, { "epoch": 0.07327924627060979, "grad_norm": 6.439781455829167, "learning_rate": 1.4583333333333332e-07, "logits/chosen": -8.384071350097656, "logits/rejected": -8.226759910583496, "logps/chosen": -240.31106567382812, "logps/rejected": -258.47332763671875, "loss": 0.6867, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0298004187643528, "rewards/margins": 0.011175071820616722, "rewards/rejected": 0.018625345081090927, "step": 70 }, { "epoch": 0.08374771002355404, "grad_norm": 6.513071298610303, "learning_rate": 1.6666666666666668e-07, "logits/chosen": -8.502431869506836, "logits/rejected": -9.255155563354492, "logps/chosen": -296.0177307128906, "logps/rejected": -268.50347900390625, "loss": 0.6815, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.04522623121738434, "rewards/margins": 0.025061482563614845, "rewards/rejected": 0.020164750516414642, "step": 80 }, { "epoch": 0.0942161737764983, "grad_norm": 5.920042917298396, "learning_rate": 1.875e-07, "logits/chosen": -8.7146635055542, "logits/rejected": -8.876623153686523, "logps/chosen": -272.12420654296875, "logps/rejected": -255.40225219726562, "loss": 0.6782, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.06685945391654968, "rewards/margins": 0.04631630331277847, "rewards/rejected": 0.02054314874112606, "step": 90 }, { "epoch": 0.10468463752944256, "grad_norm": 7.4287436912349065, "learning_rate": 1.9998929970725745e-07, "logits/chosen": -8.944499969482422, "logits/rejected": -8.88210678100586, "logps/chosen": -293.67266845703125, "logps/rejected": -300.55419921875, "loss": 0.6677, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.049799270927906036, "rewards/margins": 0.04667241498827934, "rewards/rejected": 0.003126861061900854, "step": 100 }, { "epoch": 0.10468463752944256, "eval_logits/chosen": -9.278841972351074, "eval_logits/rejected": -9.52132511138916, "eval_logps/chosen": -311.630126953125, "eval_logps/rejected": -287.41925048828125, "eval_loss": 0.6651390194892883, "eval_rewards/accuracies": 0.6439999938011169, "eval_rewards/chosen": 0.026908237487077713, "eval_rewards/margins": 0.057010453194379807, "eval_rewards/rejected": -0.030102219432592392, "eval_runtime": 151.8713, "eval_samples_per_second": 13.169, "eval_steps_per_second": 0.823, "step": 100 }, { "epoch": 0.11515310128238682, "grad_norm": 7.059170038910516, "learning_rate": 1.9986894771071702e-07, "logits/chosen": -9.338754653930664, "logits/rejected": -9.311285972595215, "logps/chosen": -288.4460144042969, "logps/rejected": -247.97964477539062, "loss": 0.6636, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.021794503554701805, "rewards/margins": 0.10265706479549408, "rewards/rejected": -0.08086254447698593, "step": 110 }, { "epoch": 0.12562156503533106, "grad_norm": 8.852936631402919, "learning_rate": 1.996150298485439e-07, "logits/chosen": -10.604690551757812, "logits/rejected": -10.366097450256348, "logps/chosen": -265.26751708984375, "logps/rejected": -300.37530517578125, "loss": 0.6409, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.0008338313782587647, "rewards/margins": 0.0940166562795639, "rewards/rejected": -0.09485048055648804, "step": 120 }, { "epoch": 0.1360900287882753, "grad_norm": 15.573650791324633, "learning_rate": 1.9922788571337257e-07, "logits/chosen": -10.188752174377441, "logits/rejected": -10.191640853881836, "logps/chosen": -323.96685791015625, "logps/rejected": -297.19439697265625, "loss": 0.6304, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.006395213305950165, "rewards/margins": 0.13638103008270264, "rewards/rejected": -0.1427762359380722, "step": 130 }, { "epoch": 0.14655849254121958, "grad_norm": 20.046359115149162, "learning_rate": 1.9870803307616914e-07, "logits/chosen": -10.423749923706055, "logits/rejected": -10.360530853271484, "logps/chosen": -343.8540954589844, "logps/rejected": -346.18634033203125, "loss": 0.6211, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.14697681367397308, "rewards/margins": 0.1654496192932129, "rewards/rejected": -0.3124264180660248, "step": 140 }, { "epoch": 0.15702695629416383, "grad_norm": 7.638915011188106, "learning_rate": 1.9805616719375848e-07, "logits/chosen": -10.7134370803833, "logits/rejected": -11.019803047180176, "logps/chosen": -330.085205078125, "logps/rejected": -305.45208740234375, "loss": 0.6196, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.15285547077655792, "rewards/margins": 0.1812134087085724, "rewards/rejected": -0.3340689241886139, "step": 150 }, { "epoch": 0.16749542004710807, "grad_norm": 12.849690942872602, "learning_rate": 1.972731598789799e-07, "logits/chosen": -9.68244743347168, "logits/rejected": -10.297185897827148, "logps/chosen": -327.29144287109375, "logps/rejected": -285.8588562011719, "loss": 0.6086, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.04306762292981148, "rewards/margins": 0.2582402527332306, "rewards/rejected": -0.3013078570365906, "step": 160 }, { "epoch": 0.17796388380005235, "grad_norm": 32.698955102660705, "learning_rate": 1.9636005833471467e-07, "logits/chosen": -11.027162551879883, "logits/rejected": -11.326080322265625, "logps/chosen": -288.31817626953125, "logps/rejected": -291.74908447265625, "loss": 0.5927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3270031809806824, "rewards/margins": 0.15106120705604553, "rewards/rejected": -0.4780643582344055, "step": 170 }, { "epoch": 0.1884323475529966, "grad_norm": 11.54139813571567, "learning_rate": 1.9531808375334508e-07, "logits/chosen": -10.985331535339355, "logits/rejected": -11.428260803222656, "logps/chosen": -317.88580322265625, "logps/rejected": -319.74444580078125, "loss": 0.5856, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10588045418262482, "rewards/margins": 0.28938889503479004, "rewards/rejected": -0.39526933431625366, "step": 180 }, { "epoch": 0.19890081130594087, "grad_norm": 10.803006572264644, "learning_rate": 1.9414862968351785e-07, "logits/chosen": -11.358689308166504, "logits/rejected": -11.146299362182617, "logps/chosen": -284.0877380371094, "logps/rejected": -376.7961120605469, "loss": 0.5878, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08324204385280609, "rewards/margins": 0.37117457389831543, "rewards/rejected": -0.4544166624546051, "step": 190 }, { "epoch": 0.2093692750588851, "grad_norm": 11.193966860687885, "learning_rate": 1.9285326016639624e-07, "logits/chosen": -11.543092727661133, "logits/rejected": -11.964883804321289, "logps/chosen": -355.447998046875, "logps/rejected": -314.7781066894531, "loss": 0.5915, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1552293449640274, "rewards/margins": 0.35743778944015503, "rewards/rejected": -0.512667179107666, "step": 200 }, { "epoch": 0.2093692750588851, "eval_logits/chosen": -11.656082153320312, "eval_logits/rejected": -11.956182479858398, "eval_logps/chosen": -347.9349060058594, "eval_logps/rejected": -349.1275939941406, "eval_loss": 0.5919647812843323, "eval_rewards/accuracies": 0.6880000233650208, "eval_rewards/chosen": -0.33613964915275574, "eval_rewards/margins": 0.31104618310928345, "eval_rewards/rejected": -0.6471858620643616, "eval_runtime": 150.3287, "eval_samples_per_second": 13.304, "eval_steps_per_second": 0.832, "step": 200 }, { "epoch": 0.21983773881182936, "grad_norm": 25.282722057255278, "learning_rate": 1.914337076438937e-07, "logits/chosen": -11.216215133666992, "logits/rejected": -11.796686172485352, "logps/chosen": -339.0605163574219, "logps/rejected": -345.29901123046875, "loss": 0.6089, "rewards/accuracies": 0.6875, "rewards/chosen": -0.44959211349487305, "rewards/margins": 0.2086714208126068, "rewards/rejected": -0.6582635641098022, "step": 210 }, { "epoch": 0.23030620256477363, "grad_norm": 14.044802000055538, "learning_rate": 1.898918706416864e-07, "logits/chosen": -11.541923522949219, "logits/rejected": -12.298526763916016, "logps/chosen": -361.52947998046875, "logps/rejected": -334.96905517578125, "loss": 0.5912, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2677989602088928, "rewards/margins": 0.373500794172287, "rewards/rejected": -0.6412997245788574, "step": 220 }, { "epoch": 0.24077466631771788, "grad_norm": 12.45188359067346, "learning_rate": 1.882298112301034e-07, "logits/chosen": -11.14279556274414, "logits/rejected": -11.60063362121582, "logps/chosen": -281.5401916503906, "logps/rejected": -305.64166259765625, "loss": 0.5863, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.23669323325157166, "rewards/margins": 0.28197115659713745, "rewards/rejected": -0.5186644196510315, "step": 230 }, { "epoch": 0.2512431300706621, "grad_norm": 17.19152112354873, "learning_rate": 1.8644975226629022e-07, "logits/chosen": -11.584768295288086, "logits/rejected": -11.848301887512207, "logps/chosen": -325.7696838378906, "logps/rejected": -335.9078369140625, "loss": 0.5696, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2423885315656662, "rewards/margins": 0.3221975266933441, "rewards/rejected": -0.5645860433578491, "step": 240 }, { "epoch": 0.26171159382360637, "grad_norm": 14.325794995366559, "learning_rate": 1.8455407442133465e-07, "logits/chosen": -11.799114227294922, "logits/rejected": -12.152410507202148, "logps/chosen": -329.87677001953125, "logps/rejected": -324.18115234375, "loss": 0.5655, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.24216821789741516, "rewards/margins": 0.43084821105003357, "rewards/rejected": -0.6730164289474487, "step": 250 }, { "epoch": 0.2721800575765506, "grad_norm": 12.117583595266726, "learning_rate": 1.8254531299633004e-07, "logits/chosen": -12.106219291687012, "logits/rejected": -12.641697883605957, "logps/chosen": -344.15032958984375, "logps/rejected": -348.90673828125, "loss": 0.562, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2714720666408539, "rewards/margins": 0.39360731840133667, "rewards/rejected": -0.6650794148445129, "step": 260 }, { "epoch": 0.2826485213294949, "grad_norm": 17.931356253932698, "learning_rate": 1.8042615453163484e-07, "logits/chosen": -12.003366470336914, "logits/rejected": -12.557150840759277, "logps/chosen": -372.60821533203125, "logps/rejected": -349.703125, "loss": 0.5514, "rewards/accuracies": 0.6875, "rewards/chosen": -0.45285406708717346, "rewards/margins": 0.3556319773197174, "rewards/rejected": -0.8084859848022461, "step": 270 }, { "epoch": 0.29311698508243916, "grad_norm": 14.078329477472565, "learning_rate": 1.7819943321386296e-07, "logits/chosen": -12.080374717712402, "logits/rejected": -12.487339973449707, "logps/chosen": -364.81134033203125, "logps/rejected": -383.1141052246094, "loss": 0.5755, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.3204973638057709, "rewards/margins": 0.5626034140586853, "rewards/rejected": -0.883100688457489, "step": 280 }, { "epoch": 0.3035854488353834, "grad_norm": 16.797223533505605, "learning_rate": 1.7586812708541044e-07, "logits/chosen": -12.28768253326416, "logits/rejected": -13.17219066619873, "logps/chosen": -382.7824401855469, "logps/rejected": -368.29449462890625, "loss": 0.5672, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.40373754501342773, "rewards/margins": 0.4520534873008728, "rewards/rejected": -0.8557910919189453, "step": 290 }, { "epoch": 0.31405391258832765, "grad_norm": 17.74775759360231, "learning_rate": 1.7343535406158772e-07, "logits/chosen": -12.226592063903809, "logits/rejected": -12.483770370483398, "logps/chosen": -300.1335144042969, "logps/rejected": -350.2237854003906, "loss": 0.5723, "rewards/accuracies": 0.75, "rewards/chosen": -0.401310533285141, "rewards/margins": 0.4268767833709717, "rewards/rejected": -0.8281872868537903, "step": 300 }, { "epoch": 0.31405391258832765, "eval_logits/chosen": -12.452622413635254, "eval_logits/rejected": -12.687331199645996, "eval_logps/chosen": -353.8749084472656, "eval_logps/rejected": -363.39166259765625, "eval_loss": 0.5674170851707458, "eval_rewards/accuracies": 0.6880000233650208, "eval_rewards/chosen": -0.39553993940353394, "eval_rewards/margins": 0.39428621530532837, "eval_rewards/rejected": -0.7898260354995728, "eval_runtime": 150.1759, "eval_samples_per_second": 13.318, "eval_steps_per_second": 0.832, "step": 300 }, { "epoch": 0.3245223763412719, "grad_norm": 21.123290325111288, "learning_rate": 1.709043677606842e-07, "logits/chosen": -13.11596393585205, "logits/rejected": -13.485511779785156, "logps/chosen": -372.96563720703125, "logps/rejected": -374.05731201171875, "loss": 0.5502, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4754568040370941, "rewards/margins": 0.5059635043144226, "rewards/rejected": -0.9814203381538391, "step": 310 }, { "epoch": 0.33499084009421615, "grad_norm": 21.451163083722758, "learning_rate": 1.6827855315254218e-07, "logits/chosen": -12.301114082336426, "logits/rejected": -11.965790748596191, "logps/chosen": -310.0186462402344, "logps/rejected": -385.1772155761719, "loss": 0.5855, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6075664758682251, "rewards/margins": 0.4841720461845398, "rewards/rejected": -1.0917384624481201, "step": 320 }, { "epoch": 0.34545930384716045, "grad_norm": 18.33219470868773, "learning_rate": 1.6556142203145976e-07, "logits/chosen": -12.740918159484863, "logits/rejected": -13.098161697387695, "logps/chosen": -339.1952819824219, "logps/rejected": -363.0919189453125, "loss": 0.5684, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5905572175979614, "rewards/margins": 0.25887611508369446, "rewards/rejected": -0.8494332432746887, "step": 330 }, { "epoch": 0.3559277676001047, "grad_norm": 11.900265933520593, "learning_rate": 1.6275660831947723e-07, "logits/chosen": -11.391129493713379, "logits/rejected": -11.80584716796875, "logps/chosen": -294.5928955078125, "logps/rejected": -305.561767578125, "loss": 0.579, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.28244853019714355, "rewards/margins": 0.5087226629257202, "rewards/rejected": -0.7911711931228638, "step": 340 }, { "epoch": 0.36639623135304894, "grad_norm": 20.453314348602326, "learning_rate": 1.598678632063284e-07, "logits/chosen": -12.832880973815918, "logits/rejected": -13.420519828796387, "logps/chosen": -368.6810302734375, "logps/rejected": -389.3878479003906, "loss": 0.5505, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4756461977958679, "rewards/margins": 0.39826783537864685, "rewards/rejected": -0.8739139437675476, "step": 350 }, { "epoch": 0.3768646951059932, "grad_norm": 20.124446185453415, "learning_rate": 1.568990501325568e-07, "logits/chosen": -12.703775405883789, "logits/rejected": -12.269942283630371, "logps/chosen": -284.19219970703125, "logps/rejected": -372.7416076660156, "loss": 0.5258, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4747592806816101, "rewards/margins": 0.43383827805519104, "rewards/rejected": -0.9085975885391235, "step": 360 }, { "epoch": 0.38733315885893743, "grad_norm": 12.256605456435558, "learning_rate": 1.5385413962250656e-07, "logits/chosen": -13.217790603637695, "logits/rejected": -13.344598770141602, "logps/chosen": -338.6850280761719, "logps/rejected": -376.1455993652344, "loss": 0.545, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.41851702332496643, "rewards/margins": 0.6487592458724976, "rewards/rejected": -1.0672763586044312, "step": 370 }, { "epoch": 0.39780162261188173, "grad_norm": 23.457846993021622, "learning_rate": 1.507372039740978e-07, "logits/chosen": -13.952138900756836, "logits/rejected": -13.335546493530273, "logps/chosen": -337.4014892578125, "logps/rejected": -424.3997497558594, "loss": 0.5639, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5712438821792603, "rewards/margins": 0.33642515540122986, "rewards/rejected": -0.907668948173523, "step": 380 }, { "epoch": 0.408270086364826, "grad_norm": 34.828863133678425, "learning_rate": 1.475524118124892e-07, "logits/chosen": -12.775480270385742, "logits/rejected": -12.927217483520508, "logps/chosen": -327.45843505859375, "logps/rejected": -354.95562744140625, "loss": 0.5512, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5806573629379272, "rewards/margins": 0.3768925070762634, "rewards/rejected": -0.9575498700141907, "step": 390 }, { "epoch": 0.4187385501177702, "grad_norm": 40.31423860042707, "learning_rate": 1.4430402251491138e-07, "logits/chosen": -13.684167861938477, "logits/rejected": -13.840978622436523, "logps/chosen": -374.80859375, "logps/rejected": -402.02685546875, "loss": 0.5622, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6369796395301819, "rewards/margins": 0.2776455581188202, "rewards/rejected": -0.9146251678466797, "step": 400 }, { "epoch": 0.4187385501177702, "eval_logits/chosen": -13.940086364746094, "eval_logits/rejected": -14.236658096313477, "eval_logps/chosen": -371.2007141113281, "eval_logps/rejected": -392.6759033203125, "eval_loss": 0.5468377470970154, "eval_rewards/accuracies": 0.6800000071525574, "eval_rewards/chosen": -0.5687984228134155, "eval_rewards/margins": 0.5138704180717468, "eval_rewards/rejected": -1.0826687812805176, "eval_runtime": 149.8563, "eval_samples_per_second": 13.346, "eval_steps_per_second": 0.834, "step": 400 }, { "epoch": 0.42920701387071447, "grad_norm": 16.540681614535696, "learning_rate": 1.4099638051412743e-07, "logits/chosen": -13.835235595703125, "logits/rejected": -13.860295295715332, "logps/chosen": -377.9153747558594, "logps/rejected": -418.71014404296875, "loss": 0.565, "rewards/accuracies": 0.75, "rewards/chosen": -0.5656043291091919, "rewards/margins": 0.5640857815742493, "rewards/rejected": -1.129690170288086, "step": 410 }, { "epoch": 0.4396754776236587, "grad_norm": 21.686061060497405, "learning_rate": 1.3763390948813896e-07, "logits/chosen": -13.40911865234375, "logits/rejected": -13.989703178405762, "logps/chosen": -390.37274169921875, "logps/rejected": -365.4905090332031, "loss": 0.5514, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7015476226806641, "rewards/margins": 0.3901844918727875, "rewards/rejected": -1.091732144355774, "step": 420 }, { "epoch": 0.45014394137660296, "grad_norm": 18.49857147965794, "learning_rate": 1.342211064439091e-07, "logits/chosen": -13.539401054382324, "logits/rejected": -13.536432266235352, "logps/chosen": -348.41888427734375, "logps/rejected": -399.24371337890625, "loss": 0.5585, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8608742952346802, "rewards/margins": 0.3582659065723419, "rewards/rejected": -1.2191402912139893, "step": 430 }, { "epoch": 0.46061240512954726, "grad_norm": 28.78489077603455, "learning_rate": 1.3076253570301408e-07, "logits/chosen": -13.883230209350586, "logits/rejected": -13.993196487426758, "logps/chosen": -349.2350769042969, "logps/rejected": -374.9832763671875, "loss": 0.5542, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7180451154708862, "rewards/margins": 0.41989952325820923, "rewards/rejected": -1.1379445791244507, "step": 440 }, { "epoch": 0.4710808688824915, "grad_norm": 18.20248083655786, "learning_rate": 1.2726282279726786e-07, "logits/chosen": -14.505528450012207, "logits/rejected": -14.693391799926758, "logps/chosen": -330.843017578125, "logps/rejected": -396.43011474609375, "loss": 0.5424, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5095499157905579, "rewards/margins": 0.6803628206253052, "rewards/rejected": -1.1899127960205078, "step": 450 }, { "epoch": 0.48154933263543576, "grad_norm": 16.310384829892065, "learning_rate": 1.2372664828248319e-07, "logits/chosen": -14.17590618133545, "logits/rejected": -14.078557014465332, "logps/chosen": -320.20062255859375, "logps/rejected": -423.71160888671875, "loss": 0.5607, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4220674932003021, "rewards/margins": 0.6703753471374512, "rewards/rejected": -1.0924427509307861, "step": 460 }, { "epoch": 0.49201779638838, "grad_norm": 19.430025911116154, "learning_rate": 1.2015874147864312e-07, "logits/chosen": -14.247779846191406, "logits/rejected": -14.570295333862305, "logps/chosen": -338.3775939941406, "logps/rejected": -354.2191467285156, "loss": 0.5134, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4138456881046295, "rewards/margins": 0.44109171628952026, "rewards/rejected": -0.8549374341964722, "step": 470 }, { "epoch": 0.5024862601413242, "grad_norm": 17.019933271716724, "learning_rate": 1.1656387414485477e-07, "logits/chosen": -13.82238483428955, "logits/rejected": -13.82000732421875, "logps/chosen": -318.55438232421875, "logps/rejected": -383.9975280761719, "loss": 0.5281, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5111416578292847, "rewards/margins": 0.5328122973442078, "rewards/rejected": -1.0439538955688477, "step": 480 }, { "epoch": 0.5129547238942685, "grad_norm": 28.874755648357077, "learning_rate": 1.1294685409754433e-07, "logits/chosen": -12.982122421264648, "logits/rejected": -14.110940933227539, "logps/chosen": -390.028076171875, "logps/rejected": -422.0865173339844, "loss": 0.5217, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.47721004486083984, "rewards/margins": 0.8017258644104004, "rewards/rejected": -1.2789360284805298, "step": 490 }, { "epoch": 0.5234231876472127, "grad_norm": 14.310282443089868, "learning_rate": 1.093125187804288e-07, "logits/chosen": -13.757654190063477, "logits/rejected": -13.917083740234375, "logps/chosen": -368.4084777832031, "logps/rejected": -402.80621337890625, "loss": 0.5441, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6358953714370728, "rewards/margins": 0.5089365243911743, "rewards/rejected": -1.144831895828247, "step": 500 }, { "epoch": 0.5234231876472127, "eval_logits/chosen": -14.130770683288574, "eval_logits/rejected": -14.397551536560059, "eval_logps/chosen": -377.0606689453125, "eval_logps/rejected": -405.3188781738281, "eval_loss": 0.5362752676010132, "eval_rewards/accuracies": 0.6679999828338623, "eval_rewards/chosen": -0.6273974180221558, "eval_rewards/margins": 0.5817012786865234, "eval_rewards/rejected": -1.2090985774993896, "eval_runtime": 149.9028, "eval_samples_per_second": 13.342, "eval_steps_per_second": 0.834, "step": 500 }, { "epoch": 0.533891651400157, "grad_norm": 20.201758093145344, "learning_rate": 1.0566572879486386e-07, "logits/chosen": -13.861791610717773, "logits/rejected": -14.239435195922852, "logps/chosen": -357.29888916015625, "logps/rejected": -406.57366943359375, "loss": 0.5252, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.498563289642334, "rewards/margins": 0.7506011724472046, "rewards/rejected": -1.249164342880249, "step": 510 }, { "epoch": 0.5443601151531012, "grad_norm": 29.36802681742468, "learning_rate": 1.0201136139922029e-07, "logits/chosen": -14.109712600708008, "logits/rejected": -14.186027526855469, "logps/chosen": -353.62469482421875, "logps/rejected": -370.9365234375, "loss": 0.551, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5417202711105347, "rewards/margins": 0.6182612180709839, "rewards/rejected": -1.1599814891815186, "step": 520 }, { "epoch": 0.5548285789060455, "grad_norm": 20.995271815473735, "learning_rate": 9.835430398598318e-08, "logits/chosen": -14.159637451171875, "logits/rejected": -14.701879501342773, "logps/chosen": -377.4815979003906, "logps/rejected": -436.66632080078125, "loss": 0.5273, "rewards/accuracies": 0.75, "rewards/chosen": -0.3921489119529724, "rewards/margins": 0.8064279556274414, "rewards/rejected": -1.1985770463943481, "step": 530 }, { "epoch": 0.5652970426589898, "grad_norm": 18.50768834323812, "learning_rate": 9.469944754529784e-08, "logits/chosen": -13.86701488494873, "logits/rejected": -14.368024826049805, "logps/chosen": -343.15997314453125, "logps/rejected": -376.8660583496094, "loss": 0.548, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6364974975585938, "rewards/margins": 0.5513135194778442, "rewards/rejected": -1.187811017036438, "step": 540 }, { "epoch": 0.575765506411934, "grad_norm": 21.357543363893058, "learning_rate": 9.105168012370371e-08, "logits/chosen": -13.735044479370117, "logits/rejected": -14.701571464538574, "logps/chosen": -379.26495361328125, "logps/rejected": -371.4606628417969, "loss": 0.5351, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.69088214635849, "rewards/margins": 0.4814137816429138, "rewards/rejected": -1.1722959280014038, "step": 550 }, { "epoch": 0.5862339701648783, "grad_norm": 30.279493020621196, "learning_rate": 8.741588028680564e-08, "logits/chosen": -14.076385498046875, "logits/rejected": -14.322651863098145, "logps/chosen": -354.1212158203125, "logps/rejected": -378.01654052734375, "loss": 0.538, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.604841411113739, "rewards/margins": 0.5136178135871887, "rewards/rejected": -1.1184592247009277, "step": 560 }, { "epoch": 0.5967024339178225, "grad_norm": 16.735647281024168, "learning_rate": 8.379691059462476e-08, "logits/chosen": -13.99199104309082, "logits/rejected": -14.157026290893555, "logps/chosen": -374.80938720703125, "logps/rejected": -430.4781799316406, "loss": 0.5302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6314277648925781, "rewards/margins": 0.5574957132339478, "rewards/rejected": -1.1889234781265259, "step": 570 }, { "epoch": 0.6071708976707668, "grad_norm": 34.538667245559054, "learning_rate": 8.019961109835518e-08, "logits/chosen": -14.209541320800781, "logits/rejected": -14.323854446411133, "logps/chosen": -338.95855712890625, "logps/rejected": -397.41888427734375, "loss": 0.5399, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6777793765068054, "rewards/margins": 0.6259523630142212, "rewards/rejected": -1.3037316799163818, "step": 580 }, { "epoch": 0.6176393614237111, "grad_norm": 26.479615574042146, "learning_rate": 7.662879286722496e-08, "logits/chosen": -13.325056076049805, "logits/rejected": -13.995033264160156, "logps/chosen": -348.37322998046875, "logps/rejected": -410.867431640625, "loss": 0.5221, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5271872282028198, "rewards/margins": 0.7677302360534668, "rewards/rejected": -1.2949175834655762, "step": 590 }, { "epoch": 0.6281078251766553, "grad_norm": 17.223129668752573, "learning_rate": 7.308923155411709e-08, "logits/chosen": -14.02897834777832, "logits/rejected": -14.710617065429688, "logps/chosen": -403.61224365234375, "logps/rejected": -422.0875549316406, "loss": 0.5125, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.579394519329071, "rewards/margins": 0.6035684943199158, "rewards/rejected": -1.1829631328582764, "step": 600 }, { "epoch": 0.6281078251766553, "eval_logits/chosen": -14.112022399902344, "eval_logits/rejected": -14.37132453918457, "eval_logps/chosen": -371.89373779296875, "eval_logps/rejected": -401.46051025390625, "eval_loss": 0.5344283580780029, "eval_rewards/accuracies": 0.6840000152587891, "eval_rewards/chosen": -0.5757284164428711, "eval_rewards/margins": 0.5947864651679993, "eval_rewards/rejected": -1.1705149412155151, "eval_runtime": 150.2044, "eval_samples_per_second": 13.315, "eval_steps_per_second": 0.832, "step": 600 }, { "epoch": 0.6385762889295996, "grad_norm": 15.068671602779133, "learning_rate": 6.958566100855715e-08, "logits/chosen": -13.8591890335083, "logits/rejected": -14.538678169250488, "logps/chosen": -354.1308288574219, "logps/rejected": -361.6885070800781, "loss": 0.535, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.481017529964447, "rewards/margins": 0.7814317941665649, "rewards/rejected": -1.2624493837356567, "step": 610 }, { "epoch": 0.6490447526825438, "grad_norm": 21.07918728015492, "learning_rate": 6.612276694560927e-08, "logits/chosen": -13.915349006652832, "logits/rejected": -14.466160774230957, "logps/chosen": -403.4571228027344, "logps/rejected": -412.64520263671875, "loss": 0.501, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5299188494682312, "rewards/margins": 0.7413763999938965, "rewards/rejected": -1.271295189857483, "step": 620 }, { "epoch": 0.6595132164354881, "grad_norm": 19.103702131187973, "learning_rate": 6.270518067914745e-08, "logits/chosen": -13.852685928344727, "logits/rejected": -14.353567123413086, "logps/chosen": -374.9162902832031, "logps/rejected": -387.75860595703125, "loss": 0.5089, "rewards/accuracies": 0.625, "rewards/chosen": -0.5707219243049622, "rewards/margins": 0.6046653389930725, "rewards/rejected": -1.1753873825073242, "step": 630 }, { "epoch": 0.6699816801884323, "grad_norm": 26.795662212357737, "learning_rate": 5.933747292788368e-08, "logits/chosen": -13.633771896362305, "logits/rejected": -13.929837226867676, "logps/chosen": -328.4835205078125, "logps/rejected": -389.5190734863281, "loss": 0.5141, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6092640161514282, "rewards/margins": 0.7141542434692383, "rewards/rejected": -1.323418378829956, "step": 640 }, { "epoch": 0.6804501439413766, "grad_norm": 26.384619321462868, "learning_rate": 5.6024147702436975e-08, "logits/chosen": -13.914807319641113, "logits/rejected": -14.317163467407227, "logps/chosen": -386.0091247558594, "logps/rejected": -435.4970703125, "loss": 0.5178, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7489283680915833, "rewards/margins": 0.6266080141067505, "rewards/rejected": -1.3755362033843994, "step": 650 }, { "epoch": 0.6909186076943209, "grad_norm": 32.16033561247209, "learning_rate": 5.276963628161832e-08, "logits/chosen": -13.31103229522705, "logits/rejected": -13.394811630249023, "logps/chosen": -323.6170349121094, "logps/rejected": -388.63958740234375, "loss": 0.5084, "rewards/accuracies": 0.8125, "rewards/chosen": -0.49584946036338806, "rewards/margins": 0.8416509628295898, "rewards/rejected": -1.3375004529953003, "step": 660 }, { "epoch": 0.7013870714472651, "grad_norm": 24.949872599196897, "learning_rate": 4.95782912859878e-08, "logits/chosen": -13.573728561401367, "logits/rejected": -13.890779495239258, "logps/chosen": -366.00323486328125, "logps/rejected": -416.9151916503906, "loss": 0.5086, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6050105094909668, "rewards/margins": 0.8032246828079224, "rewards/rejected": -1.4082351922988892, "step": 670 }, { "epoch": 0.7118555352002094, "grad_norm": 38.50640287081186, "learning_rate": 4.645438085661084e-08, "logits/chosen": -14.350041389465332, "logits/rejected": -14.588415145874023, "logps/chosen": -355.0298156738281, "logps/rejected": -394.2617492675781, "loss": 0.5374, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6707770824432373, "rewards/margins": 0.42906999588012695, "rewards/rejected": -1.0998470783233643, "step": 680 }, { "epoch": 0.7223239989531536, "grad_norm": 24.422507206522518, "learning_rate": 4.340208294679745e-08, "logits/chosen": -13.968500137329102, "logits/rejected": -14.3223876953125, "logps/chosen": -341.5804748535156, "logps/rejected": -367.90045166015625, "loss": 0.5023, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6214663982391357, "rewards/margins": 0.579698383808136, "rewards/rejected": -1.201164722442627, "step": 690 }, { "epoch": 0.7327924627060979, "grad_norm": 16.46994924986635, "learning_rate": 4.042547973446017e-08, "logits/chosen": -13.902259826660156, "logits/rejected": -14.259056091308594, "logps/chosen": -358.53607177734375, "logps/rejected": -394.62664794921875, "loss": 0.5158, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6609233617782593, "rewards/margins": 0.5934489369392395, "rewards/rejected": -1.2543723583221436, "step": 700 }, { "epoch": 0.7327924627060979, "eval_logits/chosen": -14.100985527038574, "eval_logits/rejected": -14.28322696685791, "eval_logps/chosen": -376.5181579589844, "eval_logps/rejected": -407.68670654296875, "eval_loss": 0.5316002368927002, "eval_rewards/accuracies": 0.6759999990463257, "eval_rewards/chosen": -0.6219725012779236, "eval_rewards/margins": 0.610804557800293, "eval_rewards/rejected": -1.2327771186828613, "eval_runtime": 149.412, "eval_samples_per_second": 13.386, "eval_steps_per_second": 0.837, "step": 700 }, { "epoch": 0.7432609264590422, "grad_norm": 21.302516106563797, "learning_rate": 3.7528552162562855e-08, "logits/chosen": -14.20964527130127, "logits/rejected": -14.762395858764648, "logps/chosen": -352.42864990234375, "logps/rejected": -357.80755615234375, "loss": 0.5624, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6552818417549133, "rewards/margins": 0.563475489616394, "rewards/rejected": -1.2187573909759521, "step": 710 }, { "epoch": 0.7537293902119864, "grad_norm": 21.028434452167787, "learning_rate": 3.471517461496253e-08, "logits/chosen": -13.895108222961426, "logits/rejected": -14.63810920715332, "logps/chosen": -447.4383850097656, "logps/rejected": -481.58978271484375, "loss": 0.5124, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5006210207939148, "rewards/margins": 0.9197198152542114, "rewards/rejected": -1.420340895652771, "step": 720 }, { "epoch": 0.7641978539649307, "grad_norm": 14.730020567879517, "learning_rate": 3.198910973476393e-08, "logits/chosen": -13.825230598449707, "logits/rejected": -14.049738883972168, "logps/chosen": -334.89404296875, "logps/rejected": -387.7596740722656, "loss": 0.5295, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5604512691497803, "rewards/margins": 0.7153105735778809, "rewards/rejected": -1.2757618427276611, "step": 730 }, { "epoch": 0.7746663177178749, "grad_norm": 24.660611283550825, "learning_rate": 2.935400339211841e-08, "logits/chosen": -13.387316703796387, "logits/rejected": -14.199974060058594, "logps/chosen": -387.7333068847656, "logps/rejected": -383.7166748046875, "loss": 0.5282, "rewards/accuracies": 0.75, "rewards/chosen": -0.6691089868545532, "rewards/margins": 0.6580344438552856, "rewards/rejected": -1.3271434307098389, "step": 740 }, { "epoch": 0.7851347814708192, "grad_norm": 13.767843648079221, "learning_rate": 2.6813379808195357e-08, "logits/chosen": -14.288251876831055, "logits/rejected": -14.359420776367188, "logps/chosen": -339.8260498046875, "logps/rejected": -450.6610412597656, "loss": 0.509, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5569700598716736, "rewards/margins": 0.9691677093505859, "rewards/rejected": -1.5261377096176147, "step": 750 }, { "epoch": 0.7956032452237635, "grad_norm": 25.07672836114235, "learning_rate": 2.4370636841848924e-08, "logits/chosen": -13.93481159210205, "logits/rejected": -14.752789497375488, "logps/chosen": -405.8631286621094, "logps/rejected": -400.44769287109375, "loss": 0.5268, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7553908228874207, "rewards/margins": 0.6457816958427429, "rewards/rejected": -1.401172399520874, "step": 760 }, { "epoch": 0.8060717089767077, "grad_norm": 23.918260657300134, "learning_rate": 2.202904144528295e-08, "logits/chosen": -14.572360038757324, "logits/rejected": -14.208137512207031, "logps/chosen": -367.49468994140625, "logps/rejected": -460.3101501464844, "loss": 0.5074, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5758759379386902, "rewards/margins": 0.6287983655929565, "rewards/rejected": -1.204674243927002, "step": 770 }, { "epoch": 0.816540172729652, "grad_norm": 17.468797083009278, "learning_rate": 1.9791725294791928e-08, "logits/chosen": -13.938896179199219, "logits/rejected": -14.410066604614258, "logps/chosen": -356.8143615722656, "logps/rejected": -365.7565612792969, "loss": 0.5266, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4405798017978668, "rewards/margins": 0.6074010133743286, "rewards/rejected": -1.047980785369873, "step": 780 }, { "epoch": 0.8270086364825961, "grad_norm": 25.443458513615624, "learning_rate": 1.766168060242159e-08, "logits/chosen": -14.38599681854248, "logits/rejected": -14.689620971679688, "logps/chosen": -313.2379455566406, "logps/rejected": -354.0423889160156, "loss": 0.5395, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5758354663848877, "rewards/margins": 0.5628177523612976, "rewards/rejected": -1.1386531591415405, "step": 790 }, { "epoch": 0.8374771002355405, "grad_norm": 16.137468651826794, "learning_rate": 1.564175611415055e-08, "logits/chosen": -13.394170761108398, "logits/rejected": -13.763336181640625, "logps/chosen": -310.41693115234375, "logps/rejected": -387.755615234375, "loss": 0.5133, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5183297991752625, "rewards/margins": 0.797841489315033, "rewards/rejected": -1.316171407699585, "step": 800 }, { "epoch": 0.8374771002355405, "eval_logits/chosen": -14.09082317352295, "eval_logits/rejected": -14.274714469909668, "eval_logps/chosen": -376.9043273925781, "eval_logps/rejected": -408.9254150390625, "eval_loss": 0.527787446975708, "eval_rewards/accuracies": 0.6800000071525574, "eval_rewards/chosen": -0.6258336901664734, "eval_rewards/margins": 0.6193299293518066, "eval_rewards/rejected": -1.2451636791229248, "eval_runtime": 149.3494, "eval_samples_per_second": 13.391, "eval_steps_per_second": 0.837, "step": 800 }, { "epoch": 0.8479455639884846, "grad_norm": 22.038018685540855, "learning_rate": 1.3734653299944831e-08, "logits/chosen": -13.653346061706543, "logits/rejected": -14.236944198608398, "logps/chosen": -420.22412109375, "logps/rejected": -421.231201171875, "loss": 0.532, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6482435464859009, "rewards/margins": 0.6211605072021484, "rewards/rejected": -1.2694040536880493, "step": 810 }, { "epoch": 0.8584140277414289, "grad_norm": 25.08502183374406, "learning_rate": 1.1942922740781558e-08, "logits/chosen": -13.920855522155762, "logits/rejected": -14.378143310546875, "logps/chosen": -371.789794921875, "logps/rejected": -409.6880798339844, "loss": 0.5193, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6346549987792969, "rewards/margins": 0.6358563303947449, "rewards/rejected": -1.270511269569397, "step": 820 }, { "epoch": 0.8688824914943732, "grad_norm": 16.83996659028827, "learning_rate": 1.0268960717472741e-08, "logits/chosen": -14.19860553741455, "logits/rejected": -14.166885375976562, "logps/chosen": -347.3482971191406, "logps/rejected": -415.2900390625, "loss": 0.5355, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5952852368354797, "rewards/margins": 0.6116087436676025, "rewards/rejected": -1.2068939208984375, "step": 830 }, { "epoch": 0.8793509552473174, "grad_norm": 16.08177687112279, "learning_rate": 8.715006005852143e-09, "logits/chosen": -14.115530014038086, "logits/rejected": -14.573888778686523, "logps/chosen": -362.61724853515625, "logps/rejected": -457.84942626953125, "loss": 0.4976, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6578099727630615, "rewards/margins": 0.8440145254135132, "rewards/rejected": -1.5018243789672852, "step": 840 }, { "epoch": 0.8898194190002617, "grad_norm": 39.033655096711556, "learning_rate": 7.2831368826110625e-09, "logits/chosen": -14.004640579223633, "logits/rejected": -14.758051872253418, "logps/chosen": -382.56427001953125, "logps/rejected": -397.9862365722656, "loss": 0.5084, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5773628354072571, "rewards/margins": 0.7263933420181274, "rewards/rejected": -1.3037563562393188, "step": 850 }, { "epoch": 0.9002878827532059, "grad_norm": 19.748822633492768, "learning_rate": 5.975268345787455e-09, "logits/chosen": -13.904319763183594, "logits/rejected": -13.906578063964844, "logps/chosen": -398.3722839355469, "logps/rejected": -399.95306396484375, "loss": 0.5106, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6384676694869995, "rewards/margins": 0.6380214691162109, "rewards/rejected": -1.276489019393921, "step": 860 }, { "epoch": 0.9107563465061502, "grad_norm": 59.31572753609887, "learning_rate": 4.793149553625786e-09, "logits/chosen": -14.279424667358398, "logits/rejected": -14.221160888671875, "logps/chosen": -333.5687255859375, "logps/rejected": -372.06304931640625, "loss": 0.5295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5818564295768738, "rewards/margins": 0.6132394075393677, "rewards/rejected": -1.1950958967208862, "step": 870 }, { "epoch": 0.9212248102590945, "grad_norm": 13.400208624148753, "learning_rate": 3.7383614852329214e-09, "logits/chosen": -14.04296588897705, "logits/rejected": -14.601972579956055, "logps/chosen": -374.31024169921875, "logps/rejected": -394.02618408203125, "loss": 0.5216, "rewards/accuracies": 0.75, "rewards/chosen": -0.5409375429153442, "rewards/margins": 0.7117520570755005, "rewards/rejected": -1.2526895999908447, "step": 880 }, { "epoch": 0.9316932740120387, "grad_norm": 23.713770091143388, "learning_rate": 2.812314826158746e-09, "logits/chosen": -13.538187026977539, "logits/rejected": -13.821019172668457, "logps/chosen": -342.670654296875, "logps/rejected": -417.42608642578125, "loss": 0.5146, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5183156728744507, "rewards/margins": 0.8244325518608093, "rewards/rejected": -1.3427482843399048, "step": 890 }, { "epoch": 0.942161737764983, "grad_norm": 17.247650815291333, "learning_rate": 2.016248081729144e-09, "logits/chosen": -14.117114067077637, "logits/rejected": -14.199200630187988, "logps/chosen": -360.4630126953125, "logps/rejected": -437.9790954589844, "loss": 0.5098, "rewards/accuracies": 0.75, "rewards/chosen": -0.5845374464988708, "rewards/margins": 0.8339082598686218, "rewards/rejected": -1.4184458255767822, "step": 900 }, { "epoch": 0.942161737764983, "eval_logits/chosen": -14.10096263885498, "eval_logits/rejected": -14.284878730773926, "eval_logps/chosen": -374.75311279296875, "eval_logps/rejected": -407.1072998046875, "eval_loss": 0.5276437997817993, "eval_rewards/accuracies": 0.6959999799728394, "eval_rewards/chosen": -0.6043218970298767, "eval_rewards/margins": 0.6226609349250793, "eval_rewards/rejected": -1.2269827127456665, "eval_runtime": 149.3336, "eval_samples_per_second": 13.393, "eval_steps_per_second": 0.837, "step": 900 }, { "epoch": 0.9526302015179272, "grad_norm": 38.823289395104936, "learning_rate": 1.3512259206550746e-09, "logits/chosen": -13.173808097839355, "logits/rejected": -14.06005573272705, "logps/chosen": -376.40521240234375, "logps/rejected": -369.52972412109375, "loss": 0.5286, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6154804229736328, "rewards/margins": 0.589421272277832, "rewards/rejected": -1.2049016952514648, "step": 910 }, { "epoch": 0.9630986652708715, "grad_norm": 32.054517995284215, "learning_rate": 8.181377511324306e-10, "logits/chosen": -14.00465202331543, "logits/rejected": -14.40130615234375, "logps/chosen": -339.0160217285156, "logps/rejected": -404.2559814453125, "loss": 0.5282, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.569662868976593, "rewards/margins": 0.7953134179115295, "rewards/rejected": -1.3649762868881226, "step": 920 }, { "epoch": 0.9735671290238157, "grad_norm": 24.37374985565681, "learning_rate": 4.1769653133743035e-10, "logits/chosen": -14.06200885772705, "logits/rejected": -14.65931510925293, "logps/chosen": -348.43487548828125, "logps/rejected": -376.7773132324219, "loss": 0.4989, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5918719172477722, "rewards/margins": 0.6253038644790649, "rewards/rejected": -1.2171757221221924, "step": 930 }, { "epoch": 0.98403559277676, "grad_norm": 17.872968337546382, "learning_rate": 1.5043781590823313e-10, "logits/chosen": -14.47838020324707, "logits/rejected": -14.449295043945312, "logps/chosen": -370.87957763671875, "logps/rejected": -464.7237243652344, "loss": 0.5148, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6222177743911743, "rewards/margins": 0.7162548899650574, "rewards/rejected": -1.338472604751587, "step": 940 }, { "epoch": 0.9945040565297043, "grad_norm": 22.704537546218425, "learning_rate": 1.671903968816224e-11, "logits/chosen": -13.458340644836426, "logits/rejected": -13.864636421203613, "logps/chosen": -367.2121276855469, "logps/rejected": -387.31488037109375, "loss": 0.5191, "rewards/accuracies": 0.75, "rewards/chosen": -0.6486170887947083, "rewards/margins": 0.6029497385025024, "rewards/rejected": -1.2515666484832764, "step": 950 }, { "epoch": 0.9997382884061764, "step": 955, "total_flos": 0.0, "train_loss": 0.560625178402007, "train_runtime": 12630.0326, "train_samples_per_second": 4.84, "train_steps_per_second": 0.076 } ], "logging_steps": 10, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }