{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994111874386653, "eval_steps": 100, "global_step": 1273, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 3.90625e-08, "logits/chosen": 0.8174187541007996, "logits/rejected": 1.0011495351791382, "logps/chosen": -246.74014282226562, "logps/rejected": -187.51809692382812, "loss": 0.0442, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 3.90625e-07, "logits/chosen": 1.018452525138855, "logits/rejected": 1.0691548585891724, "logps/chosen": -264.271240234375, "logps/rejected": -221.92071533203125, "loss": 0.0614, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 1.4848755199636798e-05, "rewards/margins": 9.348281309939921e-05, "rewards/rejected": -7.86340533522889e-05, "step": 10 }, { "epoch": 0.02, "learning_rate": 7.8125e-07, "logits/chosen": 0.962690532207489, "logits/rejected": 0.9728161096572876, "logps/chosen": -255.00430297851562, "logps/rejected": -221.725830078125, "loss": 0.0504, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.0949605893983971e-05, "rewards/margins": 4.187399099464528e-05, "rewards/rejected": -5.282359052216634e-05, "step": 20 }, { "epoch": 0.02, "learning_rate": 1.1718750000000001e-06, "logits/chosen": 1.0629122257232666, "logits/rejected": 1.116347074508667, "logps/chosen": -241.6275177001953, "logps/rejected": -226.95309448242188, "loss": 0.0613, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -4.63350661448203e-05, "rewards/margins": -5.460592728923075e-05, "rewards/rejected": 8.270866601378657e-06, "step": 30 }, { "epoch": 0.03, "learning_rate": 1.5625e-06, "logits/chosen": 0.9653002023696899, "logits/rejected": 1.0042797327041626, "logps/chosen": -252.0692596435547, "logps/rejected": -237.03872680664062, "loss": 0.0455, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 9.635712558520027e-06, "rewards/margins": 7.444314542226493e-05, "rewards/rejected": -6.480743468273431e-05, "step": 40 }, { "epoch": 0.04, "learning_rate": 1.953125e-06, "logits/chosen": 0.9830142259597778, "logits/rejected": 1.056870460510254, "logps/chosen": -256.4426574707031, "logps/rejected": -225.88821411132812, "loss": 0.0536, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -2.2923812139197253e-05, "rewards/margins": 0.00013652857160195708, "rewards/rejected": -0.00015945239283610135, "step": 50 }, { "epoch": 0.05, "learning_rate": 2.3437500000000002e-06, "logits/chosen": 0.9784016609191895, "logits/rejected": 1.1101980209350586, "logps/chosen": -271.16668701171875, "logps/rejected": -249.8975372314453, "loss": 0.0572, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 2.5281901798734907e-06, "rewards/margins": 0.00010906215902650729, "rewards/rejected": -0.00010653395293047652, "step": 60 }, { "epoch": 0.05, "learning_rate": 2.7343750000000004e-06, "logits/chosen": 1.0381357669830322, "logits/rejected": 1.0707155466079712, "logps/chosen": -253.42160034179688, "logps/rejected": -221.3090362548828, "loss": 0.0465, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 9.924682672135532e-06, "rewards/margins": 0.00015668092237319797, "rewards/rejected": -0.0001467562251491472, "step": 70 }, { "epoch": 0.06, "learning_rate": 3.125e-06, "logits/chosen": 0.9301819801330566, "logits/rejected": 1.1145567893981934, "logps/chosen": -291.00421142578125, "logps/rejected": -223.3363800048828, "loss": 0.0539, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 6.633991142734885e-05, "rewards/margins": 0.00031302389106713235, "rewards/rejected": -0.0002466839796397835, "step": 80 }, { "epoch": 0.07, "learning_rate": 3.5156250000000003e-06, "logits/chosen": 1.054889440536499, "logits/rejected": 1.0736507177352905, "logps/chosen": -287.6363830566406, "logps/rejected": -294.4855651855469, "loss": 0.0478, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0002254299761261791, "rewards/margins": 0.0004840657056774944, "rewards/rejected": -0.0002586357877589762, "step": 90 }, { "epoch": 0.08, "learning_rate": 3.90625e-06, "logits/chosen": 0.9826571345329285, "logits/rejected": 1.0136107206344604, "logps/chosen": -248.9982147216797, "logps/rejected": -213.6271514892578, "loss": 0.0569, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.00023196591064333916, "rewards/margins": 0.0005559330456890166, "rewards/rejected": -0.0003239671350456774, "step": 100 }, { "epoch": 0.08, "eval_logits/chosen": 0.987690269947052, "eval_logits/rejected": 1.0547419786453247, "eval_logps/chosen": -269.70703125, "eval_logps/rejected": -248.80368041992188, "eval_loss": 0.053490906953811646, "eval_rewards/accuracies": 0.597305417060852, "eval_rewards/chosen": 0.0002614832192193717, "eval_rewards/margins": 0.0006264070980250835, "eval_rewards/rejected": -0.00036492382059805095, "eval_runtime": 374.224, "eval_samples_per_second": 5.344, "eval_steps_per_second": 0.446, "step": 100 }, { "epoch": 0.09, "learning_rate": 4.296875e-06, "logits/chosen": 1.0466687679290771, "logits/rejected": 0.9887654185295105, "logps/chosen": -299.6304931640625, "logps/rejected": -244.9992218017578, "loss": 0.0627, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0005523829022422433, "rewards/margins": 0.0007637250819243491, "rewards/rejected": -0.0002113422378897667, "step": 110 }, { "epoch": 0.09, "learning_rate": 4.6875000000000004e-06, "logits/chosen": 1.0038639307022095, "logits/rejected": 1.0830211639404297, "logps/chosen": -273.029296875, "logps/rejected": -203.0375518798828, "loss": 0.0646, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0005045495927333832, "rewards/margins": 0.0008580017019994557, "rewards/rejected": -0.0003534521092660725, "step": 120 }, { "epoch": 0.1, "learning_rate": 4.999962359300416e-06, "logits/chosen": 1.0437225103378296, "logits/rejected": 1.1165318489074707, "logps/chosen": -275.69549560546875, "logps/rejected": -231.8827667236328, "loss": 0.0515, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0014680480817332864, "rewards/margins": 0.0015463808085769415, "rewards/rejected": -7.833284325897694e-05, "step": 130 }, { "epoch": 0.11, "learning_rate": 4.998645053824218e-06, "logits/chosen": 1.012627363204956, "logits/rejected": 1.0429273843765259, "logps/chosen": -306.0951232910156, "logps/rejected": -257.19134521484375, "loss": 0.052, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0023107831366360188, "rewards/margins": 0.0023487000726163387, "rewards/rejected": -3.7916899600531906e-05, "step": 140 }, { "epoch": 0.12, "learning_rate": 4.9954468466732145e-06, "logits/chosen": 0.9628115892410278, "logits/rejected": 1.01333749294281, "logps/chosen": -274.50750732421875, "logps/rejected": -251.54348754882812, "loss": 0.0499, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0016668376047164202, "rewards/margins": 0.001496818964369595, "rewards/rejected": 0.00017001861124299467, "step": 150 }, { "epoch": 0.13, "learning_rate": 4.990370145357496e-06, "logits/chosen": 0.9540281295776367, "logits/rejected": 1.0164401531219482, "logps/chosen": -245.35122680664062, "logps/rejected": -222.86331176757812, "loss": 0.0556, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0014593935338780284, "rewards/margins": 0.0026508464943617582, "rewards/rejected": -0.001191452844068408, "step": 160 }, { "epoch": 0.13, "learning_rate": 4.983418771458684e-06, "logits/chosen": 1.0378397703170776, "logits/rejected": 1.0483719110488892, "logps/chosen": -299.852294921875, "logps/rejected": -257.53851318359375, "loss": 0.0533, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.002016807906329632, "rewards/margins": 0.002726987935602665, "rewards/rejected": -0.000710180145688355, "step": 170 }, { "epoch": 0.14, "learning_rate": 4.97459795775315e-06, "logits/chosen": 0.9879986047744751, "logits/rejected": 1.0183862447738647, "logps/chosen": -292.57647705078125, "logps/rejected": -274.23297119140625, "loss": 0.0545, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.003255961462855339, "rewards/margins": 0.004640574567019939, "rewards/rejected": -0.0013846131041646004, "step": 180 }, { "epoch": 0.15, "learning_rate": 4.963914344272961e-06, "logits/chosen": 0.928981602191925, "logits/rejected": 1.0115787982940674, "logps/chosen": -274.09576416015625, "logps/rejected": -235.8800506591797, "loss": 0.0498, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0013791031669825315, "rewards/margins": 0.004472037777304649, "rewards/rejected": -0.0030929348431527615, "step": 190 }, { "epoch": 0.16, "learning_rate": 4.951375973307458e-06, "logits/chosen": 0.9295806884765625, "logits/rejected": 0.9614816904067993, "logps/chosen": -267.1017761230469, "logps/rejected": -243.70095825195312, "loss": 0.0565, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0006091211689636111, "rewards/margins": 0.0036107245832681656, "rewards/rejected": -0.004219844937324524, "step": 200 }, { "epoch": 0.16, "eval_logits/chosen": 0.8997535705566406, "eval_logits/rejected": 0.9705783128738403, "eval_logps/chosen": -271.5317687988281, "eval_logps/rejected": -255.22335815429688, "eval_loss": 0.051502373069524765, "eval_rewards/accuracies": 0.6077844500541687, "eval_rewards/chosen": -0.001563269179314375, "eval_rewards/margins": 0.005221350584179163, "eval_rewards/rejected": -0.006784618832170963, "eval_runtime": 374.3563, "eval_samples_per_second": 5.343, "eval_steps_per_second": 0.446, "step": 200 }, { "epoch": 0.16, "learning_rate": 4.93699228334928e-06, "logits/chosen": 0.9301688075065613, "logits/rejected": 0.9155396223068237, "logps/chosen": -285.2063293457031, "logps/rejected": -238.8046417236328, "loss": 0.0507, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.00031993992161005735, "rewards/margins": 0.006842759437859058, "rewards/rejected": -0.00716269901022315, "step": 210 }, { "epoch": 0.17, "learning_rate": 4.920774101989362e-06, "logits/chosen": 0.9102910757064819, "logits/rejected": 0.9553702473640442, "logps/chosen": -285.9295959472656, "logps/rejected": -256.9110107421875, "loss": 0.0525, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0035760626196861267, "rewards/margins": 0.005626793950796127, "rewards/rejected": -0.00920285563915968, "step": 220 }, { "epoch": 0.18, "learning_rate": 4.902733637766261e-06, "logits/chosen": 0.8825628161430359, "logits/rejected": 0.9233707189559937, "logps/chosen": -299.63818359375, "logps/rejected": -264.28070068359375, "loss": 0.0542, "rewards/accuracies": 0.625, "rewards/chosen": -0.004372667986899614, "rewards/margins": 0.010200297459959984, "rewards/rejected": -0.014572965912520885, "step": 230 }, { "epoch": 0.19, "learning_rate": 4.882884470975954e-06, "logits/chosen": 0.8268105387687683, "logits/rejected": 0.9335943460464478, "logps/chosen": -257.7367248535156, "logps/rejected": -233.0779571533203, "loss": 0.0534, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.010581925511360168, "rewards/margins": 0.006359722465276718, "rewards/rejected": -0.016941647976636887, "step": 240 }, { "epoch": 0.2, "learning_rate": 4.861241543449015e-06, "logits/chosen": 0.8305758237838745, "logits/rejected": 0.833528995513916, "logps/chosen": -299.9106140136719, "logps/rejected": -286.69024658203125, "loss": 0.0455, "rewards/accuracies": 0.59375, "rewards/chosen": -0.014483618550002575, "rewards/margins": 0.007877354510128498, "rewards/rejected": -0.022360973060131073, "step": 250 }, { "epoch": 0.2, "learning_rate": 4.8378211473028755e-06, "logits/chosen": 0.793732762336731, "logits/rejected": 0.7972532510757446, "logps/chosen": -286.5875244140625, "logps/rejected": -260.48895263671875, "loss": 0.048, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.01467740349471569, "rewards/margins": 0.01562904380261898, "rewards/rejected": -0.03030644915997982, "step": 260 }, { "epoch": 0.21, "learning_rate": 4.812640912677624e-06, "logits/chosen": 0.6651993989944458, "logits/rejected": 0.7179329991340637, "logps/chosen": -272.69989013671875, "logps/rejected": -246.05810546875, "loss": 0.0557, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.021714814007282257, "rewards/margins": 0.006616706494241953, "rewards/rejected": -0.028331521898508072, "step": 270 }, { "epoch": 0.22, "learning_rate": 4.785719794464596e-06, "logits/chosen": 0.5646941065788269, "logits/rejected": 0.6421071290969849, "logps/chosen": -334.10302734375, "logps/rejected": -274.1715087890625, "loss": 0.046, "rewards/accuracies": 0.625, "rewards/chosen": -0.023918839171528816, "rewards/margins": 0.012724781408905983, "rewards/rejected": -0.0366436168551445, "step": 280 }, { "epoch": 0.23, "learning_rate": 4.757078058037722e-06, "logits/chosen": 0.5190677046775818, "logits/rejected": 0.6334066987037659, "logps/chosen": -340.30914306640625, "logps/rejected": -305.35662841796875, "loss": 0.0475, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.021180950105190277, "rewards/margins": 0.02034229040145874, "rewards/rejected": -0.04152324050664902, "step": 290 }, { "epoch": 0.24, "learning_rate": 4.72673726399839e-06, "logits/chosen": 0.5193800330162048, "logits/rejected": 0.6493593454360962, "logps/chosen": -261.5180358886719, "logps/rejected": -240.78369140625, "loss": 0.0467, "rewards/accuracies": 0.5625, "rewards/chosen": -0.032134272158145905, "rewards/margins": 0.01347540132701397, "rewards/rejected": -0.045609671622514725, "step": 300 }, { "epoch": 0.24, "eval_logits/chosen": 0.4972836971282959, "eval_logits/rejected": 0.5527552962303162, "eval_logps/chosen": -301.5743103027344, "eval_logps/rejected": -296.9762268066406, "eval_loss": 0.04698715731501579, "eval_rewards/accuracies": 0.6032934188842773, "eval_rewards/chosen": -0.0316057950258255, "eval_rewards/margins": 0.016931703314185143, "eval_rewards/rejected": -0.04853750020265579, "eval_runtime": 374.353, "eval_samples_per_second": 5.343, "eval_steps_per_second": 0.446, "step": 300 }, { "epoch": 0.24, "learning_rate": 4.694720251945298e-06, "logits/chosen": 0.49350661039352417, "logits/rejected": 0.49793070554733276, "logps/chosen": -325.5235900878906, "logps/rejected": -307.2831726074219, "loss": 0.0485, "rewards/accuracies": 0.625, "rewards/chosen": -0.027102241292595863, "rewards/margins": 0.029582733288407326, "rewards/rejected": -0.05668497085571289, "step": 310 }, { "epoch": 0.25, "learning_rate": 4.661051123281528e-06, "logits/chosen": 0.43795689940452576, "logits/rejected": 0.4705297350883484, "logps/chosen": -304.1239318847656, "logps/rejected": -286.34521484375, "loss": 0.0474, "rewards/accuracies": 0.59375, "rewards/chosen": -0.029134873300790787, "rewards/margins": 0.018936321139335632, "rewards/rejected": -0.04807119444012642, "step": 320 }, { "epoch": 0.26, "learning_rate": 4.6257552230717536e-06, "logits/chosen": 0.372799813747406, "logits/rejected": 0.4166027903556824, "logps/chosen": -288.1565856933594, "logps/rejected": -327.458251953125, "loss": 0.0417, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03428940102458, "rewards/margins": 0.02542264387011528, "rewards/rejected": -0.05971204489469528, "step": 330 }, { "epoch": 0.27, "learning_rate": 4.588859120963282e-06, "logits/chosen": 0.27427417039871216, "logits/rejected": 0.2968738377094269, "logps/chosen": -267.9536437988281, "logps/rejected": -281.6087646484375, "loss": 0.049, "rewards/accuracies": 0.59375, "rewards/chosen": -0.03779522702097893, "rewards/margins": 0.023776082322001457, "rewards/rejected": -0.06157131865620613, "step": 340 }, { "epoch": 0.27, "learning_rate": 4.5503905911852435e-06, "logits/chosen": 0.2773832678794861, "logits/rejected": 0.28696539998054504, "logps/chosen": -314.53521728515625, "logps/rejected": -290.0928039550781, "loss": 0.0493, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0507054328918457, "rewards/margins": 0.023872187361121178, "rewards/rejected": -0.07457762211561203, "step": 350 }, { "epoch": 0.28, "learning_rate": 4.510378591641036e-06, "logits/chosen": 0.1799338012933731, "logits/rejected": 0.21671123802661896, "logps/chosen": -355.8804626464844, "logps/rejected": -375.59918212890625, "loss": 0.0388, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.05081228166818619, "rewards/margins": 0.030371317639946938, "rewards/rejected": -0.08118359744548798, "step": 360 }, { "epoch": 0.29, "learning_rate": 4.468853242109712e-06, "logits/chosen": 0.19799312949180603, "logits/rejected": 0.1859453022480011, "logps/chosen": -308.9555969238281, "logps/rejected": -318.88787841796875, "loss": 0.0442, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.04206562787294388, "rewards/margins": 0.02774595282971859, "rewards/rejected": -0.06981157511472702, "step": 370 }, { "epoch": 0.3, "learning_rate": 4.42584580157276e-06, "logits/chosen": 0.2413104772567749, "logits/rejected": 0.3254660964012146, "logps/chosen": -334.237548828125, "logps/rejected": -312.25518798828125, "loss": 0.0446, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.037926387041807175, "rewards/margins": 0.02587224170565605, "rewards/rejected": -0.06379862874746323, "step": 380 }, { "epoch": 0.31, "learning_rate": 4.381388644683317e-06, "logits/chosen": 0.299532949924469, "logits/rejected": 0.3157830536365509, "logps/chosen": -323.3116149902344, "logps/rejected": -300.4678649902344, "loss": 0.0474, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.02991255559027195, "rewards/margins": 0.02541544660925865, "rewards/rejected": -0.05532800033688545, "step": 390 }, { "epoch": 0.31, "learning_rate": 4.33551523739555e-06, "logits/chosen": 0.34920087456703186, "logits/rejected": 0.4129624366760254, "logps/chosen": -321.7519836425781, "logps/rejected": -281.9065246582031, "loss": 0.0467, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.03106621466577053, "rewards/margins": 0.026955511420965195, "rewards/rejected": -0.058021724224090576, "step": 400 }, { "epoch": 0.31, "eval_logits/chosen": 0.3071232736110687, "eval_logits/rejected": 0.34567520022392273, "eval_logps/chosen": -306.9801940917969, "eval_logps/rejected": -306.72412109375, "eval_loss": 0.04428274184465408, "eval_rewards/accuracies": 0.6032934188842773, "eval_rewards/chosen": -0.03701169043779373, "eval_rewards/margins": 0.021273676306009293, "eval_rewards/rejected": -0.058285363018512726, "eval_runtime": 374.0722, "eval_samples_per_second": 5.347, "eval_steps_per_second": 0.446, "step": 400 }, { "epoch": 0.32, "learning_rate": 4.288260111772535e-06, "logits/chosen": 0.37154802680015564, "logits/rejected": 0.3885091245174408, "logps/chosen": -304.3981018066406, "logps/rejected": -325.0479431152344, "loss": 0.0471, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.02885431982576847, "rewards/margins": 0.03877053037285805, "rewards/rejected": -0.06762484461069107, "step": 410 }, { "epoch": 0.33, "learning_rate": 4.239658839991594e-06, "logits/chosen": 0.2403000146150589, "logits/rejected": 0.3043418824672699, "logps/chosen": -321.8272399902344, "logps/rejected": -314.7815246582031, "loss": 0.0413, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.03149569034576416, "rewards/margins": 0.03325023502111435, "rewards/rejected": -0.06474592536687851, "step": 420 }, { "epoch": 0.34, "learning_rate": 4.189748007566686e-06, "logits/chosen": 0.2293013632297516, "logits/rejected": 0.25423663854599, "logps/chosen": -295.85174560546875, "logps/rejected": -301.47723388671875, "loss": 0.0489, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.03744658827781677, "rewards/margins": 0.021524634212255478, "rewards/rejected": -0.05897121876478195, "step": 430 }, { "epoch": 0.35, "learning_rate": 4.138565185807972e-06, "logits/chosen": 0.2203875482082367, "logits/rejected": 0.18498417735099792, "logps/chosen": -288.0199890136719, "logps/rejected": -322.2027282714844, "loss": 0.0492, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04328416287899017, "rewards/margins": 0.039459649473428726, "rewards/rejected": -0.0827438086271286, "step": 440 }, { "epoch": 0.35, "learning_rate": 4.086148903539311e-06, "logits/chosen": 0.22477002441883087, "logits/rejected": 0.24873077869415283, "logps/chosen": -316.47052001953125, "logps/rejected": -299.12127685546875, "loss": 0.0436, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04419877380132675, "rewards/margins": 0.02829798124730587, "rewards/rejected": -0.07249675691127777, "step": 450 }, { "epoch": 0.36, "learning_rate": 4.032538618094972e-06, "logits/chosen": 0.1782715618610382, "logits/rejected": 0.22016020119190216, "logps/chosen": -294.3050842285156, "logps/rejected": -309.0475769042969, "loss": 0.0393, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.03711239993572235, "rewards/margins": 0.03650590404868126, "rewards/rejected": -0.07361830770969391, "step": 460 }, { "epoch": 0.37, "learning_rate": 3.977774685617386e-06, "logits/chosen": 0.13345113396644592, "logits/rejected": 0.18138858675956726, "logps/chosen": -350.42303466796875, "logps/rejected": -348.36273193359375, "loss": 0.0353, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04365909844636917, "rewards/margins": 0.03271768242120743, "rewards/rejected": -0.0763767808675766, "step": 470 }, { "epoch": 0.38, "learning_rate": 3.92189833067831e-06, "logits/chosen": 0.13094931840896606, "logits/rejected": 0.17935091257095337, "logps/chosen": -340.19708251953125, "logps/rejected": -315.94659423828125, "loss": 0.0415, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.045614853501319885, "rewards/margins": 0.03917178884148598, "rewards/rejected": -0.08478663861751556, "step": 480 }, { "epoch": 0.38, "learning_rate": 3.864951615246261e-06, "logits/chosen": 0.1737602949142456, "logits/rejected": 0.2314929962158203, "logps/chosen": -344.6983947753906, "logps/rejected": -356.94989013671875, "loss": 0.0406, "rewards/accuracies": 0.625, "rewards/chosen": -0.04962370544672012, "rewards/margins": 0.04535280913114548, "rewards/rejected": -0.0949765145778656, "step": 490 }, { "epoch": 0.39, "learning_rate": 3.806977407023581e-06, "logits/chosen": 0.17347046732902527, "logits/rejected": 0.15172746777534485, "logps/chosen": -297.4417419433594, "logps/rejected": -321.3406677246094, "loss": 0.0359, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.04872705414891243, "rewards/margins": 0.05000175163149834, "rewards/rejected": -0.09872879832983017, "step": 500 }, { "epoch": 0.39, "eval_logits/chosen": 0.1834835261106491, "eval_logits/rejected": 0.21186047792434692, "eval_logps/chosen": -327.32745361328125, "eval_logps/rejected": -335.3609313964844, "eval_loss": 0.042766325175762177, "eval_rewards/accuracies": 0.598802387714386, "eval_rewards/chosen": -0.05735894665122032, "eval_rewards/margins": 0.029563238844275475, "eval_rewards/rejected": -0.08692218363285065, "eval_runtime": 374.1933, "eval_samples_per_second": 5.345, "eval_steps_per_second": 0.446, "step": 500 }, { "epoch": 0.4, "learning_rate": 3.7480193471769815e-06, "logits/chosen": 0.1817350536584854, "logits/rejected": 0.1976645439863205, "logps/chosen": -316.01422119140625, "logps/rejected": -315.81463623046875, "loss": 0.0487, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.04795747622847557, "rewards/margins": 0.0256235059350729, "rewards/rejected": -0.07358097285032272, "step": 510 }, { "epoch": 0.41, "learning_rate": 3.6881218174858354e-06, "logits/chosen": 0.22255906462669373, "logits/rejected": 0.2185302972793579, "logps/chosen": -336.4140930175781, "logps/rejected": -345.5931091308594, "loss": 0.0464, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.04861742630600929, "rewards/margins": 0.034276556223630905, "rewards/rejected": -0.0828939825296402, "step": 520 }, { "epoch": 0.42, "learning_rate": 3.627329906932964e-06, "logits/chosen": 0.2149231731891632, "logits/rejected": 0.23484310507774353, "logps/chosen": -358.2276611328125, "logps/rejected": -330.24505615234375, "loss": 0.0425, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05005684494972229, "rewards/margins": 0.03168889507651329, "rewards/rejected": -0.08174573630094528, "step": 530 }, { "epoch": 0.42, "learning_rate": 3.5656893777630686e-06, "logits/chosen": 0.2030162811279297, "logits/rejected": 0.25424039363861084, "logps/chosen": -298.22308349609375, "logps/rejected": -316.8375549316406, "loss": 0.0439, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.051857758313417435, "rewards/margins": 0.04128115624189377, "rewards/rejected": -0.0931389182806015, "step": 540 }, { "epoch": 0.43, "learning_rate": 3.503246631034345e-06, "logits/chosen": 0.2722618579864502, "logits/rejected": 0.3168015778064728, "logps/chosen": -299.116943359375, "logps/rejected": -307.1891784667969, "loss": 0.0448, "rewards/accuracies": 0.625, "rewards/chosen": -0.04390637204051018, "rewards/margins": 0.0321880504488945, "rewards/rejected": -0.07609442621469498, "step": 550 }, { "epoch": 0.44, "learning_rate": 3.440048671689219e-06, "logits/chosen": 0.24053359031677246, "logits/rejected": 0.27442485094070435, "logps/chosen": -323.40631103515625, "logps/rejected": -323.588623046875, "loss": 0.0408, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0459776408970356, "rewards/margins": 0.036371566355228424, "rewards/rejected": -0.08234919607639313, "step": 560 }, { "epoch": 0.45, "learning_rate": 3.3761430731705056e-06, "logits/chosen": 0.25353458523750305, "logits/rejected": 0.33455875515937805, "logps/chosen": -323.89892578125, "logps/rejected": -319.5647888183594, "loss": 0.0347, "rewards/accuracies": 0.625, "rewards/chosen": -0.04967617988586426, "rewards/margins": 0.028463760390877724, "rewards/rejected": -0.07813994586467743, "step": 570 }, { "epoch": 0.46, "learning_rate": 3.311577941609604e-06, "logits/chosen": 0.31304338574409485, "logits/rejected": 0.3095194697380066, "logps/chosen": -268.1946105957031, "logps/rejected": -263.7663879394531, "loss": 0.0451, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.043435920029878616, "rewards/margins": 0.024017784744501114, "rewards/rejected": -0.06745370477437973, "step": 580 }, { "epoch": 0.46, "learning_rate": 3.2464018796137157e-06, "logits/chosen": 0.2560151219367981, "logits/rejected": 0.24582383036613464, "logps/chosen": -306.5500183105469, "logps/rejected": -292.2866516113281, "loss": 0.0383, "rewards/accuracies": 0.5625, "rewards/chosen": -0.040644846856594086, "rewards/margins": 0.024079840630292892, "rewards/rejected": -0.06472468376159668, "step": 590 }, { "epoch": 0.47, "learning_rate": 3.1806639496793245e-06, "logits/chosen": 0.29677996039390564, "logits/rejected": 0.3049861490726471, "logps/chosen": -307.2038269042969, "logps/rejected": -326.469970703125, "loss": 0.0431, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.033155038952827454, "rewards/margins": 0.02939048409461975, "rewards/rejected": -0.0625455230474472, "step": 600 }, { "epoch": 0.47, "eval_logits/chosen": 0.2554236948490143, "eval_logits/rejected": 0.28905177116394043, "eval_logps/chosen": -314.9629821777344, "eval_logps/rejected": -320.91607666015625, "eval_loss": 0.041808027774095535, "eval_rewards/accuracies": 0.6032934188842773, "eval_rewards/chosen": -0.04499450698494911, "eval_rewards/margins": 0.02748279832303524, "eval_rewards/rejected": -0.0724773034453392, "eval_runtime": 373.9667, "eval_samples_per_second": 5.348, "eval_steps_per_second": 0.447, "step": 600 }, { "epoch": 0.48, "learning_rate": 3.114413637259484e-06, "logits/chosen": 0.2767940163612366, "logits/rejected": 0.30368971824645996, "logps/chosen": -312.98223876953125, "logps/rejected": -334.64166259765625, "loss": 0.0419, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.040467552840709686, "rewards/margins": 0.03511633723974228, "rewards/rejected": -0.07558389008045197, "step": 610 }, { "epoch": 0.49, "learning_rate": 3.0477008135127247e-06, "logits/chosen": 0.22974368929862976, "logits/rejected": 0.2028408795595169, "logps/chosen": -320.90545654296875, "logps/rejected": -328.469970703125, "loss": 0.0438, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.047637082636356354, "rewards/margins": 0.03967064619064331, "rewards/rejected": -0.08730772882699966, "step": 620 }, { "epoch": 0.49, "learning_rate": 2.980575697761603e-06, "logits/chosen": 0.25714489817619324, "logits/rejected": 0.24402575194835663, "logps/chosen": -298.93994140625, "logps/rejected": -321.35748291015625, "loss": 0.0466, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.05835377424955368, "rewards/margins": 0.031576722860336304, "rewards/rejected": -0.08993048965930939, "step": 630 }, { "epoch": 0.5, "learning_rate": 2.9130888196891755e-06, "logits/chosen": 0.21983161568641663, "logits/rejected": 0.2652173936367035, "logps/chosen": -296.48626708984375, "logps/rejected": -289.70587158203125, "loss": 0.0448, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.049575865268707275, "rewards/margins": 0.019396495074033737, "rewards/rejected": -0.06897236406803131, "step": 640 }, { "epoch": 0.51, "learning_rate": 2.845290981301834e-06, "logits/chosen": 0.30579501390457153, "logits/rejected": 0.27916720509529114, "logps/chosen": -314.1757507324219, "logps/rejected": -312.4922180175781, "loss": 0.0377, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.05070630833506584, "rewards/margins": 0.03026432916522026, "rewards/rejected": -0.0809706375002861, "step": 650 }, { "epoch": 0.52, "learning_rate": 2.7772332186871464e-06, "logits/chosen": 0.2696114182472229, "logits/rejected": 0.30878639221191406, "logps/chosen": -332.66961669921875, "logps/rejected": -331.0030822753906, "loss": 0.0431, "rewards/accuracies": 0.625, "rewards/chosen": -0.05702507495880127, "rewards/margins": 0.0359366312623024, "rewards/rejected": -0.09296169131994247, "step": 660 }, { "epoch": 0.53, "learning_rate": 2.708966763595493e-06, "logits/chosen": 0.2562243938446045, "logits/rejected": 0.2766057848930359, "logps/chosen": -330.1766357421875, "logps/rejected": -306.47894287109375, "loss": 0.0439, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.052916429936885834, "rewards/margins": 0.035131536424160004, "rewards/rejected": -0.08804796636104584, "step": 670 }, { "epoch": 0.53, "learning_rate": 2.640543004874409e-06, "logits/chosen": 0.21568675339221954, "logits/rejected": 0.2647091746330261, "logps/chosen": -336.7883605957031, "logps/rejected": -329.66461181640625, "loss": 0.0448, "rewards/accuracies": 0.59375, "rewards/chosen": -0.050951045006513596, "rewards/margins": 0.03465459495782852, "rewards/rejected": -0.08560563623905182, "step": 680 }, { "epoch": 0.54, "learning_rate": 2.572013449784671e-06, "logits/chosen": 0.2518898844718933, "logits/rejected": 0.22508414089679718, "logps/chosen": -295.5574645996094, "logps/rejected": -297.5489807128906, "loss": 0.0412, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0494476854801178, "rewards/margins": 0.02454470470547676, "rewards/rejected": -0.07399239391088486, "step": 690 }, { "epoch": 0.55, "learning_rate": 2.503429685227245e-06, "logits/chosen": 0.20451202988624573, "logits/rejected": 0.21218447387218475, "logps/chosen": -361.32379150390625, "logps/rejected": -326.19989013671875, "loss": 0.0438, "rewards/accuracies": 0.625, "rewards/chosen": -0.05410303920507431, "rewards/margins": 0.023143690079450607, "rewards/rejected": -0.07724673300981522, "step": 700 }, { "epoch": 0.55, "eval_logits/chosen": 0.204001784324646, "eval_logits/rejected": 0.23564448952674866, "eval_logps/chosen": -327.3254089355469, "eval_logps/rejected": -337.3518981933594, "eval_loss": 0.04129507392644882, "eval_rewards/accuracies": 0.6017963886260986, "eval_rewards/chosen": -0.05735692009329796, "eval_rewards/margins": 0.03155623376369476, "eval_rewards/rejected": -0.08891315013170242, "eval_runtime": 374.0567, "eval_samples_per_second": 5.347, "eval_steps_per_second": 0.446, "step": 700 }, { "epoch": 0.56, "learning_rate": 2.434843338910286e-06, "logits/chosen": 0.1988871991634369, "logits/rejected": 0.2530394494533539, "logps/chosen": -362.696533203125, "logps/rejected": -351.73992919921875, "loss": 0.039, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.04888119548559189, "rewards/margins": 0.0342581570148468, "rewards/rejected": -0.0831393450498581, "step": 710 }, { "epoch": 0.57, "learning_rate": 2.3663060404854155e-06, "logits/chosen": 0.19785156846046448, "logits/rejected": 0.2040957510471344, "logps/chosen": -303.4078674316406, "logps/rejected": -326.0086669921875, "loss": 0.0398, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.05251380056142807, "rewards/margins": 0.047541745007038116, "rewards/rejected": -0.10005555301904678, "step": 720 }, { "epoch": 0.57, "learning_rate": 2.2978693826825406e-06, "logits/chosen": 0.21812646090984344, "logits/rejected": 0.2294880896806717, "logps/chosen": -347.4920959472656, "logps/rejected": -328.907958984375, "loss": 0.0374, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.05341614410281181, "rewards/margins": 0.031886156648397446, "rewards/rejected": -0.08530230075120926, "step": 730 }, { "epoch": 0.58, "learning_rate": 2.2295848824724612e-06, "logits/chosen": 0.21070751547813416, "logits/rejected": 0.25296124815940857, "logps/chosen": -295.4853210449219, "logps/rejected": -316.27044677734375, "loss": 0.0525, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05813244730234146, "rewards/margins": 0.0497218519449234, "rewards/rejected": -0.10785429179668427, "step": 740 }, { "epoch": 0.59, "learning_rate": 2.1615039422865136e-06, "logits/chosen": 0.25933319330215454, "logits/rejected": 0.31064844131469727, "logps/chosen": -309.44964599609375, "logps/rejected": -307.4235534667969, "loss": 0.0474, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.05073714256286621, "rewards/margins": 0.03562631830573082, "rewards/rejected": -0.08636346459388733, "step": 750 }, { "epoch": 0.6, "learning_rate": 2.0936778113224253e-06, "logits/chosen": 0.3158121109008789, "logits/rejected": 0.31753307580947876, "logps/chosen": -302.07086181640625, "logps/rejected": -300.81793212890625, "loss": 0.0404, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.04117770120501518, "rewards/margins": 0.04220277816057205, "rewards/rejected": -0.08338047564029694, "step": 760 }, { "epoch": 0.6, "learning_rate": 2.0261575469655304e-06, "logits/chosen": 0.2294500768184662, "logits/rejected": 0.2536849081516266, "logps/chosen": -320.7763671875, "logps/rejected": -321.6782531738281, "loss": 0.0389, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.053652387112379074, "rewards/margins": 0.030662674456834793, "rewards/rejected": -0.08431507647037506, "step": 770 }, { "epoch": 0.61, "learning_rate": 1.9589939763543693e-06, "logits/chosen": 0.26485732197761536, "logits/rejected": 0.21588650345802307, "logps/chosen": -304.3012390136719, "logps/rejected": -291.78192138671875, "loss": 0.0431, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.04734267294406891, "rewards/margins": 0.03097641095519066, "rewards/rejected": -0.07831908762454987, "step": 780 }, { "epoch": 0.62, "learning_rate": 1.8922376581196107e-06, "logits/chosen": 0.28727906942367554, "logits/rejected": 0.27608275413513184, "logps/chosen": -343.40472412109375, "logps/rejected": -321.67230224609375, "loss": 0.0498, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.048335738480091095, "rewards/margins": 0.030372479930520058, "rewards/rejected": -0.078708216547966, "step": 790 }, { "epoch": 0.63, "learning_rate": 1.8259388443250993e-06, "logits/chosen": 0.23254342377185822, "logits/rejected": 0.2663263976573944, "logps/chosen": -303.95733642578125, "logps/rejected": -291.7852478027344, "loss": 0.0446, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03941676765680313, "rewards/margins": 0.02897232212126255, "rewards/rejected": -0.06838909536600113, "step": 800 }, { "epoch": 0.63, "eval_logits/chosen": 0.22356104850769043, "eval_logits/rejected": 0.2566269636154175, "eval_logps/chosen": -322.177734375, "eval_logps/rejected": -332.6603088378906, "eval_loss": 0.040854789316654205, "eval_rewards/accuracies": 0.6047903895378113, "eval_rewards/chosen": -0.05220925062894821, "eval_rewards/margins": 0.03201233595609665, "eval_rewards/rejected": -0.08422157913446426, "eval_runtime": 374.1827, "eval_samples_per_second": 5.345, "eval_steps_per_second": 0.446, "step": 800 }, { "epoch": 0.64, "learning_rate": 1.760147442639679e-06, "logits/chosen": 0.18028207123279572, "logits/rejected": 0.24531738460063934, "logps/chosen": -317.4986267089844, "logps/rejected": -286.6923522949219, "loss": 0.045, "rewards/accuracies": 0.625, "rewards/chosen": -0.04818979650735855, "rewards/margins": 0.03410783410072327, "rewards/rejected": -0.08229763805866241, "step": 810 }, { "epoch": 0.64, "learning_rate": 1.6949129787682628e-06, "logits/chosen": 0.1836201250553131, "logits/rejected": 0.19242016971111298, "logps/chosen": -358.3663024902344, "logps/rejected": -320.7300720214844, "loss": 0.056, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.05281525105237961, "rewards/margins": 0.030187761411070824, "rewards/rejected": -0.08300301432609558, "step": 820 }, { "epoch": 0.65, "learning_rate": 1.6302845591704348e-06, "logits/chosen": 0.23501524329185486, "logits/rejected": 0.253219872713089, "logps/chosen": -301.3849182128906, "logps/rejected": -324.7459716796875, "loss": 0.0483, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.047373272478580475, "rewards/margins": 0.03554272651672363, "rewards/rejected": -0.08291599899530411, "step": 830 }, { "epoch": 0.66, "learning_rate": 1.5663108340946465e-06, "logits/chosen": 0.27467894554138184, "logits/rejected": 0.29102447628974915, "logps/chosen": -297.8023376464844, "logps/rejected": -317.1583251953125, "loss": 0.0398, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0461508110165596, "rewards/margins": 0.037653181701898575, "rewards/rejected": -0.08380399644374847, "step": 840 }, { "epoch": 0.67, "learning_rate": 1.5030399609558364e-06, "logits/chosen": 0.22526296973228455, "logits/rejected": 0.2629440426826477, "logps/chosen": -314.87005615234375, "logps/rejected": -327.7728576660156, "loss": 0.0462, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05448513478040695, "rewards/margins": 0.03063533641397953, "rewards/rejected": -0.08512047678232193, "step": 850 }, { "epoch": 0.68, "learning_rate": 1.4405195680840357e-06, "logits/chosen": 0.21215219795703888, "logits/rejected": 0.24855566024780273, "logps/chosen": -299.90167236328125, "logps/rejected": -298.214111328125, "loss": 0.0409, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.04720069095492363, "rewards/margins": 0.04011126235127449, "rewards/rejected": -0.08731194585561752, "step": 860 }, { "epoch": 0.68, "learning_rate": 1.378796718871252e-06, "logits/chosen": 0.22330372035503387, "logits/rejected": 0.27839145064353943, "logps/chosen": -352.99810791015625, "logps/rejected": -318.991943359375, "loss": 0.0429, "rewards/accuracies": 0.625, "rewards/chosen": -0.04801223799586296, "rewards/margins": 0.03813933581113815, "rewards/rejected": -0.08615156263113022, "step": 870 }, { "epoch": 0.69, "learning_rate": 1.3179178763436302e-06, "logits/chosen": 0.2164164036512375, "logits/rejected": 0.2723962664604187, "logps/chosen": -296.51507568359375, "logps/rejected": -311.3013610839844, "loss": 0.0331, "rewards/accuracies": 0.5625, "rewards/chosen": -0.045425109565258026, "rewards/margins": 0.043222926557064056, "rewards/rejected": -0.08864804357290268, "step": 880 }, { "epoch": 0.7, "learning_rate": 1.2579288681855364e-06, "logits/chosen": 0.2106570303440094, "logits/rejected": 0.20854637026786804, "logps/chosen": -300.02044677734375, "logps/rejected": -311.44976806640625, "loss": 0.0433, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.051221005618572235, "rewards/margins": 0.03814007714390755, "rewards/rejected": -0.08936108648777008, "step": 890 }, { "epoch": 0.71, "learning_rate": 1.1988748522419163e-06, "logits/chosen": 0.23926115036010742, "logits/rejected": 0.26336807012557983, "logps/chosen": -332.501953125, "logps/rejected": -341.2127685546875, "loss": 0.0426, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.055366236716508865, "rewards/margins": 0.04208333045244217, "rewards/rejected": -0.09744956344366074, "step": 900 }, { "epoch": 0.71, "eval_logits/chosen": 0.17968809604644775, "eval_logits/rejected": 0.20887503027915955, "eval_logps/chosen": -332.3423767089844, "eval_logps/rejected": -346.1493835449219, "eval_loss": 0.04077794775366783, "eval_rewards/accuracies": 0.6047903895378113, "eval_rewards/chosen": -0.06237388774752617, "eval_rewards/margins": 0.03533677011728287, "eval_rewards/rejected": -0.09771064668893814, "eval_runtime": 374.5567, "eval_samples_per_second": 5.34, "eval_steps_per_second": 0.446, "step": 900 }, { "epoch": 0.71, "learning_rate": 1.1408002825248842e-06, "logits/chosen": 0.17859011888504028, "logits/rejected": 0.1722099781036377, "logps/chosen": -349.0040588378906, "logps/rejected": -347.06195068359375, "loss": 0.036, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.055424802005290985, "rewards/margins": 0.03749425336718559, "rewards/rejected": -0.09291905909776688, "step": 910 }, { "epoch": 0.72, "learning_rate": 1.0837488757501369e-06, "logits/chosen": 0.1789877861738205, "logits/rejected": 0.2344355285167694, "logps/chosen": -317.1181640625, "logps/rejected": -326.86566162109375, "loss": 0.039, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.053631655871868134, "rewards/margins": 0.03638458251953125, "rewards/rejected": -0.09001623094081879, "step": 920 }, { "epoch": 0.73, "learning_rate": 1.027763578428379e-06, "logits/chosen": 0.18564803898334503, "logits/rejected": 0.1936606466770172, "logps/chosen": -293.61920166015625, "logps/rejected": -302.94744873046875, "loss": 0.0454, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.06316958367824554, "rewards/margins": 0.03379397466778755, "rewards/rejected": -0.09696356952190399, "step": 930 }, { "epoch": 0.74, "learning_rate": 9.728865345365379e-07, "logits/chosen": 0.19354644417762756, "logits/rejected": 0.21065323054790497, "logps/chosen": -340.0384826660156, "logps/rejected": -319.240966796875, "loss": 0.0495, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.05120069906115532, "rewards/margins": 0.04345576837658882, "rewards/rejected": -0.09465646743774414, "step": 940 }, { "epoch": 0.75, "learning_rate": 9.191590537930975e-07, "logits/chosen": 0.169469952583313, "logits/rejected": 0.20710797607898712, "logps/chosen": -331.83740234375, "logps/rejected": -317.3833312988281, "loss": 0.0433, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04739029332995415, "rewards/margins": 0.049665920436382294, "rewards/rejected": -0.09705621004104614, "step": 950 }, { "epoch": 0.75, "learning_rate": 8.666215805614373e-07, "logits/chosen": 0.17803916335105896, "logits/rejected": 0.24121513962745667, "logps/chosen": -340.6368713378906, "logps/rejected": -323.7513732910156, "loss": 0.0448, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.06622298806905746, "rewards/margins": 0.023390918970108032, "rewards/rejected": -0.0896139144897461, "step": 960 }, { "epoch": 0.76, "learning_rate": 8.153136634045844e-07, "logits/chosen": 0.19488176703453064, "logits/rejected": 0.2004108428955078, "logps/chosen": -308.45074462890625, "logps/rejected": -309.8042297363281, "loss": 0.0388, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.05112530663609505, "rewards/margins": 0.04051050543785095, "rewards/rejected": -0.0916358157992363, "step": 970 }, { "epoch": 0.77, "learning_rate": 7.652739253142915e-07, "logits/chosen": 0.24685409665107727, "logits/rejected": 0.24745607376098633, "logps/chosen": -322.38018798828125, "logps/rejected": -314.3333435058594, "loss": 0.0421, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.05213675647974014, "rewards/margins": 0.03133903816342354, "rewards/rejected": -0.08347579836845398, "step": 980 }, { "epoch": 0.78, "learning_rate": 7.165400346368648e-07, "logits/chosen": 0.2231186181306839, "logits/rejected": 0.2616454064846039, "logps/chosen": -344.0087890625, "logps/rejected": -308.85052490234375, "loss": 0.0467, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05246872827410698, "rewards/margins": 0.03322756290435791, "rewards/rejected": -0.08569629490375519, "step": 990 }, { "epoch": 0.79, "learning_rate": 6.691486767176092e-07, "logits/chosen": 0.2258315533399582, "logits/rejected": 0.24323305487632751, "logps/chosen": -289.4067077636719, "logps/rejected": -317.7018737792969, "loss": 0.0448, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.04625866562128067, "rewards/margins": 0.02912392094731331, "rewards/rejected": -0.07538258284330368, "step": 1000 }, { "epoch": 0.79, "eval_logits/chosen": 0.21414244174957275, "eval_logits/rejected": 0.2463187277317047, "eval_logps/chosen": -324.4480285644531, "eval_logps/rejected": -335.3596496582031, "eval_loss": 0.040262602269649506, "eval_rewards/accuracies": 0.60628741979599, "eval_rewards/chosen": -0.054479535669088364, "eval_rewards/margins": 0.03244137018918991, "eval_rewards/rejected": -0.08692090213298798, "eval_runtime": 374.7358, "eval_samples_per_second": 5.337, "eval_steps_per_second": 0.446, "step": 1000 }, { "epoch": 0.79, "learning_rate": 6.231355262852529e-07, "logits/chosen": 0.25781863927841187, "logits/rejected": 0.2564670443534851, "logps/chosen": -372.15350341796875, "logps/rejected": -347.10711669921875, "loss": 0.043, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05305807664990425, "rewards/margins": 0.025652442127466202, "rewards/rejected": -0.07871051132678986, "step": 1010 }, { "epoch": 0.8, "learning_rate": 5.785352205971275e-07, "logits/chosen": 0.19803152978420258, "logits/rejected": 0.23860549926757812, "logps/chosen": -348.16656494140625, "logps/rejected": -346.49188232421875, "loss": 0.0446, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.046660203486680984, "rewards/margins": 0.04340515658259392, "rewards/rejected": -0.0900653675198555, "step": 1020 }, { "epoch": 0.81, "learning_rate": 5.353813333653287e-07, "logits/chosen": 0.18666525185108185, "logits/rejected": 0.2377820461988449, "logps/chosen": -311.5859680175781, "logps/rejected": -325.674072265625, "loss": 0.0346, "rewards/accuracies": 0.65625, "rewards/chosen": -0.04294878989458084, "rewards/margins": 0.04405210167169571, "rewards/rejected": -0.08700089901685715, "step": 1030 }, { "epoch": 0.82, "learning_rate": 4.937063494834774e-07, "logits/chosen": 0.21636883914470673, "logits/rejected": 0.1967061161994934, "logps/chosen": -352.9900207519531, "logps/rejected": -355.19720458984375, "loss": 0.0415, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.052512962371110916, "rewards/margins": 0.03489057347178459, "rewards/rejected": -0.0874035432934761, "step": 1040 }, { "epoch": 0.82, "learning_rate": 4.5354164057310857e-07, "logits/chosen": 0.19166240096092224, "logits/rejected": 0.2557406723499298, "logps/chosen": -322.88140869140625, "logps/rejected": -345.00189208984375, "loss": 0.03, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05283380672335625, "rewards/margins": 0.04377451539039612, "rewards/rejected": -0.09660832583904266, "step": 1050 }, { "epoch": 0.83, "learning_rate": 4.1491744136810066e-07, "logits/chosen": 0.18374313414096832, "logits/rejected": 0.24601753056049347, "logps/chosen": -308.51995849609375, "logps/rejected": -320.12347412109375, "loss": 0.0355, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.04724350571632385, "rewards/margins": 0.03354518115520477, "rewards/rejected": -0.08078868687152863, "step": 1060 }, { "epoch": 0.84, "learning_rate": 3.7786282695491313e-07, "logits/chosen": 0.21392662823200226, "logits/rejected": 0.2224198579788208, "logps/chosen": -315.51959228515625, "logps/rejected": -305.6190490722656, "loss": 0.0421, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.03758303076028824, "rewards/margins": 0.04133836179971695, "rewards/rejected": -0.07892139256000519, "step": 1070 }, { "epoch": 0.85, "learning_rate": 3.4240569088577564e-07, "logits/chosen": 0.2256053388118744, "logits/rejected": 0.23919573426246643, "logps/chosen": -322.1871337890625, "logps/rejected": -313.2369079589844, "loss": 0.0389, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.048378705978393555, "rewards/margins": 0.029405321925878525, "rewards/rejected": -0.07778403162956238, "step": 1080 }, { "epoch": 0.86, "learning_rate": 3.0857272418129136e-07, "logits/chosen": 0.1939636766910553, "logits/rejected": 0.22864237427711487, "logps/chosen": -305.79156494140625, "logps/rejected": -313.49688720703125, "loss": 0.0375, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.04112850874662399, "rewards/margins": 0.034687552601099014, "rewards/rejected": -0.07581605762243271, "step": 1090 }, { "epoch": 0.86, "learning_rate": 2.7638939523827956e-07, "logits/chosen": 0.2385842353105545, "logits/rejected": 0.24533557891845703, "logps/chosen": -291.5062561035156, "logps/rejected": -320.8490295410156, "loss": 0.0411, "rewards/accuracies": 0.65625, "rewards/chosen": -0.04673101380467415, "rewards/margins": 0.043406564742326736, "rewards/rejected": -0.09013758599758148, "step": 1100 }, { "epoch": 0.86, "eval_logits/chosen": 0.197061225771904, "eval_logits/rejected": 0.22829273343086243, "eval_logps/chosen": -324.82574462890625, "eval_logps/rejected": -336.86572265625, "eval_loss": 0.040152475237846375, "eval_rewards/accuracies": 0.6017963886260986, "eval_rewards/chosen": -0.05485724285244942, "eval_rewards/margins": 0.033569734543561935, "eval_rewards/rejected": -0.08842697739601135, "eval_runtime": 374.6149, "eval_samples_per_second": 5.339, "eval_steps_per_second": 0.446, "step": 1100 }, { "epoch": 0.87, "learning_rate": 2.4587993065795983e-07, "logits/chosen": 0.23787248134613037, "logits/rejected": 0.25482678413391113, "logps/chosen": -336.42578125, "logps/rejected": -350.364990234375, "loss": 0.0449, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.047406163066625595, "rewards/margins": 0.04243654012680054, "rewards/rejected": -0.08984269946813583, "step": 1110 }, { "epoch": 0.88, "learning_rate": 2.170672970089291e-07, "logits/chosen": 0.21486134827136993, "logits/rejected": 0.1970510333776474, "logps/chosen": -346.44989013671875, "logps/rejected": -328.222412109375, "loss": 0.0414, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.05092350393533707, "rewards/margins": 0.022183910012245178, "rewards/rejected": -0.07310740649700165, "step": 1120 }, { "epoch": 0.89, "learning_rate": 1.8997318353864673e-07, "logits/chosen": 0.18899409472942352, "logits/rejected": 0.21485233306884766, "logps/chosen": -315.77069091796875, "logps/rejected": -317.7997741699219, "loss": 0.0418, "rewards/accuracies": 0.65625, "rewards/chosen": -0.044594474136829376, "rewards/margins": 0.03823274374008179, "rewards/rejected": -0.08282722532749176, "step": 1130 }, { "epoch": 0.89, "learning_rate": 1.6461798584644944e-07, "logits/chosen": 0.21227781474590302, "logits/rejected": 0.24548833072185516, "logps/chosen": -336.7071228027344, "logps/rejected": -352.16741943359375, "loss": 0.042, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.056326769292354584, "rewards/margins": 0.03633355721831322, "rewards/rejected": -0.0926603227853775, "step": 1140 }, { "epoch": 0.9, "learning_rate": 1.4102079053038454e-07, "logits/chosen": 0.22621779143810272, "logits/rejected": 0.19496549665927887, "logps/chosen": -321.82904052734375, "logps/rejected": -300.1529846191406, "loss": 0.0416, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.055761467665433884, "rewards/margins": 0.024841610342264175, "rewards/rejected": -0.08060307800769806, "step": 1150 }, { "epoch": 0.91, "learning_rate": 1.1919936081941585e-07, "logits/chosen": 0.20498497784137726, "logits/rejected": 0.2435878962278366, "logps/chosen": -287.63934326171875, "logps/rejected": -335.10400390625, "loss": 0.039, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0522816963493824, "rewards/margins": 0.04128851369023323, "rewards/rejected": -0.09357021003961563, "step": 1160 }, { "epoch": 0.92, "learning_rate": 9.917012320182245e-08, "logits/chosen": 0.18155111372470856, "logits/rejected": 0.2222495973110199, "logps/chosen": -305.6929626464844, "logps/rejected": -338.02276611328125, "loss": 0.0464, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04758179932832718, "rewards/margins": 0.03497401624917984, "rewards/rejected": -0.08255580812692642, "step": 1170 }, { "epoch": 0.93, "learning_rate": 8.094815505985315e-08, "logits/chosen": 0.2116193324327469, "logits/rejected": 0.21422095596790314, "logps/chosen": -321.56341552734375, "logps/rejected": -308.92572021484375, "loss": 0.0384, "rewards/accuracies": 0.53125, "rewards/chosen": -0.05282514542341232, "rewards/margins": 0.021240225061774254, "rewards/rejected": -0.07406537234783173, "step": 1180 }, { "epoch": 0.93, "learning_rate": 6.454717331994542e-08, "logits/chosen": 0.18755567073822021, "logits/rejected": 0.203456848859787, "logps/chosen": -321.9530334472656, "logps/rejected": -328.56524658203125, "loss": 0.0416, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04693279415369034, "rewards/margins": 0.028221463784575462, "rewards/rejected": -0.07515425980091095, "step": 1190 }, { "epoch": 0.94, "learning_rate": 4.9979524127052595e-08, "logits/chosen": 0.20014750957489014, "logits/rejected": 0.21643492579460144, "logps/chosen": -325.4107666015625, "logps/rejected": -321.22515869140625, "loss": 0.0459, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.04905826970934868, "rewards/margins": 0.03321439027786255, "rewards/rejected": -0.08227266371250153, "step": 1200 }, { "epoch": 0.94, "eval_logits/chosen": 0.18975894153118134, "eval_logits/rejected": 0.22052054107189178, "eval_logps/chosen": -325.7256774902344, "eval_logps/rejected": -338.00421142578125, "eval_loss": 0.04012565314769745, "eval_rewards/accuracies": 0.6032934188842773, "eval_rewards/chosen": -0.05575716868042946, "eval_rewards/margins": 0.03380833566188812, "eval_rewards/rejected": -0.08956550806760788, "eval_runtime": 374.5923, "eval_samples_per_second": 5.339, "eval_steps_per_second": 0.446, "step": 1200 }, { "epoch": 0.95, "learning_rate": 3.725617355085476e-08, "logits/chosen": 0.22028300166130066, "logits/rejected": 0.26091286540031433, "logps/chosen": -323.34808349609375, "logps/rejected": -322.25201416015625, "loss": 0.038, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.049160152673721313, "rewards/margins": 0.03982594236731529, "rewards/rejected": -0.08898608386516571, "step": 1210 }, { "epoch": 0.96, "learning_rate": 2.63866993308437e-08, "logits/chosen": 0.21131488680839539, "logits/rejected": 0.25622323155403137, "logps/chosen": -347.6703796386719, "logps/rejected": -305.52484130859375, "loss": 0.0427, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.04987090080976486, "rewards/margins": 0.02888522483408451, "rewards/rejected": -0.07875612378120422, "step": 1220 }, { "epoch": 0.97, "learning_rate": 1.737928366650099e-08, "logits/chosen": 0.1875266432762146, "logits/rejected": 0.23892006278038025, "logps/chosen": -362.18450927734375, "logps/rejected": -347.82489013671875, "loss": 0.0424, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05079431086778641, "rewards/margins": 0.03944983333349228, "rewards/rejected": -0.09024415165185928, "step": 1230 }, { "epoch": 0.97, "learning_rate": 1.0240707057995735e-08, "logits/chosen": 0.1739097535610199, "logits/rejected": 0.18980395793914795, "logps/chosen": -297.0783996582031, "logps/rejected": -328.2581787109375, "loss": 0.039, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.04551629722118378, "rewards/margins": 0.04395337030291557, "rewards/rejected": -0.08946967869997025, "step": 1240 }, { "epoch": 0.98, "learning_rate": 4.976343202034717e-09, "logits/chosen": 0.20210778713226318, "logits/rejected": 0.21424946188926697, "logps/chosen": -314.2582092285156, "logps/rejected": -324.7920837402344, "loss": 0.0368, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.048636116087436676, "rewards/margins": 0.03599988669157028, "rewards/rejected": -0.08463600277900696, "step": 1250 }, { "epoch": 0.99, "learning_rate": 1.5901549467139953e-09, "logits/chosen": 0.2138410359621048, "logits/rejected": 0.24039845168590546, "logps/chosen": -289.2809143066406, "logps/rejected": -290.346923828125, "loss": 0.0383, "rewards/accuracies": 0.625, "rewards/chosen": -0.045187849551439285, "rewards/margins": 0.036808837205171585, "rewards/rejected": -0.08199669420719147, "step": 1260 }, { "epoch": 1.0, "learning_rate": 8.469130840960127e-11, "logits/chosen": 0.22721895575523376, "logits/rejected": 0.24769964814186096, "logps/chosen": -305.83270263671875, "logps/rejected": -334.0370788574219, "loss": 0.0414, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.048570211976766586, "rewards/margins": 0.0430789478123188, "rewards/rejected": -0.09164915978908539, "step": 1270 }, { "epoch": 1.0, "step": 1273, "total_flos": 0.0, "train_loss": 0.044965725131654775, "train_runtime": 22244.9719, "train_samples_per_second": 2.748, "train_steps_per_second": 0.057 } ], "logging_steps": 10, "max_steps": 1273, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }