{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 7.936507936507937e-08, "logits/chosen": -0.23160123825073242, "logits/rejected": -0.22930899262428284, "logps/chosen": -3480.55908203125, "logps/rejected": -3249.86767578125, "loss": 0.142, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 7.936507936507937e-07, "logits/chosen": -0.11704830080270767, "logits/rejected": -0.1438639611005783, "logps/chosen": -3806.41015625, "logps/rejected": -3513.546630859375, "loss": 0.1495, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": 4.7100489609874785e-05, "rewards/margins": 5.338901246432215e-05, "rewards/rejected": -6.288506483542733e-06, "step": 10 }, { "epoch": 0.03, "learning_rate": 1.5873015873015873e-06, "logits/chosen": -0.10943388938903809, "logits/rejected": -0.17128220200538635, "logps/chosen": -3805.66162109375, "logps/rejected": -3291.211669921875, "loss": 0.1431, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00036988104693591595, "rewards/margins": 0.00021256152831483632, "rewards/rejected": 0.00015731954772491008, "step": 20 }, { "epoch": 0.05, "learning_rate": 2.380952380952381e-06, "logits/chosen": -0.12024154514074326, "logits/rejected": -0.18497827649116516, "logps/chosen": -3670.22998046875, "logps/rejected": -3192.833251953125, "loss": 0.1446, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.003737862454727292, "rewards/margins": 0.002499702852219343, "rewards/rejected": 0.0012381596025079489, "step": 30 }, { "epoch": 0.06, "learning_rate": 3.1746031746031746e-06, "logits/chosen": -0.04808305948972702, "logits/rejected": -0.11066482216119766, "logps/chosen": -3818.647705078125, "logps/rejected": -3423.680908203125, "loss": 0.1288, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.016436848789453506, "rewards/margins": 0.013193143531680107, "rewards/rejected": 0.0032437038607895374, "step": 40 }, { "epoch": 0.08, "learning_rate": 3.968253968253968e-06, "logits/chosen": -0.04479307308793068, "logits/rejected": -0.11236592382192612, "logps/chosen": -3831.635498046875, "logps/rejected": -3337.180419921875, "loss": 0.1342, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.03949115425348282, "rewards/margins": 0.035656191408634186, "rewards/rejected": 0.0038349695969372988, "step": 50 }, { "epoch": 0.1, "learning_rate": 4.761904761904762e-06, "logits/chosen": 0.016891960054636, "logits/rejected": -0.06644205749034882, "logps/chosen": -3750.132080078125, "logps/rejected": -3339.59716796875, "loss": 0.1044, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.08823395520448685, "rewards/margins": 0.08895573765039444, "rewards/rejected": -0.0007217812235467136, "step": 60 }, { "epoch": 0.11, "learning_rate": 4.998086282661188e-06, "logits/chosen": 0.012894670478999615, "logits/rejected": -0.07027357816696167, "logps/chosen": -3487.27490234375, "logps/rejected": -3026.343994140625, "loss": 0.1132, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.14417621493339539, "rewards/margins": 0.09235814958810806, "rewards/rejected": 0.05181806534528732, "step": 70 }, { "epoch": 0.13, "learning_rate": 4.988720025682995e-06, "logits/chosen": 0.07670871168375015, "logits/rejected": 0.027111554518342018, "logps/chosen": -3489.33447265625, "logps/rejected": -3210.24951171875, "loss": 0.1089, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.18418748676776886, "rewards/margins": 0.1092456802725792, "rewards/rejected": 0.07494180649518967, "step": 80 }, { "epoch": 0.14, "learning_rate": 4.9715789537359126e-06, "logits/chosen": 0.13590265810489655, "logits/rejected": 0.06752701103687286, "logps/chosen": -3365.836669921875, "logps/rejected": -3066.538330078125, "loss": 0.0979, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.2274446189403534, "rewards/margins": 0.12401048094034195, "rewards/rejected": 0.10343412309885025, "step": 90 }, { "epoch": 0.16, "learning_rate": 4.946716615897932e-06, "logits/chosen": 0.13531382381916046, "logits/rejected": 0.05753606557846069, "logps/chosen": -3525.33837890625, "logps/rejected": -3141.802490234375, "loss": 0.1015, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.2502983510494232, "rewards/margins": 0.10583023726940155, "rewards/rejected": 0.14446814358234406, "step": 100 }, { "epoch": 0.16, "eval_logits/chosen": -0.07401716709136963, "eval_logits/rejected": -0.07821200788021088, "eval_logps/chosen": -2059.03076171875, "eval_logps/rejected": -1741.1519775390625, "eval_loss": 0.05870958790183067, "eval_rewards/accuracies": 0.5009999871253967, "eval_rewards/chosen": 0.15737517178058624, "eval_rewards/margins": 0.02211475744843483, "eval_rewards/rejected": 0.13526040315628052, "eval_runtime": 510.3402, "eval_samples_per_second": 3.919, "eval_steps_per_second": 0.98, "step": 100 }, { "epoch": 0.18, "learning_rate": 4.9142106826480114e-06, "logits/chosen": 0.1461654156446457, "logits/rejected": 0.052746810019016266, "logps/chosen": -3436.0546875, "logps/rejected": -2908.600341796875, "loss": 0.1062, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.2711651027202606, "rewards/margins": 0.10762874782085419, "rewards/rejected": 0.16353633999824524, "step": 110 }, { "epoch": 0.19, "learning_rate": 4.874162703221823e-06, "logits/chosen": 0.1735331267118454, "logits/rejected": 0.09441524744033813, "logps/chosen": -3563.86962890625, "logps/rejected": -3134.721923828125, "loss": 0.0962, "rewards/accuracies": 0.78125, "rewards/chosen": 0.33711010217666626, "rewards/margins": 0.11728501319885254, "rewards/rejected": 0.21982510387897491, "step": 120 }, { "epoch": 0.21, "learning_rate": 4.826697788369752e-06, "logits/chosen": 0.1857551634311676, "logits/rejected": 0.1031508594751358, "logps/chosen": -3260.365966796875, "logps/rejected": -2722.746337890625, "loss": 0.112, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3159078359603882, "rewards/margins": 0.09563833475112915, "rewards/rejected": 0.22026947140693665, "step": 130 }, { "epoch": 0.22, "learning_rate": 4.7719642195082224e-06, "logits/chosen": 0.16249768435955048, "logits/rejected": 0.08739353716373444, "logps/chosen": -3427.86962890625, "logps/rejected": -3068.870361328125, "loss": 0.1026, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.38163864612579346, "rewards/margins": 0.13598579168319702, "rewards/rejected": 0.24565282464027405, "step": 140 }, { "epoch": 0.24, "learning_rate": 4.710132985485355e-06, "logits/chosen": 0.1962042599916458, "logits/rejected": 0.11324436962604523, "logps/chosen": -3498.05859375, "logps/rejected": -3110.60302734375, "loss": 0.0823, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.3349873423576355, "rewards/margins": 0.14911258220672607, "rewards/rejected": 0.1858747899532318, "step": 150 }, { "epoch": 0.26, "learning_rate": 4.641397248408122e-06, "logits/chosen": 0.23380589485168457, "logits/rejected": 0.15552836656570435, "logps/chosen": -3649.421875, "logps/rejected": -3149.082763671875, "loss": 0.1018, "rewards/accuracies": 0.78125, "rewards/chosen": 0.319149911403656, "rewards/margins": 0.12320425361394882, "rewards/rejected": 0.1959456205368042, "step": 160 }, { "epoch": 0.27, "learning_rate": 4.5659717401997655e-06, "logits/chosen": 0.22691388428211212, "logits/rejected": 0.15275689959526062, "logps/chosen": -3225.24560546875, "logps/rejected": -2747.160400390625, "loss": 0.083, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.29939597845077515, "rewards/margins": 0.14731459319591522, "rewards/rejected": 0.15208135545253754, "step": 170 }, { "epoch": 0.29, "learning_rate": 4.4840920917726425e-06, "logits/chosen": 0.18859973549842834, "logits/rejected": 0.0886358767747879, "logps/chosen": -3401.942138671875, "logps/rejected": -2845.151611328125, "loss": 0.1065, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.34265846014022827, "rewards/margins": 0.14708396792411804, "rewards/rejected": 0.19557449221611023, "step": 180 }, { "epoch": 0.3, "learning_rate": 4.396014096912182e-06, "logits/chosen": 0.22746041417121887, "logits/rejected": 0.12188839912414551, "logps/chosen": -3502.93798828125, "logps/rejected": -3021.38720703125, "loss": 0.1044, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.345414400100708, "rewards/margins": 0.12817564606666565, "rewards/rejected": 0.21723875403404236, "step": 190 }, { "epoch": 0.32, "learning_rate": 4.302012913171584e-06, "logits/chosen": 0.18395040929317474, "logits/rejected": 0.11513110250234604, "logps/chosen": -3445.876220703125, "logps/rejected": -3028.39013671875, "loss": 0.1063, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.3484806418418884, "rewards/margins": 0.12718692421913147, "rewards/rejected": 0.22129373252391815, "step": 200 }, { "epoch": 0.32, "eval_logits/chosen": 0.03075716644525528, "eval_logits/rejected": 0.024621596559882164, "eval_logps/chosen": -2008.238037109375, "eval_logps/rejected": -1698.23681640625, "eval_loss": 0.0662032961845398, "eval_rewards/accuracies": 0.5034999847412109, "eval_rewards/chosen": 0.20816779136657715, "eval_rewards/margins": 0.02999204210937023, "eval_rewards/rejected": 0.17817574739456177, "eval_runtime": 510.2322, "eval_samples_per_second": 3.92, "eval_steps_per_second": 0.98, "step": 200 }, { "epoch": 0.34, "learning_rate": 4.202382202273702e-06, "logits/chosen": 0.2171505242586136, "logits/rejected": 0.15343636274337769, "logps/chosen": -3308.73681640625, "logps/rejected": -2953.523681640625, "loss": 0.1031, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.33611437678337097, "rewards/margins": 0.12898366153240204, "rewards/rejected": 0.20713074505329132, "step": 210 }, { "epoch": 0.35, "learning_rate": 4.097433212705492e-06, "logits/chosen": 0.24402043223381042, "logits/rejected": 0.1750088632106781, "logps/chosen": -3587.706298828125, "logps/rejected": -3236.89990234375, "loss": 0.0884, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.3563935160636902, "rewards/margins": 0.1363053321838379, "rewards/rejected": 0.2200881987810135, "step": 220 }, { "epoch": 0.37, "learning_rate": 3.987493807371033e-06, "logits/chosen": 0.2694666087627411, "logits/rejected": 0.19900746643543243, "logps/chosen": -3532.60791015625, "logps/rejected": -3166.88232421875, "loss": 0.0953, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.37613242864608765, "rewards/margins": 0.14061644673347473, "rewards/rejected": 0.23551598191261292, "step": 230 }, { "epoch": 0.38, "learning_rate": 3.872907439340758e-06, "logits/chosen": 0.2892386019229889, "logits/rejected": 0.2168089896440506, "logps/chosen": -3366.72607421875, "logps/rejected": -2973.12109375, "loss": 0.0925, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3254829943180084, "rewards/margins": 0.1489279717206955, "rewards/rejected": 0.17655499279499054, "step": 240 }, { "epoch": 0.4, "learning_rate": 3.75403207889666e-06, "logits/chosen": 0.31099653244018555, "logits/rejected": 0.25657492876052856, "logps/chosen": -3562.685546875, "logps/rejected": -3312.733154296875, "loss": 0.1022, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3441573977470398, "rewards/margins": 0.13442568480968475, "rewards/rejected": 0.20973166823387146, "step": 250 }, { "epoch": 0.42, "learning_rate": 3.631239095225417e-06, "logits/chosen": 0.29758816957473755, "logits/rejected": 0.21104487776756287, "logps/chosen": -3387.165283203125, "logps/rejected": -2942.01806640625, "loss": 0.1014, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.3527730405330658, "rewards/margins": 0.1494169980287552, "rewards/rejected": 0.2033560574054718, "step": 260 }, { "epoch": 0.43, "learning_rate": 3.5049120962530608e-06, "logits/chosen": 0.249090313911438, "logits/rejected": 0.2008696049451828, "logps/chosen": -3297.40185546875, "logps/rejected": -3002.595703125, "loss": 0.099, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.3519158959388733, "rewards/margins": 0.11539413779973984, "rewards/rejected": 0.23652176558971405, "step": 270 }, { "epoch": 0.45, "learning_rate": 3.3754457302455464e-06, "logits/chosen": 0.26629170775413513, "logits/rejected": 0.23264212906360626, "logps/chosen": -3481.889892578125, "logps/rejected": -3254.90234375, "loss": 0.0962, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.3905375301837921, "rewards/margins": 0.14240308105945587, "rewards/rejected": 0.24813446402549744, "step": 280 }, { "epoch": 0.46, "learning_rate": 3.2432444529190714e-06, "logits/chosen": 0.2690260112285614, "logits/rejected": 0.21250610053539276, "logps/chosen": -3407.96533203125, "logps/rejected": -3032.994873046875, "loss": 0.0939, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.3550907075405121, "rewards/margins": 0.14574530720710754, "rewards/rejected": 0.20934538543224335, "step": 290 }, { "epoch": 0.48, "learning_rate": 3.1087212639117057e-06, "logits/chosen": 0.3150966465473175, "logits/rejected": 0.253526508808136, "logps/chosen": -3427.98974609375, "logps/rejected": -3210.477294921875, "loss": 0.089, "rewards/accuracies": 0.8125, "rewards/chosen": 0.35706156492233276, "rewards/margins": 0.13845902681350708, "rewards/rejected": 0.21860253810882568, "step": 300 }, { "epoch": 0.48, "eval_logits/chosen": 0.1094050332903862, "eval_logits/rejected": 0.10683323442935944, "eval_logps/chosen": -2008.3831787109375, "eval_logps/rejected": -1698.5457763671875, "eval_loss": 0.06738738715648651, "eval_rewards/accuracies": 0.5024999976158142, "eval_rewards/chosen": 0.20802262425422668, "eval_rewards/margins": 0.03015574812889099, "eval_rewards/rejected": 0.1778668612241745, "eval_runtime": 510.4867, "eval_samples_per_second": 3.918, "eval_steps_per_second": 0.979, "step": 300 }, { "epoch": 0.5, "learning_rate": 2.9722964165636263e-06, "logits/chosen": 0.3106716573238373, "logits/rejected": 0.23689217865467072, "logps/chosen": -3217.514892578125, "logps/rejected": -2867.49072265625, "loss": 0.0983, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.35535794496536255, "rewards/margins": 0.15059463679790497, "rewards/rejected": 0.20476332306861877, "step": 310 }, { "epoch": 0.51, "learning_rate": 2.8343961050366275e-06, "logits/chosen": 0.3136465549468994, "logits/rejected": 0.27693477272987366, "logps/chosen": -3478.74365234375, "logps/rejected": -3246.188720703125, "loss": 0.0922, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.36403846740722656, "rewards/margins": 0.13698884844779968, "rewards/rejected": 0.22704961895942688, "step": 320 }, { "epoch": 0.53, "learning_rate": 2.695451132874385e-06, "logits/chosen": 0.31543809175491333, "logits/rejected": 0.2502642273902893, "logps/chosen": -3176.200439453125, "logps/rejected": -2867.706298828125, "loss": 0.0749, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.34216076135635376, "rewards/margins": 0.13852015137672424, "rewards/rejected": 0.20364060997962952, "step": 330 }, { "epoch": 0.54, "learning_rate": 2.5558955671628964e-06, "logits/chosen": 0.32941514253616333, "logits/rejected": 0.28318849205970764, "logps/chosen": -3559.00244140625, "logps/rejected": -3265.51611328125, "loss": 0.0784, "rewards/accuracies": 0.8125, "rewards/chosen": 0.41226086020469666, "rewards/margins": 0.1618272215127945, "rewards/rejected": 0.25043365359306335, "step": 340 }, { "epoch": 0.56, "learning_rate": 2.4161653824955654e-06, "logits/chosen": 0.3133377432823181, "logits/rejected": 0.22994990646839142, "logps/chosen": -3413.45068359375, "logps/rejected": -2901.52197265625, "loss": 0.0864, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.380751371383667, "rewards/margins": 0.1573660671710968, "rewards/rejected": 0.2233853042125702, "step": 350 }, { "epoch": 0.58, "learning_rate": 2.2766970989791697e-06, "logits/chosen": 0.3129468560218811, "logits/rejected": 0.26054853200912476, "logps/chosen": -3439.70654296875, "logps/rejected": -3168.26904296875, "loss": 0.0866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3977072536945343, "rewards/margins": 0.15224188566207886, "rewards/rejected": 0.24546535313129425, "step": 360 }, { "epoch": 0.59, "learning_rate": 2.1379264185356545e-06, "logits/chosen": 0.33266085386276245, "logits/rejected": 0.2755834758281708, "logps/chosen": -3541.54736328125, "logps/rejected": -3230.013671875, "loss": 0.0873, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3881675899028778, "rewards/margins": 0.15043120086193085, "rewards/rejected": 0.23773638904094696, "step": 370 }, { "epoch": 0.61, "learning_rate": 2.000286863759934e-06, "logits/chosen": 0.333050400018692, "logits/rejected": 0.2647174596786499, "logps/chosen": -3345.2734375, "logps/rejected": -2988.4150390625, "loss": 0.0888, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3639519512653351, "rewards/margins": 0.14749667048454285, "rewards/rejected": 0.21645526587963104, "step": 380 }, { "epoch": 0.62, "learning_rate": 1.8642084235859764e-06, "logits/chosen": 0.31277281045913696, "logits/rejected": 0.2777637541294098, "logps/chosen": -3408.678466796875, "logps/rejected": -3292.940673828125, "loss": 0.0876, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.36375516653060913, "rewards/margins": 0.14496205747127533, "rewards/rejected": 0.218793123960495, "step": 390 }, { "epoch": 0.64, "learning_rate": 1.7301162099921013e-06, "logits/chosen": 0.340940922498703, "logits/rejected": 0.23637180030345917, "logps/chosen": -3567.47021484375, "logps/rejected": -3109.775634765625, "loss": 0.0836, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.34295541048049927, "rewards/margins": 0.16309013962745667, "rewards/rejected": 0.1798652708530426, "step": 400 }, { "epoch": 0.64, "eval_logits/chosen": 0.09923748672008514, "eval_logits/rejected": 0.09380433708429337, "eval_logps/chosen": -2003.3719482421875, "eval_logps/rejected": -1694.8785400390625, "eval_loss": 0.06775122135877609, "eval_rewards/accuracies": 0.5044999718666077, "eval_rewards/chosen": 0.2130337953567505, "eval_rewards/margins": 0.03149950131773949, "eval_rewards/rejected": 0.1815343052148819, "eval_runtime": 510.3736, "eval_samples_per_second": 3.919, "eval_steps_per_second": 0.98, "step": 400 }, { "epoch": 0.66, "learning_rate": 1.5984291299420117e-06, "logits/chosen": 0.27606526017189026, "logits/rejected": 0.17855815589427948, "logps/chosen": -3422.63818359375, "logps/rejected": -2952.826416015625, "loss": 0.0769, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.3608594834804535, "rewards/margins": 0.17766425013542175, "rewards/rejected": 0.18319520354270935, "step": 410 }, { "epoch": 0.67, "learning_rate": 1.4695585767104092e-06, "logits/chosen": 0.28741782903671265, "logits/rejected": 0.2132318764925003, "logps/chosen": -3324.41796875, "logps/rejected": -2952.7109375, "loss": 0.1042, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.3597557246685028, "rewards/margins": 0.1408061981201172, "rewards/rejected": 0.2189495861530304, "step": 420 }, { "epoch": 0.69, "learning_rate": 1.3439071446815452e-06, "logits/chosen": 0.32707011699676514, "logits/rejected": 0.24214036762714386, "logps/chosen": -3380.89404296875, "logps/rejected": -2977.55419921875, "loss": 0.0882, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3879029154777527, "rewards/margins": 0.1662425994873047, "rewards/rejected": 0.2216603308916092, "step": 430 }, { "epoch": 0.7, "learning_rate": 1.2218673716356919e-06, "logits/chosen": 0.33859843015670776, "logits/rejected": 0.29151710867881775, "logps/chosen": -3435.18701171875, "logps/rejected": -3207.23095703125, "loss": 0.0851, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3895798325538635, "rewards/margins": 0.17013905942440033, "rewards/rejected": 0.2194407880306244, "step": 440 }, { "epoch": 0.72, "learning_rate": 1.103820512452661e-06, "logits/chosen": 0.32453230023384094, "logits/rejected": 0.21598629653453827, "logps/chosen": -3430.122314453125, "logps/rejected": -2931.8564453125, "loss": 0.0862, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.37726718187332153, "rewards/margins": 0.16498428583145142, "rewards/rejected": 0.2122829258441925, "step": 450 }, { "epoch": 0.74, "learning_rate": 9.901353480633468e-07, "logits/chosen": 0.33777526021003723, "logits/rejected": 0.24668912589550018, "logps/chosen": -3424.164794921875, "logps/rejected": -2975.840087890625, "loss": 0.0831, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.3875207304954529, "rewards/margins": 0.15877413749694824, "rewards/rejected": 0.22874662280082703, "step": 460 }, { "epoch": 0.75, "learning_rate": 8.811670333701544e-07, "logits/chosen": 0.34649503231048584, "logits/rejected": 0.2827189862728119, "logps/chosen": -3244.47607421875, "logps/rejected": -2961.621826171875, "loss": 0.0838, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.3770023584365845, "rewards/margins": 0.13464799523353577, "rewards/rejected": 0.2423543930053711, "step": 470 }, { "epoch": 0.77, "learning_rate": 7.772559877354341e-07, "logits/chosen": 0.34769752621650696, "logits/rejected": 0.2576979994773865, "logps/chosen": -3430.348876953125, "logps/rejected": -2924.820068359375, "loss": 0.0947, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.3593669831752777, "rewards/margins": 0.13479986786842346, "rewards/rejected": 0.22456713020801544, "step": 480 }, { "epoch": 0.78, "learning_rate": 6.787268315040604e-07, "logits/chosen": 0.29419490694999695, "logits/rejected": 0.22983236610889435, "logps/chosen": -3424.70849609375, "logps/rejected": -3202.71044921875, "loss": 0.1013, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.3848579525947571, "rewards/margins": 0.16225770115852356, "rewards/rejected": 0.22260025143623352, "step": 490 }, { "epoch": 0.8, "learning_rate": 5.858873718824829e-07, "logits/chosen": 0.3868134319782257, "logits/rejected": 0.3190605640411377, "logps/chosen": -3566.057861328125, "logps/rejected": -3229.52783203125, "loss": 0.0823, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.40095120668411255, "rewards/margins": 0.1678159087896347, "rewards/rejected": 0.23313529789447784, "step": 500 }, { "epoch": 0.8, "eval_logits/chosen": 0.11726534366607666, "eval_logits/rejected": 0.11339528858661652, "eval_logps/chosen": -2001.7003173828125, "eval_logps/rejected": -1693.5775146484375, "eval_loss": 0.06754996627569199, "eval_rewards/accuracies": 0.503000020980835, "eval_rewards/chosen": 0.21470554172992706, "eval_rewards/margins": 0.03187066689133644, "eval_rewards/rejected": 0.18283487856388092, "eval_runtime": 510.0656, "eval_samples_per_second": 3.921, "eval_steps_per_second": 0.98, "step": 500 }, { "epoch": 0.82, "learning_rate": 4.990276413423817e-07, "logits/chosen": 0.3040073812007904, "logits/rejected": 0.21434268355369568, "logps/chosen": -3250.26806640625, "logps/rejected": -2840.75439453125, "loss": 0.091, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.3842509388923645, "rewards/margins": 0.17019297182559967, "rewards/rejected": 0.21405795216560364, "step": 510 }, { "epoch": 0.83, "learning_rate": 4.184189915529796e-07, "logits/chosen": 0.2848634123802185, "logits/rejected": 0.26763054728507996, "logps/chosen": -3411.203857421875, "logps/rejected": -3323.831298828125, "loss": 0.086, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.3964161276817322, "rewards/margins": 0.14584572613239288, "rewards/rejected": 0.2505704164505005, "step": 520 }, { "epoch": 0.85, "learning_rate": 3.4431324567258176e-07, "logits/chosen": 0.3816804885864258, "logits/rejected": 0.27739661931991577, "logps/chosen": -3334.935546875, "logps/rejected": -2875.797607421875, "loss": 0.0886, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.3654884994029999, "rewards/margins": 0.1512555629014969, "rewards/rejected": 0.2142329216003418, "step": 530 }, { "epoch": 0.86, "learning_rate": 2.769419116476052e-07, "logits/chosen": 0.32581084966659546, "logits/rejected": 0.21518585085868835, "logps/chosen": -3319.97119140625, "logps/rejected": -2907.535888671875, "loss": 0.0828, "rewards/accuracies": 0.8125, "rewards/chosen": 0.382728636264801, "rewards/margins": 0.15941424667835236, "rewards/rejected": 0.22331435978412628, "step": 540 }, { "epoch": 0.88, "learning_rate": 2.1651545897676512e-07, "logits/chosen": 0.3343524932861328, "logits/rejected": 0.25169938802719116, "logps/chosen": -3317.521484375, "logps/rejected": -2890.90771484375, "loss": 0.0925, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.38326671719551086, "rewards/margins": 0.16836629807949066, "rewards/rejected": 0.214900404214859, "step": 550 }, { "epoch": 0.9, "learning_rate": 1.6322266119983222e-07, "logits/chosen": 0.3282504975795746, "logits/rejected": 0.26233971118927, "logps/chosen": -3273.356689453125, "logps/rejected": -2951.46875, "loss": 0.0996, "rewards/accuracies": 0.75, "rewards/chosen": 0.3491683006286621, "rewards/margins": 0.13011577725410461, "rewards/rejected": 0.2190525084733963, "step": 560 }, { "epoch": 0.91, "learning_rate": 1.1723000616502167e-07, "logits/chosen": 0.3535611033439636, "logits/rejected": 0.28358811140060425, "logps/chosen": -3421.57958984375, "logps/rejected": -3138.271484375, "loss": 0.0916, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.37185394763946533, "rewards/margins": 0.15074050426483154, "rewards/rejected": 0.2211134135723114, "step": 570 }, { "epoch": 0.93, "learning_rate": 7.868117591737585e-08, "logits/chosen": 0.3682214617729187, "logits/rejected": 0.2664671540260315, "logps/chosen": -3439.334716796875, "logps/rejected": -3021.247314453125, "loss": 0.0863, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.3696097433567047, "rewards/margins": 0.15690948069095612, "rewards/rejected": 0.2127002775669098, "step": 580 }, { "epoch": 0.94, "learning_rate": 4.769659783295383e-08, "logits/chosen": 0.2923109531402588, "logits/rejected": 0.25769466161727905, "logps/chosen": -3376.895751953125, "logps/rejected": -3227.916748046875, "loss": 0.0956, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.38735488057136536, "rewards/margins": 0.1430138349533081, "rewards/rejected": 0.24434101581573486, "step": 590 }, { "epoch": 0.96, "learning_rate": 2.4373068401120358e-08, "logits/chosen": 0.3152271807193756, "logits/rejected": 0.21970419585704803, "logps/chosen": -3313.78125, "logps/rejected": -2851.50830078125, "loss": 0.095, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3634759187698364, "rewards/margins": 0.14873453974723816, "rewards/rejected": 0.21474137902259827, "step": 600 }, { "epoch": 0.96, "eval_logits/chosen": 0.12296131998300552, "eval_logits/rejected": 0.11926081776618958, "eval_logps/chosen": -2002.1124267578125, "eval_logps/rejected": -1693.8876953125, "eval_loss": 0.06743565201759338, "eval_rewards/accuracies": 0.5024999976158142, "eval_rewards/chosen": 0.2142937034368515, "eval_rewards/margins": 0.03176878020167351, "eval_rewards/rejected": 0.1825249046087265, "eval_runtime": 510.4313, "eval_samples_per_second": 3.918, "eval_steps_per_second": 0.98, "step": 600 }, { "epoch": 0.98, "learning_rate": 8.78345083022425e-09, "logits/chosen": 0.32932108640670776, "logits/rejected": 0.2576533555984497, "logps/chosen": -3335.27587890625, "logps/rejected": -3016.89599609375, "loss": 0.0844, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.37068045139312744, "rewards/margins": 0.1670517921447754, "rewards/rejected": 0.20362868905067444, "step": 610 }, { "epoch": 0.99, "learning_rate": 9.764474213677654e-10, "logits/chosen": 0.3449278175830841, "logits/rejected": 0.29252105951309204, "logps/chosen": -3274.77978515625, "logps/rejected": -2965.578125, "loss": 0.0871, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.37701472640037537, "rewards/margins": 0.1436210572719574, "rewards/rejected": 0.23339366912841797, "step": 620 }, { "epoch": 1.0, "step": 625, "total_flos": 0.0, "train_loss": 0.09710332241058349, "train_runtime": 8719.9742, "train_samples_per_second": 1.147, "train_steps_per_second": 0.072 } ], "logging_steps": 10, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }