{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 500, "global_step": 1910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.617801047120419e-08, "logits/chosen": 0.8436492085456848, "logits/rejected": 1.1560968160629272, "logps/chosen": -330.2955322265625, "logps/rejected": -239.8994140625, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 2.617801047120419e-07, "logits/chosen": 1.0090492963790894, "logits/rejected": 1.0627849102020264, "logps/chosen": -279.4153137207031, "logps/rejected": -249.27322387695312, "loss": 0.5, "rewards/accuracies": 0.375, "rewards/chosen": -8.76396952662617e-05, "rewards/margins": -9.456619591219351e-05, "rewards/rejected": 6.926496553205652e-06, "step": 10 }, { "epoch": 0.01, "learning_rate": 5.235602094240838e-07, "logits/chosen": 1.0303412675857544, "logits/rejected": 1.0532195568084717, "logps/chosen": -321.72723388671875, "logps/rejected": -270.56353759765625, "loss": 0.5, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -6.834287341916934e-05, "rewards/margins": -4.8897858505370095e-05, "rewards/rejected": -1.9445011275820434e-05, "step": 20 }, { "epoch": 0.02, "learning_rate": 7.853403141361258e-07, "logits/chosen": 1.002454400062561, "logits/rejected": 1.06557297706604, "logps/chosen": -252.0704345703125, "logps/rejected": -246.32705688476562, "loss": 0.5, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 1.5753510524518788e-05, "rewards/margins": 5.4146301408763975e-05, "rewards/rejected": -3.83927981602028e-05, "step": 30 }, { "epoch": 0.02, "learning_rate": 1.0471204188481676e-06, "logits/chosen": 1.0041682720184326, "logits/rejected": 1.1504443883895874, "logps/chosen": -235.38217163085938, "logps/rejected": -230.2617645263672, "loss": 0.5, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 7.3400560722802766e-06, "rewards/margins": 2.9947289021947654e-06, "rewards/rejected": 4.3453355829115026e-06, "step": 40 }, { "epoch": 0.03, "learning_rate": 1.3089005235602096e-06, "logits/chosen": 0.9595837593078613, "logits/rejected": 1.0130202770233154, "logps/chosen": -294.26007080078125, "logps/rejected": -249.2256317138672, "loss": 0.5, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.00017269175441469997, "rewards/margins": 9.17307916097343e-05, "rewards/rejected": 8.096096280496567e-05, "step": 50 }, { "epoch": 0.03, "learning_rate": 1.5706806282722515e-06, "logits/chosen": 0.9245076179504395, "logits/rejected": 1.023485779762268, "logps/chosen": -242.47689819335938, "logps/rejected": -230.57373046875, "loss": 0.5, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0002746728132478893, "rewards/margins": 0.00012865502503700554, "rewards/rejected": 0.00014601778821088374, "step": 60 }, { "epoch": 0.04, "learning_rate": 1.8324607329842933e-06, "logits/chosen": 0.9357272386550903, "logits/rejected": 1.0410839319229126, "logps/chosen": -257.8460388183594, "logps/rejected": -238.37973022460938, "loss": 0.5, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.00047300319420173764, "rewards/margins": 0.00019578025967348367, "rewards/rejected": 0.0002772229490801692, "step": 70 }, { "epoch": 0.04, "learning_rate": 2.094240837696335e-06, "logits/chosen": 1.0097007751464844, "logits/rejected": 1.0268934965133667, "logps/chosen": -263.69903564453125, "logps/rejected": -256.5643615722656, "loss": 0.4999, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0005936628440394998, "rewards/margins": 0.00022301140415947884, "rewards/rejected": 0.0003706514835357666, "step": 80 }, { "epoch": 0.05, "learning_rate": 2.356020942408377e-06, "logits/chosen": 0.9857368469238281, "logits/rejected": 1.050782561302185, "logps/chosen": -252.1823272705078, "logps/rejected": -253.6891326904297, "loss": 0.4999, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.000997263239696622, "rewards/margins": 0.00041304732440039515, "rewards/rejected": 0.0005842159152962267, "step": 90 }, { "epoch": 0.05, "learning_rate": 2.617801047120419e-06, "logits/chosen": 1.0416964292526245, "logits/rejected": 1.0389362573623657, "logps/chosen": -254.76235961914062, "logps/rejected": -224.39559936523438, "loss": 0.4998, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.0013410584069788456, "rewards/margins": 0.0005450797034427524, "rewards/rejected": 0.0007959787035360932, "step": 100 }, { "epoch": 0.06, "learning_rate": 2.8795811518324613e-06, "logits/chosen": 1.0654562711715698, "logits/rejected": 1.1301515102386475, "logps/chosen": -294.14031982421875, "logps/rejected": -258.11077880859375, "loss": 0.4998, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.001866974518634379, "rewards/margins": 0.0006641600048169494, "rewards/rejected": 0.0012028145138174295, "step": 110 }, { "epoch": 0.06, "learning_rate": 3.141361256544503e-06, "logits/chosen": 0.9807151556015015, "logits/rejected": 1.125035285949707, "logps/chosen": -303.8504943847656, "logps/rejected": -249.7647705078125, "loss": 0.4997, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.002772308187559247, "rewards/margins": 0.0014164599124342203, "rewards/rejected": 0.001355848042294383, "step": 120 }, { "epoch": 0.07, "learning_rate": 3.403141361256545e-06, "logits/chosen": 1.096975564956665, "logits/rejected": 1.1348248720169067, "logps/chosen": -278.3834533691406, "logps/rejected": -245.82968139648438, "loss": 0.4995, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.003960819449275732, "rewards/margins": 0.0022526984103024006, "rewards/rejected": 0.0017081208061426878, "step": 130 }, { "epoch": 0.07, "learning_rate": 3.6649214659685865e-06, "logits/chosen": 1.0514932870864868, "logits/rejected": 1.1338948011398315, "logps/chosen": -275.76031494140625, "logps/rejected": -258.5254821777344, "loss": 0.4995, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.004108738619834185, "rewards/margins": 0.001808557310141623, "rewards/rejected": 0.0023001814261078835, "step": 140 }, { "epoch": 0.08, "learning_rate": 3.926701570680629e-06, "logits/chosen": 0.9971652030944824, "logits/rejected": 1.0917918682098389, "logps/chosen": -291.89044189453125, "logps/rejected": -254.80679321289062, "loss": 0.4993, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0049656108021736145, "rewards/margins": 0.002658768789842725, "rewards/rejected": 0.0023068420123308897, "step": 150 }, { "epoch": 0.08, "learning_rate": 4.18848167539267e-06, "logits/chosen": 1.0320146083831787, "logits/rejected": 1.053504228591919, "logps/chosen": -285.04559326171875, "logps/rejected": -244.1322784423828, "loss": 0.4993, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.005488889757543802, "rewards/margins": 0.002789679216220975, "rewards/rejected": 0.00269921007566154, "step": 160 }, { "epoch": 0.09, "learning_rate": 4.450261780104713e-06, "logits/chosen": 1.0273762941360474, "logits/rejected": 1.062558650970459, "logps/chosen": -287.9652099609375, "logps/rejected": -232.247314453125, "loss": 0.4993, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.007104066200554371, "rewards/margins": 0.0033009883482009172, "rewards/rejected": 0.0038030785508453846, "step": 170 }, { "epoch": 0.09, "learning_rate": 4.712041884816754e-06, "logits/chosen": 1.0532909631729126, "logits/rejected": 1.1673284769058228, "logps/chosen": -274.5193786621094, "logps/rejected": -238.21286010742188, "loss": 0.499, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.00742004532366991, "rewards/margins": 0.003918725997209549, "rewards/rejected": 0.003501318860799074, "step": 180 }, { "epoch": 0.1, "learning_rate": 4.9738219895287965e-06, "logits/chosen": 1.1504939794540405, "logits/rejected": 1.1638376712799072, "logps/chosen": -237.76797485351562, "logps/rejected": -211.50613403320312, "loss": 0.499, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.007698298431932926, "rewards/margins": 0.003723274450749159, "rewards/rejected": 0.00397502351552248, "step": 190 }, { "epoch": 0.1, "learning_rate": 4.999661831436499e-06, "logits/chosen": 1.0712188482284546, "logits/rejected": 1.0771671533584595, "logps/chosen": -288.3528747558594, "logps/rejected": -265.5425109863281, "loss": 0.4989, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.008598363026976585, "rewards/margins": 0.005429488606750965, "rewards/rejected": 0.003168874653056264, "step": 200 }, { "epoch": 0.11, "learning_rate": 4.9984929711403395e-06, "logits/chosen": 1.1236344575881958, "logits/rejected": 1.2009334564208984, "logps/chosen": -254.3011932373047, "logps/rejected": -224.9448699951172, "loss": 0.4988, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.007906198501586914, "rewards/margins": 0.005368872079998255, "rewards/rejected": 0.0025373264215886593, "step": 210 }, { "epoch": 0.12, "learning_rate": 4.996489634487865e-06, "logits/chosen": 1.0867538452148438, "logits/rejected": 1.2004356384277344, "logps/chosen": -258.08062744140625, "logps/rejected": -240.8439483642578, "loss": 0.4988, "rewards/accuracies": 0.625, "rewards/chosen": 0.007928581908345222, "rewards/margins": 0.004648840986192226, "rewards/rejected": 0.003279739525169134, "step": 220 }, { "epoch": 0.12, "learning_rate": 4.9936524905772466e-06, "logits/chosen": 1.0192543268203735, "logits/rejected": 1.2005066871643066, "logps/chosen": -274.07037353515625, "logps/rejected": -256.2618713378906, "loss": 0.4988, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.006800153758376837, "rewards/margins": 0.003369166050106287, "rewards/rejected": 0.0034309872426092625, "step": 230 }, { "epoch": 0.13, "learning_rate": 4.9899824869915e-06, "logits/chosen": 1.111426830291748, "logits/rejected": 1.1554086208343506, "logps/chosen": -243.208984375, "logps/rejected": -205.7252655029297, "loss": 0.4984, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.007739508058875799, "rewards/margins": 0.0072770556434988976, "rewards/rejected": 0.0004624520370271057, "step": 240 }, { "epoch": 0.13, "learning_rate": 4.985480849482012e-06, "logits/chosen": 1.1005799770355225, "logits/rejected": 1.230799913406372, "logps/chosen": -272.18597412109375, "logps/rejected": -257.9790954589844, "loss": 0.4988, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.005292638670653105, "rewards/margins": 0.0029095064383000135, "rewards/rejected": 0.0023831322323530912, "step": 250 }, { "epoch": 0.14, "learning_rate": 4.980149081559142e-06, "logits/chosen": 1.0777183771133423, "logits/rejected": 1.155970573425293, "logps/chosen": -294.93328857421875, "logps/rejected": -261.9263610839844, "loss": 0.4982, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.009707033634185791, "rewards/margins": 0.008258306421339512, "rewards/rejected": 0.00144872663076967, "step": 260 }, { "epoch": 0.14, "learning_rate": 4.9739889639900655e-06, "logits/chosen": 1.1088669300079346, "logits/rejected": 1.1434690952301025, "logps/chosen": -254.5012664794922, "logps/rejected": -254.6510009765625, "loss": 0.4979, "rewards/accuracies": 0.65625, "rewards/chosen": 0.009705386124551296, "rewards/margins": 0.009683574549853802, "rewards/rejected": 2.181164381909184e-05, "step": 270 }, { "epoch": 0.15, "learning_rate": 4.967002554204009e-06, "logits/chosen": 1.0548467636108398, "logits/rejected": 1.1509649753570557, "logps/chosen": -245.9481964111328, "logps/rejected": -229.8827362060547, "loss": 0.4985, "rewards/accuracies": 0.59375, "rewards/chosen": 0.009255246259272099, "rewards/margins": 0.006528814323246479, "rewards/rejected": 0.0027264312375336885, "step": 280 }, { "epoch": 0.15, "learning_rate": 4.959192185605089e-06, "logits/chosen": 1.0842396020889282, "logits/rejected": 1.1220932006835938, "logps/chosen": -266.4988708496094, "logps/rejected": -246.9526824951172, "loss": 0.4988, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.009208474308252335, "rewards/margins": 0.007726150564849377, "rewards/rejected": 0.0014823225792497396, "step": 290 }, { "epoch": 0.16, "learning_rate": 4.950560466792969e-06, "logits/chosen": 1.1049131155014038, "logits/rejected": 1.1441484689712524, "logps/chosen": -275.13421630859375, "logps/rejected": -246.1587677001953, "loss": 0.4984, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.006010602228343487, "rewards/margins": 0.00892153661698103, "rewards/rejected": -0.0029109339229762554, "step": 300 }, { "epoch": 0.16, "learning_rate": 4.9411102806916185e-06, "logits/chosen": 1.021583080291748, "logits/rejected": 1.047163963317871, "logps/chosen": -323.06097412109375, "logps/rejected": -254.7588653564453, "loss": 0.4977, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.008619217202067375, "rewards/margins": 0.012051543220877647, "rewards/rejected": -0.003432326018810272, "step": 310 }, { "epoch": 0.17, "learning_rate": 4.930844783586424e-06, "logits/chosen": 1.024611473083496, "logits/rejected": 1.0655776262283325, "logps/chosen": -238.3491668701172, "logps/rejected": -231.0393829345703, "loss": 0.498, "rewards/accuracies": 0.59375, "rewards/chosen": 0.006629918701946735, "rewards/margins": 0.010882768779993057, "rewards/rejected": -0.004252850078046322, "step": 320 }, { "epoch": 0.17, "learning_rate": 4.919767404070033e-06, "logits/chosen": 1.04720139503479, "logits/rejected": 1.0630711317062378, "logps/chosen": -261.62982177734375, "logps/rejected": -247.97607421875, "loss": 0.4981, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0036115895491093397, "rewards/margins": 0.009240304119884968, "rewards/rejected": -0.005628715269267559, "step": 330 }, { "epoch": 0.18, "learning_rate": 4.907881841897216e-06, "logits/chosen": 1.0087223052978516, "logits/rejected": 1.059715986251831, "logps/chosen": -314.62408447265625, "logps/rejected": -248.10879516601562, "loss": 0.4979, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.003107100958004594, "rewards/margins": 0.013965976424515247, "rewards/rejected": -0.010858876630663872, "step": 340 }, { "epoch": 0.18, "learning_rate": 4.89519206674919e-06, "logits/chosen": 0.9633463621139526, "logits/rejected": 1.0100409984588623, "logps/chosen": -241.84793090820312, "logps/rejected": -252.7783203125, "loss": 0.4976, "rewards/accuracies": 0.625, "rewards/chosen": 0.0028146414551883936, "rewards/margins": 0.013054436072707176, "rewards/rejected": -0.010239794850349426, "step": 350 }, { "epoch": 0.19, "learning_rate": 4.881702316907769e-06, "logits/chosen": 0.9069837331771851, "logits/rejected": 1.0270668268203735, "logps/chosen": -210.9730987548828, "logps/rejected": -243.6437225341797, "loss": 0.4983, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.0008282591588795185, "rewards/margins": 0.010188087821006775, "rewards/rejected": -0.009359828196465969, "step": 360 }, { "epoch": 0.19, "learning_rate": 4.86741709783982e-06, "logits/chosen": 0.8630668520927429, "logits/rejected": 0.9914480447769165, "logps/chosen": -332.7330627441406, "logps/rejected": -281.46807861328125, "loss": 0.4977, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0035103503614664078, "rewards/margins": 0.01303508598357439, "rewards/rejected": -0.009524735622107983, "step": 370 }, { "epoch": 0.2, "learning_rate": 4.852341180692471e-06, "logits/chosen": 0.9135398864746094, "logits/rejected": 0.9984884262084961, "logps/chosen": -284.92620849609375, "logps/rejected": -252.03970336914062, "loss": 0.4976, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0040648458525538445, "rewards/margins": 0.0157476756721735, "rewards/rejected": -0.011682827956974506, "step": 380 }, { "epoch": 0.2, "learning_rate": 4.836479600699579e-06, "logits/chosen": 0.9406082034111023, "logits/rejected": 0.9047748446464539, "logps/chosen": -278.61248779296875, "logps/rejected": -284.1888732910156, "loss": 0.4972, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.006144961342215538, "rewards/margins": 0.017049867659807205, "rewards/rejected": -0.010904906317591667, "step": 390 }, { "epoch": 0.21, "learning_rate": 4.819837655500014e-06, "logits/chosen": 0.8400663137435913, "logits/rejected": 0.9222391843795776, "logps/chosen": -230.8615264892578, "logps/rejected": -221.2638397216797, "loss": 0.4984, "rewards/accuracies": 0.625, "rewards/chosen": 0.0005404525436460972, "rewards/margins": 0.011931750923395157, "rewards/rejected": -0.011391298845410347, "step": 400 }, { "epoch": 0.21, "learning_rate": 4.802420903368286e-06, "logits/chosen": 0.8889272809028625, "logits/rejected": 0.8912805318832397, "logps/chosen": -268.0902099609375, "logps/rejected": -250.4331512451172, "loss": 0.4979, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0009850022615864873, "rewards/margins": 0.010063153691589832, "rewards/rejected": -0.009078151546418667, "step": 410 }, { "epoch": 0.22, "learning_rate": 4.784235161358124e-06, "logits/chosen": 0.8787338137626648, "logits/rejected": 0.9284510612487793, "logps/chosen": -288.6819152832031, "logps/rejected": -265.958984375, "loss": 0.4971, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.0032099136151373386, "rewards/margins": 0.021029185503721237, "rewards/rejected": -0.017819274216890335, "step": 420 }, { "epoch": 0.23, "learning_rate": 4.765286503359632e-06, "logits/chosen": 0.8820232152938843, "logits/rejected": 0.9475772976875305, "logps/chosen": -270.6169738769531, "logps/rejected": -259.78839111328125, "loss": 0.4973, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0021267482079565525, "rewards/margins": 0.019616421312093735, "rewards/rejected": -0.021743169054389, "step": 430 }, { "epoch": 0.23, "learning_rate": 4.745581258070654e-06, "logits/chosen": 0.7767919301986694, "logits/rejected": 0.87933349609375, "logps/chosen": -254.14315795898438, "logps/rejected": -252.87222290039062, "loss": 0.498, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0043312786146998405, "rewards/margins": 0.013770043849945068, "rewards/rejected": -0.018101321533322334, "step": 440 }, { "epoch": 0.24, "learning_rate": 4.725126006883047e-06, "logits/chosen": 0.7937654256820679, "logits/rejected": 0.8364180326461792, "logps/chosen": -238.3746337890625, "logps/rejected": -241.1796875, "loss": 0.4977, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.00781493354588747, "rewards/margins": 0.011845615692436695, "rewards/rejected": -0.019660547375679016, "step": 450 }, { "epoch": 0.24, "learning_rate": 4.70392758168454e-06, "logits/chosen": 0.7985974550247192, "logits/rejected": 0.8068701028823853, "logps/chosen": -345.21343994140625, "logps/rejected": -304.43817138671875, "loss": 0.4965, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.007276026997715235, "rewards/margins": 0.02650422975420952, "rewards/rejected": -0.033780258148908615, "step": 460 }, { "epoch": 0.25, "learning_rate": 4.68199306257695e-06, "logits/chosen": 0.7760607004165649, "logits/rejected": 0.773891806602478, "logps/chosen": -327.35369873046875, "logps/rejected": -314.1829528808594, "loss": 0.4961, "rewards/accuracies": 0.65625, "rewards/chosen": -0.014054256491363049, "rewards/margins": 0.03367748484015465, "rewards/rejected": -0.04773174598813057, "step": 470 }, { "epoch": 0.25, "learning_rate": 4.659329775511478e-06, "logits/chosen": 0.7017660140991211, "logits/rejected": 0.7137667536735535, "logps/chosen": -287.37652587890625, "logps/rejected": -271.36358642578125, "loss": 0.497, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.021392906084656715, "rewards/margins": 0.025359559804201126, "rewards/rejected": -0.04675246775150299, "step": 480 }, { "epoch": 0.26, "learning_rate": 4.635945289841902e-06, "logits/chosen": 0.5314046144485474, "logits/rejected": 0.5452633500099182, "logps/chosen": -337.0295104980469, "logps/rejected": -379.64593505859375, "loss": 0.4958, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.05135764926671982, "rewards/margins": 0.04310908168554306, "rewards/rejected": -0.09446673840284348, "step": 490 }, { "epoch": 0.26, "learning_rate": 4.611847415796476e-06, "logits/chosen": 0.29375532269477844, "logits/rejected": 0.2797163724899292, "logps/chosen": -427.3785095214844, "logps/rejected": -405.41461181640625, "loss": 0.4932, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12762612104415894, "rewards/margins": 0.03379129245877266, "rewards/rejected": -0.16141743957996368, "step": 500 }, { "epoch": 0.27, "learning_rate": 4.587044201869378e-06, "logits/chosen": -0.2227209359407425, "logits/rejected": -0.20223090052604675, "logps/chosen": -787.1062622070312, "logps/rejected": -1045.249267578125, "loss": 0.4818, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.5222705602645874, "rewards/margins": 0.26146870851516724, "rewards/rejected": -0.7837392687797546, "step": 510 }, { "epoch": 0.27, "learning_rate": 4.561543932132574e-06, "logits/chosen": -0.11980749666690826, "logits/rejected": -0.12788312137126923, "logps/chosen": -732.790283203125, "logps/rejected": -833.4085693359375, "loss": 0.4873, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.45401230454444885, "rewards/margins": 0.1505609005689621, "rewards/rejected": -0.6045731902122498, "step": 520 }, { "epoch": 0.28, "learning_rate": 4.535355123469009e-06, "logits/chosen": -0.14909827709197998, "logits/rejected": -0.18795037269592285, "logps/chosen": -696.719482421875, "logps/rejected": -1046.4390869140625, "loss": 0.4836, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.42250218987464905, "rewards/margins": 0.3782690167427063, "rewards/rejected": -0.8007712364196777, "step": 530 }, { "epoch": 0.28, "learning_rate": 4.508486522728037e-06, "logits/chosen": -0.18408063054084778, "logits/rejected": -0.14851421117782593, "logps/chosen": -893.5338134765625, "logps/rejected": -1111.295654296875, "loss": 0.4841, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.629914402961731, "rewards/margins": 0.23182418942451477, "rewards/rejected": -0.8617385625839233, "step": 540 }, { "epoch": 0.29, "learning_rate": 4.480947103804044e-06, "logits/chosen": -0.20195765793323517, "logits/rejected": -0.2249602973461151, "logps/chosen": -970.3370971679688, "logps/rejected": -1377.3724365234375, "loss": 0.4747, "rewards/accuracies": 0.625, "rewards/chosen": -0.7055306434631348, "rewards/margins": 0.46008825302124023, "rewards/rejected": -1.165618896484375, "step": 550 }, { "epoch": 0.29, "learning_rate": 4.452746064639239e-06, "logits/chosen": -0.27148136496543884, "logits/rejected": -0.24398574233055115, "logps/chosen": -1213.016357421875, "logps/rejected": -1345.276123046875, "loss": 0.4846, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.8822624087333679, "rewards/margins": 0.20509858429431915, "rewards/rejected": -1.0873609781265259, "step": 560 }, { "epoch": 0.3, "learning_rate": 4.423892824151617e-06, "logits/chosen": -0.32366353273391724, "logits/rejected": -0.3419601321220398, "logps/chosen": -1556.5191650390625, "logps/rejected": -1787.370361328125, "loss": 0.4843, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2833073139190674, "rewards/margins": 0.28631919622421265, "rewards/rejected": -1.5696265697479248, "step": 570 }, { "epoch": 0.3, "learning_rate": 4.3943970190891164e-06, "logits/chosen": -0.23246267437934875, "logits/rejected": -0.25014322996139526, "logps/chosen": -1218.4473876953125, "logps/rejected": -1182.8861083984375, "loss": 0.4809, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.9385588765144348, "rewards/margins": 0.02590467967092991, "rewards/rejected": -0.9644634127616882, "step": 580 }, { "epoch": 0.31, "learning_rate": 4.364268500811025e-06, "logits/chosen": -0.17416557669639587, "logits/rejected": -0.17902135848999023, "logps/chosen": -985.0808715820312, "logps/rejected": -1348.008056640625, "loss": 0.4847, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6908355951309204, "rewards/margins": 0.41670989990234375, "rewards/rejected": -1.1075454950332642, "step": 590 }, { "epoch": 0.31, "learning_rate": 4.333517331997704e-06, "logits/chosen": -0.14922045171260834, "logits/rejected": -0.19132760167121887, "logps/chosen": -1128.3173828125, "logps/rejected": -1489.837158203125, "loss": 0.4745, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.8316856622695923, "rewards/margins": 0.4227636754512787, "rewards/rejected": -1.2544492483139038, "step": 600 }, { "epoch": 0.32, "learning_rate": 4.302153783289737e-06, "logits/chosen": -0.1274535059928894, "logits/rejected": -0.17803938686847687, "logps/chosen": -1048.890625, "logps/rejected": -1506.1158447265625, "loss": 0.4743, "rewards/accuracies": 0.625, "rewards/chosen": -0.7957647442817688, "rewards/margins": 0.4669608175754547, "rewards/rejected": -1.2627254724502563, "step": 610 }, { "epoch": 0.32, "learning_rate": 4.270188329857613e-06, "logits/chosen": -0.14815063774585724, "logits/rejected": -0.15499570965766907, "logps/chosen": -1084.8118896484375, "logps/rejected": -1618.885009765625, "loss": 0.4711, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7856907844543457, "rewards/margins": 0.5791957974433899, "rewards/rejected": -1.3648868799209595, "step": 620 }, { "epoch": 0.33, "learning_rate": 4.237631647903115e-06, "logits/chosen": -0.024261217564344406, "logits/rejected": -0.038342759013175964, "logps/chosen": -723.5900268554688, "logps/rejected": -1155.1717529296875, "loss": 0.4678, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.46949324011802673, "rewards/margins": 0.45854002237319946, "rewards/rejected": -0.9280332326889038, "step": 630 }, { "epoch": 0.33, "learning_rate": 4.204494611093548e-06, "logits/chosen": -0.05518772080540657, "logits/rejected": -0.100825235247612, "logps/chosen": -1270.6005859375, "logps/rejected": -1703.8551025390625, "loss": 0.4819, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.9448369145393372, "rewards/margins": 0.4941697120666504, "rewards/rejected": -1.4390065670013428, "step": 640 }, { "epoch": 0.34, "learning_rate": 4.170788286930024e-06, "logits/chosen": -0.06449203193187714, "logits/rejected": -0.1527264416217804, "logps/chosen": -1250.4991455078125, "logps/rejected": -1752.0111083984375, "loss": 0.4822, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.9863438606262207, "rewards/margins": 0.5237391591072083, "rewards/rejected": -1.5100830793380737, "step": 650 }, { "epoch": 0.35, "learning_rate": 4.136523933051005e-06, "logits/chosen": -0.10980840772390366, "logits/rejected": -0.13391873240470886, "logps/chosen": -1053.7823486328125, "logps/rejected": -1614.2884521484375, "loss": 0.4762, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.8304306864738464, "rewards/margins": 0.5787540078163147, "rewards/rejected": -1.4091846942901611, "step": 660 }, { "epoch": 0.35, "learning_rate": 4.101712993472348e-06, "logits/chosen": -0.10138118267059326, "logits/rejected": -0.13220438361167908, "logps/chosen": -1581.559326171875, "logps/rejected": -1862.4993896484375, "loss": 0.481, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2885770797729492, "rewards/margins": 0.32578420639038086, "rewards/rejected": -1.6143611669540405, "step": 670 }, { "epoch": 0.36, "learning_rate": 4.066367094765091e-06, "logits/chosen": -0.06212924048304558, "logits/rejected": -0.09771373122930527, "logps/chosen": -1470.7352294921875, "logps/rejected": -1844.652587890625, "loss": 0.4783, "rewards/accuracies": 0.53125, "rewards/chosen": -1.204660177230835, "rewards/margins": 0.3980388641357422, "rewards/rejected": -1.6026990413665771, "step": 680 }, { "epoch": 0.36, "learning_rate": 4.030498042172277e-06, "logits/chosen": 0.01754361391067505, "logits/rejected": -0.048445507884025574, "logps/chosen": -979.1268310546875, "logps/rejected": -1244.6566162109375, "loss": 0.4726, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.7009618878364563, "rewards/margins": 0.3003775477409363, "rewards/rejected": -1.0013394355773926, "step": 690 }, { "epoch": 0.37, "learning_rate": 3.994117815666095e-06, "logits/chosen": -0.04728760942816734, "logits/rejected": -0.0919174998998642, "logps/chosen": -1344.916015625, "logps/rejected": -1900.5986328125, "loss": 0.472, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.0581436157226562, "rewards/margins": 0.5791832208633423, "rewards/rejected": -1.6373268365859985, "step": 700 }, { "epoch": 0.37, "learning_rate": 3.957238565946672e-06, "logits/chosen": 0.004687662236392498, "logits/rejected": -0.06074858829379082, "logps/chosen": -1193.521240234375, "logps/rejected": -2065.345947265625, "loss": 0.4653, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8945425152778625, "rewards/margins": 0.8969429731369019, "rewards/rejected": -1.7914857864379883, "step": 710 }, { "epoch": 0.38, "learning_rate": 3.919872610383831e-06, "logits/chosen": 0.07505255192518234, "logits/rejected": -0.015723228454589844, "logps/chosen": -1065.49365234375, "logps/rejected": -1707.6328125, "loss": 0.4739, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7956100702285767, "rewards/margins": 0.6891741752624512, "rewards/rejected": -1.4847842454910278, "step": 720 }, { "epoch": 0.38, "learning_rate": 3.882032428903195e-06, "logits/chosen": 0.02505052089691162, "logits/rejected": -0.009700920432806015, "logps/chosen": -1372.3634033203125, "logps/rejected": -2129.860595703125, "loss": 0.4656, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.0954824686050415, "rewards/margins": 0.7768491506576538, "rewards/rejected": -1.8723316192626953, "step": 730 }, { "epoch": 0.39, "learning_rate": 3.84373065981799e-06, "logits/chosen": 0.1249980553984642, "logits/rejected": 0.04747745767235756, "logps/chosen": -956.18115234375, "logps/rejected": -1541.792724609375, "loss": 0.4693, "rewards/accuracies": 0.625, "rewards/chosen": -0.6708589792251587, "rewards/margins": 0.6129963994026184, "rewards/rejected": -1.2838553190231323, "step": 740 }, { "epoch": 0.39, "learning_rate": 3.8049800956079552e-06, "logits/chosen": 0.23526708781719208, "logits/rejected": 0.19636312127113342, "logps/chosen": -1106.01513671875, "logps/rejected": -1326.5162353515625, "loss": 0.4752, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.8387149572372437, "rewards/margins": 0.24964456260204315, "rewards/rejected": -1.0883597135543823, "step": 750 }, { "epoch": 0.4, "learning_rate": 3.765793678646753e-06, "logits/chosen": 0.19188269972801208, "logits/rejected": 0.1782020926475525, "logps/chosen": -802.7251586914062, "logps/rejected": -1634.812255859375, "loss": 0.4647, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5356382727622986, "rewards/margins": 0.8580523729324341, "rewards/rejected": -1.3936904668807983, "step": 760 }, { "epoch": 0.4, "learning_rate": 3.726184496879323e-06, "logits/chosen": 0.14159968495368958, "logits/rejected": 0.08811040967702866, "logps/chosen": -1127.4029541015625, "logps/rejected": -1502.1641845703125, "loss": 0.4756, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.8773125410079956, "rewards/margins": 0.3974476158618927, "rewards/rejected": -1.274760365486145, "step": 770 }, { "epoch": 0.41, "learning_rate": 3.686165779450619e-06, "logits/chosen": 0.1939581334590912, "logits/rejected": 0.1522776186466217, "logps/chosen": -968.0919799804688, "logps/rejected": -1507.5386962890625, "loss": 0.4793, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.7149516344070435, "rewards/margins": 0.5672934055328369, "rewards/rejected": -1.2822450399398804, "step": 780 }, { "epoch": 0.41, "learning_rate": 3.645750892287178e-06, "logits/chosen": 0.1306479275226593, "logits/rejected": 0.05887848883867264, "logps/chosen": -1289.082275390625, "logps/rejected": -1864.7164306640625, "loss": 0.4721, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.9888286590576172, "rewards/margins": 0.6331573724746704, "rewards/rejected": -1.6219860315322876, "step": 790 }, { "epoch": 0.42, "learning_rate": 3.604953333633009e-06, "logits/chosen": 0.205116868019104, "logits/rejected": 0.15303435921669006, "logps/chosen": -848.7705078125, "logps/rejected": -1336.090576171875, "loss": 0.4708, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5991231799125671, "rewards/margins": 0.5247097015380859, "rewards/rejected": -1.1238329410552979, "step": 800 }, { "epoch": 0.42, "learning_rate": 3.56378672954129e-06, "logits/chosen": 0.22229023277759552, "logits/rejected": 0.17705193161964417, "logps/chosen": -1094.6126708984375, "logps/rejected": -1681.7445068359375, "loss": 0.469, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.8267385363578796, "rewards/margins": 0.6382580995559692, "rewards/rejected": -1.4649966955184937, "step": 810 }, { "epoch": 0.43, "learning_rate": 3.5222648293233806e-06, "logits/chosen": 0.1940724402666092, "logits/rejected": 0.1474287211894989, "logps/chosen": -1133.5128173828125, "logps/rejected": -1901.333984375, "loss": 0.4687, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.8616431951522827, "rewards/margins": 0.8072026968002319, "rewards/rejected": -1.6688458919525146, "step": 820 }, { "epoch": 0.43, "learning_rate": 3.4804015009566573e-06, "logits/chosen": 0.14867620170116425, "logits/rejected": 0.050886522978544235, "logps/chosen": -1169.879638671875, "logps/rejected": -2415.080078125, "loss": 0.4639, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9197257161140442, "rewards/margins": 1.2611197233200073, "rewards/rejected": -2.180845260620117, "step": 830 }, { "epoch": 0.44, "learning_rate": 3.4382107264527244e-06, "logits/chosen": 0.16670770943164825, "logits/rejected": 0.11358609050512314, "logps/chosen": -1215.7694091796875, "logps/rejected": -1938.170654296875, "loss": 0.4701, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.9481611251831055, "rewards/margins": 0.7456313967704773, "rewards/rejected": -1.6937923431396484, "step": 840 }, { "epoch": 0.44, "learning_rate": 3.3957065971875387e-06, "logits/chosen": 0.24467554688453674, "logits/rejected": 0.1815129816532135, "logps/chosen": -1700.726806640625, "logps/rejected": -2238.2724609375, "loss": 0.4738, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.4334746599197388, "rewards/margins": 0.5637392997741699, "rewards/rejected": -1.9972139596939087, "step": 850 }, { "epoch": 0.45, "learning_rate": 3.352903309194999e-06, "logits/chosen": 0.25681573152542114, "logits/rejected": 0.22445912659168243, "logps/chosen": -1175.2008056640625, "logps/rejected": -1852.9886474609375, "loss": 0.476, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9056652784347534, "rewards/margins": 0.6970826387405396, "rewards/rejected": -1.6027476787567139, "step": 860 }, { "epoch": 0.46, "learning_rate": 3.309815158425591e-06, "logits/chosen": 0.35658639669418335, "logits/rejected": 0.23468701541423798, "logps/chosen": -1126.1968994140625, "logps/rejected": -1490.5289306640625, "loss": 0.4765, "rewards/accuracies": 0.53125, "rewards/chosen": -0.8638699650764465, "rewards/margins": 0.4017399847507477, "rewards/rejected": -1.265609860420227, "step": 870 }, { "epoch": 0.46, "learning_rate": 3.266456535971654e-06, "logits/chosen": 0.29603832960128784, "logits/rejected": 0.2804957330226898, "logps/chosen": -1391.908447265625, "logps/rejected": -1630.26220703125, "loss": 0.4842, "rewards/accuracies": 0.5625, "rewards/chosen": -1.103570580482483, "rewards/margins": 0.2950761914253235, "rewards/rejected": -1.3986468315124512, "step": 880 }, { "epoch": 0.47, "learning_rate": 3.2228419232608692e-06, "logits/chosen": 0.25324004888534546, "logits/rejected": 0.19424840807914734, "logps/chosen": -1254.45947265625, "logps/rejected": -1625.465087890625, "loss": 0.483, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.0103174448013306, "rewards/margins": 0.3879779279232025, "rewards/rejected": -1.3982954025268555, "step": 890 }, { "epoch": 0.47, "learning_rate": 3.1789858872195888e-06, "logits/chosen": 0.35612553358078003, "logits/rejected": 0.2640685737133026, "logps/chosen": -1018.1083984375, "logps/rejected": -1447.966796875, "loss": 0.4713, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7736637592315674, "rewards/margins": 0.4553070068359375, "rewards/rejected": -1.2289707660675049, "step": 900 }, { "epoch": 0.48, "learning_rate": 3.1349030754075945e-06, "logits/chosen": 0.32709187269210815, "logits/rejected": 0.27523329854011536, "logps/chosen": -996.7443237304688, "logps/rejected": -1309.497802734375, "loss": 0.4674, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6882850527763367, "rewards/margins": 0.3720100224018097, "rewards/rejected": -1.0602951049804688, "step": 910 }, { "epoch": 0.48, "learning_rate": 3.0906082111259313e-06, "logits/chosen": 0.28385213017463684, "logits/rejected": 0.26248598098754883, "logps/chosen": -1238.9512939453125, "logps/rejected": -1446.0545654296875, "loss": 0.4729, "rewards/accuracies": 0.53125, "rewards/chosen": -0.9621122479438782, "rewards/margins": 0.24497418105602264, "rewards/rejected": -1.207086443901062, "step": 920 }, { "epoch": 0.49, "learning_rate": 3.046116088499449e-06, "logits/chosen": 0.20961081981658936, "logits/rejected": 0.12288858741521835, "logps/chosen": -1385.43359375, "logps/rejected": -2388.202392578125, "loss": 0.4591, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1082097291946411, "rewards/margins": 1.0280786752700806, "rewards/rejected": -2.1362884044647217, "step": 930 }, { "epoch": 0.49, "learning_rate": 3.0014415675356813e-06, "logits/chosen": 0.2143702507019043, "logits/rejected": 0.12640917301177979, "logps/chosen": -1842.924072265625, "logps/rejected": -2572.03759765625, "loss": 0.4703, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5414974689483643, "rewards/margins": 0.7958974838256836, "rewards/rejected": -2.337394952774048, "step": 940 }, { "epoch": 0.5, "learning_rate": 2.9565995691617242e-06, "logits/chosen": 0.2267983853816986, "logits/rejected": 0.19906947016716003, "logps/chosen": -1659.0390625, "logps/rejected": -1897.612548828125, "loss": 0.4796, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.4338902235031128, "rewards/margins": 0.23841390013694763, "rewards/rejected": -1.672304391860962, "step": 950 }, { "epoch": 0.5, "learning_rate": 2.9116050702407706e-06, "logits/chosen": 0.2076607495546341, "logits/rejected": 0.15953665971755981, "logps/chosen": -1761.4957275390625, "logps/rejected": -2119.157470703125, "loss": 0.4733, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.521126627922058, "rewards/margins": 0.37375563383102417, "rewards/rejected": -1.8948824405670166, "step": 960 }, { "epoch": 0.51, "learning_rate": 2.8664730985699537e-06, "logits/chosen": 0.2155609130859375, "logits/rejected": 0.15363694727420807, "logps/chosen": -1374.277587890625, "logps/rejected": -2335.11962890625, "loss": 0.4691, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.131838083267212, "rewards/margins": 0.9804404973983765, "rewards/rejected": -2.112278461456299, "step": 970 }, { "epoch": 0.51, "learning_rate": 2.8212187278611907e-06, "logits/chosen": 0.3766547739505768, "logits/rejected": 0.23996052145957947, "logps/chosen": -978.6238403320312, "logps/rejected": -1637.2352294921875, "loss": 0.4666, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6910273432731628, "rewards/margins": 0.7083319425582886, "rewards/rejected": -1.3993593454360962, "step": 980 }, { "epoch": 0.52, "learning_rate": 2.7758570727066843e-06, "logits/chosen": 0.3515971899032593, "logits/rejected": 0.2718420922756195, "logps/chosen": -945.19921875, "logps/rejected": -1549.3182373046875, "loss": 0.4667, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.6832455396652222, "rewards/margins": 0.6422259211540222, "rewards/rejected": -1.3254715204238892, "step": 990 }, { "epoch": 0.52, "learning_rate": 2.730403283530767e-06, "logits/chosen": 0.3331068158149719, "logits/rejected": 0.21990351378917694, "logps/chosen": -957.8298950195312, "logps/rejected": -1847.413330078125, "loss": 0.4687, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6945260167121887, "rewards/margins": 0.9040031433105469, "rewards/rejected": -1.5985292196273804, "step": 1000 }, { "epoch": 0.53, "learning_rate": 2.6848725415297888e-06, "logits/chosen": 0.24949748814105988, "logits/rejected": 0.1596693992614746, "logps/chosen": -1084.4876708984375, "logps/rejected": -1898.144287109375, "loss": 0.4618, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8169393539428711, "rewards/margins": 0.8545991778373718, "rewards/rejected": -1.6715381145477295, "step": 1010 }, { "epoch": 0.53, "learning_rate": 2.639280053601719e-06, "logits/chosen": 0.22901049256324768, "logits/rejected": 0.1595744788646698, "logps/chosen": -1491.752197265625, "logps/rejected": -2144.299560546875, "loss": 0.4707, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2246992588043213, "rewards/margins": 0.6539346575737, "rewards/rejected": -1.8786340951919556, "step": 1020 }, { "epoch": 0.54, "learning_rate": 2.59364104726716e-06, "logits/chosen": 0.31597059965133667, "logits/rejected": 0.21497178077697754, "logps/chosen": -1171.93212890625, "logps/rejected": -1925.6861572265625, "loss": 0.465, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.8903388977050781, "rewards/margins": 0.8044350743293762, "rewards/rejected": -1.6947739124298096, "step": 1030 }, { "epoch": 0.54, "learning_rate": 2.547970765583491e-06, "logits/chosen": 0.35459914803504944, "logits/rejected": 0.21209494769573212, "logps/chosen": -1010.3555908203125, "logps/rejected": -1694.515869140625, "loss": 0.4642, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.7338335514068604, "rewards/margins": 0.7184348106384277, "rewards/rejected": -1.452268362045288, "step": 1040 }, { "epoch": 0.55, "learning_rate": 2.502284462053799e-06, "logits/chosen": 0.2834840416908264, "logits/rejected": 0.19832350313663483, "logps/chosen": -1069.5567626953125, "logps/rejected": -1713.046142578125, "loss": 0.4653, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.781063437461853, "rewards/margins": 0.6842104196548462, "rewards/rejected": -1.4652738571166992, "step": 1050 }, { "epoch": 0.55, "learning_rate": 2.456597395532338e-06, "logits/chosen": 0.23369982838630676, "logits/rejected": 0.15703235566616058, "logps/chosen": -1476.560546875, "logps/rejected": -2163.74267578125, "loss": 0.4708, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1821922063827515, "rewards/margins": 0.7186304330825806, "rewards/rejected": -1.900822639465332, "step": 1060 }, { "epoch": 0.56, "learning_rate": 2.4109248251281953e-06, "logits/chosen": 0.2690127491950989, "logits/rejected": 0.1083533763885498, "logps/chosen": -1436.783447265625, "logps/rejected": -2573.591064453125, "loss": 0.4639, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1651248931884766, "rewards/margins": 1.1516902446746826, "rewards/rejected": -2.316815137863159, "step": 1070 }, { "epoch": 0.57, "learning_rate": 2.365282005108875e-06, "logits/chosen": 0.2598133087158203, "logits/rejected": 0.17415449023246765, "logps/chosen": -1348.472412109375, "logps/rejected": -1934.5325927734375, "loss": 0.4721, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.0647830963134766, "rewards/margins": 0.6361646056175232, "rewards/rejected": -1.7009475231170654, "step": 1080 }, { "epoch": 0.57, "learning_rate": 2.319684179805491e-06, "logits/chosen": 0.28293731808662415, "logits/rejected": 0.16613037884235382, "logps/chosen": -1299.3868408203125, "logps/rejected": -2169.75830078125, "loss": 0.4726, "rewards/accuracies": 0.53125, "rewards/chosen": -1.0253424644470215, "rewards/margins": 0.8982712626457214, "rewards/rejected": -1.9236137866973877, "step": 1090 }, { "epoch": 0.58, "learning_rate": 2.2741465785212905e-06, "logits/chosen": 0.3770299553871155, "logits/rejected": 0.3206137418746948, "logps/chosen": -845.8580322265625, "logps/rejected": -1371.0318603515625, "loss": 0.4754, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.573067843914032, "rewards/margins": 0.5587201714515686, "rewards/rejected": -1.1317881345748901, "step": 1100 }, { "epoch": 0.58, "learning_rate": 2.2286844104451848e-06, "logits/chosen": 0.29950836300849915, "logits/rejected": 0.2572200298309326, "logps/chosen": -1225.456298828125, "logps/rejected": -1701.4114990234375, "loss": 0.4717, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.9399654269218445, "rewards/margins": 0.5411953926086426, "rewards/rejected": -1.4811608791351318, "step": 1110 }, { "epoch": 0.59, "learning_rate": 2.183312859572008e-06, "logits/chosen": 0.2056627720594406, "logits/rejected": 0.13243384659290314, "logps/chosen": -1311.5948486328125, "logps/rejected": -2090.031494140625, "loss": 0.473, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.03976309299469, "rewards/margins": 0.8429223895072937, "rewards/rejected": -1.8826854228973389, "step": 1120 }, { "epoch": 0.59, "learning_rate": 2.1380470796311843e-06, "logits/chosen": 0.26897698640823364, "logits/rejected": 0.19322913885116577, "logps/chosen": -1409.248779296875, "logps/rejected": -1968.114013671875, "loss": 0.4624, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1417442560195923, "rewards/margins": 0.591802716255188, "rewards/rejected": -1.7335469722747803, "step": 1130 }, { "epoch": 0.6, "learning_rate": 2.092902189025507e-06, "logits/chosen": 0.298466295003891, "logits/rejected": 0.1567627638578415, "logps/chosen": -1206.5018310546875, "logps/rejected": -2206.86767578125, "loss": 0.4604, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9475823640823364, "rewards/margins": 1.0231791734695435, "rewards/rejected": -1.9707612991333008, "step": 1140 }, { "epoch": 0.6, "learning_rate": 2.0478932657817105e-06, "logits/chosen": 0.31211769580841064, "logits/rejected": 0.1320025771856308, "logps/chosen": -1475.707275390625, "logps/rejected": -2485.997802734375, "loss": 0.4686, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1953158378601074, "rewards/margins": 1.0596123933792114, "rewards/rejected": -2.2549283504486084, "step": 1150 }, { "epoch": 0.61, "learning_rate": 2.0030353425145376e-06, "logits/chosen": 0.29154402017593384, "logits/rejected": 0.20484980940818787, "logps/chosen": -1307.7490234375, "logps/rejected": -1891.5804443359375, "loss": 0.475, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.068193793296814, "rewards/margins": 0.5871996879577637, "rewards/rejected": -1.6553936004638672, "step": 1160 }, { "epoch": 0.61, "learning_rate": 1.958343401405964e-06, "logits/chosen": 0.2675972282886505, "logits/rejected": 0.20726804435253143, "logps/chosen": -1136.7181396484375, "logps/rejected": -1507.15234375, "loss": 0.4705, "rewards/accuracies": 0.53125, "rewards/chosen": -0.8827505111694336, "rewards/margins": 0.3945409953594208, "rewards/rejected": -1.2772915363311768, "step": 1170 }, { "epoch": 0.62, "learning_rate": 1.9138323692012734e-06, "logits/chosen": 0.273415207862854, "logits/rejected": 0.16786028444766998, "logps/chosen": -1736.076416015625, "logps/rejected": -2560.149169921875, "loss": 0.4705, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4262146949768066, "rewards/margins": 0.8943548202514648, "rewards/rejected": -2.3205695152282715, "step": 1180 }, { "epoch": 0.62, "learning_rate": 1.8695171122236443e-06, "logits/chosen": 0.20894399285316467, "logits/rejected": 0.10228965431451797, "logps/chosen": -1324.954345703125, "logps/rejected": -2638.982666015625, "loss": 0.4668, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.0546363592147827, "rewards/margins": 1.3503773212432861, "rewards/rejected": -2.4050137996673584, "step": 1190 }, { "epoch": 0.63, "learning_rate": 1.8254124314089225e-06, "logits/chosen": 0.3430663049221039, "logits/rejected": 0.2673262655735016, "logps/chosen": -861.5838623046875, "logps/rejected": -1974.1458740234375, "loss": 0.4518, "rewards/accuracies": 0.625, "rewards/chosen": -0.5944491624832153, "rewards/margins": 1.1017727851867676, "rewards/rejected": -1.696221947669983, "step": 1200 }, { "epoch": 0.63, "learning_rate": 1.781533057362221e-06, "logits/chosen": 0.3156498670578003, "logits/rejected": 0.185347780585289, "logps/chosen": -1168.6217041015625, "logps/rejected": -1924.7115478515625, "loss": 0.4583, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8821272850036621, "rewards/margins": 0.8161047101020813, "rewards/rejected": -1.6982319355010986, "step": 1210 }, { "epoch": 0.64, "learning_rate": 1.7378936454380277e-06, "logits/chosen": 0.36333876848220825, "logits/rejected": 0.26434630155563354, "logps/chosen": -1027.678466796875, "logps/rejected": -1634.684814453125, "loss": 0.4654, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.7384849786758423, "rewards/margins": 0.6454702615737915, "rewards/rejected": -1.383955478668213, "step": 1220 }, { "epoch": 0.64, "learning_rate": 1.6945087708454273e-06, "logits/chosen": 0.27189189195632935, "logits/rejected": 0.18399885296821594, "logps/chosen": -1334.14990234375, "logps/rejected": -1880.106201171875, "loss": 0.4767, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.090384840965271, "rewards/margins": 0.5727940797805786, "rewards/rejected": -1.66317880153656, "step": 1230 }, { "epoch": 0.65, "learning_rate": 1.651392923780105e-06, "logits/chosen": 0.4100673794746399, "logits/rejected": 0.2657643258571625, "logps/chosen": -1111.376220703125, "logps/rejected": -1941.037353515625, "loss": 0.46, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8117152452468872, "rewards/margins": 0.8833802938461304, "rewards/rejected": -1.695095419883728, "step": 1240 }, { "epoch": 0.65, "learning_rate": 1.608560504584737e-06, "logits/chosen": 0.301455020904541, "logits/rejected": 0.22863301634788513, "logps/chosen": -1159.3509521484375, "logps/rejected": -2089.1953125, "loss": 0.4631, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.8735660314559937, "rewards/margins": 0.9659594297409058, "rewards/rejected": -1.839525580406189, "step": 1250 }, { "epoch": 0.66, "learning_rate": 1.5660258189393945e-06, "logits/chosen": 0.19146260619163513, "logits/rejected": 0.14353962242603302, "logps/chosen": -1484.316650390625, "logps/rejected": -2343.659423828125, "loss": 0.4687, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2383463382720947, "rewards/margins": 0.8734383583068848, "rewards/rejected": -2.1117844581604004, "step": 1260 }, { "epoch": 0.66, "learning_rate": 1.5238030730835578e-06, "logits/chosen": 0.31026071310043335, "logits/rejected": 0.19475135207176208, "logps/chosen": -1738.6246337890625, "logps/rejected": -2328.933349609375, "loss": 0.4693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.462491750717163, "rewards/margins": 0.6261566281318665, "rewards/rejected": -2.0886483192443848, "step": 1270 }, { "epoch": 0.67, "learning_rate": 1.4819063690713565e-06, "logits/chosen": 0.26937440037727356, "logits/rejected": 0.15669001638889313, "logps/chosen": -1396.046630859375, "logps/rejected": -2102.262939453125, "loss": 0.4598, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1547179222106934, "rewards/margins": 0.7373046278953552, "rewards/rejected": -1.892022728919983, "step": 1280 }, { "epoch": 0.68, "learning_rate": 1.4403497000615885e-06, "logits/chosen": 0.3091123700141907, "logits/rejected": 0.204463392496109, "logps/chosen": -1624.702392578125, "logps/rejected": -2571.47412109375, "loss": 0.4654, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3447537422180176, "rewards/margins": 0.9728642702102661, "rewards/rejected": -2.3176181316375732, "step": 1290 }, { "epoch": 0.68, "learning_rate": 1.3991469456441273e-06, "logits/chosen": 0.31638103723526, "logits/rejected": 0.23879094421863556, "logps/chosen": -1413.901123046875, "logps/rejected": -2330.88330078125, "loss": 0.4546, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1106030941009521, "rewards/margins": 0.9651702642440796, "rewards/rejected": -2.075773239135742, "step": 1300 }, { "epoch": 0.69, "learning_rate": 1.3583118672042441e-06, "logits/chosen": 0.274738609790802, "logits/rejected": 0.18945345282554626, "logps/chosen": -1652.8541259765625, "logps/rejected": -2093.89990234375, "loss": 0.4704, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.371697187423706, "rewards/margins": 0.5045391917228699, "rewards/rejected": -1.8762363195419312, "step": 1310 }, { "epoch": 0.69, "learning_rate": 1.3178581033264218e-06, "logits/chosen": 0.27669447660446167, "logits/rejected": 0.16615034639835358, "logps/chosen": -1164.213134765625, "logps/rejected": -2034.477783203125, "loss": 0.4566, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9321501851081848, "rewards/margins": 0.888770580291748, "rewards/rejected": -1.820920705795288, "step": 1320 }, { "epoch": 0.7, "learning_rate": 1.2777991652391757e-06, "logits/chosen": 0.31228479743003845, "logits/rejected": 0.21845977008342743, "logps/chosen": -1202.439697265625, "logps/rejected": -1930.3785400390625, "loss": 0.4661, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.9665547609329224, "rewards/margins": 0.7526635527610779, "rewards/rejected": -1.7192184925079346, "step": 1330 }, { "epoch": 0.7, "learning_rate": 1.2381484323024178e-06, "logits/chosen": 0.35927221179008484, "logits/rejected": 0.2287793606519699, "logps/chosen": -1124.3046875, "logps/rejected": -2056.501708984375, "loss": 0.4623, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.8218294382095337, "rewards/margins": 0.98065185546875, "rewards/rejected": -1.8024810552597046, "step": 1340 }, { "epoch": 0.71, "learning_rate": 1.1989191475388518e-06, "logits/chosen": 0.3698425889015198, "logits/rejected": 0.2954414486885071, "logps/chosen": -1166.0611572265625, "logps/rejected": -1549.630126953125, "loss": 0.4695, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8860333561897278, "rewards/margins": 0.42343559861183167, "rewards/rejected": -1.3094689846038818, "step": 1350 }, { "epoch": 0.71, "learning_rate": 1.160124413210918e-06, "logits/chosen": 0.35506299138069153, "logits/rejected": 0.2409767210483551, "logps/chosen": -1092.040283203125, "logps/rejected": -1912.9970703125, "loss": 0.4582, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.802148163318634, "rewards/margins": 0.8678997755050659, "rewards/rejected": -1.6700481176376343, "step": 1360 }, { "epoch": 0.72, "learning_rate": 1.1217771864447396e-06, "logits/chosen": 0.3243677616119385, "logits/rejected": 0.17696735262870789, "logps/chosen": -991.9781494140625, "logps/rejected": -2300.01806640625, "loss": 0.4563, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6990408897399902, "rewards/margins": 1.3517526388168335, "rewards/rejected": -2.050793409347534, "step": 1370 }, { "epoch": 0.72, "learning_rate": 1.08389027490255e-06, "logits/chosen": 0.27917546033859253, "logits/rejected": 0.13479743897914886, "logps/chosen": -1405.369384765625, "logps/rejected": -2042.785888671875, "loss": 0.4724, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.150667428970337, "rewards/margins": 0.6895908713340759, "rewards/rejected": -1.8402583599090576, "step": 1380 }, { "epoch": 0.73, "learning_rate": 1.046476332505036e-06, "logits/chosen": 0.3343364894390106, "logits/rejected": 0.23037847876548767, "logps/chosen": -1098.7138671875, "logps/rejected": -2268.34130859375, "loss": 0.463, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8331190347671509, "rewards/margins": 1.2149721384048462, "rewards/rejected": -2.048090934753418, "step": 1390 }, { "epoch": 0.73, "learning_rate": 1.0095478552050348e-06, "logits/chosen": 0.26566246151924133, "logits/rejected": 0.2032911777496338, "logps/chosen": -956.0791015625, "logps/rejected": -1707.116943359375, "loss": 0.4575, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7228320837020874, "rewards/margins": 0.797810435295105, "rewards/rejected": -1.5206425189971924, "step": 1400 }, { "epoch": 0.74, "learning_rate": 9.731171768139808e-07, "logits/chosen": 0.3556608557701111, "logits/rejected": 0.28849393129348755, "logps/chosen": -1063.8748779296875, "logps/rejected": -1503.830322265625, "loss": 0.4712, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8245790600776672, "rewards/margins": 0.47296270728111267, "rewards/rejected": -1.2975417375564575, "step": 1410 }, { "epoch": 0.74, "learning_rate": 9.371964648825221e-07, "logits/chosen": 0.3505791425704956, "logits/rejected": 0.22841492295265198, "logps/chosen": -1045.915283203125, "logps/rejected": -2175.98876953125, "loss": 0.4619, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7385074496269226, "rewards/margins": 1.1998847723007202, "rewards/rejected": -1.9383922815322876, "step": 1420 }, { "epoch": 0.75, "learning_rate": 9.017977166366445e-07, "logits/chosen": 0.2420744001865387, "logits/rejected": 0.17516903579235077, "logps/chosen": -1474.8148193359375, "logps/rejected": -1958.3011474609375, "loss": 0.4696, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2239506244659424, "rewards/margins": 0.5244899988174438, "rewards/rejected": -1.7484405040740967, "step": 1430 }, { "epoch": 0.75, "learning_rate": 8.669327549707096e-07, "logits/chosen": 0.2893267571926117, "logits/rejected": 0.18889647722244263, "logps/chosen": -1455.797607421875, "logps/rejected": -2097.127685546875, "loss": 0.467, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1924570798873901, "rewards/margins": 0.6752533912658691, "rewards/rejected": -1.8677103519439697, "step": 1440 }, { "epoch": 0.76, "learning_rate": 8.326132244986932e-07, "logits/chosen": 0.24378347396850586, "logits/rejected": 0.050407588481903076, "logps/chosen": -1516.9559326171875, "logps/rejected": -2759.12353515625, "loss": 0.4595, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2377126216888428, "rewards/margins": 1.2976287603378296, "rewards/rejected": -2.535341262817383, "step": 1450 }, { "epoch": 0.76, "learning_rate": 7.988505876649863e-07, "logits/chosen": 0.2632651925086975, "logits/rejected": 0.18519091606140137, "logps/chosen": -1460.3145751953125, "logps/rejected": -1786.594482421875, "loss": 0.4719, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1815235614776611, "rewards/margins": 0.385366290807724, "rewards/rejected": -1.566890001296997, "step": 1460 }, { "epoch": 0.77, "learning_rate": 7.656561209160248e-07, "logits/chosen": 0.2761257290840149, "logits/rejected": 0.16277745366096497, "logps/chosen": -1284.623291015625, "logps/rejected": -2513.677734375, "loss": 0.4624, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9854960441589355, "rewards/margins": 1.282775640487671, "rewards/rejected": -2.2682716846466064, "step": 1470 }, { "epoch": 0.77, "learning_rate": 7.330409109340563e-07, "logits/chosen": 0.2461864948272705, "logits/rejected": 0.16639626026153564, "logps/chosen": -1436.0191650390625, "logps/rejected": -2242.687744140625, "loss": 0.4672, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1464489698410034, "rewards/margins": 0.8480439186096191, "rewards/rejected": -1.9944928884506226, "step": 1480 }, { "epoch": 0.78, "learning_rate": 7.010158509342682e-07, "logits/chosen": 0.2478228360414505, "logits/rejected": 0.1436949521303177, "logps/chosen": -1367.794921875, "logps/rejected": -2203.85107421875, "loss": 0.4721, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1351174116134644, "rewards/margins": 0.8538748621940613, "rewards/rejected": -1.9889923334121704, "step": 1490 }, { "epoch": 0.79, "learning_rate": 6.695916370265529e-07, "logits/chosen": 0.27428361773490906, "logits/rejected": 0.18057170510292053, "logps/chosen": -1440.5146484375, "logps/rejected": -2161.803466796875, "loss": 0.4628, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1880239248275757, "rewards/margins": 0.7238161563873291, "rewards/rejected": -1.9118402004241943, "step": 1500 }, { "epoch": 0.79, "learning_rate": 6.387787646430854e-07, "logits/chosen": 0.25450989603996277, "logits/rejected": 0.1020331159234047, "logps/chosen": -1392.365966796875, "logps/rejected": -2656.48974609375, "loss": 0.4575, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.118877649307251, "rewards/margins": 1.2837135791778564, "rewards/rejected": -2.4025912284851074, "step": 1510 }, { "epoch": 0.8, "learning_rate": 6.085875250329401e-07, "logits/chosen": 0.3250389099121094, "logits/rejected": 0.23088189959526062, "logps/chosen": -1277.065673828125, "logps/rejected": -2237.5419921875, "loss": 0.4588, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.98078453540802, "rewards/margins": 1.0101826190948486, "rewards/rejected": -1.9909673929214478, "step": 1520 }, { "epoch": 0.8, "learning_rate": 5.79028001824894e-07, "logits/chosen": 0.34990447759628296, "logits/rejected": 0.1642770618200302, "logps/chosen": -1346.687744140625, "logps/rejected": -3187.396484375, "loss": 0.4642, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0415836572647095, "rewards/margins": 1.8741792440414429, "rewards/rejected": -2.9157626628875732, "step": 1530 }, { "epoch": 0.81, "learning_rate": 5.501100676595761e-07, "logits/chosen": 0.2536852955818176, "logits/rejected": 0.1401246041059494, "logps/chosen": -1562.163818359375, "logps/rejected": -2294.75732421875, "loss": 0.4614, "rewards/accuracies": 0.625, "rewards/chosen": -1.2489855289459229, "rewards/margins": 0.7926613092422485, "rewards/rejected": -2.041646957397461, "step": 1540 }, { "epoch": 0.81, "learning_rate": 5.218433808920884e-07, "logits/chosen": 0.2926151752471924, "logits/rejected": 0.09962544590234756, "logps/chosen": -1433.572509765625, "logps/rejected": -2299.615478515625, "loss": 0.4524, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1709363460540771, "rewards/margins": 0.9143635630607605, "rewards/rejected": -2.0853002071380615, "step": 1550 }, { "epoch": 0.82, "learning_rate": 4.942373823661928e-07, "logits/chosen": 0.23216836154460907, "logits/rejected": 0.19754758477210999, "logps/chosen": -1521.939208984375, "logps/rejected": -2178.3291015625, "loss": 0.4698, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2342188358306885, "rewards/margins": 0.6809908151626587, "rewards/rejected": -1.9152095317840576, "step": 1560 }, { "epoch": 0.82, "learning_rate": 4.6730129226114363e-07, "logits/chosen": 0.19226306676864624, "logits/rejected": 0.13501006364822388, "logps/chosen": -1532.320068359375, "logps/rejected": -2355.093505859375, "loss": 0.4712, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2820327281951904, "rewards/margins": 0.8527010679244995, "rewards/rejected": -2.1347339153289795, "step": 1570 }, { "epoch": 0.83, "learning_rate": 4.4104410701222703e-07, "logits/chosen": 0.15366807579994202, "logits/rejected": 0.11835174262523651, "logps/chosen": -1608.6761474609375, "logps/rejected": -2489.91455078125, "loss": 0.469, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.338335394859314, "rewards/margins": 0.8876321911811829, "rewards/rejected": -2.2259676456451416, "step": 1580 }, { "epoch": 0.83, "learning_rate": 4.154745963060197e-07, "logits/chosen": 0.21381524205207825, "logits/rejected": 0.0645713359117508, "logps/chosen": -1354.0247802734375, "logps/rejected": -2909.98828125, "loss": 0.4559, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0845643281936646, "rewards/margins": 1.571176290512085, "rewards/rejected": -2.655740737915039, "step": 1590 }, { "epoch": 0.84, "learning_rate": 3.9060130015138863e-07, "logits/chosen": 0.25924235582351685, "logits/rejected": 0.1109732836484909, "logps/chosen": -1437.39501953125, "logps/rejected": -2759.584228515625, "loss": 0.4559, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1647388935089111, "rewards/margins": 1.3532658815383911, "rewards/rejected": -2.5180046558380127, "step": 1600 }, { "epoch": 0.84, "learning_rate": 3.664325260271953e-07, "logits/chosen": 0.22887060046195984, "logits/rejected": 0.09053263813257217, "logps/chosen": -1473.260009765625, "logps/rejected": -1995.0269775390625, "loss": 0.4712, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2130258083343506, "rewards/margins": 0.5937215089797974, "rewards/rejected": -1.8067471981048584, "step": 1610 }, { "epoch": 0.85, "learning_rate": 3.429763461076677e-07, "logits/chosen": 0.1899276226758957, "logits/rejected": 0.12356813251972198, "logps/chosen": -1743.064453125, "logps/rejected": -2304.783203125, "loss": 0.4677, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4444160461425781, "rewards/margins": 0.6152055859565735, "rewards/rejected": -2.059621572494507, "step": 1620 }, { "epoch": 0.85, "learning_rate": 3.202405945663556e-07, "logits/chosen": 0.22914421558380127, "logits/rejected": 0.09422020614147186, "logps/chosen": -1509.6998291015625, "logps/rejected": -2195.837646484375, "loss": 0.4638, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2225978374481201, "rewards/margins": 0.7319514155387878, "rewards/rejected": -1.9545494318008423, "step": 1630 }, { "epoch": 0.86, "learning_rate": 2.982328649595856e-07, "logits/chosen": 0.24722608923912048, "logits/rejected": 0.10591373592615128, "logps/chosen": -1233.9052734375, "logps/rejected": -2268.322509765625, "loss": 0.4653, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9642356634140015, "rewards/margins": 1.0845736265182495, "rewards/rejected": -2.04880952835083, "step": 1640 }, { "epoch": 0.86, "learning_rate": 2.7696050769026954e-07, "logits/chosen": 0.21008674800395966, "logits/rejected": 0.05934596806764603, "logps/chosen": -1442.0106201171875, "logps/rejected": -2874.48388671875, "loss": 0.4615, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2054154872894287, "rewards/margins": 1.4088830947875977, "rewards/rejected": -2.6142985820770264, "step": 1650 }, { "epoch": 0.87, "learning_rate": 2.564306275529341e-07, "logits/chosen": 0.18529877066612244, "logits/rejected": 0.12559422850608826, "logps/chosen": -1704.0299072265625, "logps/rejected": -2808.08349609375, "loss": 0.4591, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4349021911621094, "rewards/margins": 1.1065049171447754, "rewards/rejected": -2.5414071083068848, "step": 1660 }, { "epoch": 0.87, "learning_rate": 2.3665008136077332e-07, "logits/chosen": 0.19881121814250946, "logits/rejected": 0.17202343046665192, "logps/chosen": -1710.2633056640625, "logps/rejected": -1984.5556640625, "loss": 0.473, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3971118927001953, "rewards/margins": 0.33112001419067383, "rewards/rejected": -1.7282320261001587, "step": 1670 }, { "epoch": 0.88, "learning_rate": 2.1762547565553293e-07, "logits/chosen": 0.17657816410064697, "logits/rejected": 0.11265295743942261, "logps/chosen": -1725.0482177734375, "logps/rejected": -1982.76953125, "loss": 0.466, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.5021684169769287, "rewards/margins": 0.2558698058128357, "rewards/rejected": -1.7580381631851196, "step": 1680 }, { "epoch": 0.88, "learning_rate": 1.993631645009747e-07, "logits/chosen": 0.19522444903850555, "logits/rejected": 0.058800529688596725, "logps/chosen": -1578.4208984375, "logps/rejected": -2554.65185546875, "loss": 0.4675, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2845125198364258, "rewards/margins": 1.006216049194336, "rewards/rejected": -2.290728807449341, "step": 1690 }, { "epoch": 0.89, "learning_rate": 1.818692473606748e-07, "logits/chosen": 0.2271948605775833, "logits/rejected": 0.18108686804771423, "logps/chosen": -1478.1927490234375, "logps/rejected": -2156.734375, "loss": 0.4747, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2414519786834717, "rewards/margins": 0.7026728391647339, "rewards/rejected": -1.9441248178482056, "step": 1700 }, { "epoch": 0.9, "learning_rate": 1.6514956706084885e-07, "logits/chosen": 0.23735575377941132, "logits/rejected": 0.11482490599155426, "logps/chosen": -1801.324462890625, "logps/rejected": -2704.887939453125, "loss": 0.4736, "rewards/accuracies": 0.59375, "rewards/chosen": -1.552073359489441, "rewards/margins": 0.8944045901298523, "rewards/rejected": -2.4464781284332275, "step": 1710 }, { "epoch": 0.9, "learning_rate": 1.4920970783889737e-07, "logits/chosen": 0.22280173003673553, "logits/rejected": 0.11919368803501129, "logps/chosen": -1566.2589111328125, "logps/rejected": -2471.8125, "loss": 0.4684, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2630040645599365, "rewards/margins": 0.966931164264679, "rewards/rejected": -2.229935646057129, "step": 1720 }, { "epoch": 0.91, "learning_rate": 1.340549934783164e-07, "logits/chosen": 0.2689998745918274, "logits/rejected": 0.12245997041463852, "logps/chosen": -1098.0716552734375, "logps/rejected": -2332.2578125, "loss": 0.4659, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8609301447868347, "rewards/margins": 1.2441167831420898, "rewards/rejected": -2.1050469875335693, "step": 1730 }, { "epoch": 0.91, "learning_rate": 1.196904855305961e-07, "logits/chosen": 0.2383730709552765, "logits/rejected": 0.15037932991981506, "logps/chosen": -1544.904052734375, "logps/rejected": -2499.219482421875, "loss": 0.4561, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2684749364852905, "rewards/margins": 0.9886786341667175, "rewards/rejected": -2.2571537494659424, "step": 1740 }, { "epoch": 0.92, "learning_rate": 1.0612098162470302e-07, "logits/chosen": 0.20837631821632385, "logits/rejected": 0.1260487288236618, "logps/chosen": -1376.4371337890625, "logps/rejected": -2311.586669921875, "loss": 0.4467, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.126542329788208, "rewards/margins": 0.9472710490226746, "rewards/rejected": -2.0738134384155273, "step": 1750 }, { "epoch": 0.92, "learning_rate": 9.335101386471285e-08, "logits/chosen": 0.2322504222393036, "logits/rejected": 0.06627029925584793, "logps/chosen": -1435.283447265625, "logps/rejected": -2674.15576171875, "loss": 0.4715, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.151064157485962, "rewards/margins": 1.2704349756240845, "rewards/rejected": -2.421499252319336, "step": 1760 }, { "epoch": 0.93, "learning_rate": 8.138484731612273e-08, "logits/chosen": 0.2155352383852005, "logits/rejected": 0.12622274458408356, "logps/chosen": -1182.394287109375, "logps/rejected": -2245.24462890625, "loss": 0.4629, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.9620729684829712, "rewards/margins": 1.0421679019927979, "rewards/rejected": -2.0042405128479004, "step": 1770 }, { "epoch": 0.93, "learning_rate": 7.022647858135501e-08, "logits/chosen": 0.30309510231018066, "logits/rejected": 0.18017789721488953, "logps/chosen": -1599.3291015625, "logps/rejected": -2475.38720703125, "loss": 0.465, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3132779598236084, "rewards/margins": 0.8995476961135864, "rewards/rejected": -2.2128255367279053, "step": 1780 }, { "epoch": 0.94, "learning_rate": 5.987963446492384e-08, "logits/chosen": 0.23334476351737976, "logits/rejected": 0.17126549780368805, "logps/chosen": -1491.8856201171875, "logps/rejected": -2064.55078125, "loss": 0.4679, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2208709716796875, "rewards/margins": 0.6093058586120605, "rewards/rejected": -1.8301767110824585, "step": 1790 }, { "epoch": 0.94, "learning_rate": 5.034777072871394e-08, "logits/chosen": 0.23894283175468445, "logits/rejected": 0.16225464642047882, "logps/chosen": -1209.31494140625, "logps/rejected": -1923.9974365234375, "loss": 0.4748, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.9387443661689758, "rewards/margins": 0.7473281621932983, "rewards/rejected": -1.6860727071762085, "step": 1800 }, { "epoch": 0.95, "learning_rate": 4.163407093778243e-08, "logits/chosen": 0.30054157972335815, "logits/rejected": 0.17386284470558167, "logps/chosen": -1040.991455078125, "logps/rejected": -2445.29541015625, "loss": 0.4516, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7577398419380188, "rewards/margins": 1.4329960346221924, "rewards/rejected": -2.1907360553741455, "step": 1810 }, { "epoch": 0.95, "learning_rate": 3.37414453970758e-08, "logits/chosen": 0.303236186504364, "logits/rejected": 0.1971709430217743, "logps/chosen": -1248.239501953125, "logps/rejected": -2541.384033203125, "loss": 0.4512, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9271729588508606, "rewards/margins": 1.3627898693084717, "rewards/rejected": -2.2899627685546875, "step": 1820 }, { "epoch": 0.96, "learning_rate": 2.6672530179410183e-08, "logits/chosen": 0.25464674830436707, "logits/rejected": 0.13462017476558685, "logps/chosen": -1484.759521484375, "logps/rejected": -2381.3857421875, "loss": 0.4582, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2104871273040771, "rewards/margins": 0.955074667930603, "rewards/rejected": -2.165562152862549, "step": 1830 }, { "epoch": 0.96, "learning_rate": 2.04296862450451e-08, "logits/chosen": 0.34345191717147827, "logits/rejected": 0.1766502857208252, "logps/chosen": -1336.6195068359375, "logps/rejected": -2531.597412109375, "loss": 0.4675, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.0585607290267944, "rewards/margins": 1.2248413562774658, "rewards/rejected": -2.28340220451355, "step": 1840 }, { "epoch": 0.97, "learning_rate": 1.501499865314171e-08, "logits/chosen": 0.31596893072128296, "logits/rejected": 0.17752663791179657, "logps/chosen": -1208.4625244140625, "logps/rejected": -2460.017578125, "loss": 0.4534, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.9176605939865112, "rewards/margins": 1.2910888195037842, "rewards/rejected": -2.208749294281006, "step": 1850 }, { "epoch": 0.97, "learning_rate": 1.0430275865371265e-08, "logits/chosen": 0.30796024203300476, "logits/rejected": 0.15131710469722748, "logps/chosen": -1164.2542724609375, "logps/rejected": -2230.33056640625, "loss": 0.4555, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8924150466918945, "rewards/margins": 1.0938717126846313, "rewards/rejected": -1.9862868785858154, "step": 1860 }, { "epoch": 0.98, "learning_rate": 6.677049141901315e-09, "logits/chosen": 0.26449787616729736, "logits/rejected": 0.12270595878362656, "logps/chosen": -1493.645263671875, "logps/rejected": -2633.17626953125, "loss": 0.4614, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2356170415878296, "rewards/margins": 1.1605656147003174, "rewards/rejected": -2.3961825370788574, "step": 1870 }, { "epoch": 0.98, "learning_rate": 3.756572029968708e-09, "logits/chosen": 0.23211045563220978, "logits/rejected": 0.13400281965732574, "logps/chosen": -1511.829345703125, "logps/rejected": -2489.31494140625, "loss": 0.4594, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2328951358795166, "rewards/margins": 1.0022671222686768, "rewards/rejected": -2.2351622581481934, "step": 1880 }, { "epoch": 0.99, "learning_rate": 1.6698199452053199e-09, "logits/chosen": 0.19983918964862823, "logits/rejected": 0.11516892910003662, "logps/chosen": -1396.664306640625, "logps/rejected": -2378.28857421875, "loss": 0.4543, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1454143524169922, "rewards/margins": 1.0044304132461548, "rewards/rejected": -2.1498446464538574, "step": 1890 }, { "epoch": 0.99, "learning_rate": 4.1748984585560094e-10, "logits/chosen": 0.2773471474647522, "logits/rejected": 0.1175018697977066, "logps/chosen": -1402.577392578125, "logps/rejected": -2661.568603515625, "loss": 0.4649, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.132912039756775, "rewards/margins": 1.262406587600708, "rewards/rejected": -2.3953185081481934, "step": 1900 }, { "epoch": 1.0, "learning_rate": 0.0, "logits/chosen": 0.21633613109588623, "logits/rejected": 0.111175537109375, "logps/chosen": -1688.2955322265625, "logps/rejected": -2615.567626953125, "loss": 0.4707, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.4329384565353394, "rewards/margins": 0.9476302862167358, "rewards/rejected": -2.380568504333496, "step": 1910 }, { "epoch": 1.0, "step": 1910, "total_flos": 0.0, "train_loss": 0.0024642594821790128, "train_runtime": 126.2817, "train_samples_per_second": 484.116, "train_steps_per_second": 15.125 } ], "logging_steps": 10, "max_steps": 1910, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }