{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999183606825047, "eval_steps": 100, "global_step": 1531, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000653114539962446, "grad_norm": 10.320163557955956, "learning_rate": 9.74025974025974e-10, "logits/chosen": -1.751611590385437, "logits/rejected": -1.8014392852783203, "logps/chosen": -475.2503662109375, "logps/rejected": -473.9908752441406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.001306229079924892, "grad_norm": 9.57458924359402, "learning_rate": 1.948051948051948e-09, "logits/chosen": -1.6125147342681885, "logits/rejected": -1.6155394315719604, "logps/chosen": -510.954833984375, "logps/rejected": -456.25836181640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0019593436198873378, "grad_norm": 7.386703525361656, "learning_rate": 2.9220779220779217e-09, "logits/chosen": -1.6310054063796997, "logits/rejected": -1.664994239807129, "logps/chosen": -526.9005737304688, "logps/rejected": -502.1066589355469, "loss": 0.6936, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0020557926036417484, "rewards/margins": -0.0028404928743839264, "rewards/rejected": 0.0007846998050808907, "step": 3 }, { "epoch": 0.002612458159849784, "grad_norm": 8.184972062975513, "learning_rate": 3.896103896103896e-09, "logits/chosen": -1.6248514652252197, "logits/rejected": -1.6408921480178833, "logps/chosen": -452.23651123046875, "logps/rejected": -464.2245178222656, "loss": 0.6927, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0005986166652292013, "rewards/margins": 0.0015139534370973706, "rewards/rejected": -0.0009153365390375257, "step": 4 }, { "epoch": 0.0032655726998122294, "grad_norm": 11.574533999877016, "learning_rate": 4.8701298701298695e-09, "logits/chosen": -1.7286796569824219, "logits/rejected": -1.7585710287094116, "logps/chosen": -462.3341979980469, "logps/rejected": -441.8200378417969, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": -0.00044421182246878743, "rewards/margins": 0.00013704760931432247, "rewards/rejected": -0.0005812598392367363, "step": 5 }, { "epoch": 0.0039186872397746755, "grad_norm": 9.259170331718803, "learning_rate": 5.844155844155843e-09, "logits/chosen": -1.7418220043182373, "logits/rejected": -1.7892532348632812, "logps/chosen": -522.937744140625, "logps/rejected": -552.8088989257812, "loss": 0.6929, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0012052917154505849, "rewards/margins": -0.002461440395563841, "rewards/rejected": 0.0012561489129438996, "step": 6 }, { "epoch": 0.0045718017797371216, "grad_norm": 7.63789930024362, "learning_rate": 6.818181818181818e-09, "logits/chosen": -1.7751970291137695, "logits/rejected": -1.8038901090621948, "logps/chosen": -533.4199829101562, "logps/rejected": -480.9632263183594, "loss": 0.6925, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0024563027545809746, "rewards/margins": 0.0012150906259194016, "rewards/rejected": 0.0012412117794156075, "step": 7 }, { "epoch": 0.005224916319699568, "grad_norm": 9.961517065455444, "learning_rate": 7.792207792207793e-09, "logits/chosen": -1.7411315441131592, "logits/rejected": -1.754071593284607, "logps/chosen": -495.657470703125, "logps/rejected": -475.14959716796875, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00112921220716089, "rewards/margins": -0.0019524528179317713, "rewards/rejected": 0.0008232402033172548, "step": 8 }, { "epoch": 0.005878030859662013, "grad_norm": 6.85854758907971, "learning_rate": 8.766233766233765e-09, "logits/chosen": -1.6711037158966064, "logits/rejected": -1.7036432027816772, "logps/chosen": -506.71331787109375, "logps/rejected": -518.3823852539062, "loss": 0.6932, "rewards/accuracies": 0.46875, "rewards/chosen": 0.00047096985508687794, "rewards/margins": -0.0014907550066709518, "rewards/rejected": 0.0019617248326539993, "step": 9 }, { "epoch": 0.006531145399624459, "grad_norm": 11.552115994362305, "learning_rate": 9.740259740259739e-09, "logits/chosen": -1.6863927841186523, "logits/rejected": -1.6270835399627686, "logps/chosen": -527.3156127929688, "logps/rejected": -595.2274169921875, "loss": 0.6923, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0007424141513183713, "rewards/margins": 0.0002746771788224578, "rewards/rejected": -0.0010170910973101854, "step": 10 }, { "epoch": 0.007184259939586905, "grad_norm": 9.285086199342572, "learning_rate": 1.0714285714285713e-08, "logits/chosen": -1.6245068311691284, "logits/rejected": -1.6487013101577759, "logps/chosen": -552.9703369140625, "logps/rejected": -526.51025390625, "loss": 0.6929, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0022101732902228832, "rewards/margins": 0.0029705429915338755, "rewards/rejected": -0.0007603692938573658, "step": 11 }, { "epoch": 0.007837374479549351, "grad_norm": 6.699656683031416, "learning_rate": 1.1688311688311687e-08, "logits/chosen": -1.7146100997924805, "logits/rejected": -1.7387487888336182, "logps/chosen": -512.6212158203125, "logps/rejected": -510.3314514160156, "loss": 0.6931, "rewards/accuracies": 0.625, "rewards/chosen": -0.001821169862523675, "rewards/margins": 0.0011536835227161646, "rewards/rejected": -0.0029748533852398396, "step": 12 }, { "epoch": 0.008490489019511797, "grad_norm": 7.205708657995355, "learning_rate": 1.2662337662337662e-08, "logits/chosen": -1.7469621896743774, "logits/rejected": -1.7776381969451904, "logps/chosen": -585.2794799804688, "logps/rejected": -501.048828125, "loss": 0.6933, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0012821245472878218, "rewards/margins": 0.001307644764892757, "rewards/rejected": -2.552039222791791e-05, "step": 13 }, { "epoch": 0.009143603559474243, "grad_norm": 17.744991773610803, "learning_rate": 1.3636363636363636e-08, "logits/chosen": -1.704886555671692, "logits/rejected": -1.7239997386932373, "logps/chosen": -522.1668090820312, "logps/rejected": -519.49951171875, "loss": 0.6931, "rewards/accuracies": 0.59375, "rewards/chosen": 0.00018573057604953647, "rewards/margins": 0.0024611069820821285, "rewards/rejected": -0.0022753761149942875, "step": 14 }, { "epoch": 0.00979671809943669, "grad_norm": 6.553103071836206, "learning_rate": 1.461038961038961e-08, "logits/chosen": -1.715399980545044, "logits/rejected": -1.724109411239624, "logps/chosen": -551.634765625, "logps/rejected": -474.11785888671875, "loss": 0.6937, "rewards/accuracies": 0.53125, "rewards/chosen": 0.001085147843696177, "rewards/margins": 0.0005957795074209571, "rewards/rejected": 0.0004893685108982027, "step": 15 }, { "epoch": 0.010449832639399135, "grad_norm": 10.474583885683455, "learning_rate": 1.5584415584415586e-08, "logits/chosen": -1.791701078414917, "logits/rejected": -1.7533316612243652, "logps/chosen": -495.9596252441406, "logps/rejected": -524.2186279296875, "loss": 0.6935, "rewards/accuracies": 0.5, "rewards/chosen": -0.0018585537327453494, "rewards/margins": -0.001228995155543089, "rewards/rejected": -0.0006295585772022605, "step": 16 }, { "epoch": 0.011102947179361581, "grad_norm": 8.585521664394983, "learning_rate": 1.6558441558441556e-08, "logits/chosen": -1.689960241317749, "logits/rejected": -1.660653829574585, "logps/chosen": -493.1070251464844, "logps/rejected": -558.66748046875, "loss": 0.6931, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0004895019810646772, "rewards/margins": 0.0019447661470621824, "rewards/rejected": -0.0014552639331668615, "step": 17 }, { "epoch": 0.011756061719324026, "grad_norm": 7.688284862178414, "learning_rate": 1.753246753246753e-08, "logits/chosen": -1.634385108947754, "logits/rejected": -1.6527695655822754, "logps/chosen": -476.8695068359375, "logps/rejected": -481.3186950683594, "loss": 0.6935, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0008362793596461415, "rewards/margins": -0.0002491784398443997, "rewards/rejected": 0.0010854577412828803, "step": 18 }, { "epoch": 0.012409176259286472, "grad_norm": 11.300638272495242, "learning_rate": 1.8506493506493504e-08, "logits/chosen": -1.7336674928665161, "logits/rejected": -1.7424148321151733, "logps/chosen": -555.4613037109375, "logps/rejected": -531.6415405273438, "loss": 0.693, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0010435438016429543, "rewards/margins": -0.0002537918044254184, "rewards/rejected": 0.001297335489653051, "step": 19 }, { "epoch": 0.013062290799248918, "grad_norm": 7.209810609202922, "learning_rate": 1.9480519480519478e-08, "logits/chosen": -1.6538033485412598, "logits/rejected": -1.676405668258667, "logps/chosen": -535.2705688476562, "logps/rejected": -540.2841186523438, "loss": 0.6926, "rewards/accuracies": 0.4375, "rewards/chosen": -0.000612707226537168, "rewards/margins": -0.0002046869631158188, "rewards/rejected": -0.0004080201033502817, "step": 20 }, { "epoch": 0.013715405339211364, "grad_norm": 9.239564322245995, "learning_rate": 2.0454545454545452e-08, "logits/chosen": -1.6935447454452515, "logits/rejected": -1.7362101078033447, "logps/chosen": -483.9548034667969, "logps/rejected": -443.07958984375, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00011293424176983535, "rewards/margins": 0.0016996811609715223, "rewards/rejected": -0.0018126153154298663, "step": 21 }, { "epoch": 0.01436851987917381, "grad_norm": 11.953662843213952, "learning_rate": 2.1428571428571426e-08, "logits/chosen": -1.649803638458252, "logits/rejected": -1.6609901189804077, "logps/chosen": -556.7703247070312, "logps/rejected": -528.4605102539062, "loss": 0.6941, "rewards/accuracies": 0.375, "rewards/chosen": 0.0003633833257481456, "rewards/margins": -0.001797966891899705, "rewards/rejected": 0.0021613501012325287, "step": 22 }, { "epoch": 0.015021634419136256, "grad_norm": 5.585424980323363, "learning_rate": 2.24025974025974e-08, "logits/chosen": -1.7381466627120972, "logits/rejected": -1.7497470378875732, "logps/chosen": -470.3625183105469, "logps/rejected": -458.3432312011719, "loss": 0.6932, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0006353663047775626, "rewards/margins": -0.0016739725833758712, "rewards/rejected": 0.002309338888153434, "step": 23 }, { "epoch": 0.015674748959098702, "grad_norm": 12.282080216373183, "learning_rate": 2.3376623376623374e-08, "logits/chosen": -1.6787421703338623, "logits/rejected": -1.6916463375091553, "logps/chosen": -500.0643615722656, "logps/rejected": -530.3701171875, "loss": 0.6923, "rewards/accuracies": 0.5, "rewards/chosen": 0.00024698246852494776, "rewards/margins": 0.0018526006024330854, "rewards/rejected": -0.0016056179301813245, "step": 24 }, { "epoch": 0.016327863499061148, "grad_norm": 6.151170677406452, "learning_rate": 2.435064935064935e-08, "logits/chosen": -1.730494499206543, "logits/rejected": -1.7640714645385742, "logps/chosen": -567.3126831054688, "logps/rejected": -524.96630859375, "loss": 0.694, "rewards/accuracies": 0.40625, "rewards/chosen": 0.002176732989028096, "rewards/margins": -0.0018334074411541224, "rewards/rejected": 0.004010140895843506, "step": 25 }, { "epoch": 0.016980978039023594, "grad_norm": 6.027535384301359, "learning_rate": 2.5324675324675325e-08, "logits/chosen": -1.700589895248413, "logits/rejected": -1.655698537826538, "logps/chosen": -525.3576049804688, "logps/rejected": -536.69384765625, "loss": 0.6926, "rewards/accuracies": 0.4375, "rewards/chosen": 0.002966861240565777, "rewards/margins": -0.00015424739103764296, "rewards/rejected": 0.0031211089808493853, "step": 26 }, { "epoch": 0.01763409257898604, "grad_norm": 10.92870471034886, "learning_rate": 2.62987012987013e-08, "logits/chosen": -1.7824229001998901, "logits/rejected": -1.7060723304748535, "logps/chosen": -499.6011962890625, "logps/rejected": -459.31390380859375, "loss": 0.6928, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0014267514925450087, "rewards/margins": -0.000579934217967093, "rewards/rejected": -0.0008468173909932375, "step": 27 }, { "epoch": 0.018287207118948486, "grad_norm": 10.356153154714054, "learning_rate": 2.7272727272727272e-08, "logits/chosen": -1.7105991840362549, "logits/rejected": -1.704366683959961, "logps/chosen": -532.1188354492188, "logps/rejected": -546.0477905273438, "loss": 0.6924, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0009431360522285104, "rewards/margins": 0.0037088487297296524, "rewards/rejected": -0.00276571256108582, "step": 28 }, { "epoch": 0.018940321658910932, "grad_norm": 6.73381910894591, "learning_rate": 2.8246753246753246e-08, "logits/chosen": -1.651279330253601, "logits/rejected": -1.6885946989059448, "logps/chosen": -498.8572998046875, "logps/rejected": -454.43701171875, "loss": 0.6935, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0029693078249692917, "rewards/margins": 0.00039883877616375685, "rewards/rejected": 0.0025704693980515003, "step": 29 }, { "epoch": 0.01959343619887338, "grad_norm": 9.815644247246649, "learning_rate": 2.922077922077922e-08, "logits/chosen": -1.659732699394226, "logits/rejected": -1.6510050296783447, "logps/chosen": -527.5221557617188, "logps/rejected": -572.2091064453125, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": -0.00042819028021767735, "rewards/margins": 0.0010157201904803514, "rewards/rejected": -0.0014439105289056897, "step": 30 }, { "epoch": 0.020246550738835824, "grad_norm": 6.574072733494995, "learning_rate": 3.01948051948052e-08, "logits/chosen": -1.7301993370056152, "logits/rejected": -1.7147951126098633, "logps/chosen": -495.9303894042969, "logps/rejected": -487.27630615234375, "loss": 0.6937, "rewards/accuracies": 0.625, "rewards/chosen": 0.001899483148008585, "rewards/margins": 0.0021509858779609203, "rewards/rejected": -0.00025150307919830084, "step": 31 }, { "epoch": 0.02089966527879827, "grad_norm": 6.336712001931794, "learning_rate": 3.116883116883117e-08, "logits/chosen": -1.769822597503662, "logits/rejected": -1.7909101247787476, "logps/chosen": -466.05523681640625, "logps/rejected": -443.83001708984375, "loss": 0.6939, "rewards/accuracies": 0.34375, "rewards/chosen": -0.002961869351565838, "rewards/margins": -0.004024825058877468, "rewards/rejected": 0.0010629557073116302, "step": 32 }, { "epoch": 0.021552779818760717, "grad_norm": 7.984443377717489, "learning_rate": 3.214285714285714e-08, "logits/chosen": -1.6552331447601318, "logits/rejected": -1.6343588829040527, "logps/chosen": -473.5636901855469, "logps/rejected": -490.8223876953125, "loss": 0.6935, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0031329344492405653, "rewards/margins": 0.0010508847190067172, "rewards/rejected": -0.004183819051831961, "step": 33 }, { "epoch": 0.022205894358723163, "grad_norm": 6.978522846761633, "learning_rate": 3.311688311688311e-08, "logits/chosen": -1.685523271560669, "logits/rejected": -1.6947942972183228, "logps/chosen": -488.18597412109375, "logps/rejected": -450.8877258300781, "loss": 0.6926, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0016193913761526346, "rewards/margins": 0.001421906752511859, "rewards/rejected": 0.00019748439081013203, "step": 34 }, { "epoch": 0.02285900889868561, "grad_norm": 7.493471887712862, "learning_rate": 3.4090909090909086e-08, "logits/chosen": -1.711535096168518, "logits/rejected": -1.7027740478515625, "logps/chosen": -518.72314453125, "logps/rejected": -475.7657165527344, "loss": 0.6931, "rewards/accuracies": 0.625, "rewards/chosen": 0.0004046607355121523, "rewards/margins": 0.001060183160007, "rewards/rejected": -0.0006555224535986781, "step": 35 }, { "epoch": 0.02351212343864805, "grad_norm": 11.205930629830023, "learning_rate": 3.506493506493506e-08, "logits/chosen": -1.7367273569107056, "logits/rejected": -1.762798547744751, "logps/chosen": -453.77130126953125, "logps/rejected": -481.04730224609375, "loss": 0.6929, "rewards/accuracies": 0.53125, "rewards/chosen": -0.00015169614925980568, "rewards/margins": 0.0007772659882903099, "rewards/rejected": -0.0009289622539654374, "step": 36 }, { "epoch": 0.024165237978610497, "grad_norm": 9.467273081236305, "learning_rate": 3.6038961038961034e-08, "logits/chosen": -1.7246737480163574, "logits/rejected": -1.7480111122131348, "logps/chosen": -466.54998779296875, "logps/rejected": -427.57373046875, "loss": 0.6931, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0015318728983402252, "rewards/margins": 0.0017306231893599033, "rewards/rejected": -0.00019875055295415223, "step": 37 }, { "epoch": 0.024818352518572943, "grad_norm": 9.156288457593233, "learning_rate": 3.701298701298701e-08, "logits/chosen": -1.6512861251831055, "logits/rejected": -1.690797209739685, "logps/chosen": -509.2376708984375, "logps/rejected": -528.0012817382812, "loss": 0.6931, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0029702354222536087, "rewards/margins": 0.0016618178924545646, "rewards/rejected": 0.0013084171805530787, "step": 38 }, { "epoch": 0.02547146705853539, "grad_norm": 17.95636045906593, "learning_rate": 3.798701298701298e-08, "logits/chosen": -1.6484848260879517, "logits/rejected": -1.6626923084259033, "logps/chosen": -444.7795715332031, "logps/rejected": -453.7108154296875, "loss": 0.6934, "rewards/accuracies": 0.53125, "rewards/chosen": -0.000330042967107147, "rewards/margins": 0.0012718366924673319, "rewards/rejected": -0.001601879601366818, "step": 39 }, { "epoch": 0.026124581598497836, "grad_norm": 8.845607047635447, "learning_rate": 3.8961038961038956e-08, "logits/chosen": -1.6774462461471558, "logits/rejected": -1.6941050291061401, "logps/chosen": -490.5712890625, "logps/rejected": -565.9147338867188, "loss": 0.6937, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0004610727773979306, "rewards/margins": -0.0016047237440943718, "rewards/rejected": 0.002065796870738268, "step": 40 }, { "epoch": 0.02677769613846028, "grad_norm": 9.919258253862095, "learning_rate": 3.993506493506493e-08, "logits/chosen": -1.6783130168914795, "logits/rejected": -1.6832352876663208, "logps/chosen": -424.7172546386719, "logps/rejected": -463.73919677734375, "loss": 0.6929, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0029250048100948334, "rewards/margins": 0.00039243214996531606, "rewards/rejected": 0.0025325727183371782, "step": 41 }, { "epoch": 0.027430810678422728, "grad_norm": 7.033725232378199, "learning_rate": 4.0909090909090904e-08, "logits/chosen": -1.7260679006576538, "logits/rejected": -1.7201330661773682, "logps/chosen": -540.1764526367188, "logps/rejected": -501.5211486816406, "loss": 0.6926, "rewards/accuracies": 0.5625, "rewards/chosen": 2.3270258679986e-06, "rewards/margins": 0.0007927274564281106, "rewards/rejected": -0.0007904003723524511, "step": 42 }, { "epoch": 0.028083925218385174, "grad_norm": 8.639855374503854, "learning_rate": 4.188311688311688e-08, "logits/chosen": -1.7189686298370361, "logits/rejected": -1.7004797458648682, "logps/chosen": -505.0670471191406, "logps/rejected": -522.9998168945312, "loss": 0.6933, "rewards/accuracies": 0.34375, "rewards/chosen": 0.0024530505761504173, "rewards/margins": -0.0023120976984500885, "rewards/rejected": 0.004765148274600506, "step": 43 }, { "epoch": 0.02873703975834762, "grad_norm": 7.855142409694213, "learning_rate": 4.285714285714285e-08, "logits/chosen": -1.8125633001327515, "logits/rejected": -1.8358653783798218, "logps/chosen": -449.2396545410156, "logps/rejected": -412.31622314453125, "loss": 0.692, "rewards/accuracies": 0.625, "rewards/chosen": -7.799867307767272e-05, "rewards/margins": 0.0013592124450951815, "rewards/rejected": -0.0014372109435498714, "step": 44 }, { "epoch": 0.029390154298310066, "grad_norm": 14.0615352021285, "learning_rate": 4.3831168831168825e-08, "logits/chosen": -1.6212176084518433, "logits/rejected": -1.7029989957809448, "logps/chosen": -485.98822021484375, "logps/rejected": -451.6901550292969, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": 0.002106280066072941, "rewards/margins": -0.00013485189992934465, "rewards/rejected": 0.0022411320824176073, "step": 45 }, { "epoch": 0.030043268838272512, "grad_norm": 13.882948188534005, "learning_rate": 4.48051948051948e-08, "logits/chosen": -1.7054741382598877, "logits/rejected": -1.6707385778427124, "logps/chosen": -523.1839599609375, "logps/rejected": -538.1018676757812, "loss": 0.6925, "rewards/accuracies": 0.4375, "rewards/chosen": 6.611342541873455e-05, "rewards/margins": -0.0016670847544446588, "rewards/rejected": 0.0017331981798633933, "step": 46 }, { "epoch": 0.030696383378234958, "grad_norm": 9.843375032358303, "learning_rate": 4.577922077922077e-08, "logits/chosen": -1.7931544780731201, "logits/rejected": -1.8134924173355103, "logps/chosen": -576.3263549804688, "logps/rejected": -531.6103515625, "loss": 0.6925, "rewards/accuracies": 0.40625, "rewards/chosen": -0.00019430162501521409, "rewards/margins": -0.0015074944822117686, "rewards/rejected": 0.0013131927698850632, "step": 47 }, { "epoch": 0.031349497918197404, "grad_norm": 13.08440244244893, "learning_rate": 4.675324675324675e-08, "logits/chosen": -1.728432536125183, "logits/rejected": -1.687523365020752, "logps/chosen": -424.1823425292969, "logps/rejected": -436.7662353515625, "loss": 0.6932, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0026053618639707565, "rewards/margins": 0.0011907146545127034, "rewards/rejected": 0.0014146470930427313, "step": 48 }, { "epoch": 0.03200261245815985, "grad_norm": 9.954241633042399, "learning_rate": 4.772727272727273e-08, "logits/chosen": -1.7366923093795776, "logits/rejected": -1.7875515222549438, "logps/chosen": -482.4845886230469, "logps/rejected": -462.21734619140625, "loss": 0.6931, "rewards/accuracies": 0.46875, "rewards/chosen": 0.002501377835869789, "rewards/margins": 0.0017617797711864114, "rewards/rejected": 0.0007395982975140214, "step": 49 }, { "epoch": 0.032655726998122296, "grad_norm": 7.516649010433926, "learning_rate": 4.87012987012987e-08, "logits/chosen": -1.643943190574646, "logits/rejected": -1.6239250898361206, "logps/chosen": -571.3704223632812, "logps/rejected": -552.835205078125, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": 0.0005948520265519619, "rewards/margins": -0.001149489893577993, "rewards/rejected": 0.0017443416872993112, "step": 50 }, { "epoch": 0.03330884153808474, "grad_norm": 8.673266186430727, "learning_rate": 4.9675324675324675e-08, "logits/chosen": -1.654160499572754, "logits/rejected": -1.6642738580703735, "logps/chosen": -568.924072265625, "logps/rejected": -502.4623107910156, "loss": 0.6928, "rewards/accuracies": 0.46875, "rewards/chosen": 0.003631243482232094, "rewards/margins": 0.002229990903288126, "rewards/rejected": 0.0014012528117746115, "step": 51 }, { "epoch": 0.03396195607804719, "grad_norm": 5.760323905203282, "learning_rate": 5.064935064935065e-08, "logits/chosen": -1.7572216987609863, "logits/rejected": -1.746352195739746, "logps/chosen": -500.9621276855469, "logps/rejected": -516.9683837890625, "loss": 0.6923, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0025606201961636543, "rewards/margins": 0.0012761401012539864, "rewards/rejected": 0.0012844798620790243, "step": 52 }, { "epoch": 0.034615070618009634, "grad_norm": 13.745858132264178, "learning_rate": 5.162337662337662e-08, "logits/chosen": -1.6922714710235596, "logits/rejected": -1.6829943656921387, "logps/chosen": -521.5164794921875, "logps/rejected": -511.12115478515625, "loss": 0.6929, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0012258647475391626, "rewards/margins": 0.0017541146371513605, "rewards/rejected": -0.0005282496567815542, "step": 53 }, { "epoch": 0.03526818515797208, "grad_norm": 9.901929656057623, "learning_rate": 5.25974025974026e-08, "logits/chosen": -1.6156866550445557, "logits/rejected": -1.6489068269729614, "logps/chosen": -504.8508605957031, "logps/rejected": -510.78070068359375, "loss": 0.6923, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0022437286097556353, "rewards/margins": 0.0005393887404352427, "rewards/rejected": 0.0017043398693203926, "step": 54 }, { "epoch": 0.035921299697934526, "grad_norm": 7.628193867040592, "learning_rate": 5.357142857142857e-08, "logits/chosen": -1.7694371938705444, "logits/rejected": -1.7641582489013672, "logps/chosen": -497.0212707519531, "logps/rejected": -577.7604370117188, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.00022474752040579915, "rewards/margins": 0.0001437664614059031, "rewards/rejected": 8.098152466118336e-05, "step": 55 }, { "epoch": 0.03657441423789697, "grad_norm": 6.526308952187627, "learning_rate": 5.4545454545454545e-08, "logits/chosen": -1.6863549947738647, "logits/rejected": -1.726689100265503, "logps/chosen": -570.279052734375, "logps/rejected": -529.3751831054688, "loss": 0.693, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0028358842246234417, "rewards/margins": -0.0016305134631693363, "rewards/rejected": -0.00120537041220814, "step": 56 }, { "epoch": 0.03722752877785942, "grad_norm": 7.971501408541291, "learning_rate": 5.551948051948052e-08, "logits/chosen": -1.7459444999694824, "logits/rejected": -1.8165135383605957, "logps/chosen": -500.9310302734375, "logps/rejected": -476.9623107910156, "loss": 0.6933, "rewards/accuracies": 0.46875, "rewards/chosen": -0.002534267958253622, "rewards/margins": -0.00011749513214454055, "rewards/rejected": -0.0024167723022401333, "step": 57 }, { "epoch": 0.037880643317821865, "grad_norm": 19.843826487104433, "learning_rate": 5.649350649350649e-08, "logits/chosen": -1.6915953159332275, "logits/rejected": -1.7043017148971558, "logps/chosen": -437.893310546875, "logps/rejected": -444.86895751953125, "loss": 0.6934, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0005522965220734477, "rewards/margins": -0.0005562017904594541, "rewards/rejected": 3.905253834091127e-06, "step": 58 }, { "epoch": 0.03853375785778431, "grad_norm": 17.45299457584043, "learning_rate": 5.7467532467532466e-08, "logits/chosen": -1.7828876972198486, "logits/rejected": -1.7247426509857178, "logps/chosen": -489.2809753417969, "logps/rejected": -575.0067138671875, "loss": 0.6935, "rewards/accuracies": 0.5, "rewards/chosen": -0.004004344809800386, "rewards/margins": -0.001498160301707685, "rewards/rejected": -0.0025061843916773796, "step": 59 }, { "epoch": 0.03918687239774676, "grad_norm": 7.706775067267686, "learning_rate": 5.844155844155844e-08, "logits/chosen": -1.7627689838409424, "logits/rejected": -1.7395163774490356, "logps/chosen": -543.8458251953125, "logps/rejected": -535.2295532226562, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": 0.002326030982658267, "rewards/margins": 0.0011458850931376219, "rewards/rejected": 0.0011801458895206451, "step": 60 }, { "epoch": 0.0398399869377092, "grad_norm": 6.253034693514966, "learning_rate": 5.9415584415584414e-08, "logits/chosen": -1.6752774715423584, "logits/rejected": -1.6769983768463135, "logps/chosen": -513.5790405273438, "logps/rejected": -491.4671630859375, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 4.048121627420187e-05, "rewards/margins": 0.00097788090351969, "rewards/rejected": -0.0009373998618684709, "step": 61 }, { "epoch": 0.04049310147767165, "grad_norm": 8.25357941061994, "learning_rate": 6.03896103896104e-08, "logits/chosen": -1.7725433111190796, "logits/rejected": -1.8016797304153442, "logps/chosen": -504.8186950683594, "logps/rejected": -534.6757202148438, "loss": 0.6929, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0038966606371104717, "rewards/margins": 0.0019743391312658787, "rewards/rejected": 0.0019223212730139494, "step": 62 }, { "epoch": 0.041146216017634095, "grad_norm": 7.603828708440927, "learning_rate": 6.136363636363636e-08, "logits/chosen": -1.784754991531372, "logits/rejected": -1.7742725610733032, "logps/chosen": -547.2255249023438, "logps/rejected": -546.972412109375, "loss": 0.6936, "rewards/accuracies": 0.46875, "rewards/chosen": 0.002894229721277952, "rewards/margins": -0.0006492708926089108, "rewards/rejected": 0.0035435007885098457, "step": 63 }, { "epoch": 0.04179933055759654, "grad_norm": 15.969939708450756, "learning_rate": 6.233766233766234e-08, "logits/chosen": -1.654356837272644, "logits/rejected": -1.6166218519210815, "logps/chosen": -488.8111877441406, "logps/rejected": -569.5529174804688, "loss": 0.692, "rewards/accuracies": 0.71875, "rewards/chosen": 0.003502319101244211, "rewards/margins": 0.005036606453359127, "rewards/rejected": -0.0015342880506068468, "step": 64 }, { "epoch": 0.04245244509755899, "grad_norm": 22.983780138981498, "learning_rate": 6.331168831168831e-08, "logits/chosen": -1.7115808725357056, "logits/rejected": -1.7262012958526611, "logps/chosen": -499.15399169921875, "logps/rejected": -488.51214599609375, "loss": 0.6929, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0016056394670158625, "rewards/margins": 0.0008727168897166848, "rewards/rejected": 0.0007329225772991776, "step": 65 }, { "epoch": 0.04310555963752143, "grad_norm": 11.044651160280129, "learning_rate": 6.428571428571428e-08, "logits/chosen": -1.795534610748291, "logits/rejected": -1.799638271331787, "logps/chosen": -533.3612670898438, "logps/rejected": -621.8697509765625, "loss": 0.6923, "rewards/accuracies": 0.59375, "rewards/chosen": 0.001967411022633314, "rewards/margins": 0.003664374118670821, "rewards/rejected": -0.001696963096037507, "step": 66 }, { "epoch": 0.04375867417748388, "grad_norm": 25.764603545634195, "learning_rate": 6.525974025974026e-08, "logits/chosen": -1.7212399244308472, "logits/rejected": -1.6900732517242432, "logps/chosen": -433.75335693359375, "logps/rejected": -483.9370422363281, "loss": 0.6929, "rewards/accuracies": 0.6875, "rewards/chosen": 0.005031700246036053, "rewards/margins": 0.00277211656793952, "rewards/rejected": 0.002259582979604602, "step": 67 }, { "epoch": 0.044411788717446325, "grad_norm": 7.6456802404761985, "learning_rate": 6.623376623376622e-08, "logits/chosen": -1.8252774477005005, "logits/rejected": -1.7665594816207886, "logps/chosen": -505.1371765136719, "logps/rejected": -519.2654418945312, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": -7.071479922160506e-05, "rewards/margins": 0.0002764108357951045, "rewards/rejected": -0.00034712543128989637, "step": 68 }, { "epoch": 0.04506490325740877, "grad_norm": 22.249196941026028, "learning_rate": 6.72077922077922e-08, "logits/chosen": -1.7685558795928955, "logits/rejected": -1.754807710647583, "logps/chosen": -498.36309814453125, "logps/rejected": -479.595947265625, "loss": 0.6927, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0044998289085924625, "rewards/margins": 0.0033680484630167484, "rewards/rejected": 0.0011317802127450705, "step": 69 }, { "epoch": 0.04571801779737122, "grad_norm": 18.41888656875299, "learning_rate": 6.818181818181817e-08, "logits/chosen": -1.6847811937332153, "logits/rejected": -1.7150285243988037, "logps/chosen": -521.6116333007812, "logps/rejected": -501.6766662597656, "loss": 0.6926, "rewards/accuracies": 0.65625, "rewards/chosen": 0.00515484344214201, "rewards/margins": 0.0027162095066159964, "rewards/rejected": 0.0024386332370340824, "step": 70 }, { "epoch": 0.046371132337333656, "grad_norm": 7.5323060443272425, "learning_rate": 6.915584415584415e-08, "logits/chosen": -1.6845817565917969, "logits/rejected": -1.6868298053741455, "logps/chosen": -509.47174072265625, "logps/rejected": -497.8253173828125, "loss": 0.6923, "rewards/accuracies": 0.65625, "rewards/chosen": 0.006278076209127903, "rewards/margins": 0.003210592083632946, "rewards/rejected": 0.0030674838926643133, "step": 71 }, { "epoch": 0.0470242468772961, "grad_norm": 11.518462131512745, "learning_rate": 7.012987012987012e-08, "logits/chosen": -1.689769983291626, "logits/rejected": -1.6752269268035889, "logps/chosen": -508.0777587890625, "logps/rejected": -497.0141906738281, "loss": 0.6925, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0023371409624814987, "rewards/margins": -0.0009353661444038153, "rewards/rejected": 0.0032725068740546703, "step": 72 }, { "epoch": 0.04767736141725855, "grad_norm": 13.464519389398141, "learning_rate": 7.11038961038961e-08, "logits/chosen": -1.7256083488464355, "logits/rejected": -1.7064285278320312, "logps/chosen": -528.0186157226562, "logps/rejected": -521.5746459960938, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0029949708841741085, "rewards/margins": 0.0013601587852463126, "rewards/rejected": 0.0016348123317584395, "step": 73 }, { "epoch": 0.048330475957220995, "grad_norm": 20.376786509212838, "learning_rate": 7.207792207792207e-08, "logits/chosen": -1.641348123550415, "logits/rejected": -1.6459161043167114, "logps/chosen": -482.44732666015625, "logps/rejected": -473.39190673828125, "loss": 0.6926, "rewards/accuracies": 0.40625, "rewards/chosen": 0.0051408931612968445, "rewards/margins": -0.0015343116829171777, "rewards/rejected": 0.006675205193459988, "step": 74 }, { "epoch": 0.04898359049718344, "grad_norm": 6.970678593146312, "learning_rate": 7.305194805194805e-08, "logits/chosen": -1.7044446468353271, "logits/rejected": -1.68734610080719, "logps/chosen": -502.9562072753906, "logps/rejected": -517.5908813476562, "loss": 0.6922, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0027034569066017866, "rewards/margins": 0.0027098727878183126, "rewards/rejected": -6.4158812165260315e-06, "step": 75 }, { "epoch": 0.04963670503714589, "grad_norm": 9.6526922150323, "learning_rate": 7.402597402597402e-08, "logits/chosen": -1.7046608924865723, "logits/rejected": -1.7354328632354736, "logps/chosen": -535.493408203125, "logps/rejected": -594.682861328125, "loss": 0.6927, "rewards/accuracies": 0.34375, "rewards/chosen": -0.004995846655219793, "rewards/margins": -0.0013705159071832895, "rewards/rejected": -0.0036253309808671474, "step": 76 }, { "epoch": 0.05028981957710833, "grad_norm": 7.329104460905438, "learning_rate": 7.5e-08, "logits/chosen": -1.7437448501586914, "logits/rejected": -1.786738395690918, "logps/chosen": -503.44281005859375, "logps/rejected": -479.0059814453125, "loss": 0.6907, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0025321291759610176, "rewards/margins": 0.007668951991945505, "rewards/rejected": -0.005136823281645775, "step": 77 }, { "epoch": 0.05094293411707078, "grad_norm": 12.493258401725456, "learning_rate": 7.597402597402596e-08, "logits/chosen": -1.7119678258895874, "logits/rejected": -1.7332830429077148, "logps/chosen": -488.6138916015625, "logps/rejected": -485.7553405761719, "loss": 0.6921, "rewards/accuracies": 0.65625, "rewards/chosen": 0.004445738159120083, "rewards/margins": 0.002824036870151758, "rewards/rejected": 0.001621701754629612, "step": 78 }, { "epoch": 0.051596048657033225, "grad_norm": 8.01136858731629, "learning_rate": 7.694805194805194e-08, "logits/chosen": -1.7600123882293701, "logits/rejected": -1.7657560110092163, "logps/chosen": -527.957763671875, "logps/rejected": -547.0194702148438, "loss": 0.6924, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0005399225628934801, "rewards/margins": 0.0007603027625009418, "rewards/rejected": -0.00022038002498447895, "step": 79 }, { "epoch": 0.05224916319699567, "grad_norm": 15.784986319963485, "learning_rate": 7.792207792207791e-08, "logits/chosen": -1.7285094261169434, "logits/rejected": -1.712064504623413, "logps/chosen": -521.34814453125, "logps/rejected": -482.8376159667969, "loss": 0.6921, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0010725975735113025, "rewards/margins": -0.0014756344025954604, "rewards/rejected": 0.00040303694549947977, "step": 80 }, { "epoch": 0.05290227773695812, "grad_norm": 8.657892032924215, "learning_rate": 7.889610389610389e-08, "logits/chosen": -1.6536437273025513, "logits/rejected": -1.6625897884368896, "logps/chosen": -549.460205078125, "logps/rejected": -534.7042236328125, "loss": 0.6927, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0003026010235771537, "rewards/margins": 0.0008853006293065846, "rewards/rejected": -0.0005827000131830573, "step": 81 }, { "epoch": 0.05355539227692056, "grad_norm": 10.870568795609778, "learning_rate": 7.987012987012986e-08, "logits/chosen": -1.6692464351654053, "logits/rejected": -1.6596240997314453, "logps/chosen": -506.01922607421875, "logps/rejected": -524.4590454101562, "loss": 0.6917, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0017397881019860506, "rewards/margins": 0.005637907888740301, "rewards/rejected": -0.003898120019584894, "step": 82 }, { "epoch": 0.05420850681688301, "grad_norm": 5.851977319869193, "learning_rate": 8.084415584415584e-08, "logits/chosen": -1.738106608390808, "logits/rejected": -1.7303545475006104, "logps/chosen": -482.5537414550781, "logps/rejected": -484.3572998046875, "loss": 0.6922, "rewards/accuracies": 0.625, "rewards/chosen": 0.0022105362731963396, "rewards/margins": 0.001744122477248311, "rewards/rejected": 0.00046641333028674126, "step": 83 }, { "epoch": 0.054861621356845455, "grad_norm": 5.9816373124178375, "learning_rate": 8.181818181818181e-08, "logits/chosen": -1.7219839096069336, "logits/rejected": -1.7243647575378418, "logps/chosen": -531.4603881835938, "logps/rejected": -624.1871337890625, "loss": 0.6912, "rewards/accuracies": 0.625, "rewards/chosen": -0.001187248039059341, "rewards/margins": 0.008130445145070553, "rewards/rejected": -0.009317693300545216, "step": 84 }, { "epoch": 0.0555147358968079, "grad_norm": 17.14950100054283, "learning_rate": 8.279220779220779e-08, "logits/chosen": -1.7397611141204834, "logits/rejected": -1.7358020544052124, "logps/chosen": -475.4699401855469, "logps/rejected": -454.5489501953125, "loss": 0.6923, "rewards/accuracies": 0.46875, "rewards/chosen": 0.004636278375983238, "rewards/margins": 0.0002992916852235794, "rewards/rejected": 0.004336986690759659, "step": 85 }, { "epoch": 0.05616785043677035, "grad_norm": 10.798912022369151, "learning_rate": 8.376623376623376e-08, "logits/chosen": -1.6888103485107422, "logits/rejected": -1.6987148523330688, "logps/chosen": -572.60400390625, "logps/rejected": -599.0338745117188, "loss": 0.691, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0021805190481245518, "rewards/margins": 0.006904205307364464, "rewards/rejected": -0.004723686724901199, "step": 86 }, { "epoch": 0.05682096497673279, "grad_norm": 5.848424010532979, "learning_rate": 8.474025974025974e-08, "logits/chosen": -1.7512600421905518, "logits/rejected": -1.77037513256073, "logps/chosen": -575.0525512695312, "logps/rejected": -551.678955078125, "loss": 0.692, "rewards/accuracies": 0.53125, "rewards/chosen": 0.000223713053856045, "rewards/margins": 6.933440454304218e-05, "rewards/rejected": 0.00015437835827469826, "step": 87 }, { "epoch": 0.05747407951669524, "grad_norm": 9.540765629514766, "learning_rate": 8.57142857142857e-08, "logits/chosen": -1.6761497259140015, "logits/rejected": -1.6493499279022217, "logps/chosen": -565.0869140625, "logps/rejected": -575.0073852539062, "loss": 0.6916, "rewards/accuracies": 0.71875, "rewards/chosen": -0.001043493626639247, "rewards/margins": 0.004106073174625635, "rewards/rejected": -0.0051495665684342384, "step": 88 }, { "epoch": 0.058127194056657686, "grad_norm": 10.17514292490149, "learning_rate": 8.668831168831168e-08, "logits/chosen": -1.8012816905975342, "logits/rejected": -1.7821853160858154, "logps/chosen": -503.46295166015625, "logps/rejected": -492.86474609375, "loss": 0.6923, "rewards/accuracies": 0.375, "rewards/chosen": 0.0033976598642766476, "rewards/margins": -0.0017642759485170245, "rewards/rejected": 0.005161936394870281, "step": 89 }, { "epoch": 0.05878030859662013, "grad_norm": 9.087384783413304, "learning_rate": 8.766233766233765e-08, "logits/chosen": -1.6532089710235596, "logits/rejected": -1.657583475112915, "logps/chosen": -515.2310180664062, "logps/rejected": -509.0292053222656, "loss": 0.6916, "rewards/accuracies": 0.5, "rewards/chosen": 0.0025243638083338737, "rewards/margins": 0.0019023300847038627, "rewards/rejected": 0.0006220341892912984, "step": 90 }, { "epoch": 0.05943342313658258, "grad_norm": 11.600432401053613, "learning_rate": 8.863636363636363e-08, "logits/chosen": -1.7180678844451904, "logits/rejected": -1.702775478363037, "logps/chosen": -534.7321166992188, "logps/rejected": -540.93994140625, "loss": 0.6914, "rewards/accuracies": 0.625, "rewards/chosen": 0.004910244606435299, "rewards/margins": 0.003610987449064851, "rewards/rejected": 0.0012992569245398045, "step": 91 }, { "epoch": 0.060086537676545024, "grad_norm": 19.03876328807656, "learning_rate": 8.96103896103896e-08, "logits/chosen": -1.7077628374099731, "logits/rejected": -1.7351174354553223, "logps/chosen": -529.73681640625, "logps/rejected": -508.55828857421875, "loss": 0.6917, "rewards/accuracies": 0.53125, "rewards/chosen": -0.003127522300928831, "rewards/margins": 0.00047443623770959675, "rewards/rejected": -0.0036019585095345974, "step": 92 }, { "epoch": 0.06073965221650747, "grad_norm": 16.589555191285804, "learning_rate": 9.058441558441558e-08, "logits/chosen": -1.6833391189575195, "logits/rejected": -1.674033284187317, "logps/chosen": -477.49798583984375, "logps/rejected": -464.76336669921875, "loss": 0.6915, "rewards/accuracies": 0.53125, "rewards/chosen": 0.001920571201480925, "rewards/margins": 0.0017885612323880196, "rewards/rejected": 0.00013200979446992278, "step": 93 }, { "epoch": 0.061392766756469916, "grad_norm": 21.483285457055274, "learning_rate": 9.155844155844155e-08, "logits/chosen": -1.6899855136871338, "logits/rejected": -1.7081849575042725, "logps/chosen": -472.30670166015625, "logps/rejected": -473.02532958984375, "loss": 0.6906, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0001794814015738666, "rewards/margins": 0.0037227103020995855, "rewards/rejected": -0.0039021919947117567, "step": 94 }, { "epoch": 0.06204588129643236, "grad_norm": 8.584522992242459, "learning_rate": 9.253246753246754e-08, "logits/chosen": -1.6826878786087036, "logits/rejected": -1.6474887132644653, "logps/chosen": -469.18511962890625, "logps/rejected": -475.0853576660156, "loss": 0.6914, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0038090678863227367, "rewards/margins": 0.005096444860100746, "rewards/rejected": -0.00890551321208477, "step": 95 }, { "epoch": 0.06269899583639481, "grad_norm": 10.611257258997737, "learning_rate": 9.35064935064935e-08, "logits/chosen": -1.7167426347732544, "logits/rejected": -1.745957374572754, "logps/chosen": -415.5352478027344, "logps/rejected": -406.41021728515625, "loss": 0.6919, "rewards/accuracies": 0.5625, "rewards/chosen": -8.110757335089147e-05, "rewards/margins": 0.005273091606795788, "rewards/rejected": -0.005354199092835188, "step": 96 }, { "epoch": 0.06335211037635725, "grad_norm": 17.637332082165788, "learning_rate": 9.448051948051949e-08, "logits/chosen": -1.7745977640151978, "logits/rejected": -1.7798304557800293, "logps/chosen": -471.1458435058594, "logps/rejected": -464.43621826171875, "loss": 0.6916, "rewards/accuracies": 0.5, "rewards/chosen": -0.0014066218864172697, "rewards/margins": 0.002926683286204934, "rewards/rejected": -0.004333305172622204, "step": 97 }, { "epoch": 0.0640052249163197, "grad_norm": 15.475362634607388, "learning_rate": 9.545454545454546e-08, "logits/chosen": -1.757310390472412, "logits/rejected": -1.7441529035568237, "logps/chosen": -543.7720336914062, "logps/rejected": -521.256591796875, "loss": 0.6905, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0013191296020522714, "rewards/margins": 0.006374385207891464, "rewards/rejected": -0.005055255722254515, "step": 98 }, { "epoch": 0.06465833945628215, "grad_norm": 8.90308809277182, "learning_rate": 9.642857142857144e-08, "logits/chosen": -1.7599557638168335, "logits/rejected": -1.7538748979568481, "logps/chosen": -488.3480224609375, "logps/rejected": -493.222900390625, "loss": 0.6907, "rewards/accuracies": 0.65625, "rewards/chosen": 0.005196585785597563, "rewards/margins": 0.00629068398848176, "rewards/rejected": -0.0010940982028841972, "step": 99 }, { "epoch": 0.06531145399624459, "grad_norm": 8.599502814641067, "learning_rate": 9.74025974025974e-08, "logits/chosen": -1.6944297552108765, "logits/rejected": -1.6814221143722534, "logps/chosen": -491.616943359375, "logps/rejected": -464.1621398925781, "loss": 0.6911, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0056236740201711655, "rewards/margins": 0.0013095784233883023, "rewards/rejected": -0.006933252792805433, "step": 100 }, { "epoch": 0.06531145399624459, "eval_logits/chosen": -1.7781137228012085, "eval_logits/rejected": -1.783394694328308, "eval_logps/chosen": -510.6041564941406, "eval_logps/rejected": -502.9036865234375, "eval_loss": 0.6912173628807068, "eval_rewards/accuracies": 0.5640000104904175, "eval_rewards/chosen": -0.0025881431065499783, "eval_rewards/margins": 0.004054033197462559, "eval_rewards/rejected": -0.006642176769673824, "eval_runtime": 309.9462, "eval_samples_per_second": 12.905, "eval_steps_per_second": 0.807, "step": 100 }, { "epoch": 0.06596456853620704, "grad_norm": 11.975392598780218, "learning_rate": 9.837662337662338e-08, "logits/chosen": -1.8105459213256836, "logits/rejected": -1.7418413162231445, "logps/chosen": -495.683837890625, "logps/rejected": -558.681884765625, "loss": 0.6916, "rewards/accuracies": 0.5625, "rewards/chosen": -0.013931737281382084, "rewards/margins": 0.0046969749964773655, "rewards/rejected": -0.018628710880875587, "step": 101 }, { "epoch": 0.06661768307616948, "grad_norm": 19.08100215691415, "learning_rate": 9.935064935064935e-08, "logits/chosen": -1.7734951972961426, "logits/rejected": -1.785585880279541, "logps/chosen": -521.8323974609375, "logps/rejected": -493.7707824707031, "loss": 0.69, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0010571195743978024, "rewards/margins": 0.004120311699807644, "rewards/rejected": -0.0051774317398667336, "step": 102 }, { "epoch": 0.06727079761613193, "grad_norm": 16.407227405047756, "learning_rate": 1.0032467532467532e-07, "logits/chosen": -1.813089370727539, "logits/rejected": -1.8418606519699097, "logps/chosen": -535.99658203125, "logps/rejected": -547.38134765625, "loss": 0.6894, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0009367464226670563, "rewards/margins": 0.004543146584182978, "rewards/rejected": -0.005479893181473017, "step": 103 }, { "epoch": 0.06792391215609438, "grad_norm": 10.784387451558391, "learning_rate": 1.012987012987013e-07, "logits/chosen": -1.6748157739639282, "logits/rejected": -1.6749082803726196, "logps/chosen": -544.4282836914062, "logps/rejected": -502.47491455078125, "loss": 0.6909, "rewards/accuracies": 0.53125, "rewards/chosen": -0.005649018567055464, "rewards/margins": 0.002750987885519862, "rewards/rejected": -0.008400006219744682, "step": 104 }, { "epoch": 0.06857702669605682, "grad_norm": 7.366213696928153, "learning_rate": 1.0227272727272727e-07, "logits/chosen": -1.8434463739395142, "logits/rejected": -1.8662176132202148, "logps/chosen": -499.53240966796875, "logps/rejected": -481.79345703125, "loss": 0.6913, "rewards/accuracies": 0.5625, "rewards/chosen": -0.008984744548797607, "rewards/margins": 0.001937179360538721, "rewards/rejected": -0.010921923443675041, "step": 105 }, { "epoch": 0.06923014123601927, "grad_norm": 8.12342504564889, "learning_rate": 1.0324675324675325e-07, "logits/chosen": -1.7362070083618164, "logits/rejected": -1.7785553932189941, "logps/chosen": -524.1248779296875, "logps/rejected": -506.4564514160156, "loss": 0.6903, "rewards/accuracies": 0.65625, "rewards/chosen": 0.001606869394890964, "rewards/margins": 0.002641630358994007, "rewards/rejected": -0.0010347607312723994, "step": 106 }, { "epoch": 0.06988325577598171, "grad_norm": 6.5649883506893385, "learning_rate": 1.0422077922077921e-07, "logits/chosen": -1.695350170135498, "logits/rejected": -1.6904462575912476, "logps/chosen": -556.2769775390625, "logps/rejected": -519.2373046875, "loss": 0.6906, "rewards/accuracies": 0.6875, "rewards/chosen": -0.012179381214082241, "rewards/margins": 0.008277284912765026, "rewards/rejected": -0.020456667989492416, "step": 107 }, { "epoch": 0.07053637031594416, "grad_norm": 13.47303429680314, "learning_rate": 1.051948051948052e-07, "logits/chosen": -1.6657894849777222, "logits/rejected": -1.6693402528762817, "logps/chosen": -535.307373046875, "logps/rejected": -526.8478393554688, "loss": 0.6906, "rewards/accuracies": 0.65625, "rewards/chosen": -0.006545607931911945, "rewards/margins": 0.008519239723682404, "rewards/rejected": -0.015064846724271774, "step": 108 }, { "epoch": 0.0711894848559066, "grad_norm": 16.04827613099803, "learning_rate": 1.0616883116883116e-07, "logits/chosen": -1.6586048603057861, "logits/rejected": -1.6781889200210571, "logps/chosen": -530.2203979492188, "logps/rejected": -543.2777709960938, "loss": 0.6887, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0075052459724247456, "rewards/margins": 0.00435210857540369, "rewards/rejected": -0.011857354082167149, "step": 109 }, { "epoch": 0.07184259939586905, "grad_norm": 6.9887079248570245, "learning_rate": 1.0714285714285714e-07, "logits/chosen": -1.719894289970398, "logits/rejected": -1.73370361328125, "logps/chosen": -461.96893310546875, "logps/rejected": -460.13861083984375, "loss": 0.6903, "rewards/accuracies": 0.6875, "rewards/chosen": -0.003742053173482418, "rewards/margins": 0.008006452582776546, "rewards/rejected": -0.011748505756258965, "step": 110 }, { "epoch": 0.0724957139358315, "grad_norm": 17.35796671683516, "learning_rate": 1.0811688311688311e-07, "logits/chosen": -1.7226269245147705, "logits/rejected": -1.70708429813385, "logps/chosen": -531.7066650390625, "logps/rejected": -529.4414672851562, "loss": 0.6929, "rewards/accuracies": 0.625, "rewards/chosen": -0.013616499491035938, "rewards/margins": 0.0025032046250998974, "rewards/rejected": -0.01611970365047455, "step": 111 }, { "epoch": 0.07314882847579394, "grad_norm": 8.237208524897573, "learning_rate": 1.0909090909090909e-07, "logits/chosen": -1.6860663890838623, "logits/rejected": -1.6681212186813354, "logps/chosen": -412.7210693359375, "logps/rejected": -408.09210205078125, "loss": 0.6898, "rewards/accuracies": 0.53125, "rewards/chosen": -0.010377769358456135, "rewards/margins": 0.0005328749539330602, "rewards/rejected": -0.010910644195973873, "step": 112 }, { "epoch": 0.07380194301575639, "grad_norm": 14.11440807458263, "learning_rate": 1.1006493506493506e-07, "logits/chosen": -1.7322160005569458, "logits/rejected": -1.7938529253005981, "logps/chosen": -524.374755859375, "logps/rejected": -477.0540771484375, "loss": 0.6909, "rewards/accuracies": 0.5, "rewards/chosen": -0.010047688148915768, "rewards/margins": 0.0021818396635353565, "rewards/rejected": -0.012229528278112411, "step": 113 }, { "epoch": 0.07445505755571884, "grad_norm": 19.41135088594217, "learning_rate": 1.1103896103896104e-07, "logits/chosen": -1.7305679321289062, "logits/rejected": -1.687849521636963, "logps/chosen": -497.699462890625, "logps/rejected": -470.3168640136719, "loss": 0.69, "rewards/accuracies": 0.53125, "rewards/chosen": -0.013295488432049751, "rewards/margins": 0.0023743584752082825, "rewards/rejected": -0.015669845044612885, "step": 114 }, { "epoch": 0.07510817209568128, "grad_norm": 16.441138226102247, "learning_rate": 1.12012987012987e-07, "logits/chosen": -1.7420533895492554, "logits/rejected": -1.7339985370635986, "logps/chosen": -581.6981201171875, "logps/rejected": -568.88623046875, "loss": 0.6899, "rewards/accuracies": 0.5625, "rewards/chosen": -0.014405852183699608, "rewards/margins": 0.004987373016774654, "rewards/rejected": -0.019393224269151688, "step": 115 }, { "epoch": 0.07576128663564373, "grad_norm": 8.586529812749172, "learning_rate": 1.1298701298701299e-07, "logits/chosen": -1.7932806015014648, "logits/rejected": -1.7588953971862793, "logps/chosen": -532.9600830078125, "logps/rejected": -497.33770751953125, "loss": 0.6905, "rewards/accuracies": 0.5, "rewards/chosen": -8.44545429572463e-05, "rewards/margins": 0.00019355083350092173, "rewards/rejected": -0.0002780060167424381, "step": 116 }, { "epoch": 0.07641440117560618, "grad_norm": 11.94419407737389, "learning_rate": 1.1396103896103895e-07, "logits/chosen": -1.7573413848876953, "logits/rejected": -1.7641370296478271, "logps/chosen": -529.7691650390625, "logps/rejected": -494.5425109863281, "loss": 0.6892, "rewards/accuracies": 0.625, "rewards/chosen": -0.009526774287223816, "rewards/margins": 0.004622914828360081, "rewards/rejected": -0.014149690046906471, "step": 117 }, { "epoch": 0.07706751571556862, "grad_norm": 8.288500408438793, "learning_rate": 1.1493506493506493e-07, "logits/chosen": -1.6377328634262085, "logits/rejected": -1.7309472560882568, "logps/chosen": -479.968505859375, "logps/rejected": -474.0822448730469, "loss": 0.6911, "rewards/accuracies": 0.5, "rewards/chosen": -0.0036013172939419746, "rewards/margins": 0.0017371103167533875, "rewards/rejected": -0.005338427610695362, "step": 118 }, { "epoch": 0.07772063025553107, "grad_norm": 7.65494131386099, "learning_rate": 1.159090909090909e-07, "logits/chosen": -1.720550298690796, "logits/rejected": -1.7268420457839966, "logps/chosen": -511.4947814941406, "logps/rejected": -518.540283203125, "loss": 0.6886, "rewards/accuracies": 0.8125, "rewards/chosen": -0.002116803778335452, "rewards/margins": 0.014177341014146805, "rewards/rejected": -0.016294144093990326, "step": 119 }, { "epoch": 0.07837374479549351, "grad_norm": 7.87167890173762, "learning_rate": 1.1688311688311688e-07, "logits/chosen": -1.6774556636810303, "logits/rejected": -1.663172960281372, "logps/chosen": -486.4458923339844, "logps/rejected": -449.217529296875, "loss": 0.6893, "rewards/accuracies": 0.65625, "rewards/chosen": -0.014286703430116177, "rewards/margins": 0.00825961772352457, "rewards/rejected": -0.022546321153640747, "step": 120 }, { "epoch": 0.07902685933545596, "grad_norm": 6.91850162308767, "learning_rate": 1.1785714285714285e-07, "logits/chosen": -1.694111943244934, "logits/rejected": -1.7094212770462036, "logps/chosen": -547.680419921875, "logps/rejected": -498.9229736328125, "loss": 0.6889, "rewards/accuracies": 0.71875, "rewards/chosen": -0.003534555435180664, "rewards/margins": 0.00987851619720459, "rewards/rejected": -0.013413071632385254, "step": 121 }, { "epoch": 0.0796799738754184, "grad_norm": 9.143990922203699, "learning_rate": 1.1883116883116883e-07, "logits/chosen": -1.6494725942611694, "logits/rejected": -1.6601428985595703, "logps/chosen": -484.3493347167969, "logps/rejected": -487.05059814453125, "loss": 0.6897, "rewards/accuracies": 0.53125, "rewards/chosen": -0.014209108427166939, "rewards/margins": 0.0037215554621070623, "rewards/rejected": -0.017930667847394943, "step": 122 }, { "epoch": 0.08033308841538085, "grad_norm": 16.41656810337495, "learning_rate": 1.198051948051948e-07, "logits/chosen": -1.7612290382385254, "logits/rejected": -1.7043516635894775, "logps/chosen": -504.9853210449219, "logps/rejected": -622.8348999023438, "loss": 0.6853, "rewards/accuracies": 0.59375, "rewards/chosen": -0.017526667565107346, "rewards/margins": 0.025080684572458267, "rewards/rejected": -0.04260735213756561, "step": 123 }, { "epoch": 0.0809862029553433, "grad_norm": 25.637817852859747, "learning_rate": 1.207792207792208e-07, "logits/chosen": -1.6937708854675293, "logits/rejected": -1.6883544921875, "logps/chosen": -384.4920959472656, "logps/rejected": -507.6640930175781, "loss": 0.6885, "rewards/accuracies": 0.6875, "rewards/chosen": -0.024008184671401978, "rewards/margins": 0.0192131157964468, "rewards/rejected": -0.04322130233049393, "step": 124 }, { "epoch": 0.08163931749530574, "grad_norm": 22.15806914900568, "learning_rate": 1.2175324675324674e-07, "logits/chosen": -1.5668435096740723, "logits/rejected": -1.5370216369628906, "logps/chosen": -491.66278076171875, "logps/rejected": -550.0761108398438, "loss": 0.6886, "rewards/accuracies": 0.65625, "rewards/chosen": -0.030099207535386086, "rewards/margins": 0.012964273802936077, "rewards/rejected": -0.04306348040699959, "step": 125 }, { "epoch": 0.08229243203526819, "grad_norm": 21.70535951365313, "learning_rate": 1.2272727272727272e-07, "logits/chosen": -1.718205451965332, "logits/rejected": -1.7234244346618652, "logps/chosen": -532.9760131835938, "logps/rejected": -553.8611450195312, "loss": 0.6876, "rewards/accuracies": 0.5625, "rewards/chosen": -0.028685756027698517, "rewards/margins": 0.005702130496501923, "rewards/rejected": -0.03438788652420044, "step": 126 }, { "epoch": 0.08294554657523064, "grad_norm": 16.27604402538048, "learning_rate": 1.237012987012987e-07, "logits/chosen": -1.7105181217193604, "logits/rejected": -1.7274971008300781, "logps/chosen": -452.66326904296875, "logps/rejected": -424.0040588378906, "loss": 0.6882, "rewards/accuracies": 0.59375, "rewards/chosen": -0.023135703057050705, "rewards/margins": 0.00477581936866045, "rewards/rejected": -0.02791152149438858, "step": 127 }, { "epoch": 0.08359866111519308, "grad_norm": 8.360473540141022, "learning_rate": 1.2467532467532469e-07, "logits/chosen": -1.7243715524673462, "logits/rejected": -1.719879388809204, "logps/chosen": -577.6858520507812, "logps/rejected": -574.778564453125, "loss": 0.6875, "rewards/accuracies": 0.71875, "rewards/chosen": -0.01625608652830124, "rewards/margins": 0.01795053295791149, "rewards/rejected": -0.03420662134885788, "step": 128 }, { "epoch": 0.08425177565515553, "grad_norm": 7.426631483908922, "learning_rate": 1.2564935064935064e-07, "logits/chosen": -1.6296292543411255, "logits/rejected": -1.645986557006836, "logps/chosen": -514.0096435546875, "logps/rejected": -507.37237548828125, "loss": 0.6876, "rewards/accuracies": 0.59375, "rewards/chosen": -0.00854131206870079, "rewards/margins": 0.007741453126072884, "rewards/rejected": -0.016282765194773674, "step": 129 }, { "epoch": 0.08490489019511797, "grad_norm": 12.61939534454806, "learning_rate": 1.2662337662337662e-07, "logits/chosen": -1.7628737688064575, "logits/rejected": -1.784444808959961, "logps/chosen": -515.55419921875, "logps/rejected": -445.7431640625, "loss": 0.6873, "rewards/accuracies": 0.53125, "rewards/chosen": -0.026726404204964638, "rewards/margins": 0.007864664308726788, "rewards/rejected": -0.03459106758236885, "step": 130 }, { "epoch": 0.08555800473508042, "grad_norm": 6.940242974613805, "learning_rate": 1.275974025974026e-07, "logits/chosen": -1.723580002784729, "logits/rejected": -1.7397780418395996, "logps/chosen": -490.61956787109375, "logps/rejected": -519.4144287109375, "loss": 0.686, "rewards/accuracies": 0.75, "rewards/chosen": -0.02750040590763092, "rewards/margins": 0.028302742168307304, "rewards/rejected": -0.055803146213293076, "step": 131 }, { "epoch": 0.08621111927504287, "grad_norm": 9.176637689468706, "learning_rate": 1.2857142857142855e-07, "logits/chosen": -1.7846028804779053, "logits/rejected": -1.7753349542617798, "logps/chosen": -537.8035278320312, "logps/rejected": -657.7811889648438, "loss": 0.6844, "rewards/accuracies": 0.5625, "rewards/chosen": -0.039286255836486816, "rewards/margins": 0.02002215012907982, "rewards/rejected": -0.059308405965566635, "step": 132 }, { "epoch": 0.08686423381500531, "grad_norm": 8.420310076033147, "learning_rate": 1.2954545454545453e-07, "logits/chosen": -1.7075575590133667, "logits/rejected": -1.7227400541305542, "logps/chosen": -505.66326904296875, "logps/rejected": -475.65240478515625, "loss": 0.6876, "rewards/accuracies": 0.625, "rewards/chosen": -0.03044586256146431, "rewards/margins": 0.011737257242202759, "rewards/rejected": -0.04218312352895737, "step": 133 }, { "epoch": 0.08751734835496776, "grad_norm": 17.816465764608324, "learning_rate": 1.3051948051948052e-07, "logits/chosen": -1.6720120906829834, "logits/rejected": -1.6279743909835815, "logps/chosen": -497.46038818359375, "logps/rejected": -475.72442626953125, "loss": 0.6897, "rewards/accuracies": 0.5625, "rewards/chosen": -0.02933063544332981, "rewards/margins": 0.009470460936427116, "rewards/rejected": -0.03880109637975693, "step": 134 }, { "epoch": 0.0881704628949302, "grad_norm": 8.48602261690419, "learning_rate": 1.314935064935065e-07, "logits/chosen": -1.6978920698165894, "logits/rejected": -1.7289825677871704, "logps/chosen": -530.339111328125, "logps/rejected": -493.29913330078125, "loss": 0.6867, "rewards/accuracies": 0.71875, "rewards/chosen": -0.02855011634528637, "rewards/margins": 0.0154922716319561, "rewards/rejected": -0.04404238611459732, "step": 135 }, { "epoch": 0.08882357743489265, "grad_norm": 9.366599761861622, "learning_rate": 1.3246753246753245e-07, "logits/chosen": -1.7024781703948975, "logits/rejected": -1.6904100179672241, "logps/chosen": -410.39215087890625, "logps/rejected": -436.4376525878906, "loss": 0.6859, "rewards/accuracies": 0.5625, "rewards/chosen": -0.012004725635051727, "rewards/margins": 0.023064523935317993, "rewards/rejected": -0.03506924957036972, "step": 136 }, { "epoch": 0.0894766919748551, "grad_norm": 17.561997807750693, "learning_rate": 1.3344155844155843e-07, "logits/chosen": -1.6883275508880615, "logits/rejected": -1.6822022199630737, "logps/chosen": -470.7867736816406, "logps/rejected": -476.1307067871094, "loss": 0.6862, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04406014457345009, "rewards/margins": 0.020365647971630096, "rewards/rejected": -0.06442578881978989, "step": 137 }, { "epoch": 0.09012980651481754, "grad_norm": 6.311518850983097, "learning_rate": 1.344155844155844e-07, "logits/chosen": -1.6198405027389526, "logits/rejected": -1.6445224285125732, "logps/chosen": -577.4093627929688, "logps/rejected": -524.4210815429688, "loss": 0.6889, "rewards/accuracies": 0.53125, "rewards/chosen": -0.06365535408258438, "rewards/margins": -0.0076623717322945595, "rewards/rejected": -0.0559929758310318, "step": 138 }, { "epoch": 0.09078292105477999, "grad_norm": 7.902620448969978, "learning_rate": 1.353896103896104e-07, "logits/chosen": -1.626230001449585, "logits/rejected": -1.6832174062728882, "logps/chosen": -452.2579650878906, "logps/rejected": -457.4361267089844, "loss": 0.6848, "rewards/accuracies": 0.6875, "rewards/chosen": -0.046750400215387344, "rewards/margins": 0.018618889153003693, "rewards/rejected": -0.06536928564310074, "step": 139 }, { "epoch": 0.09143603559474243, "grad_norm": 9.061462627285719, "learning_rate": 1.3636363636363635e-07, "logits/chosen": -1.786929965019226, "logits/rejected": -1.748306393623352, "logps/chosen": -477.7335510253906, "logps/rejected": -470.07623291015625, "loss": 0.6823, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05674988403916359, "rewards/margins": 0.019451485946774483, "rewards/rejected": -0.07620137184858322, "step": 140 }, { "epoch": 0.09208915013470488, "grad_norm": 8.27691403538538, "learning_rate": 1.3733766233766233e-07, "logits/chosen": -1.7523396015167236, "logits/rejected": -1.750685691833496, "logps/chosen": -544.35888671875, "logps/rejected": -523.8983154296875, "loss": 0.6861, "rewards/accuracies": 0.75, "rewards/chosen": -0.04826396703720093, "rewards/margins": 0.019393280148506165, "rewards/rejected": -0.06765724718570709, "step": 141 }, { "epoch": 0.09274226467466731, "grad_norm": 40.62514167248459, "learning_rate": 1.383116883116883e-07, "logits/chosen": -1.6654901504516602, "logits/rejected": -1.6239811182022095, "logps/chosen": -479.25079345703125, "logps/rejected": -476.67572021484375, "loss": 0.6897, "rewards/accuracies": 0.40625, "rewards/chosen": -0.045496560633182526, "rewards/margins": 0.007325804326683283, "rewards/rejected": -0.05282236635684967, "step": 142 }, { "epoch": 0.09339537921462976, "grad_norm": 28.906389537639807, "learning_rate": 1.392857142857143e-07, "logits/chosen": -1.805418848991394, "logits/rejected": -1.7966678142547607, "logps/chosen": -521.4232788085938, "logps/rejected": -504.65008544921875, "loss": 0.6839, "rewards/accuracies": 0.75, "rewards/chosen": -0.043921858072280884, "rewards/margins": 0.02651369199156761, "rewards/rejected": -0.0704355537891388, "step": 143 }, { "epoch": 0.0940484937545922, "grad_norm": 7.759027403093762, "learning_rate": 1.4025974025974024e-07, "logits/chosen": -1.808532953262329, "logits/rejected": -1.7720710039138794, "logps/chosen": -530.4207763671875, "logps/rejected": -543.4842529296875, "loss": 0.6823, "rewards/accuracies": 0.71875, "rewards/chosen": -0.058577775955200195, "rewards/margins": 0.01986054703593254, "rewards/rejected": -0.07843831926584244, "step": 144 }, { "epoch": 0.09470160829455465, "grad_norm": 19.65781688946105, "learning_rate": 1.4123376623376622e-07, "logits/chosen": -1.7204667329788208, "logits/rejected": -1.6994764804840088, "logps/chosen": -600.3792724609375, "logps/rejected": -596.1019897460938, "loss": 0.6858, "rewards/accuracies": 0.75, "rewards/chosen": -0.07191239297389984, "rewards/margins": 0.02992035634815693, "rewards/rejected": -0.10183274745941162, "step": 145 }, { "epoch": 0.0953547228345171, "grad_norm": 27.083668342567048, "learning_rate": 1.422077922077922e-07, "logits/chosen": -1.6221792697906494, "logits/rejected": -1.6598668098449707, "logps/chosen": -526.0675048828125, "logps/rejected": -521.8135986328125, "loss": 0.6908, "rewards/accuracies": 0.5625, "rewards/chosen": -0.042592283338308334, "rewards/margins": 0.012567641213536263, "rewards/rejected": -0.055159930139780045, "step": 146 }, { "epoch": 0.09600783737447954, "grad_norm": 8.808375433626706, "learning_rate": 1.4318181818181818e-07, "logits/chosen": -1.8164383172988892, "logits/rejected": -1.8282561302185059, "logps/chosen": -614.2815551757812, "logps/rejected": -643.0169067382812, "loss": 0.6865, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07687513530254364, "rewards/margins": 0.012751526199281216, "rewards/rejected": -0.08962665498256683, "step": 147 }, { "epoch": 0.09666095191444199, "grad_norm": 7.09253863840217, "learning_rate": 1.4415584415584414e-07, "logits/chosen": -1.6832506656646729, "logits/rejected": -1.7021249532699585, "logps/chosen": -498.71942138671875, "logps/rejected": -501.34869384765625, "loss": 0.6838, "rewards/accuracies": 0.625, "rewards/chosen": -0.054647862911224365, "rewards/margins": 0.026497341692447662, "rewards/rejected": -0.08114521205425262, "step": 148 }, { "epoch": 0.09731406645440444, "grad_norm": 16.781059240998136, "learning_rate": 1.4512987012987012e-07, "logits/chosen": -1.715110182762146, "logits/rejected": -1.7118535041809082, "logps/chosen": -474.4345703125, "logps/rejected": -454.8081970214844, "loss": 0.6855, "rewards/accuracies": 0.53125, "rewards/chosen": -0.062017131596803665, "rewards/margins": 0.004093126859515905, "rewards/rejected": -0.06611025333404541, "step": 149 }, { "epoch": 0.09796718099436688, "grad_norm": 20.190214518687835, "learning_rate": 1.461038961038961e-07, "logits/chosen": -1.6403260231018066, "logits/rejected": -1.601685881614685, "logps/chosen": -516.1248168945312, "logps/rejected": -536.0068969726562, "loss": 0.6845, "rewards/accuracies": 0.625, "rewards/chosen": -0.05097360908985138, "rewards/margins": 0.021962041035294533, "rewards/rejected": -0.07293565571308136, "step": 150 }, { "epoch": 0.09862029553432933, "grad_norm": 12.205029148852612, "learning_rate": 1.4707792207792208e-07, "logits/chosen": -1.6476508378982544, "logits/rejected": -1.7027806043624878, "logps/chosen": -488.97259521484375, "logps/rejected": -461.6432800292969, "loss": 0.681, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03291236236691475, "rewards/margins": 0.01708255708217621, "rewards/rejected": -0.04999491944909096, "step": 151 }, { "epoch": 0.09927341007429177, "grad_norm": 22.040695719974778, "learning_rate": 1.4805194805194803e-07, "logits/chosen": -1.7001389265060425, "logits/rejected": -1.7121036052703857, "logps/chosen": -473.0397033691406, "logps/rejected": -513.0086059570312, "loss": 0.6773, "rewards/accuracies": 0.78125, "rewards/chosen": -0.05476941913366318, "rewards/margins": 0.05117000639438629, "rewards/rejected": -0.10593942552804947, "step": 152 }, { "epoch": 0.09992652461425422, "grad_norm": 17.736405376818148, "learning_rate": 1.4902597402597404e-07, "logits/chosen": -1.7151176929473877, "logits/rejected": -1.6587436199188232, "logps/chosen": -528.3651733398438, "logps/rejected": -519.2221069335938, "loss": 0.6872, "rewards/accuracies": 0.75, "rewards/chosen": -0.050955355167388916, "rewards/margins": 0.02759256586432457, "rewards/rejected": -0.07854791730642319, "step": 153 }, { "epoch": 0.10057963915421667, "grad_norm": 20.0575565981101, "learning_rate": 1.5e-07, "logits/chosen": -1.7852433919906616, "logits/rejected": -1.7886905670166016, "logps/chosen": -482.718017578125, "logps/rejected": -482.28759765625, "loss": 0.6837, "rewards/accuracies": 0.625, "rewards/chosen": -0.07595521211624146, "rewards/margins": 0.016735542565584183, "rewards/rejected": -0.09269075095653534, "step": 154 }, { "epoch": 0.10123275369417911, "grad_norm": 6.032126948811811, "learning_rate": 1.499998048075819e-07, "logits/chosen": -1.649161696434021, "logits/rejected": -1.6406943798065186, "logps/chosen": -509.6943359375, "logps/rejected": -504.8258361816406, "loss": 0.6872, "rewards/accuracies": 0.5, "rewards/chosen": -0.07708139717578888, "rewards/margins": 0.013607650063931942, "rewards/rejected": -0.0906890481710434, "step": 155 }, { "epoch": 0.10188586823414156, "grad_norm": 8.923286598087802, "learning_rate": 1.4999921923134367e-07, "logits/chosen": -1.7759987115859985, "logits/rejected": -1.731602668762207, "logps/chosen": -508.0286865234375, "logps/rejected": -549.663330078125, "loss": 0.6864, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04344085976481438, "rewards/margins": 0.024824578315019608, "rewards/rejected": -0.06826544553041458, "step": 156 }, { "epoch": 0.102538982774104, "grad_norm": 18.440082543584566, "learning_rate": 1.499982432743333e-07, "logits/chosen": -1.7433996200561523, "logits/rejected": -1.7404325008392334, "logps/chosen": -483.1097717285156, "logps/rejected": -518.4947509765625, "loss": 0.682, "rewards/accuracies": 0.53125, "rewards/chosen": -0.07219300419092178, "rewards/margins": 0.0072326865047216415, "rewards/rejected": -0.07942568510770798, "step": 157 }, { "epoch": 0.10319209731406645, "grad_norm": 26.950148340749994, "learning_rate": 1.4999687694163071e-07, "logits/chosen": -1.6927975416183472, "logits/rejected": -1.7593554258346558, "logps/chosen": -587.9935302734375, "logps/rejected": -547.7039184570312, "loss": 0.6831, "rewards/accuracies": 0.75, "rewards/chosen": -0.05381292477250099, "rewards/margins": 0.02101278305053711, "rewards/rejected": -0.0748257115483284, "step": 158 }, { "epoch": 0.1038452118540289, "grad_norm": 7.542663745310761, "learning_rate": 1.499951202403479e-07, "logits/chosen": -1.7588621377944946, "logits/rejected": -1.703225016593933, "logps/chosen": -505.0411376953125, "logps/rejected": -599.8470458984375, "loss": 0.6763, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05922378972172737, "rewards/margins": 0.053594160825014114, "rewards/rejected": -0.11281795799732208, "step": 159 }, { "epoch": 0.10449832639399134, "grad_norm": 17.13212774750112, "learning_rate": 1.4999297317962876e-07, "logits/chosen": -1.7104802131652832, "logits/rejected": -1.7329843044281006, "logps/chosen": -514.36279296875, "logps/rejected": -533.5743408203125, "loss": 0.6821, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07126364856958389, "rewards/margins": 0.031774699687957764, "rewards/rejected": -0.10303835570812225, "step": 160 }, { "epoch": 0.10515144093395379, "grad_norm": 10.645728177870987, "learning_rate": 1.4999043577064894e-07, "logits/chosen": -1.826465129852295, "logits/rejected": -1.8719959259033203, "logps/chosen": -616.4337768554688, "logps/rejected": -609.4326171875, "loss": 0.6779, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07613222301006317, "rewards/margins": 0.02693980187177658, "rewards/rejected": -0.10307201743125916, "step": 161 }, { "epoch": 0.10580455547391623, "grad_norm": 31.5882679445136, "learning_rate": 1.4998750802661605e-07, "logits/chosen": -1.727925419807434, "logits/rejected": -1.724989414215088, "logps/chosen": -599.0576171875, "logps/rejected": -563.6280517578125, "loss": 0.683, "rewards/accuracies": 0.65625, "rewards/chosen": -0.08880521357059479, "rewards/margins": 0.025629587471485138, "rewards/rejected": -0.11443479359149933, "step": 162 }, { "epoch": 0.10645767001387868, "grad_norm": 12.592298356038635, "learning_rate": 1.4998418996276933e-07, "logits/chosen": -1.738313913345337, "logits/rejected": -1.753507375717163, "logps/chosen": -579.961669921875, "logps/rejected": -594.0586547851562, "loss": 0.6819, "rewards/accuracies": 0.75, "rewards/chosen": -0.05988356098532677, "rewards/margins": 0.03945886343717575, "rewards/rejected": -0.09934242069721222, "step": 163 }, { "epoch": 0.10711078455384113, "grad_norm": 19.46009999388665, "learning_rate": 1.499804815963798e-07, "logits/chosen": -1.7987678050994873, "logits/rejected": -1.807891845703125, "logps/chosen": -550.672119140625, "logps/rejected": -561.8424072265625, "loss": 0.677, "rewards/accuracies": 0.6875, "rewards/chosen": -0.040339332073926926, "rewards/margins": 0.04034976288676262, "rewards/rejected": -0.08068908751010895, "step": 164 }, { "epoch": 0.10776389909380357, "grad_norm": 27.510358333266502, "learning_rate": 1.4997638294674996e-07, "logits/chosen": -1.7597366571426392, "logits/rejected": -1.7664936780929565, "logps/chosen": -517.5831909179688, "logps/rejected": -524.0874633789062, "loss": 0.6807, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10509812831878662, "rewards/margins": 0.02205245941877365, "rewards/rejected": -0.12715059518814087, "step": 165 }, { "epoch": 0.10841701363376602, "grad_norm": 26.359728514103736, "learning_rate": 1.499718940352138e-07, "logits/chosen": -1.660881519317627, "logits/rejected": -1.6824941635131836, "logps/chosen": -526.3524169921875, "logps/rejected": -505.15814208984375, "loss": 0.6795, "rewards/accuracies": 0.71875, "rewards/chosen": -0.09450910985469818, "rewards/margins": 0.04056902229785919, "rewards/rejected": -0.13507813215255737, "step": 166 }, { "epoch": 0.10907012817372846, "grad_norm": 13.379229704663794, "learning_rate": 1.499670148851367e-07, "logits/chosen": -1.6919353008270264, "logits/rejected": -1.7081630229949951, "logps/chosen": -462.4615783691406, "logps/rejected": -441.3831787109375, "loss": 0.6793, "rewards/accuracies": 0.71875, "rewards/chosen": -0.047392651438713074, "rewards/margins": 0.02708030305802822, "rewards/rejected": -0.07447294890880585, "step": 167 }, { "epoch": 0.10972324271369091, "grad_norm": 23.765383175487347, "learning_rate": 1.4996174552191534e-07, "logits/chosen": -1.6622581481933594, "logits/rejected": -1.679819941520691, "logps/chosen": -464.3515930175781, "logps/rejected": -449.2407531738281, "loss": 0.6781, "rewards/accuracies": 0.75, "rewards/chosen": -0.059018295258283615, "rewards/margins": 0.03328193724155426, "rewards/rejected": -0.09230024367570877, "step": 168 }, { "epoch": 0.11037635725365336, "grad_norm": 7.18145864960063, "learning_rate": 1.4995608597297736e-07, "logits/chosen": -1.7335408926010132, "logits/rejected": -1.7119437456130981, "logps/chosen": -517.2203979492188, "logps/rejected": -603.5662841796875, "loss": 0.6748, "rewards/accuracies": 0.71875, "rewards/chosen": -0.06651943922042847, "rewards/margins": 0.05973542109131813, "rewards/rejected": -0.1262548714876175, "step": 169 }, { "epoch": 0.1110294717936158, "grad_norm": 33.768142398887186, "learning_rate": 1.4995003626778149e-07, "logits/chosen": -1.7551608085632324, "logits/rejected": -1.718156099319458, "logps/chosen": -503.027099609375, "logps/rejected": -557.4558715820312, "loss": 0.685, "rewards/accuracies": 0.5, "rewards/chosen": -0.06958004832267761, "rewards/margins": 0.021374018862843513, "rewards/rejected": -0.09095406532287598, "step": 170 }, { "epoch": 0.11168258633357825, "grad_norm": 13.332558234776263, "learning_rate": 1.4994359643781725e-07, "logits/chosen": -1.6308660507202148, "logits/rejected": -1.6401119232177734, "logps/chosen": -491.20263671875, "logps/rejected": -492.4757995605469, "loss": 0.6805, "rewards/accuracies": 0.75, "rewards/chosen": -0.08171894401311874, "rewards/margins": 0.03456515446305275, "rewards/rejected": -0.1162840873003006, "step": 171 }, { "epoch": 0.1123357008735407, "grad_norm": 11.374539796821198, "learning_rate": 1.4993676651660479e-07, "logits/chosen": -1.7294381856918335, "logits/rejected": -1.7581063508987427, "logps/chosen": -510.65618896484375, "logps/rejected": -494.7225341796875, "loss": 0.6791, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07346140593290329, "rewards/margins": 0.016454335302114487, "rewards/rejected": -0.08991573750972748, "step": 172 }, { "epoch": 0.11298881541350314, "grad_norm": 10.782302320724707, "learning_rate": 1.4992954653969473e-07, "logits/chosen": -1.7590150833129883, "logits/rejected": -1.7065110206604004, "logps/chosen": -579.6332397460938, "logps/rejected": -642.480224609375, "loss": 0.6756, "rewards/accuracies": 0.625, "rewards/chosen": -0.16745489835739136, "rewards/margins": 0.04030359163880348, "rewards/rejected": -0.20775848627090454, "step": 173 }, { "epoch": 0.11364192995346559, "grad_norm": 30.030650307666996, "learning_rate": 1.4992193654466804e-07, "logits/chosen": -1.7184665203094482, "logits/rejected": -1.751639485359192, "logps/chosen": -505.05078125, "logps/rejected": -476.3111572265625, "loss": 0.6774, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07360526919364929, "rewards/margins": 0.02522566169500351, "rewards/rejected": -0.0988309308886528, "step": 174 }, { "epoch": 0.11429504449342803, "grad_norm": 6.801707032164358, "learning_rate": 1.4991393657113566e-07, "logits/chosen": -1.7240628004074097, "logits/rejected": -1.693426489830017, "logps/chosen": -513.940673828125, "logps/rejected": -593.656494140625, "loss": 0.6822, "rewards/accuracies": 0.53125, "rewards/chosen": -0.09224756807088852, "rewards/margins": 0.05458322912454605, "rewards/rejected": -0.14683078229427338, "step": 175 }, { "epoch": 0.11494815903339048, "grad_norm": 6.667305623368847, "learning_rate": 1.499055466607386e-07, "logits/chosen": -1.6916465759277344, "logits/rejected": -1.7375673055648804, "logps/chosen": -533.5892944335938, "logps/rejected": -507.4823913574219, "loss": 0.675, "rewards/accuracies": 0.75, "rewards/chosen": -0.080963134765625, "rewards/margins": 0.053236886858940125, "rewards/rejected": -0.13420002162456512, "step": 176 }, { "epoch": 0.11560127357335293, "grad_norm": 6.771588986223236, "learning_rate": 1.498967668571474e-07, "logits/chosen": -1.735141396522522, "logits/rejected": -1.7742522954940796, "logps/chosen": -510.868408203125, "logps/rejected": -452.6504211425781, "loss": 0.6749, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10077373683452606, "rewards/margins": 0.003773924894630909, "rewards/rejected": -0.1045476570725441, "step": 177 }, { "epoch": 0.11625438811331537, "grad_norm": 9.953482315244722, "learning_rate": 1.4988759720606207e-07, "logits/chosen": -1.7570782899856567, "logits/rejected": -1.8162400722503662, "logps/chosen": -554.0537719726562, "logps/rejected": -541.5122680664062, "loss": 0.6801, "rewards/accuracies": 0.75, "rewards/chosen": -0.10425091534852982, "rewards/margins": 0.060477737337350845, "rewards/rejected": -0.16472867131233215, "step": 178 }, { "epoch": 0.11690750265327782, "grad_norm": 5.973916986801873, "learning_rate": 1.4987803775521184e-07, "logits/chosen": -1.690962791442871, "logits/rejected": -1.689450979232788, "logps/chosen": -643.9844360351562, "logps/rejected": -589.926025390625, "loss": 0.6829, "rewards/accuracies": 0.34375, "rewards/chosen": -0.12561464309692383, "rewards/margins": -0.008268720470368862, "rewards/rejected": -0.11734593659639359, "step": 179 }, { "epoch": 0.11756061719324026, "grad_norm": 14.891949152028998, "learning_rate": 1.4986808855435498e-07, "logits/chosen": -1.6953084468841553, "logits/rejected": -1.6965618133544922, "logps/chosen": -465.82086181640625, "logps/rejected": -438.37799072265625, "loss": 0.6767, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08036669343709946, "rewards/margins": 0.017360081896185875, "rewards/rejected": -0.09772677719593048, "step": 180 }, { "epoch": 0.11821373173320271, "grad_norm": 16.11153067050427, "learning_rate": 1.498577496552783e-07, "logits/chosen": -1.7526249885559082, "logits/rejected": -1.7759355306625366, "logps/chosen": -482.6202697753906, "logps/rejected": -492.8947448730469, "loss": 0.6659, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09437385201454163, "rewards/margins": 0.037018630653619766, "rewards/rejected": -0.1313924789428711, "step": 181 }, { "epoch": 0.11886684627316516, "grad_norm": 22.729237496040554, "learning_rate": 1.4984702111179715e-07, "logits/chosen": -1.6132816076278687, "logits/rejected": -1.655545949935913, "logps/chosen": -527.8369750976562, "logps/rejected": -527.8175048828125, "loss": 0.6727, "rewards/accuracies": 0.875, "rewards/chosen": -0.0954374372959137, "rewards/margins": 0.05533631891012192, "rewards/rejected": -0.1507737785577774, "step": 182 }, { "epoch": 0.1195199608131276, "grad_norm": 26.231353893622398, "learning_rate": 1.4983590297975505e-07, "logits/chosen": -1.6354296207427979, "logits/rejected": -1.6777453422546387, "logps/chosen": -504.5555419921875, "logps/rejected": -538.9359741210938, "loss": 0.6843, "rewards/accuracies": 0.625, "rewards/chosen": -0.12271232157945633, "rewards/margins": 0.04234550893306732, "rewards/rejected": -0.16505783796310425, "step": 183 }, { "epoch": 0.12017307535309005, "grad_norm": 7.114753478180882, "learning_rate": 1.498243953170233e-07, "logits/chosen": -1.69046950340271, "logits/rejected": -1.7043442726135254, "logps/chosen": -530.9853515625, "logps/rejected": -529.0479125976562, "loss": 0.677, "rewards/accuracies": 0.75, "rewards/chosen": -0.06343412399291992, "rewards/margins": 0.035453617572784424, "rewards/rejected": -0.09888774156570435, "step": 184 }, { "epoch": 0.1208261898930525, "grad_norm": 21.787798884018414, "learning_rate": 1.498124981835008e-07, "logits/chosen": -1.7766292095184326, "logits/rejected": -1.74836266040802, "logps/chosen": -475.3532409667969, "logps/rejected": -489.02923583984375, "loss": 0.6776, "rewards/accuracies": 0.75, "rewards/chosen": -0.07969984412193298, "rewards/margins": 0.041919078677892685, "rewards/rejected": -0.12161892652511597, "step": 185 }, { "epoch": 0.12147930443301494, "grad_norm": 23.709784617273577, "learning_rate": 1.4980021164111366e-07, "logits/chosen": -1.6516242027282715, "logits/rejected": -1.673815369606018, "logps/chosen": -455.3662109375, "logps/rejected": -492.9649963378906, "loss": 0.677, "rewards/accuracies": 0.59375, "rewards/chosen": -0.14689013361930847, "rewards/margins": 0.02547987923026085, "rewards/rejected": -0.1723700314760208, "step": 186 }, { "epoch": 0.12213241897297739, "grad_norm": 12.348486008384063, "learning_rate": 1.4978753575381498e-07, "logits/chosen": -1.7523754835128784, "logits/rejected": -1.745179533958435, "logps/chosen": -503.9389343261719, "logps/rejected": -510.8143615722656, "loss": 0.6702, "rewards/accuracies": 0.84375, "rewards/chosen": -0.11973577737808228, "rewards/margins": 0.05588344484567642, "rewards/rejected": -0.1756192147731781, "step": 187 }, { "epoch": 0.12278553351293983, "grad_norm": 13.48324499414794, "learning_rate": 1.4977447058758439e-07, "logits/chosen": -1.7630393505096436, "logits/rejected": -1.7760398387908936, "logps/chosen": -571.8206787109375, "logps/rejected": -541.6646118164062, "loss": 0.6801, "rewards/accuracies": 0.5625, "rewards/chosen": -0.11507945507764816, "rewards/margins": 0.029867036268115044, "rewards/rejected": -0.14494650065898895, "step": 188 }, { "epoch": 0.12343864805290228, "grad_norm": 10.415485511799975, "learning_rate": 1.4976101621042783e-07, "logits/chosen": -1.684414267539978, "logits/rejected": -1.6862590312957764, "logps/chosen": -535.666015625, "logps/rejected": -548.4473876953125, "loss": 0.6756, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1297360062599182, "rewards/margins": 0.05931150168180466, "rewards/rejected": -0.18904751539230347, "step": 189 }, { "epoch": 0.12409176259286472, "grad_norm": 6.350273320282756, "learning_rate": 1.4974717269237708e-07, "logits/chosen": -1.6381902694702148, "logits/rejected": -1.6570154428482056, "logps/chosen": -497.8411560058594, "logps/rejected": -489.9413757324219, "loss": 0.6632, "rewards/accuracies": 0.75, "rewards/chosen": -0.11118555068969727, "rewards/margins": 0.06526104360818863, "rewards/rejected": -0.17644661664962769, "step": 190 }, { "epoch": 0.12474487713282717, "grad_norm": 6.038127824462166, "learning_rate": 1.4973294010548946e-07, "logits/chosen": -1.6311917304992676, "logits/rejected": -1.6577972173690796, "logps/chosen": -518.0223388671875, "logps/rejected": -520.6466674804688, "loss": 0.677, "rewards/accuracies": 0.4375, "rewards/chosen": -0.15784503519535065, "rewards/margins": 0.01725524663925171, "rewards/rejected": -0.17510026693344116, "step": 191 }, { "epoch": 0.12539799167278962, "grad_norm": 26.218558847948582, "learning_rate": 1.4971831852384745e-07, "logits/chosen": -1.6489263772964478, "logits/rejected": -1.6216522455215454, "logps/chosen": -570.0574951171875, "logps/rejected": -592.376708984375, "loss": 0.6842, "rewards/accuracies": 0.59375, "rewards/chosen": -0.17664462327957153, "rewards/margins": 0.014460130594670773, "rewards/rejected": -0.1911047399044037, "step": 192 }, { "epoch": 0.12605110621275206, "grad_norm": 14.358619994205673, "learning_rate": 1.497033080235583e-07, "logits/chosen": -1.754329800605774, "logits/rejected": -1.7733534574508667, "logps/chosen": -515.3424682617188, "logps/rejected": -505.7689208984375, "loss": 0.6687, "rewards/accuracies": 0.59375, "rewards/chosen": -0.12092112749814987, "rewards/margins": 0.024929339066147804, "rewards/rejected": -0.14585046470165253, "step": 193 }, { "epoch": 0.1267042207527145, "grad_norm": 17.358492156799695, "learning_rate": 1.4968790868275365e-07, "logits/chosen": -1.666093349456787, "logits/rejected": -1.6654276847839355, "logps/chosen": -510.59320068359375, "logps/rejected": -497.73431396484375, "loss": 0.6717, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13687899708747864, "rewards/margins": 0.009451786056160927, "rewards/rejected": -0.1463308036327362, "step": 194 }, { "epoch": 0.12735733529267695, "grad_norm": 33.350981012916925, "learning_rate": 1.4967212058158908e-07, "logits/chosen": -1.7516664266586304, "logits/rejected": -1.7618868350982666, "logps/chosen": -574.4312744140625, "logps/rejected": -562.3271484375, "loss": 0.687, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1302211880683899, "rewards/margins": 0.03780483454465866, "rewards/rejected": -0.16802600026130676, "step": 195 }, { "epoch": 0.1280104498326394, "grad_norm": 12.460824021943772, "learning_rate": 1.4965594380224373e-07, "logits/chosen": -1.6539489030838013, "logits/rejected": -1.6902060508728027, "logps/chosen": -540.94873046875, "logps/rejected": -570.3787231445312, "loss": 0.6737, "rewards/accuracies": 0.625, "rewards/chosen": -0.16215386986732483, "rewards/margins": 0.05639547482132912, "rewards/rejected": -0.21854937076568604, "step": 196 }, { "epoch": 0.12866356437260185, "grad_norm": 7.277120819141452, "learning_rate": 1.4963937842891983e-07, "logits/chosen": -1.7623552083969116, "logits/rejected": -1.777740240097046, "logps/chosen": -562.0401611328125, "logps/rejected": -549.0805053710938, "loss": 0.6625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.15102756023406982, "rewards/margins": 0.02262779325246811, "rewards/rejected": -0.17365534603595734, "step": 197 }, { "epoch": 0.1293166789125643, "grad_norm": 6.4071063075287045, "learning_rate": 1.4962242454784235e-07, "logits/chosen": -1.7132699489593506, "logits/rejected": -1.6834791898727417, "logps/chosen": -473.0800476074219, "logps/rejected": -574.1148681640625, "loss": 0.6639, "rewards/accuracies": 0.625, "rewards/chosen": -0.13342300057411194, "rewards/margins": 0.09166872501373291, "rewards/rejected": -0.22509171068668365, "step": 198 }, { "epoch": 0.12996979345252674, "grad_norm": 16.16942335499691, "learning_rate": 1.4960508224725845e-07, "logits/chosen": -1.697445273399353, "logits/rejected": -1.737441897392273, "logps/chosen": -491.8898010253906, "logps/rejected": -465.43133544921875, "loss": 0.6753, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12092840671539307, "rewards/margins": 0.01965322345495224, "rewards/rejected": -0.1405816376209259, "step": 199 }, { "epoch": 0.13062290799248918, "grad_norm": 9.236224313907128, "learning_rate": 1.495873516174371e-07, "logits/chosen": -1.818574070930481, "logits/rejected": -1.7804887294769287, "logps/chosen": -583.37548828125, "logps/rejected": -581.4929809570312, "loss": 0.6703, "rewards/accuracies": 0.375, "rewards/chosen": -0.20678864419460297, "rewards/margins": 0.013942277058959007, "rewards/rejected": -0.22073093056678772, "step": 200 }, { "epoch": 0.13062290799248918, "eval_logits/chosen": -1.7592895030975342, "eval_logits/rejected": -1.7686011791229248, "eval_logps/chosen": -524.639404296875, "eval_logps/rejected": -522.0521240234375, "eval_loss": 0.6713127493858337, "eval_rewards/accuracies": 0.6380000114440918, "eval_rewards/chosen": -0.1429399996995926, "eval_rewards/margins": 0.05518652871251106, "eval_rewards/rejected": -0.19812652468681335, "eval_runtime": 300.5974, "eval_samples_per_second": 13.307, "eval_steps_per_second": 0.832, "step": 200 }, { "epoch": 0.13127602253245163, "grad_norm": 8.660547603879296, "learning_rate": 1.4956923275066855e-07, "logits/chosen": -1.7169371843338013, "logits/rejected": -1.7529581785202026, "logps/chosen": -491.5006408691406, "logps/rejected": -438.540283203125, "loss": 0.6728, "rewards/accuracies": 0.71875, "rewards/chosen": -0.12306279689073563, "rewards/margins": 0.035101909190416336, "rewards/rejected": -0.15816472470760345, "step": 201 }, { "epoch": 0.13192913707241408, "grad_norm": 9.941469801759618, "learning_rate": 1.4955072574126383e-07, "logits/chosen": -1.6325197219848633, "logits/rejected": -1.6774578094482422, "logps/chosen": -501.1746826171875, "logps/rejected": -474.1672668457031, "loss": 0.6778, "rewards/accuracies": 0.5, "rewards/chosen": -0.1591545194387436, "rewards/margins": -0.0037236525677144527, "rewards/rejected": -0.155430868268013, "step": 202 }, { "epoch": 0.13258225161237652, "grad_norm": 6.629751952322596, "learning_rate": 1.4953183068555444e-07, "logits/chosen": -1.6159188747406006, "logits/rejected": -1.655371069908142, "logps/chosen": -511.7205810546875, "logps/rejected": -497.550537109375, "loss": 0.6755, "rewards/accuracies": 0.65625, "rewards/chosen": -0.17229245603084564, "rewards/margins": 0.036124035716056824, "rewards/rejected": -0.20841647684574127, "step": 203 }, { "epoch": 0.13323536615233897, "grad_norm": 14.86037901564549, "learning_rate": 1.4951254768189153e-07, "logits/chosen": -1.7175514698028564, "logits/rejected": -1.7305848598480225, "logps/chosen": -521.8203735351562, "logps/rejected": -473.41363525390625, "loss": 0.6698, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1745811253786087, "rewards/margins": 0.05103900283575058, "rewards/rejected": -0.22562013566493988, "step": 204 }, { "epoch": 0.13388848069230141, "grad_norm": 9.81928586229961, "learning_rate": 1.4949287683064572e-07, "logits/chosen": -1.7592101097106934, "logits/rejected": -1.7103134393692017, "logps/chosen": -506.8877258300781, "logps/rejected": -512.57568359375, "loss": 0.6752, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12894892692565918, "rewards/margins": 0.058204442262649536, "rewards/rejected": -0.1871533840894699, "step": 205 }, { "epoch": 0.13454159523226386, "grad_norm": 12.70868918654191, "learning_rate": 1.4947281823420636e-07, "logits/chosen": -1.74403715133667, "logits/rejected": -1.7773162126541138, "logps/chosen": -549.9198608398438, "logps/rejected": -537.7161865234375, "loss": 0.6726, "rewards/accuracies": 0.5, "rewards/chosen": -0.15765821933746338, "rewards/margins": 0.024023467674851418, "rewards/rejected": -0.18168169260025024, "step": 206 }, { "epoch": 0.1351947097722263, "grad_norm": 9.464198406615232, "learning_rate": 1.4945237199698105e-07, "logits/chosen": -1.7218544483184814, "logits/rejected": -1.7245031595230103, "logps/chosen": -575.4227294921875, "logps/rejected": -613.4913330078125, "loss": 0.6628, "rewards/accuracies": 0.625, "rewards/chosen": -0.2187643200159073, "rewards/margins": 0.05670395866036415, "rewards/rejected": -0.27546826004981995, "step": 207 }, { "epoch": 0.13584782431218875, "grad_norm": 40.46016052410099, "learning_rate": 1.4943153822539518e-07, "logits/chosen": -1.7167556285858154, "logits/rejected": -1.7805320024490356, "logps/chosen": -536.693603515625, "logps/rejected": -531.85986328125, "loss": 0.6732, "rewards/accuracies": 0.75, "rewards/chosen": -0.18354004621505737, "rewards/margins": 0.048775896430015564, "rewards/rejected": -0.23231592774391174, "step": 208 }, { "epoch": 0.1365009388521512, "grad_norm": 22.288492214742288, "learning_rate": 1.4941031702789123e-07, "logits/chosen": -1.6213892698287964, "logits/rejected": -1.63565993309021, "logps/chosen": -565.4679565429688, "logps/rejected": -557.27685546875, "loss": 0.6666, "rewards/accuracies": 0.75, "rewards/chosen": -0.13010086119174957, "rewards/margins": 0.07792194187641144, "rewards/rejected": -0.20802278816699982, "step": 209 }, { "epoch": 0.13715405339211365, "grad_norm": 15.197653036128015, "learning_rate": 1.4938870851492834e-07, "logits/chosen": -1.6974363327026367, "logits/rejected": -1.7302565574645996, "logps/chosen": -496.8112487792969, "logps/rejected": -477.2409362792969, "loss": 0.6659, "rewards/accuracies": 0.5625, "rewards/chosen": -0.15533235669136047, "rewards/margins": 0.022654909640550613, "rewards/rejected": -0.1779872477054596, "step": 210 }, { "epoch": 0.1378071679320761, "grad_norm": 6.3314279991311055, "learning_rate": 1.4936671279898162e-07, "logits/chosen": -1.7678776979446411, "logits/rejected": -1.8228353261947632, "logps/chosen": -473.0333557128906, "logps/rejected": -480.84259033203125, "loss": 0.6704, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2336454689502716, "rewards/margins": 0.01017429493367672, "rewards/rejected": -0.24381977319717407, "step": 211 }, { "epoch": 0.13846028247203854, "grad_norm": 33.62952471238388, "learning_rate": 1.493443299945417e-07, "logits/chosen": -1.674032211303711, "logits/rejected": -1.6666409969329834, "logps/chosen": -487.1645202636719, "logps/rejected": -511.31976318359375, "loss": 0.6672, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17207323014736176, "rewards/margins": 0.0362030491232872, "rewards/rejected": -0.20827627182006836, "step": 212 }, { "epoch": 0.13911339701200098, "grad_norm": 17.559552038629228, "learning_rate": 1.4932156021811393e-07, "logits/chosen": -1.6960467100143433, "logits/rejected": -1.6814875602722168, "logps/chosen": -448.228515625, "logps/rejected": -523.8681640625, "loss": 0.6702, "rewards/accuracies": 0.71875, "rewards/chosen": -0.19846728444099426, "rewards/margins": 0.07104050368070602, "rewards/rejected": -0.2695077657699585, "step": 213 }, { "epoch": 0.13976651155196343, "grad_norm": 21.300381341216802, "learning_rate": 1.492984035882181e-07, "logits/chosen": -1.7540203332901, "logits/rejected": -1.6938629150390625, "logps/chosen": -510.3170166015625, "logps/rejected": -555.1364135742188, "loss": 0.6838, "rewards/accuracies": 0.5, "rewards/chosen": -0.20609605312347412, "rewards/margins": 0.04784277081489563, "rewards/rejected": -0.25393882393836975, "step": 214 }, { "epoch": 0.14041962609192588, "grad_norm": 18.999047527818195, "learning_rate": 1.4927486022538743e-07, "logits/chosen": -1.6708929538726807, "logits/rejected": -1.712494969367981, "logps/chosen": -605.5905151367188, "logps/rejected": -571.502685546875, "loss": 0.6654, "rewards/accuracies": 0.625, "rewards/chosen": -0.21136337518692017, "rewards/margins": 0.03826094791293144, "rewards/rejected": -0.24962429702281952, "step": 215 }, { "epoch": 0.14107274063188832, "grad_norm": 21.481084161977915, "learning_rate": 1.4925093025216822e-07, "logits/chosen": -1.6800730228424072, "logits/rejected": -1.6440744400024414, "logps/chosen": -561.3792724609375, "logps/rejected": -543.5944213867188, "loss": 0.6691, "rewards/accuracies": 0.71875, "rewards/chosen": -0.18766431510448456, "rewards/margins": 0.08474057167768478, "rewards/rejected": -0.27240487933158875, "step": 216 }, { "epoch": 0.14172585517185077, "grad_norm": 36.19470917799039, "learning_rate": 1.4922661379311916e-07, "logits/chosen": -1.7101688385009766, "logits/rejected": -1.7070766687393188, "logps/chosen": -513.0838012695312, "logps/rejected": -580.56982421875, "loss": 0.6719, "rewards/accuracies": 0.6875, "rewards/chosen": -0.19527553021907806, "rewards/margins": 0.07064682245254517, "rewards/rejected": -0.26592230796813965, "step": 217 }, { "epoch": 0.1423789697118132, "grad_norm": 19.07151886221254, "learning_rate": 1.4920191097481055e-07, "logits/chosen": -1.6426142454147339, "logits/rejected": -1.6799873113632202, "logps/chosen": -544.11083984375, "logps/rejected": -474.7537536621094, "loss": 0.6781, "rewards/accuracies": 0.5625, "rewards/chosen": -0.24504226446151733, "rewards/margins": -0.005615321919322014, "rewards/rejected": -0.23942697048187256, "step": 218 }, { "epoch": 0.14303208425177566, "grad_norm": 16.177094066689747, "learning_rate": 1.4917682192582382e-07, "logits/chosen": -1.6813910007476807, "logits/rejected": -1.7110719680786133, "logps/chosen": -583.3339233398438, "logps/rejected": -526.6600341796875, "loss": 0.6832, "rewards/accuracies": 0.625, "rewards/chosen": -0.2589135766029358, "rewards/margins": -0.009151825681328773, "rewards/rejected": -0.24976176023483276, "step": 219 }, { "epoch": 0.1436851987917381, "grad_norm": 7.819380233896718, "learning_rate": 1.4915134677675075e-07, "logits/chosen": -1.6574629545211792, "logits/rejected": -1.6859042644500732, "logps/chosen": -482.84991455078125, "logps/rejected": -488.5449523925781, "loss": 0.6614, "rewards/accuracies": 0.71875, "rewards/chosen": -0.13272731006145477, "rewards/margins": 0.06106055900454521, "rewards/rejected": -0.1937878578901291, "step": 220 }, { "epoch": 0.14433831333170055, "grad_norm": 24.184875753596092, "learning_rate": 1.4912548566019288e-07, "logits/chosen": -1.730996012687683, "logits/rejected": -1.7523002624511719, "logps/chosen": -550.131103515625, "logps/rejected": -511.6734313964844, "loss": 0.6859, "rewards/accuracies": 0.46875, "rewards/chosen": -0.16892360150814056, "rewards/margins": 0.009203894063830376, "rewards/rejected": -0.1781274825334549, "step": 221 }, { "epoch": 0.144991427871663, "grad_norm": 40.392554453725786, "learning_rate": 1.4909923871076067e-07, "logits/chosen": -1.7876818180084229, "logits/rejected": -1.8064486980438232, "logps/chosen": -545.5269775390625, "logps/rejected": -553.6494140625, "loss": 0.663, "rewards/accuracies": 0.78125, "rewards/chosen": -0.21288356184959412, "rewards/margins": 0.10687703639268875, "rewards/rejected": -0.31976059079170227, "step": 222 }, { "epoch": 0.14564454241162544, "grad_norm": 26.592793442117337, "learning_rate": 1.4907260606507294e-07, "logits/chosen": -1.7836247682571411, "logits/rejected": -1.7736763954162598, "logps/chosen": -491.8907470703125, "logps/rejected": -510.7823181152344, "loss": 0.6533, "rewards/accuracies": 0.75, "rewards/chosen": -0.17148932814598083, "rewards/margins": 0.10181709378957748, "rewards/rejected": -0.27330639958381653, "step": 223 }, { "epoch": 0.1462976569515879, "grad_norm": 7.602991933565122, "learning_rate": 1.490455878617561e-07, "logits/chosen": -1.7201324701309204, "logits/rejected": -1.6427278518676758, "logps/chosen": -493.15155029296875, "logps/rejected": -565.255615234375, "loss": 0.6665, "rewards/accuracies": 0.75, "rewards/chosen": -0.16769693791866302, "rewards/margins": 0.1270834058523178, "rewards/rejected": -0.29478034377098083, "step": 224 }, { "epoch": 0.14695077149155034, "grad_norm": 19.894237277380455, "learning_rate": 1.4901818424144348e-07, "logits/chosen": -1.7538783550262451, "logits/rejected": -1.7509037256240845, "logps/chosen": -613.3535766601562, "logps/rejected": -623.0008544921875, "loss": 0.6655, "rewards/accuracies": 0.65625, "rewards/chosen": -0.22273880243301392, "rewards/margins": 0.08194239437580109, "rewards/rejected": -0.3046812117099762, "step": 225 }, { "epoch": 0.14760388603151278, "grad_norm": 27.20495808215994, "learning_rate": 1.4899039534677446e-07, "logits/chosen": -1.678489327430725, "logits/rejected": -1.598201870918274, "logps/chosen": -493.9471435546875, "logps/rejected": -593.92333984375, "loss": 0.6653, "rewards/accuracies": 0.71875, "rewards/chosen": -0.23504018783569336, "rewards/margins": 0.11126020550727844, "rewards/rejected": -0.3463003933429718, "step": 226 }, { "epoch": 0.14825700057147523, "grad_norm": 30.195854234112794, "learning_rate": 1.489622213223939e-07, "logits/chosen": -1.7436487674713135, "logits/rejected": -1.712781310081482, "logps/chosen": -514.0474243164062, "logps/rejected": -565.7003784179688, "loss": 0.6667, "rewards/accuracies": 0.75, "rewards/chosen": -0.1994318962097168, "rewards/margins": 0.1081019788980484, "rewards/rejected": -0.307533860206604, "step": 227 }, { "epoch": 0.14891011511143767, "grad_norm": 6.64214876236842, "learning_rate": 1.4893366231495133e-07, "logits/chosen": -1.6693284511566162, "logits/rejected": -1.6889541149139404, "logps/chosen": -538.619873046875, "logps/rejected": -530.3651123046875, "loss": 0.6658, "rewards/accuracies": 0.6875, "rewards/chosen": -0.18542669713497162, "rewards/margins": 0.06551727652549744, "rewards/rejected": -0.25094395875930786, "step": 228 }, { "epoch": 0.14956322965140012, "grad_norm": 19.211074462031995, "learning_rate": 1.489047184731001e-07, "logits/chosen": -1.784908413887024, "logits/rejected": -1.801963210105896, "logps/chosen": -545.5239868164062, "logps/rejected": -563.0906372070312, "loss": 0.6648, "rewards/accuracies": 0.4375, "rewards/chosen": -0.20398400723934174, "rewards/margins": 0.05913097783923149, "rewards/rejected": -0.26311495900154114, "step": 229 }, { "epoch": 0.15021634419136257, "grad_norm": 18.555655165809004, "learning_rate": 1.488753899474967e-07, "logits/chosen": -1.7692968845367432, "logits/rejected": -1.7724251747131348, "logps/chosen": -505.1445617675781, "logps/rejected": -516.6723022460938, "loss": 0.6724, "rewards/accuracies": 0.625, "rewards/chosen": -0.21376638114452362, "rewards/margins": 0.06437156349420547, "rewards/rejected": -0.2781379520893097, "step": 230 }, { "epoch": 0.150869458731325, "grad_norm": 12.537841664973817, "learning_rate": 1.4884567689079993e-07, "logits/chosen": -1.7249011993408203, "logits/rejected": -1.7343446016311646, "logps/chosen": -559.3654174804688, "logps/rejected": -483.27349853515625, "loss": 0.6834, "rewards/accuracies": 0.59375, "rewards/chosen": -0.23856452107429504, "rewards/margins": -0.02549094706773758, "rewards/rejected": -0.21307358145713806, "step": 231 }, { "epoch": 0.15152257327128746, "grad_norm": 7.974565094982971, "learning_rate": 1.4881557945767017e-07, "logits/chosen": -1.6963640451431274, "logits/rejected": -1.6877349615097046, "logps/chosen": -534.5089111328125, "logps/rejected": -530.7501220703125, "loss": 0.6613, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2620963156223297, "rewards/margins": 0.11971013993024826, "rewards/rejected": -0.38180649280548096, "step": 232 }, { "epoch": 0.1521756878112499, "grad_norm": 18.70437584528313, "learning_rate": 1.4878509780476852e-07, "logits/chosen": -1.6992989778518677, "logits/rejected": -1.7136512994766235, "logps/chosen": -494.5487060546875, "logps/rejected": -466.22174072265625, "loss": 0.6548, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1514841765165329, "rewards/margins": 0.03264841437339783, "rewards/rejected": -0.18413259088993073, "step": 233 }, { "epoch": 0.15282880235121235, "grad_norm": 6.4709344020100925, "learning_rate": 1.4875423209075598e-07, "logits/chosen": -1.7042124271392822, "logits/rejected": -1.6889500617980957, "logps/chosen": -625.1463623046875, "logps/rejected": -610.1585083007812, "loss": 0.6638, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2791493237018585, "rewards/margins": 0.07801420241594315, "rewards/rejected": -0.35716351866722107, "step": 234 }, { "epoch": 0.1534819168911748, "grad_norm": 6.93021377573884, "learning_rate": 1.4872298247629262e-07, "logits/chosen": -1.7167174816131592, "logits/rejected": -1.7292131185531616, "logps/chosen": -540.9368286132812, "logps/rejected": -528.7276611328125, "loss": 0.6536, "rewards/accuracies": 0.75, "rewards/chosen": -0.21813733875751495, "rewards/margins": 0.07377970963716507, "rewards/rejected": -0.29191702604293823, "step": 235 }, { "epoch": 0.15413503143113724, "grad_norm": 17.987164953454663, "learning_rate": 1.486913491240368e-07, "logits/chosen": -1.7713466882705688, "logits/rejected": -1.7571359872817993, "logps/chosen": -569.6904907226562, "logps/rejected": -590.6726684570312, "loss": 0.6518, "rewards/accuracies": 0.625, "rewards/chosen": -0.27502891421318054, "rewards/margins": 0.07631243020296097, "rewards/rejected": -0.3513413667678833, "step": 236 }, { "epoch": 0.1547881459710997, "grad_norm": 13.606732916452634, "learning_rate": 1.4865933219864426e-07, "logits/chosen": -1.687245488166809, "logits/rejected": -1.6311473846435547, "logps/chosen": -535.9907836914062, "logps/rejected": -575.3447875976562, "loss": 0.6569, "rewards/accuracies": 0.78125, "rewards/chosen": -0.24033436179161072, "rewards/margins": 0.06762813031673431, "rewards/rejected": -0.3079625368118286, "step": 237 }, { "epoch": 0.15544126051106213, "grad_norm": 19.472827108008165, "learning_rate": 1.4862693186676727e-07, "logits/chosen": -1.6632615327835083, "logits/rejected": -1.7188615798950195, "logps/chosen": -570.304931640625, "logps/rejected": -528.0740966796875, "loss": 0.6658, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2309207022190094, "rewards/margins": 0.0879175215959549, "rewards/rejected": -0.3188382387161255, "step": 238 }, { "epoch": 0.15609437505102458, "grad_norm": 22.88007198498636, "learning_rate": 1.4859414829705384e-07, "logits/chosen": -1.6684261560440063, "logits/rejected": -1.7101047039031982, "logps/chosen": -567.6971435546875, "logps/rejected": -546.453369140625, "loss": 0.6719, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19033075869083405, "rewards/margins": 0.06966198980808258, "rewards/rejected": -0.2599927484989166, "step": 239 }, { "epoch": 0.15674748959098703, "grad_norm": 36.841939632438105, "learning_rate": 1.4856098166014676e-07, "logits/chosen": -1.7202922105789185, "logits/rejected": -1.6927356719970703, "logps/chosen": -574.1094970703125, "logps/rejected": -533.2037963867188, "loss": 0.6703, "rewards/accuracies": 0.53125, "rewards/chosen": -0.25585952401161194, "rewards/margins": 0.0038572316989302635, "rewards/rejected": -0.2597167491912842, "step": 240 }, { "epoch": 0.15740060413094947, "grad_norm": 21.236611473334218, "learning_rate": 1.4852743212868267e-07, "logits/chosen": -1.653743028640747, "logits/rejected": -1.6406570672988892, "logps/chosen": -491.4779968261719, "logps/rejected": -504.90289306640625, "loss": 0.6619, "rewards/accuracies": 0.40625, "rewards/chosen": -0.26939359307289124, "rewards/margins": -0.0027835238724946976, "rewards/rejected": -0.2666100263595581, "step": 241 }, { "epoch": 0.15805371867091192, "grad_norm": 22.64341132140898, "learning_rate": 1.4849349987729134e-07, "logits/chosen": -1.6768407821655273, "logits/rejected": -1.707101821899414, "logps/chosen": -508.9664306640625, "logps/rejected": -484.92047119140625, "loss": 0.6723, "rewards/accuracies": 0.5625, "rewards/chosen": -0.22562530636787415, "rewards/margins": 0.022399617359042168, "rewards/rejected": -0.24802491068840027, "step": 242 }, { "epoch": 0.15870683321087437, "grad_norm": 38.25143501681014, "learning_rate": 1.4845918508259456e-07, "logits/chosen": -1.5299750566482544, "logits/rejected": -1.5509228706359863, "logps/chosen": -515.6123046875, "logps/rejected": -503.2989196777344, "loss": 0.6704, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2918733060359955, "rewards/margins": 0.03851493448019028, "rewards/rejected": -0.33038821816444397, "step": 243 }, { "epoch": 0.1593599477508368, "grad_norm": 42.26475756226102, "learning_rate": 1.4842448792320532e-07, "logits/chosen": -1.6829333305358887, "logits/rejected": -1.6792709827423096, "logps/chosen": -551.5542602539062, "logps/rejected": -623.4794921875, "loss": 0.6633, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2852548360824585, "rewards/margins": 0.11484323441982269, "rewards/rejected": -0.4000980854034424, "step": 244 }, { "epoch": 0.16001306229079926, "grad_norm": 6.6637868943907135, "learning_rate": 1.4838940857972694e-07, "logits/chosen": -1.7213988304138184, "logits/rejected": -1.7653172016143799, "logps/chosen": -597.0615234375, "logps/rejected": -580.1837158203125, "loss": 0.6619, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2760111391544342, "rewards/margins": 0.08438215404748917, "rewards/rejected": -0.3603932559490204, "step": 245 }, { "epoch": 0.1606661768307617, "grad_norm": 7.466033172797767, "learning_rate": 1.4835394723475195e-07, "logits/chosen": -1.7462300062179565, "logits/rejected": -1.7599581480026245, "logps/chosen": -624.055908203125, "logps/rejected": -556.203369140625, "loss": 0.6582, "rewards/accuracies": 0.78125, "rewards/chosen": -0.32625612616539, "rewards/margins": 0.019012071192264557, "rewards/rejected": -0.3452681601047516, "step": 246 }, { "epoch": 0.16131929137072415, "grad_norm": 18.36532918357786, "learning_rate": 1.4831810407286132e-07, "logits/chosen": -1.7185373306274414, "logits/rejected": -1.6984236240386963, "logps/chosen": -497.5492858886719, "logps/rejected": -549.6527709960938, "loss": 0.657, "rewards/accuracies": 0.8125, "rewards/chosen": -0.21095693111419678, "rewards/margins": 0.10894811898469925, "rewards/rejected": -0.31990501284599304, "step": 247 }, { "epoch": 0.1619724059106866, "grad_norm": 6.712430351467606, "learning_rate": 1.4828187928062343e-07, "logits/chosen": -1.677220344543457, "logits/rejected": -1.6865521669387817, "logps/chosen": -496.3159484863281, "logps/rejected": -491.939208984375, "loss": 0.6605, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2620304822921753, "rewards/margins": 0.05196765065193176, "rewards/rejected": -0.31399816274642944, "step": 248 }, { "epoch": 0.16262552045064904, "grad_norm": 16.60390065867787, "learning_rate": 1.4824527304659303e-07, "logits/chosen": -1.8142789602279663, "logits/rejected": -1.808518648147583, "logps/chosen": -573.7063598632812, "logps/rejected": -586.904541015625, "loss": 0.656, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2939229905605316, "rewards/margins": 0.20337671041488647, "rewards/rejected": -0.4972996711730957, "step": 249 }, { "epoch": 0.1632786349906115, "grad_norm": 8.526623591061165, "learning_rate": 1.4820828556131042e-07, "logits/chosen": -1.682909607887268, "logits/rejected": -1.6931588649749756, "logps/chosen": -553.196044921875, "logps/rejected": -606.1085205078125, "loss": 0.6407, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3215971887111664, "rewards/margins": 0.13771557807922363, "rewards/rejected": -0.4593127965927124, "step": 250 }, { "epoch": 0.16393174953057393, "grad_norm": 31.575444561187965, "learning_rate": 1.4817091701730025e-07, "logits/chosen": -1.7506418228149414, "logits/rejected": -1.7645210027694702, "logps/chosen": -504.40216064453125, "logps/rejected": -466.3576965332031, "loss": 0.6607, "rewards/accuracies": 0.625, "rewards/chosen": -0.2621554732322693, "rewards/margins": 0.02839628979563713, "rewards/rejected": -0.2905517518520355, "step": 251 }, { "epoch": 0.16458486407053638, "grad_norm": 33.198936730829594, "learning_rate": 1.4813316760907073e-07, "logits/chosen": -1.7571806907653809, "logits/rejected": -1.7107410430908203, "logps/chosen": -535.119140625, "logps/rejected": -570.49658203125, "loss": 0.6653, "rewards/accuracies": 0.625, "rewards/chosen": -0.2747470736503601, "rewards/margins": 0.10117386281490326, "rewards/rejected": -0.37592095136642456, "step": 252 }, { "epoch": 0.16523797861049883, "grad_norm": 8.023556183248012, "learning_rate": 1.480950375331125e-07, "logits/chosen": -1.724591851234436, "logits/rejected": -1.724915623664856, "logps/chosen": -533.9208374023438, "logps/rejected": -578.927734375, "loss": 0.6578, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2872743010520935, "rewards/margins": 0.1008455902338028, "rewards/rejected": -0.3881198763847351, "step": 253 }, { "epoch": 0.16589109315046127, "grad_norm": 17.29562845944618, "learning_rate": 1.4805652698789758e-07, "logits/chosen": -1.6281641721725464, "logits/rejected": -1.6535223722457886, "logps/chosen": -501.57318115234375, "logps/rejected": -623.32568359375, "loss": 0.6373, "rewards/accuracies": 0.875, "rewards/chosen": -0.29188841581344604, "rewards/margins": 0.24822545051574707, "rewards/rejected": -0.5401138663291931, "step": 254 }, { "epoch": 0.16654420769042372, "grad_norm": 12.955908095771617, "learning_rate": 1.480176361738784e-07, "logits/chosen": -1.6974416971206665, "logits/rejected": -1.7031437158584595, "logps/chosen": -536.0557250976562, "logps/rejected": -505.6365661621094, "loss": 0.6586, "rewards/accuracies": 0.46875, "rewards/chosen": -0.32581400871276855, "rewards/margins": 0.018291521817445755, "rewards/rejected": -0.3441055715084076, "step": 255 }, { "epoch": 0.16719732223038616, "grad_norm": 10.398012268043638, "learning_rate": 1.4797836529348678e-07, "logits/chosen": -1.6877224445343018, "logits/rejected": -1.6973309516906738, "logps/chosen": -539.1455688476562, "logps/rejected": -539.2821044921875, "loss": 0.6554, "rewards/accuracies": 0.75, "rewards/chosen": -0.30231714248657227, "rewards/margins": 0.07891248911619186, "rewards/rejected": -0.3812296390533447, "step": 256 }, { "epoch": 0.1678504367703486, "grad_norm": 13.095086309931633, "learning_rate": 1.4793871455113277e-07, "logits/chosen": -1.7343683242797852, "logits/rejected": -1.7324395179748535, "logps/chosen": -509.0965576171875, "logps/rejected": -560.6080322265625, "loss": 0.6479, "rewards/accuracies": 0.78125, "rewards/chosen": -0.30541422963142395, "rewards/margins": 0.18525291979312897, "rewards/rejected": -0.4906671643257141, "step": 257 }, { "epoch": 0.16850355131031106, "grad_norm": 8.776278689782488, "learning_rate": 1.478986841532037e-07, "logits/chosen": -1.6799153089523315, "logits/rejected": -1.6792922019958496, "logps/chosen": -507.31787109375, "logps/rejected": -538.5675659179688, "loss": 0.6616, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2834991216659546, "rewards/margins": 0.1000271737575531, "rewards/rejected": -0.3835262954235077, "step": 258 }, { "epoch": 0.1691566658502735, "grad_norm": 12.19544220249078, "learning_rate": 1.4785827430806304e-07, "logits/chosen": -1.6284384727478027, "logits/rejected": -1.5995004177093506, "logps/chosen": -496.8951416015625, "logps/rejected": -489.3295593261719, "loss": 0.6491, "rewards/accuracies": 0.65625, "rewards/chosen": -0.344193696975708, "rewards/margins": 0.09732924401760101, "rewards/rejected": -0.4415229558944702, "step": 259 }, { "epoch": 0.16980978039023595, "grad_norm": 34.82104290571965, "learning_rate": 1.4781748522604932e-07, "logits/chosen": -1.7195165157318115, "logits/rejected": -1.7009164094924927, "logps/chosen": -552.0281982421875, "logps/rejected": -630.2988891601562, "loss": 0.6517, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3577840328216553, "rewards/margins": 0.2054346352815628, "rewards/rejected": -0.5632186532020569, "step": 260 }, { "epoch": 0.1704628949301984, "grad_norm": 30.979244743569765, "learning_rate": 1.4777631711947508e-07, "logits/chosen": -1.6629853248596191, "logits/rejected": -1.6167453527450562, "logps/chosen": -505.7708740234375, "logps/rejected": -532.100830078125, "loss": 0.6485, "rewards/accuracies": 0.75, "rewards/chosen": -0.3221421241760254, "rewards/margins": 0.06837349385023117, "rewards/rejected": -0.39051565527915955, "step": 261 }, { "epoch": 0.17111600947016084, "grad_norm": 41.43490075038442, "learning_rate": 1.4773477020262572e-07, "logits/chosen": -1.728777527809143, "logits/rejected": -1.726311445236206, "logps/chosen": -620.5463256835938, "logps/rejected": -633.969970703125, "loss": 0.6527, "rewards/accuracies": 0.75, "rewards/chosen": -0.4750226140022278, "rewards/margins": 0.15798604488372803, "rewards/rejected": -0.6330086588859558, "step": 262 }, { "epoch": 0.1717691240101233, "grad_norm": 23.108934696505543, "learning_rate": 1.4769284469175835e-07, "logits/chosen": -1.7010533809661865, "logits/rejected": -1.700863242149353, "logps/chosen": -496.9283752441406, "logps/rejected": -496.18096923828125, "loss": 0.6404, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2904779613018036, "rewards/margins": 0.08895879238843918, "rewards/rejected": -0.37943676114082336, "step": 263 }, { "epoch": 0.17242223855008573, "grad_norm": 29.393486906363, "learning_rate": 1.476505408051008e-07, "logits/chosen": -1.7014069557189941, "logits/rejected": -1.785143494606018, "logps/chosen": -577.2012939453125, "logps/rejected": -608.8991088867188, "loss": 0.6534, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3605682849884033, "rewards/margins": 0.11243538558483124, "rewards/rejected": -0.47300365567207336, "step": 264 }, { "epoch": 0.17307535309004818, "grad_norm": 6.776931632936762, "learning_rate": 1.476078587628503e-07, "logits/chosen": -1.7219668626785278, "logits/rejected": -1.7316367626190186, "logps/chosen": -613.4144287109375, "logps/rejected": -600.9342041015625, "loss": 0.6394, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4629693329334259, "rewards/margins": 0.09507772326469421, "rewards/rejected": -0.5580470561981201, "step": 265 }, { "epoch": 0.17372846763001062, "grad_norm": 15.838565452388137, "learning_rate": 1.4756479878717254e-07, "logits/chosen": -1.6845935583114624, "logits/rejected": -1.697797417640686, "logps/chosen": -502.57818603515625, "logps/rejected": -501.9476318359375, "loss": 0.6348, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3347086012363434, "rewards/margins": 0.14087814092636108, "rewards/rejected": -0.4755867123603821, "step": 266 }, { "epoch": 0.17438158216997307, "grad_norm": 22.80772385019741, "learning_rate": 1.4752136110220027e-07, "logits/chosen": -1.689018964767456, "logits/rejected": -1.68304443359375, "logps/chosen": -510.1197509765625, "logps/rejected": -546.5357666015625, "loss": 0.6512, "rewards/accuracies": 0.75, "rewards/chosen": -0.30356815457344055, "rewards/margins": 0.14922784268856049, "rewards/rejected": -0.45279598236083984, "step": 267 }, { "epoch": 0.17503469670993552, "grad_norm": 23.730722811526665, "learning_rate": 1.4747754593403243e-07, "logits/chosen": -1.6035892963409424, "logits/rejected": -1.63158118724823, "logps/chosen": -526.643798828125, "logps/rejected": -575.2221069335938, "loss": 0.6536, "rewards/accuracies": 0.625, "rewards/chosen": -0.3458973467350006, "rewards/margins": 0.19965365529060364, "rewards/rejected": -0.5455510020256042, "step": 268 }, { "epoch": 0.17568781124989796, "grad_norm": 7.0753534582972, "learning_rate": 1.4743335351073263e-07, "logits/chosen": -1.7131729125976562, "logits/rejected": -1.716842532157898, "logps/chosen": -589.8107299804688, "logps/rejected": -586.211669921875, "loss": 0.6484, "rewards/accuracies": 0.625, "rewards/chosen": -0.36673662066459656, "rewards/margins": 0.08891665935516357, "rewards/rejected": -0.45565328001976013, "step": 269 }, { "epoch": 0.1763409257898604, "grad_norm": 13.47915067878089, "learning_rate": 1.4738878406232824e-07, "logits/chosen": -1.6974538564682007, "logits/rejected": -1.7051836252212524, "logps/chosen": -576.85302734375, "logps/rejected": -595.0402221679688, "loss": 0.6588, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4583105444908142, "rewards/margins": 0.09714774042367935, "rewards/rejected": -0.5554581880569458, "step": 270 }, { "epoch": 0.17699404032982285, "grad_norm": 26.554664114423755, "learning_rate": 1.4734383782080914e-07, "logits/chosen": -1.682262659072876, "logits/rejected": -1.653159499168396, "logps/chosen": -584.1964721679688, "logps/rejected": -617.1187744140625, "loss": 0.6637, "rewards/accuracies": 0.53125, "rewards/chosen": -0.42176809906959534, "rewards/margins": 0.07791588455438614, "rewards/rejected": -0.4996839761734009, "step": 271 }, { "epoch": 0.1776471548697853, "grad_norm": 10.821406826613458, "learning_rate": 1.4729851502012636e-07, "logits/chosen": -1.711740255355835, "logits/rejected": -1.7466825246810913, "logps/chosen": -522.8740844726562, "logps/rejected": -510.05743408203125, "loss": 0.6331, "rewards/accuracies": 0.75, "rewards/chosen": -0.388105183839798, "rewards/margins": 0.08373910188674927, "rewards/rejected": -0.47184431552886963, "step": 272 }, { "epoch": 0.17830026940974775, "grad_norm": 6.675378991764519, "learning_rate": 1.4725281589619103e-07, "logits/chosen": -1.6703485250473022, "logits/rejected": -1.6685247421264648, "logps/chosen": -530.2496337890625, "logps/rejected": -535.6815795898438, "loss": 0.6426, "rewards/accuracies": 0.78125, "rewards/chosen": -0.42745375633239746, "rewards/margins": 0.18275927007198334, "rewards/rejected": -0.6102129817008972, "step": 273 }, { "epoch": 0.1789533839497102, "grad_norm": 9.179293872139016, "learning_rate": 1.4720674068687308e-07, "logits/chosen": -1.7541323900222778, "logits/rejected": -1.811903476715088, "logps/chosen": -548.3256225585938, "logps/rejected": -549.988525390625, "loss": 0.6477, "rewards/accuracies": 0.625, "rewards/chosen": -0.4320800304412842, "rewards/margins": 0.04719913750886917, "rewards/rejected": -0.47927919030189514, "step": 274 }, { "epoch": 0.17960649848967264, "grad_norm": 19.694787175563757, "learning_rate": 1.4716028963200005e-07, "logits/chosen": -1.692307472229004, "logits/rejected": -1.6705561876296997, "logps/chosen": -493.9141845703125, "logps/rejected": -527.4869384765625, "loss": 0.6533, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3439943492412567, "rewards/margins": 0.14113494753837585, "rewards/rejected": -0.48512929677963257, "step": 275 }, { "epoch": 0.18025961302963509, "grad_norm": 20.421232540328838, "learning_rate": 1.4711346297335575e-07, "logits/chosen": -1.7493505477905273, "logits/rejected": -1.7484819889068604, "logps/chosen": -562.100341796875, "logps/rejected": -544.1605224609375, "loss": 0.6586, "rewards/accuracies": 0.8125, "rewards/chosen": -0.39796724915504456, "rewards/margins": 0.1226586103439331, "rewards/rejected": -0.5206258296966553, "step": 276 }, { "epoch": 0.18091272756959753, "grad_norm": 7.813830449111613, "learning_rate": 1.4706626095467905e-07, "logits/chosen": -1.6537861824035645, "logits/rejected": -1.6490662097930908, "logps/chosen": -615.1405029296875, "logps/rejected": -640.3934326171875, "loss": 0.6364, "rewards/accuracies": 0.75, "rewards/chosen": -0.39063894748687744, "rewards/margins": 0.22695013880729675, "rewards/rejected": -0.6175890564918518, "step": 277 }, { "epoch": 0.18156584210955998, "grad_norm": 18.137788034555644, "learning_rate": 1.470186838216627e-07, "logits/chosen": -1.7237962484359741, "logits/rejected": -1.7260386943817139, "logps/chosen": -560.9794921875, "logps/rejected": -544.9048461914062, "loss": 0.6549, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4301608204841614, "rewards/margins": 0.10491985827684402, "rewards/rejected": -0.5350806713104248, "step": 278 }, { "epoch": 0.18221895664952242, "grad_norm": 6.203981970076561, "learning_rate": 1.469707318219519e-07, "logits/chosen": -1.7497467994689941, "logits/rejected": -1.7625311613082886, "logps/chosen": -476.921875, "logps/rejected": -468.9197692871094, "loss": 0.6288, "rewards/accuracies": 0.53125, "rewards/chosen": -0.39907845854759216, "rewards/margins": 0.09824170172214508, "rewards/rejected": -0.49732017517089844, "step": 279 }, { "epoch": 0.18287207118948487, "grad_norm": 30.605405460520963, "learning_rate": 1.4692240520514308e-07, "logits/chosen": -1.692575216293335, "logits/rejected": -1.7037385702133179, "logps/chosen": -459.114990234375, "logps/rejected": -475.6065368652344, "loss": 0.6475, "rewards/accuracies": 0.71875, "rewards/chosen": -0.37590938806533813, "rewards/margins": 0.06421739608049393, "rewards/rejected": -0.4401267468929291, "step": 280 }, { "epoch": 0.18352518572944732, "grad_norm": 6.890672032716978, "learning_rate": 1.4687370422278264e-07, "logits/chosen": -1.6996791362762451, "logits/rejected": -1.7280397415161133, "logps/chosen": -567.3660278320312, "logps/rejected": -556.817138671875, "loss": 0.6557, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4747771620750427, "rewards/margins": 0.10796495527029037, "rewards/rejected": -0.5827420949935913, "step": 281 }, { "epoch": 0.18417830026940976, "grad_norm": 7.575771612722592, "learning_rate": 1.4682462912836556e-07, "logits/chosen": -1.6831636428833008, "logits/rejected": -1.659497857093811, "logps/chosen": -591.215576171875, "logps/rejected": -606.3766479492188, "loss": 0.6464, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5020853281021118, "rewards/margins": 0.08574617654085159, "rewards/rejected": -0.5878314971923828, "step": 282 }, { "epoch": 0.18483141480937218, "grad_norm": 14.228779893218709, "learning_rate": 1.4677518017733416e-07, "logits/chosen": -1.620701789855957, "logits/rejected": -1.6582701206207275, "logps/chosen": -493.6382141113281, "logps/rejected": -494.93731689453125, "loss": 0.652, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4052881896495819, "rewards/margins": 0.05283729359507561, "rewards/rejected": -0.4581254720687866, "step": 283 }, { "epoch": 0.18548452934933463, "grad_norm": 9.401784297751998, "learning_rate": 1.467253576270767e-07, "logits/chosen": -1.741771936416626, "logits/rejected": -1.75984787940979, "logps/chosen": -491.3598937988281, "logps/rejected": -527.6268310546875, "loss": 0.6431, "rewards/accuracies": 0.75, "rewards/chosen": -0.45621737837791443, "rewards/margins": 0.12358233332633972, "rewards/rejected": -0.5797997713088989, "step": 284 }, { "epoch": 0.18613764388929707, "grad_norm": 8.738330524484025, "learning_rate": 1.466751617369261e-07, "logits/chosen": -1.7441930770874023, "logits/rejected": -1.8137414455413818, "logps/chosen": -600.4554443359375, "logps/rejected": -585.8037719726562, "loss": 0.6408, "rewards/accuracies": 0.625, "rewards/chosen": -0.5598416924476624, "rewards/margins": 0.06251634657382965, "rewards/rejected": -0.6223580241203308, "step": 285 }, { "epoch": 0.18679075842925952, "grad_norm": 7.921733780275398, "learning_rate": 1.4662459276815857e-07, "logits/chosen": -1.740809679031372, "logits/rejected": -1.6921014785766602, "logps/chosen": -521.9088745117188, "logps/rejected": -613.4127807617188, "loss": 0.6222, "rewards/accuracies": 0.75, "rewards/chosen": -0.5204059481620789, "rewards/margins": 0.23885881900787354, "rewards/rejected": -0.7592648267745972, "step": 286 }, { "epoch": 0.18744387296922196, "grad_norm": 14.993635596662422, "learning_rate": 1.4657365098399217e-07, "logits/chosen": -1.6475145816802979, "logits/rejected": -1.6759244203567505, "logps/chosen": -588.921142578125, "logps/rejected": -621.540771484375, "loss": 0.6506, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5357760787010193, "rewards/margins": 0.17871497571468353, "rewards/rejected": -0.714491069316864, "step": 287 }, { "epoch": 0.1880969875091844, "grad_norm": 13.969578435371153, "learning_rate": 1.4652233664958564e-07, "logits/chosen": -1.6996246576309204, "logits/rejected": -1.6887569427490234, "logps/chosen": -524.6732177734375, "logps/rejected": -578.5980224609375, "loss": 0.6505, "rewards/accuracies": 0.65625, "rewards/chosen": -0.513081431388855, "rewards/margins": 0.24069830775260925, "rewards/rejected": -0.7537796497344971, "step": 288 }, { "epoch": 0.18875010204914686, "grad_norm": 11.610438307536514, "learning_rate": 1.4647065003203673e-07, "logits/chosen": -1.629154086112976, "logits/rejected": -1.675700306892395, "logps/chosen": -522.1669921875, "logps/rejected": -537.0850830078125, "loss": 0.6589, "rewards/accuracies": 0.625, "rewards/chosen": -0.5520819425582886, "rewards/margins": 0.10711211711168289, "rewards/rejected": -0.6591941118240356, "step": 289 }, { "epoch": 0.1894032165891093, "grad_norm": 10.283639619093217, "learning_rate": 1.4641859140038115e-07, "logits/chosen": -1.7125153541564941, "logits/rejected": -1.7900097370147705, "logps/chosen": -481.0557861328125, "logps/rejected": -466.9371643066406, "loss": 0.6447, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4833194613456726, "rewards/margins": 0.10249477624893188, "rewards/rejected": -0.5858142375946045, "step": 290 }, { "epoch": 0.19005633112907175, "grad_norm": 10.863781043680522, "learning_rate": 1.4636616102559085e-07, "logits/chosen": -1.7732642889022827, "logits/rejected": -1.7560861110687256, "logps/chosen": -497.37286376953125, "logps/rejected": -492.2978515625, "loss": 0.6447, "rewards/accuracies": 0.625, "rewards/chosen": -0.5158191919326782, "rewards/margins": 0.12314166873693466, "rewards/rejected": -0.6389608383178711, "step": 291 }, { "epoch": 0.1907094456690342, "grad_norm": 8.437201886799969, "learning_rate": 1.4631335918057284e-07, "logits/chosen": -1.755825161933899, "logits/rejected": -1.7563961744308472, "logps/chosen": -545.761962890625, "logps/rejected": -605.7576904296875, "loss": 0.6288, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5955057740211487, "rewards/margins": 0.1695830523967743, "rewards/rejected": -0.7650887966156006, "step": 292 }, { "epoch": 0.19136256020899664, "grad_norm": 8.28332489348213, "learning_rate": 1.4626018614016762e-07, "logits/chosen": -1.6458687782287598, "logits/rejected": -1.7265384197235107, "logps/chosen": -524.3062744140625, "logps/rejected": -535.7152709960938, "loss": 0.6534, "rewards/accuracies": 0.75, "rewards/chosen": -0.5473222136497498, "rewards/margins": 0.09474081546068192, "rewards/rejected": -0.6420629620552063, "step": 293 }, { "epoch": 0.1920156747489591, "grad_norm": 12.121969382291516, "learning_rate": 1.4620664218114785e-07, "logits/chosen": -1.7038286924362183, "logits/rejected": -1.7111061811447144, "logps/chosen": -608.0626220703125, "logps/rejected": -610.8870849609375, "loss": 0.6474, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6269205212593079, "rewards/margins": 0.192575603723526, "rewards/rejected": -0.8194961547851562, "step": 294 }, { "epoch": 0.19266878928892153, "grad_norm": 8.914872873902894, "learning_rate": 1.4615272758221687e-07, "logits/chosen": -1.6704249382019043, "logits/rejected": -1.6967017650604248, "logps/chosen": -571.5804443359375, "logps/rejected": -568.3060302734375, "loss": 0.6579, "rewards/accuracies": 0.625, "rewards/chosen": -0.5785356163978577, "rewards/margins": 0.08196890354156494, "rewards/rejected": -0.6605044603347778, "step": 295 }, { "epoch": 0.19332190382888398, "grad_norm": 7.935449781786862, "learning_rate": 1.4609844262400722e-07, "logits/chosen": -1.7343144416809082, "logits/rejected": -1.7453346252441406, "logps/chosen": -595.2299194335938, "logps/rejected": -627.33984375, "loss": 0.6345, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6429135799407959, "rewards/margins": 0.16354820132255554, "rewards/rejected": -0.8064618110656738, "step": 296 }, { "epoch": 0.19397501836884642, "grad_norm": 7.485459727998883, "learning_rate": 1.4604378758907928e-07, "logits/chosen": -1.717028021812439, "logits/rejected": -1.7182062864303589, "logps/chosen": -603.7417602539062, "logps/rejected": -610.2848510742188, "loss": 0.6507, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6892397999763489, "rewards/margins": 0.20615926384925842, "rewards/rejected": -0.8953990936279297, "step": 297 }, { "epoch": 0.19462813290880887, "grad_norm": 11.03240678642255, "learning_rate": 1.459887627619196e-07, "logits/chosen": -1.7350809574127197, "logits/rejected": -1.7344433069229126, "logps/chosen": -579.1885986328125, "logps/rejected": -620.0966796875, "loss": 0.6361, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6911602020263672, "rewards/margins": 0.14496688544750214, "rewards/rejected": -0.836126983165741, "step": 298 }, { "epoch": 0.19528124744877132, "grad_norm": 8.53488849066449, "learning_rate": 1.4593336842893963e-07, "logits/chosen": -1.745806336402893, "logits/rejected": -1.6635572910308838, "logps/chosen": -607.8656616210938, "logps/rejected": -672.4678344726562, "loss": 0.6208, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6869965195655823, "rewards/margins": 0.2365533858537674, "rewards/rejected": -0.9235499501228333, "step": 299 }, { "epoch": 0.19593436198873376, "grad_norm": 9.49489920339701, "learning_rate": 1.458776048784742e-07, "logits/chosen": -1.6529350280761719, "logits/rejected": -1.6770867109298706, "logps/chosen": -532.1659545898438, "logps/rejected": -558.4205322265625, "loss": 0.6306, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6781768798828125, "rewards/margins": 0.10716618597507477, "rewards/rejected": -0.7853431701660156, "step": 300 }, { "epoch": 0.19593436198873376, "eval_logits/chosen": -1.7436009645462036, "eval_logits/rejected": -1.7536306381225586, "eval_logps/chosen": -574.737548828125, "eval_logps/rejected": -584.3355712890625, "eval_loss": 0.6346647143363953, "eval_rewards/accuracies": 0.6840000152587891, "eval_rewards/chosen": -0.6439220309257507, "eval_rewards/margins": 0.17703963816165924, "eval_rewards/rejected": -0.8209616541862488, "eval_runtime": 297.2957, "eval_samples_per_second": 13.455, "eval_steps_per_second": 0.841, "step": 300 }, { "epoch": 0.1965874765286962, "grad_norm": 8.98774554330102, "learning_rate": 1.4582147240077982e-07, "logits/chosen": -1.7660400867462158, "logits/rejected": -1.7696928977966309, "logps/chosen": -551.3262939453125, "logps/rejected": -584.052001953125, "loss": 0.6399, "rewards/accuracies": 0.53125, "rewards/chosen": -0.6249464154243469, "rewards/margins": 0.09611751139163971, "rewards/rejected": -0.7210639715194702, "step": 301 }, { "epoch": 0.19724059106865865, "grad_norm": 7.863471372182064, "learning_rate": 1.4576497128803348e-07, "logits/chosen": -1.7265328168869019, "logits/rejected": -1.6853750944137573, "logps/chosen": -536.3348999023438, "logps/rejected": -628.0272827148438, "loss": 0.625, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6308943033218384, "rewards/margins": 0.23925556242465973, "rewards/rejected": -0.8701498508453369, "step": 302 }, { "epoch": 0.1978937056086211, "grad_norm": 28.45275146429454, "learning_rate": 1.4570810183433083e-07, "logits/chosen": -1.6958410739898682, "logits/rejected": -1.7414443492889404, "logps/chosen": -663.8955078125, "logps/rejected": -632.41015625, "loss": 0.6593, "rewards/accuracies": 0.5, "rewards/chosen": -0.8960934281349182, "rewards/margins": 0.07879979908466339, "rewards/rejected": -0.9748932123184204, "step": 303 }, { "epoch": 0.19854682014858355, "grad_norm": 11.410487006682061, "learning_rate": 1.4565086433568487e-07, "logits/chosen": -1.7225995063781738, "logits/rejected": -1.6888340711593628, "logps/chosen": -581.2233276367188, "logps/rejected": -675.6259155273438, "loss": 0.6429, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6933416724205017, "rewards/margins": 0.3561021387577057, "rewards/rejected": -1.0494438409805298, "step": 304 }, { "epoch": 0.199199934688546, "grad_norm": 8.270002666629233, "learning_rate": 1.4559325909002424e-07, "logits/chosen": -1.7664427757263184, "logits/rejected": -1.7638676166534424, "logps/chosen": -560.244873046875, "logps/rejected": -626.065185546875, "loss": 0.6395, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6132575869560242, "rewards/margins": 0.3563953936100006, "rewards/rejected": -0.9696530103683472, "step": 305 }, { "epoch": 0.19985304922850844, "grad_norm": 12.608968024472125, "learning_rate": 1.4553528639719185e-07, "logits/chosen": -1.6360845565795898, "logits/rejected": -1.7036012411117554, "logps/chosen": -494.1507568359375, "logps/rejected": -567.8990478515625, "loss": 0.6366, "rewards/accuracies": 0.75, "rewards/chosen": -0.6473857760429382, "rewards/margins": 0.21676453948020935, "rewards/rejected": -0.8641502857208252, "step": 306 }, { "epoch": 0.20050616376847089, "grad_norm": 17.135231726028064, "learning_rate": 1.4547694655894313e-07, "logits/chosen": -1.7396022081375122, "logits/rejected": -1.6808857917785645, "logps/chosen": -533.951416015625, "logps/rejected": -562.3763427734375, "loss": 0.6188, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6712486743927002, "rewards/margins": 0.19814179837703705, "rewards/rejected": -0.8693904876708984, "step": 307 }, { "epoch": 0.20115927830843333, "grad_norm": 6.928796136463469, "learning_rate": 1.454182398789446e-07, "logits/chosen": -1.776479721069336, "logits/rejected": -1.7864151000976562, "logps/chosen": -642.5709228515625, "logps/rejected": -602.6436157226562, "loss": 0.6323, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7122976779937744, "rewards/margins": 0.10113891959190369, "rewards/rejected": -0.8134365677833557, "step": 308 }, { "epoch": 0.20181239284839578, "grad_norm": 6.579505553675459, "learning_rate": 1.4535916666277225e-07, "logits/chosen": -1.6873722076416016, "logits/rejected": -1.683058500289917, "logps/chosen": -625.4521484375, "logps/rejected": -681.3677978515625, "loss": 0.6223, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7979632616043091, "rewards/margins": 0.28991833329200745, "rewards/rejected": -1.0878815650939941, "step": 309 }, { "epoch": 0.20246550738835822, "grad_norm": 7.215238256234268, "learning_rate": 1.4529972721790987e-07, "logits/chosen": -1.7267826795578003, "logits/rejected": -1.6847716569900513, "logps/chosen": -607.7223510742188, "logps/rejected": -689.0610961914062, "loss": 0.5934, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6881787776947021, "rewards/margins": 0.390371173620224, "rewards/rejected": -1.0785499811172485, "step": 310 }, { "epoch": 0.20311862192832067, "grad_norm": 8.708876826384516, "learning_rate": 1.4523992185374762e-07, "logits/chosen": -1.6836024522781372, "logits/rejected": -1.7239104509353638, "logps/chosen": -581.9091796875, "logps/rejected": -602.4833374023438, "loss": 0.6503, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6792923212051392, "rewards/margins": 0.17103339731693268, "rewards/rejected": -0.8503257632255554, "step": 311 }, { "epoch": 0.20377173646828312, "grad_norm": 15.693301460696688, "learning_rate": 1.4517975088158024e-07, "logits/chosen": -1.6466374397277832, "logits/rejected": -1.7001001834869385, "logps/chosen": -522.967041015625, "logps/rejected": -504.9639892578125, "loss": 0.6303, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6082721948623657, "rewards/margins": 0.13548986613750458, "rewards/rejected": -0.7437620759010315, "step": 312 }, { "epoch": 0.20442485100824556, "grad_norm": 9.45995269518211, "learning_rate": 1.4511921461460552e-07, "logits/chosen": -1.6590129137039185, "logits/rejected": -1.7052656412124634, "logps/chosen": -619.462158203125, "logps/rejected": -658.5916748046875, "loss": 0.6425, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8983247876167297, "rewards/margins": 0.1738293468952179, "rewards/rejected": -1.0721540451049805, "step": 313 }, { "epoch": 0.205077965548208, "grad_norm": 8.619266517151214, "learning_rate": 1.4505831336792268e-07, "logits/chosen": -1.6681677103042603, "logits/rejected": -1.6365997791290283, "logps/chosen": -623.3759155273438, "logps/rejected": -652.1790771484375, "loss": 0.6061, "rewards/accuracies": 0.75, "rewards/chosen": -0.817943811416626, "rewards/margins": 0.2616656422615051, "rewards/rejected": -1.0796093940734863, "step": 314 }, { "epoch": 0.20573108008817045, "grad_norm": 16.305545075033027, "learning_rate": 1.449970474585307e-07, "logits/chosen": -1.7769198417663574, "logits/rejected": -1.7549265623092651, "logps/chosen": -508.7225646972656, "logps/rejected": -541.2574462890625, "loss": 0.6153, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8023353815078735, "rewards/margins": 0.1428290456533432, "rewards/rejected": -0.9451643228530884, "step": 315 }, { "epoch": 0.2063841946281329, "grad_norm": 16.881338125964565, "learning_rate": 1.4493541720532666e-07, "logits/chosen": -1.7054510116577148, "logits/rejected": -1.7318084239959717, "logps/chosen": -577.06787109375, "logps/rejected": -674.53857421875, "loss": 0.5824, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7155259251594543, "rewards/margins": 0.4451526999473572, "rewards/rejected": -1.1606786251068115, "step": 316 }, { "epoch": 0.20703730916809535, "grad_norm": 8.857782623326043, "learning_rate": 1.4487342292910414e-07, "logits/chosen": -1.6867567300796509, "logits/rejected": -1.7076420783996582, "logps/chosen": -634.0771484375, "logps/rejected": -662.7860107421875, "loss": 0.6135, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9092274904251099, "rewards/margins": 0.23942092061042786, "rewards/rejected": -1.1486485004425049, "step": 317 }, { "epoch": 0.2076904237080578, "grad_norm": 32.895897939433254, "learning_rate": 1.4481106495255145e-07, "logits/chosen": -1.6250176429748535, "logits/rejected": -1.6629047393798828, "logps/chosen": -638.931640625, "logps/rejected": -675.3734130859375, "loss": 0.6102, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9061939120292664, "rewards/margins": 0.18294726312160492, "rewards/rejected": -1.0891411304473877, "step": 318 }, { "epoch": 0.20834353824802024, "grad_norm": 35.209821508774134, "learning_rate": 1.4474834360025005e-07, "logits/chosen": -1.7239129543304443, "logits/rejected": -1.7509957551956177, "logps/chosen": -539.3623046875, "logps/rejected": -585.4423828125, "loss": 0.6255, "rewards/accuracies": 0.75, "rewards/chosen": -0.7921657562255859, "rewards/margins": 0.20182442665100098, "rewards/rejected": -0.9939901232719421, "step": 319 }, { "epoch": 0.20899665278798268, "grad_norm": 22.542720186905754, "learning_rate": 1.446852591986728e-07, "logits/chosen": -1.7275651693344116, "logits/rejected": -1.7619481086730957, "logps/chosen": -574.9896850585938, "logps/rejected": -537.2247924804688, "loss": 0.6222, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7382320761680603, "rewards/margins": 0.09026205539703369, "rewards/rejected": -0.8284941911697388, "step": 320 }, { "epoch": 0.20964976732794513, "grad_norm": 11.689355163454417, "learning_rate": 1.4462181207618226e-07, "logits/chosen": -1.6684459447860718, "logits/rejected": -1.664050817489624, "logps/chosen": -612.190673828125, "logps/rejected": -627.1231079101562, "loss": 0.6396, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9567988514900208, "rewards/margins": 0.15958093106746674, "rewards/rejected": -1.1163798570632935, "step": 321 }, { "epoch": 0.21030288186790758, "grad_norm": 8.28817528253119, "learning_rate": 1.445580025630291e-07, "logits/chosen": -1.696157455444336, "logits/rejected": -1.693131446838379, "logps/chosen": -616.43017578125, "logps/rejected": -605.6445922851562, "loss": 0.6273, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8394571542739868, "rewards/margins": 0.19782942533493042, "rewards/rejected": -1.0372865200042725, "step": 322 }, { "epoch": 0.21095599640787002, "grad_norm": 9.732936932214093, "learning_rate": 1.444938309913501e-07, "logits/chosen": -1.6943405866622925, "logits/rejected": -1.7072209119796753, "logps/chosen": -584.8599853515625, "logps/rejected": -641.0568237304688, "loss": 0.6046, "rewards/accuracies": 0.75, "rewards/chosen": -0.7837256193161011, "rewards/margins": 0.3857865333557129, "rewards/rejected": -1.1695120334625244, "step": 323 }, { "epoch": 0.21160911094783247, "grad_norm": 11.744162208267507, "learning_rate": 1.444292976951668e-07, "logits/chosen": -1.6355708837509155, "logits/rejected": -1.627254843711853, "logps/chosen": -610.7489013671875, "logps/rejected": -618.13330078125, "loss": 0.5927, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8486420512199402, "rewards/margins": 0.18311645090579987, "rewards/rejected": -1.0317585468292236, "step": 324 }, { "epoch": 0.21226222548779491, "grad_norm": 8.36583130670152, "learning_rate": 1.4436440301038337e-07, "logits/chosen": -1.6780744791030884, "logits/rejected": -1.681298017501831, "logps/chosen": -618.8885498046875, "logps/rejected": -628.7433471679688, "loss": 0.6279, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9191569089889526, "rewards/margins": 0.21737900376319885, "rewards/rejected": -1.1365360021591187, "step": 325 }, { "epoch": 0.21291534002775736, "grad_norm": 7.011876209354938, "learning_rate": 1.4429914727478526e-07, "logits/chosen": -1.693228840827942, "logits/rejected": -1.6980655193328857, "logps/chosen": -560.333984375, "logps/rejected": -595.471435546875, "loss": 0.6033, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7711371779441833, "rewards/margins": 0.37155580520629883, "rewards/rejected": -1.142693042755127, "step": 326 }, { "epoch": 0.2135684545677198, "grad_norm": 8.197929978948176, "learning_rate": 1.4423353082803705e-07, "logits/chosen": -1.6932696104049683, "logits/rejected": -1.7340316772460938, "logps/chosen": -592.4081420898438, "logps/rejected": -626.7181396484375, "loss": 0.6079, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9063585996627808, "rewards/margins": 0.23381030559539795, "rewards/rejected": -1.1401689052581787, "step": 327 }, { "epoch": 0.21422156910768225, "grad_norm": 21.483660249834884, "learning_rate": 1.44167554011681e-07, "logits/chosen": -1.6718993186950684, "logits/rejected": -1.6670811176300049, "logps/chosen": -570.2409057617188, "logps/rejected": -584.1919555664062, "loss": 0.6376, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6868249177932739, "rewards/margins": 0.22840677201747894, "rewards/rejected": -0.9152317643165588, "step": 328 }, { "epoch": 0.2148746836476447, "grad_norm": 16.130012234299603, "learning_rate": 1.4410121716913508e-07, "logits/chosen": -1.6927289962768555, "logits/rejected": -1.70664644241333, "logps/chosen": -617.483642578125, "logps/rejected": -620.2203369140625, "loss": 0.6259, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9101148247718811, "rewards/margins": 0.1278352588415146, "rewards/rejected": -1.0379501581192017, "step": 329 }, { "epoch": 0.21552779818760714, "grad_norm": 8.367739313272205, "learning_rate": 1.4403452064569127e-07, "logits/chosen": -1.6701712608337402, "logits/rejected": -1.7202551364898682, "logps/chosen": -661.21435546875, "logps/rejected": -602.512451171875, "loss": 0.6385, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1165893077850342, "rewards/margins": 0.08732178062200546, "rewards/rejected": -1.2039111852645874, "step": 330 }, { "epoch": 0.2161809127275696, "grad_norm": 9.035063267419801, "learning_rate": 1.439674647885137e-07, "logits/chosen": -1.6763889789581299, "logits/rejected": -1.6751902103424072, "logps/chosen": -613.0372314453125, "logps/rejected": -630.5709838867188, "loss": 0.6134, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0509929656982422, "rewards/margins": 0.17839497327804565, "rewards/rejected": -1.2293879985809326, "step": 331 }, { "epoch": 0.21683402726753204, "grad_norm": 9.470243202935096, "learning_rate": 1.439000499466369e-07, "logits/chosen": -1.7101950645446777, "logits/rejected": -1.693764328956604, "logps/chosen": -592.8642578125, "logps/rejected": -629.7588500976562, "loss": 0.6173, "rewards/accuracies": 0.625, "rewards/chosen": -0.9428937435150146, "rewards/margins": 0.1368747353553772, "rewards/rejected": -1.0797685384750366, "step": 332 }, { "epoch": 0.21748714180749448, "grad_norm": 11.187409550489294, "learning_rate": 1.4383227647096393e-07, "logits/chosen": -1.651065707206726, "logits/rejected": -1.6519496440887451, "logps/chosen": -613.1717529296875, "logps/rejected": -579.97265625, "loss": 0.624, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9745721817016602, "rewards/margins": 0.17566874623298645, "rewards/rejected": -1.1502408981323242, "step": 333 }, { "epoch": 0.21814025634745693, "grad_norm": 49.216595878368345, "learning_rate": 1.4376414471426472e-07, "logits/chosen": -1.7066453695297241, "logits/rejected": -1.7159579992294312, "logps/chosen": -645.9775390625, "logps/rejected": -741.1705932617188, "loss": 0.6203, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0998951196670532, "rewards/margins": 0.478588730096817, "rewards/rejected": -1.5784838199615479, "step": 334 }, { "epoch": 0.21879337088741938, "grad_norm": 16.168657963184085, "learning_rate": 1.436956550311739e-07, "logits/chosen": -1.7134239673614502, "logits/rejected": -1.7491358518600464, "logps/chosen": -618.333251953125, "logps/rejected": -704.8095092773438, "loss": 0.613, "rewards/accuracies": 0.71875, "rewards/chosen": -1.038926124572754, "rewards/margins": 0.38716238737106323, "rewards/rejected": -1.426088571548462, "step": 335 }, { "epoch": 0.21944648542738182, "grad_norm": 15.399404851620021, "learning_rate": 1.4362680777818932e-07, "logits/chosen": -1.6947367191314697, "logits/rejected": -1.7013921737670898, "logps/chosen": -616.1701049804688, "logps/rejected": -646.965576171875, "loss": 0.6476, "rewards/accuracies": 0.625, "rewards/chosen": -1.009097695350647, "rewards/margins": 0.12043975293636322, "rewards/rejected": -1.1295373439788818, "step": 336 }, { "epoch": 0.22009959996734427, "grad_norm": 12.377040896457764, "learning_rate": 1.435576033136699e-07, "logits/chosen": -1.72658109664917, "logits/rejected": -1.7111766338348389, "logps/chosen": -691.18701171875, "logps/rejected": -746.902587890625, "loss": 0.6108, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1553032398223877, "rewards/margins": 0.3523827791213989, "rewards/rejected": -1.507685899734497, "step": 337 }, { "epoch": 0.2207527145073067, "grad_norm": 21.708794576351895, "learning_rate": 1.4348804199783397e-07, "logits/chosen": -1.7045814990997314, "logits/rejected": -1.733116865158081, "logps/chosen": -591.8544921875, "logps/rejected": -598.6890869140625, "loss": 0.6173, "rewards/accuracies": 0.75, "rewards/chosen": -0.881680428981781, "rewards/margins": 0.18818916380405426, "rewards/rejected": -1.0698695182800293, "step": 338 }, { "epoch": 0.22140582904726916, "grad_norm": 12.368369376917613, "learning_rate": 1.4341812419275735e-07, "logits/chosen": -1.7070914506912231, "logits/rejected": -1.7090568542480469, "logps/chosen": -598.9154663085938, "logps/rejected": -546.4701538085938, "loss": 0.6502, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9902876019477844, "rewards/margins": 0.08627556264400482, "rewards/rejected": -1.0765631198883057, "step": 339 }, { "epoch": 0.2220589435872316, "grad_norm": 7.332074698031059, "learning_rate": 1.4334785026237135e-07, "logits/chosen": -1.757274866104126, "logits/rejected": -1.7839263677597046, "logps/chosen": -611.2498779296875, "logps/rejected": -621.130859375, "loss": 0.5915, "rewards/accuracies": 0.75, "rewards/chosen": -1.0745751857757568, "rewards/margins": 0.25732651352882385, "rewards/rejected": -1.3319017887115479, "step": 340 }, { "epoch": 0.22271205812719405, "grad_norm": 16.45318119474388, "learning_rate": 1.43277220572461e-07, "logits/chosen": -1.5772120952606201, "logits/rejected": -1.5908713340759277, "logps/chosen": -537.1170043945312, "logps/rejected": -550.2281494140625, "loss": 0.6306, "rewards/accuracies": 0.75, "rewards/chosen": -0.9023160338401794, "rewards/margins": 0.1352323591709137, "rewards/rejected": -1.037548303604126, "step": 341 }, { "epoch": 0.2233651726671565, "grad_norm": 37.88300298632188, "learning_rate": 1.4320623549066308e-07, "logits/chosen": -1.7355406284332275, "logits/rejected": -1.7195450067520142, "logps/chosen": -686.9090576171875, "logps/rejected": -739.3055419921875, "loss": 0.579, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1977063417434692, "rewards/margins": 0.5113601088523865, "rewards/rejected": -1.7090665102005005, "step": 342 }, { "epoch": 0.22401828720711894, "grad_norm": 22.776462467379197, "learning_rate": 1.4313489538646427e-07, "logits/chosen": -1.7235167026519775, "logits/rejected": -1.7320938110351562, "logps/chosen": -663.5242309570312, "logps/rejected": -628.5629272460938, "loss": 0.6169, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0688875913619995, "rewards/margins": 0.16213612258434296, "rewards/rejected": -1.2310236692428589, "step": 343 }, { "epoch": 0.2246714017470814, "grad_norm": 8.488895496301154, "learning_rate": 1.4306320063119916e-07, "logits/chosen": -1.6490345001220703, "logits/rejected": -1.6467983722686768, "logps/chosen": -686.3357543945312, "logps/rejected": -679.5280151367188, "loss": 0.6362, "rewards/accuracies": 0.53125, "rewards/chosen": -1.28061044216156, "rewards/margins": 0.09073764830827713, "rewards/rejected": -1.371348261833191, "step": 344 }, { "epoch": 0.22532451628704384, "grad_norm": 22.168028050033644, "learning_rate": 1.4299115159804836e-07, "logits/chosen": -1.7651727199554443, "logits/rejected": -1.756993293762207, "logps/chosen": -707.9656372070312, "logps/rejected": -787.037109375, "loss": 0.6355, "rewards/accuracies": 0.625, "rewards/chosen": -1.312680959701538, "rewards/margins": 0.39120543003082275, "rewards/rejected": -1.7038863897323608, "step": 345 }, { "epoch": 0.22597763082700628, "grad_norm": 7.58865714123304, "learning_rate": 1.4291874866203655e-07, "logits/chosen": -1.659616231918335, "logits/rejected": -1.637772798538208, "logps/chosen": -658.1541137695312, "logps/rejected": -791.5831909179688, "loss": 0.5703, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1292518377304077, "rewards/margins": 0.6298790574073792, "rewards/rejected": -1.759130835533142, "step": 346 }, { "epoch": 0.22663074536696873, "grad_norm": 26.071692354910912, "learning_rate": 1.428459922000305e-07, "logits/chosen": -1.5885182619094849, "logits/rejected": -1.6136213541030884, "logps/chosen": -555.8087158203125, "logps/rejected": -621.08349609375, "loss": 0.5983, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9678431749343872, "rewards/margins": 0.3450620472431183, "rewards/rejected": -1.312905192375183, "step": 347 }, { "epoch": 0.22728385990693117, "grad_norm": 7.8840447097778545, "learning_rate": 1.4277288259073708e-07, "logits/chosen": -1.568705439567566, "logits/rejected": -1.5865546464920044, "logps/chosen": -589.7760009765625, "logps/rejected": -573.239990234375, "loss": 0.6277, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1354511976242065, "rewards/margins": 0.18996107578277588, "rewards/rejected": -1.3254122734069824, "step": 348 }, { "epoch": 0.22793697444689362, "grad_norm": 10.92884715777762, "learning_rate": 1.4269942021470148e-07, "logits/chosen": -1.6641845703125, "logits/rejected": -1.6769914627075195, "logps/chosen": -672.136474609375, "logps/rejected": -664.6466674804688, "loss": 0.6121, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0376440286636353, "rewards/margins": 0.1383248120546341, "rewards/rejected": -1.1759687662124634, "step": 349 }, { "epoch": 0.22859008898685607, "grad_norm": 31.26872455787033, "learning_rate": 1.4262560545430495e-07, "logits/chosen": -1.5913841724395752, "logits/rejected": -1.6360561847686768, "logps/chosen": -617.2501831054688, "logps/rejected": -550.8568115234375, "loss": 0.6164, "rewards/accuracies": 0.84375, "rewards/chosen": -0.979924738407135, "rewards/margins": 0.24141384661197662, "rewards/rejected": -1.2213386297225952, "step": 350 }, { "epoch": 0.2292432035268185, "grad_norm": 8.368119693346522, "learning_rate": 1.4255143869376301e-07, "logits/chosen": -1.7337816953659058, "logits/rejected": -1.6971759796142578, "logps/chosen": -652.0234375, "logps/rejected": -777.8805541992188, "loss": 0.5913, "rewards/accuracies": 0.875, "rewards/chosen": -1.1779980659484863, "rewards/margins": 0.6400733590126038, "rewards/rejected": -1.8180711269378662, "step": 351 }, { "epoch": 0.22989631806678096, "grad_norm": 7.04954036164927, "learning_rate": 1.424769203191234e-07, "logits/chosen": -1.706908941268921, "logits/rejected": -1.7061164379119873, "logps/chosen": -648.4498291015625, "logps/rejected": -606.88623046875, "loss": 0.6278, "rewards/accuracies": 0.625, "rewards/chosen": -1.2371093034744263, "rewards/margins": 0.11428666114807129, "rewards/rejected": -1.3513959646224976, "step": 352 }, { "epoch": 0.2305494326067434, "grad_norm": 24.647751119336064, "learning_rate": 1.42402050718264e-07, "logits/chosen": -1.6741199493408203, "logits/rejected": -1.743821620941162, "logps/chosen": -677.9364624023438, "logps/rejected": -710.4935302734375, "loss": 0.5985, "rewards/accuracies": 0.625, "rewards/chosen": -1.0869948863983154, "rewards/margins": 0.1914815604686737, "rewards/rejected": -1.278476357460022, "step": 353 }, { "epoch": 0.23120254714670585, "grad_norm": 32.99725275222979, "learning_rate": 1.4232683028089092e-07, "logits/chosen": -1.6567903757095337, "logits/rejected": -1.6555206775665283, "logps/chosen": -587.80029296875, "logps/rejected": -598.2508544921875, "loss": 0.6307, "rewards/accuracies": 0.8125, "rewards/chosen": -1.024768352508545, "rewards/margins": 0.3201058804988861, "rewards/rejected": -1.3448742628097534, "step": 354 }, { "epoch": 0.2318556616866683, "grad_norm": 15.301036178500334, "learning_rate": 1.4225125939853637e-07, "logits/chosen": -1.5989353656768799, "logits/rejected": -1.6277306079864502, "logps/chosen": -566.171875, "logps/rejected": -590.93505859375, "loss": 0.6273, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9104868173599243, "rewards/margins": 0.22141122817993164, "rewards/rejected": -1.1318979263305664, "step": 355 }, { "epoch": 0.23250877622663074, "grad_norm": 8.150144656099924, "learning_rate": 1.4217533846455675e-07, "logits/chosen": -1.6726055145263672, "logits/rejected": -1.6693233251571655, "logps/chosen": -643.8858642578125, "logps/rejected": -628.0254516601562, "loss": 0.6466, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2304469347000122, "rewards/margins": 0.19707579910755157, "rewards/rejected": -1.427522897720337, "step": 356 }, { "epoch": 0.2331618907665932, "grad_norm": 33.25010241380852, "learning_rate": 1.4209906787413047e-07, "logits/chosen": -1.6271083354949951, "logits/rejected": -1.6530542373657227, "logps/chosen": -599.4658813476562, "logps/rejected": -608.0745849609375, "loss": 0.5938, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1141527891159058, "rewards/margins": 0.19912773370742798, "rewards/rejected": -1.3132805824279785, "step": 357 }, { "epoch": 0.23381500530655563, "grad_norm": 14.540966438844277, "learning_rate": 1.420224480242559e-07, "logits/chosen": -1.5631732940673828, "logits/rejected": -1.582556128501892, "logps/chosen": -584.55712890625, "logps/rejected": -616.7429809570312, "loss": 0.6118, "rewards/accuracies": 0.75, "rewards/chosen": -1.0560500621795654, "rewards/margins": 0.2983832359313965, "rewards/rejected": -1.354433298110962, "step": 358 }, { "epoch": 0.23446811984651808, "grad_norm": 11.291749985668568, "learning_rate": 1.4194547931374948e-07, "logits/chosen": -1.6868391036987305, "logits/rejected": -1.7105621099472046, "logps/chosen": -600.7924194335938, "logps/rejected": -677.1724853515625, "loss": 0.6779, "rewards/accuracies": 0.65625, "rewards/chosen": -1.23578941822052, "rewards/margins": 0.33331653475761414, "rewards/rejected": -1.569106101989746, "step": 359 }, { "epoch": 0.23512123438648053, "grad_norm": 22.901901301811037, "learning_rate": 1.418681621432434e-07, "logits/chosen": -1.574560523033142, "logits/rejected": -1.599095106124878, "logps/chosen": -599.9710693359375, "logps/rejected": -652.0952758789062, "loss": 0.6254, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1917017698287964, "rewards/margins": 0.3461110591888428, "rewards/rejected": -1.5378127098083496, "step": 360 }, { "epoch": 0.23577434892644297, "grad_norm": 25.59056348020549, "learning_rate": 1.417904969151837e-07, "logits/chosen": -1.7425224781036377, "logits/rejected": -1.7415771484375, "logps/chosen": -704.6986694335938, "logps/rejected": -693.380126953125, "loss": 0.6119, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2282774448394775, "rewards/margins": 0.26026061177253723, "rewards/rejected": -1.488538146018982, "step": 361 }, { "epoch": 0.23642746346640542, "grad_norm": 8.463533236278087, "learning_rate": 1.4171248403382806e-07, "logits/chosen": -1.6078510284423828, "logits/rejected": -1.700717568397522, "logps/chosen": -629.0587768554688, "logps/rejected": -598.7468872070312, "loss": 0.6081, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0717506408691406, "rewards/margins": 0.29170456528663635, "rewards/rejected": -1.3634551763534546, "step": 362 }, { "epoch": 0.23708057800636786, "grad_norm": 27.979305395002925, "learning_rate": 1.4163412390524378e-07, "logits/chosen": -1.6555612087249756, "logits/rejected": -1.701103925704956, "logps/chosen": -515.6534423828125, "logps/rejected": -615.6427001953125, "loss": 0.5931, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1157307624816895, "rewards/margins": 0.35381343960762024, "rewards/rejected": -1.4695440530776978, "step": 363 }, { "epoch": 0.2377336925463303, "grad_norm": 29.822643828567546, "learning_rate": 1.4155541693730556e-07, "logits/chosen": -1.6813297271728516, "logits/rejected": -1.6906770467758179, "logps/chosen": -571.6758422851562, "logps/rejected": -599.9418334960938, "loss": 0.5808, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0341238975524902, "rewards/margins": 0.39114344120025635, "rewards/rejected": -1.425267219543457, "step": 364 }, { "epoch": 0.23838680708629276, "grad_norm": 8.151382967050962, "learning_rate": 1.414763635396935e-07, "logits/chosen": -1.6014394760131836, "logits/rejected": -1.5656381845474243, "logps/chosen": -540.3904418945312, "logps/rejected": -593.1724853515625, "loss": 0.6379, "rewards/accuracies": 0.75, "rewards/chosen": -1.1642670631408691, "rewards/margins": 0.3421732187271118, "rewards/rejected": -1.506440281867981, "step": 365 }, { "epoch": 0.2390399216262552, "grad_norm": 57.74463360143876, "learning_rate": 1.4139696412389096e-07, "logits/chosen": -1.7312068939208984, "logits/rejected": -1.7102075815200806, "logps/chosen": -643.19091796875, "logps/rejected": -672.47509765625, "loss": 0.5735, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1425721645355225, "rewards/margins": 0.4072319269180298, "rewards/rejected": -1.5498042106628418, "step": 366 }, { "epoch": 0.23969303616621765, "grad_norm": 12.197736908947663, "learning_rate": 1.4131721910318227e-07, "logits/chosen": -1.8108876943588257, "logits/rejected": -1.7958606481552124, "logps/chosen": -733.1993408203125, "logps/rejected": -763.6776733398438, "loss": 0.5912, "rewards/accuracies": 0.6875, "rewards/chosen": -1.482933759689331, "rewards/margins": 0.2555669844150543, "rewards/rejected": -1.7385008335113525, "step": 367 }, { "epoch": 0.2403461507061801, "grad_norm": 19.29177394775571, "learning_rate": 1.4123712889265072e-07, "logits/chosen": -1.6916940212249756, "logits/rejected": -1.7217150926589966, "logps/chosen": -671.9423217773438, "logps/rejected": -675.45361328125, "loss": 0.5719, "rewards/accuracies": 0.625, "rewards/chosen": -1.3399765491485596, "rewards/margins": 0.20047269761562347, "rewards/rejected": -1.5404491424560547, "step": 368 }, { "epoch": 0.24099926524614254, "grad_norm": 25.260138658321655, "learning_rate": 1.4115669390917636e-07, "logits/chosen": -1.6250600814819336, "logits/rejected": -1.6512022018432617, "logps/chosen": -625.4688720703125, "logps/rejected": -614.0571899414062, "loss": 0.5783, "rewards/accuracies": 0.75, "rewards/chosen": -1.2662508487701416, "rewards/margins": 0.20612508058547974, "rewards/rejected": -1.4723761081695557, "step": 369 }, { "epoch": 0.241652379786105, "grad_norm": 9.803497546984937, "learning_rate": 1.4107591457143383e-07, "logits/chosen": -1.5895332098007202, "logits/rejected": -1.602332592010498, "logps/chosen": -675.6551513671875, "logps/rejected": -709.5709838867188, "loss": 0.5924, "rewards/accuracies": 0.75, "rewards/chosen": -1.2006454467773438, "rewards/margins": 0.3013772666454315, "rewards/rejected": -1.5020227432250977, "step": 370 }, { "epoch": 0.24230549432606743, "grad_norm": 8.545956638134795, "learning_rate": 1.409947912998902e-07, "logits/chosen": -1.6708649396896362, "logits/rejected": -1.713352918624878, "logps/chosen": -680.72607421875, "logps/rejected": -674.447265625, "loss": 0.6127, "rewards/accuracies": 0.75, "rewards/chosen": -1.2759678363800049, "rewards/margins": 0.18677686154842377, "rewards/rejected": -1.4627448320388794, "step": 371 }, { "epoch": 0.24295860886602988, "grad_norm": 19.844846501689318, "learning_rate": 1.4091332451680267e-07, "logits/chosen": -1.7042357921600342, "logits/rejected": -1.7265393733978271, "logps/chosen": -675.3814086914062, "logps/rejected": -644.4542846679688, "loss": 0.6427, "rewards/accuracies": 0.53125, "rewards/chosen": -1.4431321620941162, "rewards/margins": 0.07462039589881897, "rewards/rejected": -1.5177525281906128, "step": 372 }, { "epoch": 0.24361172340599233, "grad_norm": 7.920082953904519, "learning_rate": 1.408315146462166e-07, "logits/chosen": -1.721234679222107, "logits/rejected": -1.701398253440857, "logps/chosen": -589.4611206054688, "logps/rejected": -710.4976806640625, "loss": 0.581, "rewards/accuracies": 0.75, "rewards/chosen": -1.1915465593338013, "rewards/margins": 0.42795515060424805, "rewards/rejected": -1.6195017099380493, "step": 373 }, { "epoch": 0.24426483794595477, "grad_norm": 12.858984927661352, "learning_rate": 1.407493621139631e-07, "logits/chosen": -1.713492751121521, "logits/rejected": -1.7290925979614258, "logps/chosen": -641.5909423828125, "logps/rejected": -650.439697265625, "loss": 0.5713, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2154250144958496, "rewards/margins": 0.2901778817176819, "rewards/rejected": -1.5056030750274658, "step": 374 }, { "epoch": 0.24491795248591722, "grad_norm": 15.690657976850302, "learning_rate": 1.406668673476568e-07, "logits/chosen": -1.6640945672988892, "logits/rejected": -1.6718759536743164, "logps/chosen": -649.737060546875, "logps/rejected": -686.3532104492188, "loss": 0.568, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3824304342269897, "rewards/margins": 0.3402037024497986, "rewards/rejected": -1.722634196281433, "step": 375 }, { "epoch": 0.24557106702587966, "grad_norm": 25.733728918193314, "learning_rate": 1.4058403077669386e-07, "logits/chosen": -1.671245813369751, "logits/rejected": -1.6539918184280396, "logps/chosen": -620.85791015625, "logps/rejected": -613.2889404296875, "loss": 0.5943, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2888745069503784, "rewards/margins": 0.22225916385650635, "rewards/rejected": -1.5111336708068848, "step": 376 }, { "epoch": 0.2462241815658421, "grad_norm": 9.763117553021695, "learning_rate": 1.4050085283224946e-07, "logits/chosen": -1.5626500844955444, "logits/rejected": -1.57936429977417, "logps/chosen": -653.4628295898438, "logps/rejected": -741.0684814453125, "loss": 0.6487, "rewards/accuracies": 0.75, "rewards/chosen": -1.4431822299957275, "rewards/margins": 0.3858519494533539, "rewards/rejected": -1.8290340900421143, "step": 377 }, { "epoch": 0.24687729610580456, "grad_norm": 14.715453079588329, "learning_rate": 1.4041733394727567e-07, "logits/chosen": -1.6787821054458618, "logits/rejected": -1.675290584564209, "logps/chosen": -656.50830078125, "logps/rejected": -641.9553833007812, "loss": 0.5851, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2514023780822754, "rewards/margins": 0.16022822260856628, "rewards/rejected": -1.411630630493164, "step": 378 }, { "epoch": 0.247530410645767, "grad_norm": 17.584914736775964, "learning_rate": 1.403334745564993e-07, "logits/chosen": -1.6352498531341553, "logits/rejected": -1.6839510202407837, "logps/chosen": -743.545654296875, "logps/rejected": -792.6776123046875, "loss": 0.587, "rewards/accuracies": 0.65625, "rewards/chosen": -1.677843689918518, "rewards/margins": 0.3219238221645355, "rewards/rejected": -1.999767541885376, "step": 379 }, { "epoch": 0.24818352518572945, "grad_norm": 35.19811670438595, "learning_rate": 1.4024927509641947e-07, "logits/chosen": -1.7035257816314697, "logits/rejected": -1.6846370697021484, "logps/chosen": -731.2345581054688, "logps/rejected": -747.091064453125, "loss": 0.5723, "rewards/accuracies": 0.78125, "rewards/chosen": -1.520798921585083, "rewards/margins": 0.359272301197052, "rewards/rejected": -1.8800714015960693, "step": 380 }, { "epoch": 0.2488366397256919, "grad_norm": 51.54152025955344, "learning_rate": 1.401647360053054e-07, "logits/chosen": -1.6718703508377075, "logits/rejected": -1.7071422338485718, "logps/chosen": -684.9619140625, "logps/rejected": -685.8736572265625, "loss": 0.6127, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1913833618164062, "rewards/margins": 0.4190147817134857, "rewards/rejected": -1.6103980541229248, "step": 381 }, { "epoch": 0.24948975426565434, "grad_norm": 29.30838787108767, "learning_rate": 1.4007985772319414e-07, "logits/chosen": -1.58506441116333, "logits/rejected": -1.576029896736145, "logps/chosen": -592.5045166015625, "logps/rejected": -606.4868774414062, "loss": 0.57, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3507953882217407, "rewards/margins": 0.4049099385738373, "rewards/rejected": -1.7557053565979004, "step": 382 }, { "epoch": 0.25014286880561676, "grad_norm": 30.296371441246823, "learning_rate": 1.3999464069188827e-07, "logits/chosen": -1.511520504951477, "logits/rejected": -1.5346118211746216, "logps/chosen": -610.6934814453125, "logps/rejected": -583.274658203125, "loss": 0.5848, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2899129390716553, "rewards/margins": 0.24026429653167725, "rewards/rejected": -1.5301774740219116, "step": 383 }, { "epoch": 0.25079598334557923, "grad_norm": 22.072613902976816, "learning_rate": 1.3990908535495366e-07, "logits/chosen": -1.6665383577346802, "logits/rejected": -1.7260501384735107, "logps/chosen": -649.5576782226562, "logps/rejected": -632.3207397460938, "loss": 0.5902, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3579076528549194, "rewards/margins": 0.13580122590065002, "rewards/rejected": -1.493708848953247, "step": 384 }, { "epoch": 0.25144909788554165, "grad_norm": 11.159048199299994, "learning_rate": 1.39823192157717e-07, "logits/chosen": -1.7021327018737793, "logits/rejected": -1.7216562032699585, "logps/chosen": -702.6673583984375, "logps/rejected": -677.152587890625, "loss": 0.6105, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3912357091903687, "rewards/margins": 0.28019946813583374, "rewards/rejected": -1.6714351177215576, "step": 385 }, { "epoch": 0.2521022124255041, "grad_norm": 12.67256793359313, "learning_rate": 1.3973696154726372e-07, "logits/chosen": -1.652302622795105, "logits/rejected": -1.6712439060211182, "logps/chosen": -615.6753540039062, "logps/rejected": -612.9768676757812, "loss": 0.535, "rewards/accuracies": 0.78125, "rewards/chosen": -1.381615400314331, "rewards/margins": 0.35143721103668213, "rewards/rejected": -1.7330526113510132, "step": 386 }, { "epoch": 0.25275532696546654, "grad_norm": 9.156751796221311, "learning_rate": 1.396503939724354e-07, "logits/chosen": -1.687638521194458, "logits/rejected": -1.6907660961151123, "logps/chosen": -679.770263671875, "logps/rejected": -704.907470703125, "loss": 0.581, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3277437686920166, "rewards/margins": 0.3244020938873291, "rewards/rejected": -1.6521457433700562, "step": 387 }, { "epoch": 0.253408441505429, "grad_norm": 26.505065205334525, "learning_rate": 1.3956348988382757e-07, "logits/chosen": -1.621063470840454, "logits/rejected": -1.6041771173477173, "logps/chosen": -596.8775634765625, "logps/rejected": -675.8204345703125, "loss": 0.5536, "rewards/accuracies": 0.75, "rewards/chosen": -1.4424089193344116, "rewards/margins": 0.5358285903930664, "rewards/rejected": -1.9782376289367676, "step": 388 }, { "epoch": 0.25406155604539143, "grad_norm": 7.262480595814242, "learning_rate": 1.394762497337875e-07, "logits/chosen": -1.6179628372192383, "logits/rejected": -1.6096928119659424, "logps/chosen": -653.6219482421875, "logps/rejected": -699.051513671875, "loss": 0.6154, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4057137966156006, "rewards/margins": 0.2138241082429886, "rewards/rejected": -1.6195377111434937, "step": 389 }, { "epoch": 0.2547146705853539, "grad_norm": 17.339973588325055, "learning_rate": 1.393886739764116e-07, "logits/chosen": -1.6001747846603394, "logits/rejected": -1.6313351392745972, "logps/chosen": -626.1659545898438, "logps/rejected": -644.4179077148438, "loss": 0.5469, "rewards/accuracies": 0.84375, "rewards/chosen": -1.3461098670959473, "rewards/margins": 0.41640228033065796, "rewards/rejected": -1.76251220703125, "step": 390 }, { "epoch": 0.2553677851253163, "grad_norm": 14.339393954095954, "learning_rate": 1.3930076306754315e-07, "logits/chosen": -1.5895658731460571, "logits/rejected": -1.6347410678863525, "logps/chosen": -611.0591430664062, "logps/rejected": -595.1077880859375, "loss": 0.5883, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2730367183685303, "rewards/margins": 0.2380126416683197, "rewards/rejected": -1.5110492706298828, "step": 391 }, { "epoch": 0.2560208996652788, "grad_norm": 10.18368249263564, "learning_rate": 1.3921251746476998e-07, "logits/chosen": -1.7359168529510498, "logits/rejected": -1.7397726774215698, "logps/chosen": -588.6852416992188, "logps/rejected": -697.5007934570312, "loss": 0.5732, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3043936491012573, "rewards/margins": 0.6203251481056213, "rewards/rejected": -1.9247188568115234, "step": 392 }, { "epoch": 0.2566740142052412, "grad_norm": 18.38247483748203, "learning_rate": 1.39123937627422e-07, "logits/chosen": -1.6276944875717163, "logits/rejected": -1.5889580249786377, "logps/chosen": -620.385498046875, "logps/rejected": -635.3296508789062, "loss": 0.6444, "rewards/accuracies": 0.5, "rewards/chosen": -1.3040493726730347, "rewards/margins": 0.10177471488714218, "rewards/rejected": -1.4058241844177246, "step": 393 }, { "epoch": 0.2573271287452037, "grad_norm": 36.73565911642949, "learning_rate": 1.390350240165689e-07, "logits/chosen": -1.6098779439926147, "logits/rejected": -1.7416176795959473, "logps/chosen": -642.3756713867188, "logps/rejected": -643.9348754882812, "loss": 0.5877, "rewards/accuracies": 0.6875, "rewards/chosen": -1.407647728919983, "rewards/margins": 0.2774192690849304, "rewards/rejected": -1.685067057609558, "step": 394 }, { "epoch": 0.2579802432851661, "grad_norm": 16.023490486119584, "learning_rate": 1.3894577709501766e-07, "logits/chosen": -1.692483901977539, "logits/rejected": -1.7297693490982056, "logps/chosen": -797.02880859375, "logps/rejected": -657.3602294921875, "loss": 0.7438, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8514511585235596, "rewards/margins": -0.006901424378156662, "rewards/rejected": -1.8445496559143066, "step": 395 }, { "epoch": 0.2586333578251286, "grad_norm": 16.55592977846572, "learning_rate": 1.3885619732731024e-07, "logits/chosen": -1.6773359775543213, "logits/rejected": -1.7305378913879395, "logps/chosen": -627.8666381835938, "logps/rejected": -695.2501220703125, "loss": 0.5468, "rewards/accuracies": 0.75, "rewards/chosen": -1.4438410997390747, "rewards/margins": 0.509320855140686, "rewards/rejected": -1.9531618356704712, "step": 396 }, { "epoch": 0.259286472365091, "grad_norm": 31.880923432118284, "learning_rate": 1.3876628517972106e-07, "logits/chosen": -1.5321552753448486, "logits/rejected": -1.52942955493927, "logps/chosen": -627.0950317382812, "logps/rejected": -674.76025390625, "loss": 0.5836, "rewards/accuracies": 0.5625, "rewards/chosen": -1.53360915184021, "rewards/margins": 0.2759949266910553, "rewards/rejected": -1.8096040487289429, "step": 397 }, { "epoch": 0.2599395869050535, "grad_norm": 13.538949029727982, "learning_rate": 1.3867604112025465e-07, "logits/chosen": -1.6481291055679321, "logits/rejected": -1.6133708953857422, "logps/chosen": -633.3253173828125, "logps/rejected": -659.7344970703125, "loss": 0.6281, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4847848415374756, "rewards/margins": 0.1952800452709198, "rewards/rejected": -1.6800646781921387, "step": 398 }, { "epoch": 0.2605927014450159, "grad_norm": 41.823948195416314, "learning_rate": 1.3858546561864315e-07, "logits/chosen": -1.6834887266159058, "logits/rejected": -1.668868064880371, "logps/chosen": -664.4490966796875, "logps/rejected": -718.4891357421875, "loss": 0.5855, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5809217691421509, "rewards/margins": 0.4111039936542511, "rewards/rejected": -1.9920259714126587, "step": 399 }, { "epoch": 0.26124581598497837, "grad_norm": 7.562874624386567, "learning_rate": 1.3849455914634399e-07, "logits/chosen": -1.6510210037231445, "logits/rejected": -1.6854655742645264, "logps/chosen": -694.1316528320312, "logps/rejected": -653.6522216796875, "loss": 0.5831, "rewards/accuracies": 0.625, "rewards/chosen": -1.8670495748519897, "rewards/margins": 0.05141986906528473, "rewards/rejected": -1.9184695482254028, "step": 400 }, { "epoch": 0.26124581598497837, "eval_logits/chosen": -1.6877070665359497, "eval_logits/rejected": -1.6963342428207397, "eval_logps/chosen": -661.8920288085938, "eval_logps/rejected": -689.978759765625, "eval_loss": 0.5931771397590637, "eval_rewards/accuracies": 0.7070000171661377, "eval_rewards/chosen": -1.515466332435608, "eval_rewards/margins": 0.36192643642425537, "eval_rewards/rejected": -1.8773927688598633, "eval_runtime": 300.4429, "eval_samples_per_second": 13.314, "eval_steps_per_second": 0.832, "step": 400 }, { "epoch": 0.2618989305249408, "grad_norm": 9.182566410300494, "learning_rate": 1.3840332217653723e-07, "logits/chosen": -1.673874020576477, "logits/rejected": -1.680929183959961, "logps/chosen": -692.4191284179688, "logps/rejected": -704.0576171875, "loss": 0.6425, "rewards/accuracies": 0.625, "rewards/chosen": -1.688931941986084, "rewards/margins": 0.1834324449300766, "rewards/rejected": -1.8723644018173218, "step": 401 }, { "epoch": 0.26255204506490326, "grad_norm": 8.101445637974606, "learning_rate": 1.3831175518412327e-07, "logits/chosen": -1.6620182991027832, "logits/rejected": -1.6938483715057373, "logps/chosen": -745.3438720703125, "logps/rejected": -760.4389038085938, "loss": 0.6006, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6187502145767212, "rewards/margins": 0.23492467403411865, "rewards/rejected": -1.8536747694015503, "step": 402 }, { "epoch": 0.2632051596048657, "grad_norm": 10.95532049184201, "learning_rate": 1.3821985864572028e-07, "logits/chosen": -1.6001207828521729, "logits/rejected": -1.565683126449585, "logps/chosen": -678.5340576171875, "logps/rejected": -816.8855590820312, "loss": 0.6223, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7041336297988892, "rewards/margins": 0.5798905491828918, "rewards/rejected": -2.284024238586426, "step": 403 }, { "epoch": 0.26385827414482815, "grad_norm": 19.544045940040125, "learning_rate": 1.3812763303966186e-07, "logits/chosen": -1.6810550689697266, "logits/rejected": -1.6656105518341064, "logps/chosen": -635.9982299804688, "logps/rejected": -654.9970703125, "loss": 0.5854, "rewards/accuracies": 0.75, "rewards/chosen": -1.5096485614776611, "rewards/margins": 0.24476324021816254, "rewards/rejected": -1.7544115781784058, "step": 404 }, { "epoch": 0.26451138868479057, "grad_norm": 8.792026250775443, "learning_rate": 1.3803507884599438e-07, "logits/chosen": -1.693314552307129, "logits/rejected": -1.678006887435913, "logps/chosen": -794.29443359375, "logps/rejected": -808.3348388671875, "loss": 0.5861, "rewards/accuracies": 0.59375, "rewards/chosen": -1.7267074584960938, "rewards/margins": 0.24482649564743042, "rewards/rejected": -1.9715338945388794, "step": 405 }, { "epoch": 0.26516450322475305, "grad_norm": 39.2544432402473, "learning_rate": 1.379421965464745e-07, "logits/chosen": -1.672491431236267, "logits/rejected": -1.6746456623077393, "logps/chosen": -739.5106811523438, "logps/rejected": -728.4205322265625, "loss": 0.611, "rewards/accuracies": 0.625, "rewards/chosen": -1.7484608888626099, "rewards/margins": 0.24220341444015503, "rewards/rejected": -1.9906642436981201, "step": 406 }, { "epoch": 0.26581761776471546, "grad_norm": 61.06487102613164, "learning_rate": 1.378489866245668e-07, "logits/chosen": -1.6239569187164307, "logits/rejected": -1.6054996252059937, "logps/chosen": -597.54833984375, "logps/rejected": -692.744384765625, "loss": 0.5651, "rewards/accuracies": 0.75, "rewards/chosen": -1.441511631011963, "rewards/margins": 0.5937218070030212, "rewards/rejected": -2.035233497619629, "step": 407 }, { "epoch": 0.26647073230467794, "grad_norm": 9.653358684629563, "learning_rate": 1.3775544956544115e-07, "logits/chosen": -1.6666955947875977, "logits/rejected": -1.6608633995056152, "logps/chosen": -721.04541015625, "logps/rejected": -931.44482421875, "loss": 0.519, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6330653429031372, "rewards/margins": 1.069937825202942, "rewards/rejected": -2.7030029296875, "step": 408 }, { "epoch": 0.26712384684464036, "grad_norm": 37.20492293139169, "learning_rate": 1.3766158585597024e-07, "logits/chosen": -1.747741460800171, "logits/rejected": -1.681113600730896, "logps/chosen": -684.744384765625, "logps/rejected": -716.8374633789062, "loss": 0.5827, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5082005262374878, "rewards/margins": 0.3761386573314667, "rewards/rejected": -1.8843392133712769, "step": 409 }, { "epoch": 0.26777696138460283, "grad_norm": 14.02260372554619, "learning_rate": 1.3756739598472692e-07, "logits/chosen": -1.6259886026382446, "logits/rejected": -1.6599675416946411, "logps/chosen": -740.4105834960938, "logps/rejected": -777.7703247070312, "loss": 0.5891, "rewards/accuracies": 0.75, "rewards/chosen": -1.6748157739639282, "rewards/margins": 0.48535382747650146, "rewards/rejected": -2.1601696014404297, "step": 410 }, { "epoch": 0.26843007592456525, "grad_norm": 11.140097310760751, "learning_rate": 1.3747288044198186e-07, "logits/chosen": -1.692323088645935, "logits/rejected": -1.705490231513977, "logps/chosen": -676.2076416015625, "logps/rejected": -718.6063232421875, "loss": 0.6076, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5469157695770264, "rewards/margins": 0.4593140482902527, "rewards/rejected": -2.006229877471924, "step": 411 }, { "epoch": 0.2690831904645277, "grad_norm": 21.86521082932002, "learning_rate": 1.373780397197009e-07, "logits/chosen": -1.5999939441680908, "logits/rejected": -1.638975739479065, "logps/chosen": -697.4719848632812, "logps/rejected": -822.9923095703125, "loss": 0.5562, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7497361898422241, "rewards/margins": 0.5286187529563904, "rewards/rejected": -2.278355121612549, "step": 412 }, { "epoch": 0.26973630500449014, "grad_norm": 85.11109239770148, "learning_rate": 1.3728287431154236e-07, "logits/chosen": -1.65316903591156, "logits/rejected": -1.6552778482437134, "logps/chosen": -639.5845947265625, "logps/rejected": -646.5022583007812, "loss": 0.6713, "rewards/accuracies": 0.53125, "rewards/chosen": -1.5560939311981201, "rewards/margins": 0.2231585830450058, "rewards/rejected": -1.779252529144287, "step": 413 }, { "epoch": 0.2703894195444526, "grad_norm": 27.726328984830257, "learning_rate": 1.371873847128547e-07, "logits/chosen": -1.7042299509048462, "logits/rejected": -1.664905309677124, "logps/chosen": -594.562744140625, "logps/rejected": -746.6199951171875, "loss": 0.5604, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5040781497955322, "rewards/margins": 0.6467331647872925, "rewards/rejected": -2.1508116722106934, "step": 414 }, { "epoch": 0.27104253408441503, "grad_norm": 7.783183095188105, "learning_rate": 1.3709157142067383e-07, "logits/chosen": -1.530868649482727, "logits/rejected": -1.5461376905441284, "logps/chosen": -656.3353881835938, "logps/rejected": -642.3773193359375, "loss": 0.6118, "rewards/accuracies": 0.625, "rewards/chosen": -1.472480297088623, "rewards/margins": 0.1450308859348297, "rewards/rejected": -1.61751127243042, "step": 415 }, { "epoch": 0.2716956486243775, "grad_norm": 19.144666486903013, "learning_rate": 1.3699543493372047e-07, "logits/chosen": -1.69258713722229, "logits/rejected": -1.7538609504699707, "logps/chosen": -628.3487548828125, "logps/rejected": -619.4342651367188, "loss": 0.5694, "rewards/accuracies": 0.6875, "rewards/chosen": -1.569846510887146, "rewards/margins": 0.38913729786872864, "rewards/rejected": -1.9589838981628418, "step": 416 }, { "epoch": 0.2723487631643399, "grad_norm": 27.797390348868447, "learning_rate": 1.3689897575239766e-07, "logits/chosen": -1.5702205896377563, "logits/rejected": -1.602936863899231, "logps/chosen": -667.676513671875, "logps/rejected": -693.8980712890625, "loss": 0.5853, "rewards/accuracies": 0.84375, "rewards/chosen": -1.529921531677246, "rewards/margins": 0.45492294430732727, "rewards/rejected": -1.9848445653915405, "step": 417 }, { "epoch": 0.2730018777043024, "grad_norm": 8.991724030392819, "learning_rate": 1.3680219437878805e-07, "logits/chosen": -1.645481824874878, "logits/rejected": -1.6687901020050049, "logps/chosen": -626.5555419921875, "logps/rejected": -623.034912109375, "loss": 0.6408, "rewards/accuracies": 0.53125, "rewards/chosen": -1.4581669569015503, "rewards/margins": 0.075958251953125, "rewards/rejected": -1.5341250896453857, "step": 418 }, { "epoch": 0.2736549922442648, "grad_norm": 34.54942489548843, "learning_rate": 1.3670509131665145e-07, "logits/chosen": -1.6880152225494385, "logits/rejected": -1.7365965843200684, "logps/chosen": -684.6690673828125, "logps/rejected": -660.1009521484375, "loss": 0.6038, "rewards/accuracies": 0.71875, "rewards/chosen": -1.548675298690796, "rewards/margins": 0.23877474665641785, "rewards/rejected": -1.7874499559402466, "step": 419 }, { "epoch": 0.2743081067842273, "grad_norm": 7.725802049377969, "learning_rate": 1.36607667071422e-07, "logits/chosen": -1.60299551486969, "logits/rejected": -1.5734202861785889, "logps/chosen": -566.0638427734375, "logps/rejected": -681.6033325195312, "loss": 0.5367, "rewards/accuracies": 0.71875, "rewards/chosen": -1.342764973640442, "rewards/margins": 0.6366487741470337, "rewards/rejected": -1.9794137477874756, "step": 420 }, { "epoch": 0.2749612213241897, "grad_norm": 29.01805274332031, "learning_rate": 1.3650992215020568e-07, "logits/chosen": -1.6936869621276855, "logits/rejected": -1.7092525959014893, "logps/chosen": -627.9664306640625, "logps/rejected": -691.7342529296875, "loss": 0.5607, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4970836639404297, "rewards/margins": 0.4171164631843567, "rewards/rejected": -1.9142000675201416, "step": 421 }, { "epoch": 0.2756143358641522, "grad_norm": 16.315855308400074, "learning_rate": 1.3641185706177758e-07, "logits/chosen": -1.6888682842254639, "logits/rejected": -1.749967336654663, "logps/chosen": -697.9429321289062, "logps/rejected": -695.5234985351562, "loss": 0.5916, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7964341640472412, "rewards/margins": 0.21294526755809784, "rewards/rejected": -2.0093793869018555, "step": 422 }, { "epoch": 0.2762674504041146, "grad_norm": 14.532459170681967, "learning_rate": 1.3631347231657941e-07, "logits/chosen": -1.5714447498321533, "logits/rejected": -1.5943671464920044, "logps/chosen": -706.9442138671875, "logps/rejected": -723.3374633789062, "loss": 0.5472, "rewards/accuracies": 0.78125, "rewards/chosen": -1.754028558731079, "rewards/margins": 0.30723437666893005, "rewards/rejected": -2.061262845993042, "step": 423 }, { "epoch": 0.2769205649440771, "grad_norm": 17.1924352317544, "learning_rate": 1.3621476842671663e-07, "logits/chosen": -1.5889177322387695, "logits/rejected": -1.607007384300232, "logps/chosen": -727.8897705078125, "logps/rejected": -668.5709838867188, "loss": 0.5935, "rewards/accuracies": 0.53125, "rewards/chosen": -1.8610236644744873, "rewards/margins": 0.0675252377986908, "rewards/rejected": -1.92854905128479, "step": 424 }, { "epoch": 0.2775736794840395, "grad_norm": 40.939411419121505, "learning_rate": 1.3611574590595592e-07, "logits/chosen": -1.6908094882965088, "logits/rejected": -1.6986981630325317, "logps/chosen": -739.2138671875, "logps/rejected": -840.9638061523438, "loss": 0.5882, "rewards/accuracies": 0.84375, "rewards/chosen": -1.968212604522705, "rewards/margins": 0.6422140002250671, "rewards/rejected": -2.610426425933838, "step": 425 }, { "epoch": 0.27822679402400197, "grad_norm": 7.3972481534822405, "learning_rate": 1.3601640526972256e-07, "logits/chosen": -1.6408872604370117, "logits/rejected": -1.6618748903274536, "logps/chosen": -625.466796875, "logps/rejected": -732.3475341796875, "loss": 0.56, "rewards/accuracies": 0.8125, "rewards/chosen": -1.500174641609192, "rewards/margins": 0.6626629829406738, "rewards/rejected": -2.162837505340576, "step": 426 }, { "epoch": 0.2788799085639644, "grad_norm": 17.28581442914079, "learning_rate": 1.3591674703509755e-07, "logits/chosen": -1.6025550365447998, "logits/rejected": -1.6443158388137817, "logps/chosen": -708.13232421875, "logps/rejected": -776.6422729492188, "loss": 0.5629, "rewards/accuracies": 0.75, "rewards/chosen": -1.7544456720352173, "rewards/margins": 0.7180143594741821, "rewards/rejected": -2.4724600315093994, "step": 427 }, { "epoch": 0.27953302310392686, "grad_norm": 21.933643031973503, "learning_rate": 1.3581677172081503e-07, "logits/chosen": -1.705406904220581, "logits/rejected": -1.7163548469543457, "logps/chosen": -776.873291015625, "logps/rejected": -821.5286865234375, "loss": 0.5636, "rewards/accuracies": 0.625, "rewards/chosen": -2.0922696590423584, "rewards/margins": 0.3925325274467468, "rewards/rejected": -2.484802484512329, "step": 428 }, { "epoch": 0.2801861376438893, "grad_norm": 9.981810314745449, "learning_rate": 1.3571647984725965e-07, "logits/chosen": -1.5826154947280884, "logits/rejected": -1.5522079467773438, "logps/chosen": -658.821044921875, "logps/rejected": -708.5677490234375, "loss": 0.5996, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5901877880096436, "rewards/margins": 0.4858096241950989, "rewards/rejected": -2.0759973526000977, "step": 429 }, { "epoch": 0.28083925218385175, "grad_norm": 11.36893380136887, "learning_rate": 1.3561587193646377e-07, "logits/chosen": -1.5808284282684326, "logits/rejected": -1.61090087890625, "logps/chosen": -645.9747314453125, "logps/rejected": -630.6502685546875, "loss": 0.6277, "rewards/accuracies": 0.6875, "rewards/chosen": -1.67696213722229, "rewards/margins": 0.11575008928775787, "rewards/rejected": -1.7927122116088867, "step": 430 }, { "epoch": 0.28149236672381417, "grad_norm": 65.85082801878511, "learning_rate": 1.355149485121048e-07, "logits/chosen": -1.618956446647644, "logits/rejected": -1.6172845363616943, "logps/chosen": -570.473876953125, "logps/rejected": -637.30126953125, "loss": 0.5954, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5234228372573853, "rewards/margins": 0.49459823966026306, "rewards/rejected": -2.0180211067199707, "step": 431 }, { "epoch": 0.28214548126377664, "grad_norm": 16.266142416859648, "learning_rate": 1.3541371009950234e-07, "logits/chosen": -1.552247166633606, "logits/rejected": -1.580383539199829, "logps/chosen": -720.5084228515625, "logps/rejected": -765.5215454101562, "loss": 0.5772, "rewards/accuracies": 0.75, "rewards/chosen": -1.807952880859375, "rewards/margins": 0.48564061522483826, "rewards/rejected": -2.293593406677246, "step": 432 }, { "epoch": 0.28279859580373906, "grad_norm": 42.15718871671079, "learning_rate": 1.3531215722561562e-07, "logits/chosen": -1.6510796546936035, "logits/rejected": -1.6283471584320068, "logps/chosen": -709.1431884765625, "logps/rejected": -720.7800903320312, "loss": 0.6278, "rewards/accuracies": 0.625, "rewards/chosen": -1.7743120193481445, "rewards/margins": 0.24123550951480865, "rewards/rejected": -2.015547513961792, "step": 433 }, { "epoch": 0.28345171034370154, "grad_norm": 19.658520116756474, "learning_rate": 1.3521029041904067e-07, "logits/chosen": -1.7421890497207642, "logits/rejected": -1.7447417974472046, "logps/chosen": -633.9954833984375, "logps/rejected": -745.4783325195312, "loss": 0.5583, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7052303552627563, "rewards/margins": 0.5718401074409485, "rewards/rejected": -2.2770705223083496, "step": 434 }, { "epoch": 0.28410482488366395, "grad_norm": 61.59792122699335, "learning_rate": 1.351081102100076e-07, "logits/chosen": -1.6574376821517944, "logits/rejected": -1.6252573728561401, "logps/chosen": -680.404541015625, "logps/rejected": -763.9954833984375, "loss": 0.5739, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7275390625, "rewards/margins": 0.38947176933288574, "rewards/rejected": -2.117011070251465, "step": 435 }, { "epoch": 0.2847579394236264, "grad_norm": 7.980201871732804, "learning_rate": 1.3500561713037777e-07, "logits/chosen": -1.6319736242294312, "logits/rejected": -1.6381350755691528, "logps/chosen": -719.6119995117188, "logps/rejected": -745.8162841796875, "loss": 0.5692, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7299431562423706, "rewards/margins": 0.5405523180961609, "rewards/rejected": -2.270495653152466, "step": 436 }, { "epoch": 0.28541105396358885, "grad_norm": 8.727533392873836, "learning_rate": 1.3490281171364112e-07, "logits/chosen": -1.6387797594070435, "logits/rejected": -1.6695307493209839, "logps/chosen": -695.0264892578125, "logps/rejected": -734.383544921875, "loss": 0.5652, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7570171356201172, "rewards/margins": 0.508963942527771, "rewards/rejected": -2.2659811973571777, "step": 437 }, { "epoch": 0.2860641685035513, "grad_norm": 28.528027325398355, "learning_rate": 1.3479969449491332e-07, "logits/chosen": -1.600303053855896, "logits/rejected": -1.5852181911468506, "logps/chosen": -685.3209838867188, "logps/rejected": -876.8265380859375, "loss": 0.5698, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7708958387374878, "rewards/margins": 0.7778002619743347, "rewards/rejected": -2.5486960411071777, "step": 438 }, { "epoch": 0.28671728304351374, "grad_norm": 14.87235543409777, "learning_rate": 1.3469626601093301e-07, "logits/chosen": -1.5701959133148193, "logits/rejected": -1.6426581144332886, "logps/chosen": -664.4986572265625, "logps/rejected": -663.914794921875, "loss": 0.6061, "rewards/accuracies": 0.84375, "rewards/chosen": -1.8818445205688477, "rewards/margins": 0.31387796998023987, "rewards/rejected": -2.1957225799560547, "step": 439 }, { "epoch": 0.2873703975834762, "grad_norm": 10.063735379517492, "learning_rate": 1.34592526800059e-07, "logits/chosen": -1.6531732082366943, "logits/rejected": -1.7189072370529175, "logps/chosen": -714.6017456054688, "logps/rejected": -692.885498046875, "loss": 0.5671, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7326029539108276, "rewards/margins": 0.28911852836608887, "rewards/rejected": -2.021721363067627, "step": 440 }, { "epoch": 0.28802351212343863, "grad_norm": 9.184629537387579, "learning_rate": 1.3448847740226753e-07, "logits/chosen": -1.4753351211547852, "logits/rejected": -1.5424079895019531, "logps/chosen": -621.2201538085938, "logps/rejected": -637.651611328125, "loss": 0.6113, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6852307319641113, "rewards/margins": 0.3220943510532379, "rewards/rejected": -2.0073251724243164, "step": 441 }, { "epoch": 0.2886766266634011, "grad_norm": 8.636425870814191, "learning_rate": 1.3438411835914935e-07, "logits/chosen": -1.689327597618103, "logits/rejected": -1.6427044868469238, "logps/chosen": -652.803466796875, "logps/rejected": -845.2495727539062, "loss": 0.5666, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6894198656082153, "rewards/margins": 0.8975628614425659, "rewards/rejected": -2.5869827270507812, "step": 442 }, { "epoch": 0.2893297412033635, "grad_norm": 10.50323658639023, "learning_rate": 1.3427945021390695e-07, "logits/chosen": -1.7043615579605103, "logits/rejected": -1.6875221729278564, "logps/chosen": -757.1970825195312, "logps/rejected": -846.8417358398438, "loss": 0.5605, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8582996129989624, "rewards/margins": 0.7164192795753479, "rewards/rejected": -2.574718952178955, "step": 443 }, { "epoch": 0.289982855743326, "grad_norm": 58.021997546888514, "learning_rate": 1.3417447351135174e-07, "logits/chosen": -1.6309140920639038, "logits/rejected": -1.5757015943527222, "logps/chosen": -656.53466796875, "logps/rejected": -700.3538818359375, "loss": 0.556, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6346073150634766, "rewards/margins": 0.3053905665874481, "rewards/rejected": -1.939997911453247, "step": 444 }, { "epoch": 0.2906359702832884, "grad_norm": 23.765137933971218, "learning_rate": 1.3406918879790125e-07, "logits/chosen": -1.604915976524353, "logits/rejected": -1.5979593992233276, "logps/chosen": -678.1114501953125, "logps/rejected": -687.6436767578125, "loss": 0.5872, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8041229248046875, "rewards/margins": 0.2883044183254242, "rewards/rejected": -2.0924274921417236, "step": 445 }, { "epoch": 0.2912890848232509, "grad_norm": 9.50417097205747, "learning_rate": 1.3396359662157621e-07, "logits/chosen": -1.625780463218689, "logits/rejected": -1.6541712284088135, "logps/chosen": -595.81396484375, "logps/rejected": -702.814453125, "loss": 0.544, "rewards/accuracies": 0.75, "rewards/chosen": -1.58511221408844, "rewards/margins": 0.7033818364143372, "rewards/rejected": -2.288494110107422, "step": 446 }, { "epoch": 0.2919421993632133, "grad_norm": 18.981324007656344, "learning_rate": 1.3385769753199778e-07, "logits/chosen": -1.6669667959213257, "logits/rejected": -1.6522274017333984, "logps/chosen": -686.9908447265625, "logps/rejected": -650.0782470703125, "loss": 0.5556, "rewards/accuracies": 0.71875, "rewards/chosen": -1.9364535808563232, "rewards/margins": 0.2179185450077057, "rewards/rejected": -2.154372215270996, "step": 447 }, { "epoch": 0.2925953139031758, "grad_norm": 10.169059802410512, "learning_rate": 1.3375149208038454e-07, "logits/chosen": -1.6317005157470703, "logits/rejected": -1.6902613639831543, "logps/chosen": -641.2767944335938, "logps/rejected": -646.5491943359375, "loss": 0.5492, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7388436794281006, "rewards/margins": 0.43374961614608765, "rewards/rejected": -2.172593355178833, "step": 448 }, { "epoch": 0.2932484284431382, "grad_norm": 49.39528712371372, "learning_rate": 1.3364498081954984e-07, "logits/chosen": -1.6534464359283447, "logits/rejected": -1.6478941440582275, "logps/chosen": -652.5145263671875, "logps/rejected": -719.8404541015625, "loss": 0.5349, "rewards/accuracies": 0.90625, "rewards/chosen": -1.6955946683883667, "rewards/margins": 0.6352224349975586, "rewards/rejected": -2.3308169841766357, "step": 449 }, { "epoch": 0.29390154298310067, "grad_norm": 22.612708126765373, "learning_rate": 1.3353816430389877e-07, "logits/chosen": -1.6020948886871338, "logits/rejected": -1.5905299186706543, "logps/chosen": -640.2467041015625, "logps/rejected": -654.7265625, "loss": 0.5702, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7803701162338257, "rewards/margins": 0.2993907928466797, "rewards/rejected": -2.079760789871216, "step": 450 }, { "epoch": 0.2945546575230631, "grad_norm": 53.155242423002186, "learning_rate": 1.3343104308942526e-07, "logits/chosen": -1.5806388854980469, "logits/rejected": -1.6192755699157715, "logps/chosen": -638.1632080078125, "logps/rejected": -661.924072265625, "loss": 0.5966, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5359299182891846, "rewards/margins": 0.4746638536453247, "rewards/rejected": -2.0105934143066406, "step": 451 }, { "epoch": 0.29520777206302556, "grad_norm": 27.43879161964594, "learning_rate": 1.3332361773370933e-07, "logits/chosen": -1.615262746810913, "logits/rejected": -1.618033766746521, "logps/chosen": -684.5283203125, "logps/rejected": -777.6100463867188, "loss": 0.5537, "rewards/accuracies": 0.71875, "rewards/chosen": -1.678364634513855, "rewards/margins": 0.7497380971908569, "rewards/rejected": -2.428102731704712, "step": 452 }, { "epoch": 0.295860886602988, "grad_norm": 90.50268890044883, "learning_rate": 1.3321588879591404e-07, "logits/chosen": -1.6059372425079346, "logits/rejected": -1.646728754043579, "logps/chosen": -712.547119140625, "logps/rejected": -678.2857666015625, "loss": 0.6262, "rewards/accuracies": 0.46875, "rewards/chosen": -2.0923614501953125, "rewards/margins": 0.0826776921749115, "rewards/rejected": -2.175039291381836, "step": 453 }, { "epoch": 0.29651400114295046, "grad_norm": 23.544890284443007, "learning_rate": 1.331078568367826e-07, "logits/chosen": -1.6115236282348633, "logits/rejected": -1.6034373044967651, "logps/chosen": -676.42529296875, "logps/rejected": -732.385009765625, "loss": 0.5661, "rewards/accuracies": 0.625, "rewards/chosen": -1.7986388206481934, "rewards/margins": 0.3738027811050415, "rewards/rejected": -2.1724417209625244, "step": 454 }, { "epoch": 0.2971671156829129, "grad_norm": 20.85817286901651, "learning_rate": 1.3299952241863558e-07, "logits/chosen": -1.547767996788025, "logits/rejected": -1.5575647354125977, "logps/chosen": -677.2283325195312, "logps/rejected": -753.81201171875, "loss": 0.5791, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8230202198028564, "rewards/margins": 0.49635809659957886, "rewards/rejected": -2.319378137588501, "step": 455 }, { "epoch": 0.29782023022287535, "grad_norm": 83.24183783338488, "learning_rate": 1.3289088610536775e-07, "logits/chosen": -1.704686164855957, "logits/rejected": -1.7018077373504639, "logps/chosen": -748.0383911132812, "logps/rejected": -787.422607421875, "loss": 0.5771, "rewards/accuracies": 0.65625, "rewards/chosen": -2.033522605895996, "rewards/margins": 0.40751010179519653, "rewards/rejected": -2.441032886505127, "step": 456 }, { "epoch": 0.29847334476283777, "grad_norm": 9.918684990634835, "learning_rate": 1.3278194846244547e-07, "logits/chosen": -1.6032931804656982, "logits/rejected": -1.631669282913208, "logps/chosen": -650.1741943359375, "logps/rejected": -619.4449462890625, "loss": 0.5868, "rewards/accuracies": 0.5, "rewards/chosen": -2.0186896324157715, "rewards/margins": 0.09990565478801727, "rewards/rejected": -2.1185953617095947, "step": 457 }, { "epoch": 0.29912645930280024, "grad_norm": 8.930483236743907, "learning_rate": 1.326727100569034e-07, "logits/chosen": -1.5541692972183228, "logits/rejected": -1.5824774503707886, "logps/chosen": -660.3080444335938, "logps/rejected": -716.2559814453125, "loss": 0.5453, "rewards/accuracies": 0.75, "rewards/chosen": -1.9189565181732178, "rewards/margins": 0.3501700758934021, "rewards/rejected": -2.2691266536712646, "step": 458 }, { "epoch": 0.29977957384276266, "grad_norm": 27.815139403173614, "learning_rate": 1.3256317145734176e-07, "logits/chosen": -1.6188397407531738, "logits/rejected": -1.6419901847839355, "logps/chosen": -769.3443603515625, "logps/rejected": -774.8818969726562, "loss": 0.5426, "rewards/accuracies": 0.65625, "rewards/chosen": -2.362337350845337, "rewards/margins": 0.2768968641757965, "rewards/rejected": -2.6392343044281006, "step": 459 }, { "epoch": 0.30043268838272513, "grad_norm": 32.322236847979745, "learning_rate": 1.3245333323392333e-07, "logits/chosen": -1.575210452079773, "logits/rejected": -1.5853873491287231, "logps/chosen": -663.4458618164062, "logps/rejected": -697.9381103515625, "loss": 0.5825, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7677443027496338, "rewards/margins": 0.5210894346237183, "rewards/rejected": -2.2888338565826416, "step": 460 }, { "epoch": 0.30108580292268755, "grad_norm": 39.148266264726594, "learning_rate": 1.3234319595837053e-07, "logits/chosen": -1.6241929531097412, "logits/rejected": -1.633462905883789, "logps/chosen": -698.6934814453125, "logps/rejected": -783.1587524414062, "loss": 0.5855, "rewards/accuracies": 0.625, "rewards/chosen": -1.971913456916809, "rewards/margins": 0.39414018392562866, "rewards/rejected": -2.366053581237793, "step": 461 }, { "epoch": 0.30173891746265, "grad_norm": 11.953653868367251, "learning_rate": 1.3223276020396224e-07, "logits/chosen": -1.6563340425491333, "logits/rejected": -1.6382731199264526, "logps/chosen": -750.8363647460938, "logps/rejected": -918.314453125, "loss": 0.5138, "rewards/accuracies": 0.75, "rewards/chosen": -2.174654722213745, "rewards/margins": 0.9784154891967773, "rewards/rejected": -3.1530702114105225, "step": 462 }, { "epoch": 0.30239203200261244, "grad_norm": 11.156052343735718, "learning_rate": 1.3212202654553108e-07, "logits/chosen": -1.6498849391937256, "logits/rejected": -1.611333966255188, "logps/chosen": -697.7623291015625, "logps/rejected": -808.7626953125, "loss": 0.5996, "rewards/accuracies": 0.625, "rewards/chosen": -2.0645065307617188, "rewards/margins": 0.5361623167991638, "rewards/rejected": -2.6006689071655273, "step": 463 }, { "epoch": 0.3030451465425749, "grad_norm": 9.56202472673271, "learning_rate": 1.3201099555946027e-07, "logits/chosen": -1.5961620807647705, "logits/rejected": -1.6157381534576416, "logps/chosen": -695.4110717773438, "logps/rejected": -687.124755859375, "loss": 0.5877, "rewards/accuracies": 0.65625, "rewards/chosen": -1.75066339969635, "rewards/margins": 0.38458389043807983, "rewards/rejected": -2.135247230529785, "step": 464 }, { "epoch": 0.30369826108253734, "grad_norm": 16.37722781419385, "learning_rate": 1.3189966782368067e-07, "logits/chosen": -1.478019118309021, "logits/rejected": -1.5088088512420654, "logps/chosen": -654.6697998046875, "logps/rejected": -701.0975952148438, "loss": 0.5274, "rewards/accuracies": 0.75, "rewards/chosen": -1.9592812061309814, "rewards/margins": 0.3919811546802521, "rewards/rejected": -2.351262331008911, "step": 465 }, { "epoch": 0.3043513756224998, "grad_norm": 48.09748648700373, "learning_rate": 1.3178804391766773e-07, "logits/chosen": -1.6397638320922852, "logits/rejected": -1.6730685234069824, "logps/chosen": -689.5550537109375, "logps/rejected": -693.6464233398438, "loss": 0.5573, "rewards/accuracies": 0.75, "rewards/chosen": -1.6381131410598755, "rewards/margins": 0.5276268720626831, "rewards/rejected": -2.1657400131225586, "step": 466 }, { "epoch": 0.3050044901624622, "grad_norm": 38.21088887373832, "learning_rate": 1.3167612442243849e-07, "logits/chosen": -1.6109600067138672, "logits/rejected": -1.588073968887329, "logps/chosen": -679.6129760742188, "logps/rejected": -752.5172119140625, "loss": 0.5441, "rewards/accuracies": 0.65625, "rewards/chosen": -1.768452525138855, "rewards/margins": 0.5461674332618713, "rewards/rejected": -2.314620018005371, "step": 467 }, { "epoch": 0.3056576047024247, "grad_norm": 33.132628684957666, "learning_rate": 1.3156390992054862e-07, "logits/chosen": -1.6505910158157349, "logits/rejected": -1.6523258686065674, "logps/chosen": -707.0516357421875, "logps/rejected": -742.3071899414062, "loss": 0.5618, "rewards/accuracies": 0.65625, "rewards/chosen": -2.175845146179199, "rewards/margins": 0.5027600526809692, "rewards/rejected": -2.678605318069458, "step": 468 }, { "epoch": 0.3063107192423871, "grad_norm": 13.439285792051546, "learning_rate": 1.3145140099608933e-07, "logits/chosen": -1.6851032972335815, "logits/rejected": -1.6651477813720703, "logps/chosen": -708.4338989257812, "logps/rejected": -813.6752319335938, "loss": 0.5774, "rewards/accuracies": 0.71875, "rewards/chosen": -2.052492380142212, "rewards/margins": 0.5105604529380798, "rewards/rejected": -2.5630528926849365, "step": 469 }, { "epoch": 0.3069638337823496, "grad_norm": 21.620672217002696, "learning_rate": 1.3133859823468433e-07, "logits/chosen": -1.5591509342193604, "logits/rejected": -1.620843768119812, "logps/chosen": -712.449951171875, "logps/rejected": -747.5558471679688, "loss": 0.561, "rewards/accuracies": 0.8125, "rewards/chosen": -2.18332576751709, "rewards/margins": 0.5399474501609802, "rewards/rejected": -2.7232730388641357, "step": 470 }, { "epoch": 0.307616948322312, "grad_norm": 7.80149433228074, "learning_rate": 1.3122550222348676e-07, "logits/chosen": -1.6145565509796143, "logits/rejected": -1.612226128578186, "logps/chosen": -602.789794921875, "logps/rejected": -650.0497436523438, "loss": 0.5333, "rewards/accuracies": 0.625, "rewards/chosen": -1.8238171339035034, "rewards/margins": 0.18020805716514587, "rewards/rejected": -2.0040249824523926, "step": 471 }, { "epoch": 0.3082700628622745, "grad_norm": 12.541386961248598, "learning_rate": 1.3111211355117625e-07, "logits/chosen": -1.7079181671142578, "logits/rejected": -1.6550383567810059, "logps/chosen": -733.6851806640625, "logps/rejected": -804.6192626953125, "loss": 0.5624, "rewards/accuracies": 0.71875, "rewards/chosen": -2.264955520629883, "rewards/margins": 0.5682797431945801, "rewards/rejected": -2.833235263824463, "step": 472 }, { "epoch": 0.3089231774022369, "grad_norm": 49.77231044522242, "learning_rate": 1.3099843280795564e-07, "logits/chosen": -1.5972816944122314, "logits/rejected": -1.5726184844970703, "logps/chosen": -697.912109375, "logps/rejected": -767.2205810546875, "loss": 0.5352, "rewards/accuracies": 0.65625, "rewards/chosen": -1.8209738731384277, "rewards/margins": 0.34553292393684387, "rewards/rejected": -2.16650652885437, "step": 473 }, { "epoch": 0.3095762919421994, "grad_norm": 9.964229839524728, "learning_rate": 1.3088446058554813e-07, "logits/chosen": -1.5458612442016602, "logits/rejected": -1.533179759979248, "logps/chosen": -707.8756103515625, "logps/rejected": -787.054443359375, "loss": 0.5863, "rewards/accuracies": 0.84375, "rewards/chosen": -2.043527364730835, "rewards/margins": 0.6540895700454712, "rewards/rejected": -2.6976165771484375, "step": 474 }, { "epoch": 0.3102294064821618, "grad_norm": 45.959503738015734, "learning_rate": 1.3077019747719412e-07, "logits/chosen": -1.595198154449463, "logits/rejected": -1.591080904006958, "logps/chosen": -645.2706298828125, "logps/rejected": -711.2510986328125, "loss": 0.5782, "rewards/accuracies": 0.71875, "rewards/chosen": -2.0279712677001953, "rewards/margins": 0.44245079159736633, "rewards/rejected": -2.4704222679138184, "step": 475 }, { "epoch": 0.31088252102212427, "grad_norm": 44.73324911306505, "learning_rate": 1.3065564407764802e-07, "logits/chosen": -1.7171953916549683, "logits/rejected": -1.707771897315979, "logps/chosen": -780.519775390625, "logps/rejected": -787.8773193359375, "loss": 0.6095, "rewards/accuracies": 0.75, "rewards/chosen": -2.1023976802825928, "rewards/margins": 0.306552529335022, "rewards/rejected": -2.4089503288269043, "step": 476 }, { "epoch": 0.3115356355620867, "grad_norm": 48.62571818947248, "learning_rate": 1.3054080098317535e-07, "logits/chosen": -1.6242291927337646, "logits/rejected": -1.604690432548523, "logps/chosen": -631.8426513671875, "logps/rejected": -682.6934204101562, "loss": 0.5734, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8018213510513306, "rewards/margins": 0.6165929436683655, "rewards/rejected": -2.418414354324341, "step": 477 }, { "epoch": 0.31218875010204916, "grad_norm": 12.529349998561317, "learning_rate": 1.3042566879154942e-07, "logits/chosen": -1.6056309938430786, "logits/rejected": -1.5826270580291748, "logps/chosen": -729.5185546875, "logps/rejected": -871.41064453125, "loss": 0.5912, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2389609813690186, "rewards/margins": 0.8720102310180664, "rewards/rejected": -3.110971450805664, "step": 478 }, { "epoch": 0.3128418646420116, "grad_norm": 25.761326367725754, "learning_rate": 1.3031024810204844e-07, "logits/chosen": -1.652268648147583, "logits/rejected": -1.6630209684371948, "logps/chosen": -703.5462036132812, "logps/rejected": -724.381591796875, "loss": 0.5784, "rewards/accuracies": 0.75, "rewards/chosen": -2.003835439682007, "rewards/margins": 0.37049388885498047, "rewards/rejected": -2.3743293285369873, "step": 479 }, { "epoch": 0.31349497918197405, "grad_norm": 33.561886992169335, "learning_rate": 1.3019453951545222e-07, "logits/chosen": -1.5987491607666016, "logits/rejected": -1.5921955108642578, "logps/chosen": -745.3104248046875, "logps/rejected": -764.4356689453125, "loss": 0.5442, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0976104736328125, "rewards/margins": 0.27280157804489136, "rewards/rejected": -2.3704121112823486, "step": 480 }, { "epoch": 0.31414809372193647, "grad_norm": 36.917374177404064, "learning_rate": 1.3007854363403912e-07, "logits/chosen": -1.4915146827697754, "logits/rejected": -1.4882618188858032, "logps/chosen": -751.360107421875, "logps/rejected": -766.1044311523438, "loss": 0.5453, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2802181243896484, "rewards/margins": 0.4815993905067444, "rewards/rejected": -2.761817693710327, "step": 481 }, { "epoch": 0.31480120826189895, "grad_norm": 13.711790140200893, "learning_rate": 1.2996226106158292e-07, "logits/chosen": -1.5975637435913086, "logits/rejected": -1.6035072803497314, "logps/chosen": -755.90576171875, "logps/rejected": -734.3273315429688, "loss": 0.5692, "rewards/accuracies": 0.6875, "rewards/chosen": -2.2815115451812744, "rewards/margins": 0.35683876276016235, "rewards/rejected": -2.638350486755371, "step": 482 }, { "epoch": 0.31545432280186136, "grad_norm": 10.225083803183653, "learning_rate": 1.2984569240334968e-07, "logits/chosen": -1.6066551208496094, "logits/rejected": -1.6023541688919067, "logps/chosen": -683.2215576171875, "logps/rejected": -708.5989379882812, "loss": 0.643, "rewards/accuracies": 0.5, "rewards/chosen": -2.3892295360565186, "rewards/margins": 0.2522014081478119, "rewards/rejected": -2.6414308547973633, "step": 483 }, { "epoch": 0.31610743734182384, "grad_norm": 43.21606295702285, "learning_rate": 1.297288382660945e-07, "logits/chosen": -1.5631223917007446, "logits/rejected": -1.5718114376068115, "logps/chosen": -689.9915771484375, "logps/rejected": -718.431396484375, "loss": 0.6106, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1257877349853516, "rewards/margins": 0.2984415590763092, "rewards/rejected": -2.424229145050049, "step": 484 }, { "epoch": 0.31676055188178626, "grad_norm": 27.88872175196333, "learning_rate": 1.2961169925805854e-07, "logits/chosen": -1.5951167345046997, "logits/rejected": -1.6255998611450195, "logps/chosen": -788.0588989257812, "logps/rejected": -845.193603515625, "loss": 0.5597, "rewards/accuracies": 0.75, "rewards/chosen": -2.4089677333831787, "rewards/margins": 0.6144492626190186, "rewards/rejected": -3.0234172344207764, "step": 485 }, { "epoch": 0.31741366642174873, "grad_norm": 20.90987797781216, "learning_rate": 1.294942759889657e-07, "logits/chosen": -1.675968050956726, "logits/rejected": -1.7186676263809204, "logps/chosen": -748.28466796875, "logps/rejected": -773.9339599609375, "loss": 0.5613, "rewards/accuracies": 0.71875, "rewards/chosen": -2.358844757080078, "rewards/margins": 0.4122333824634552, "rewards/rejected": -2.771078109741211, "step": 486 }, { "epoch": 0.31806678096171115, "grad_norm": 55.17649113640136, "learning_rate": 1.2937656907001946e-07, "logits/chosen": -1.641244649887085, "logits/rejected": -1.6599280834197998, "logps/chosen": -761.1217041015625, "logps/rejected": -830.0103149414062, "loss": 0.5631, "rewards/accuracies": 0.75, "rewards/chosen": -2.4944777488708496, "rewards/margins": 0.6374845504760742, "rewards/rejected": -3.131962299346924, "step": 487 }, { "epoch": 0.3187198955016736, "grad_norm": 102.83841154795559, "learning_rate": 1.2925857911389977e-07, "logits/chosen": -1.5266590118408203, "logits/rejected": -1.5372052192687988, "logps/chosen": -713.9895629882812, "logps/rejected": -758.6173706054688, "loss": 0.5926, "rewards/accuracies": 0.65625, "rewards/chosen": -2.067730188369751, "rewards/margins": 0.3173166811466217, "rewards/rejected": -2.3850467205047607, "step": 488 }, { "epoch": 0.31937301004163604, "grad_norm": 16.29290504478057, "learning_rate": 1.2914030673475987e-07, "logits/chosen": -1.6052075624465942, "logits/rejected": -1.6125444173812866, "logps/chosen": -788.452392578125, "logps/rejected": -915.9051513671875, "loss": 0.5288, "rewards/accuracies": 0.75, "rewards/chosen": -2.4514353275299072, "rewards/margins": 0.7415469884872437, "rewards/rejected": -3.1929819583892822, "step": 489 }, { "epoch": 0.3200261245815985, "grad_norm": 10.250010376773696, "learning_rate": 1.29021752548223e-07, "logits/chosen": -1.5654120445251465, "logits/rejected": -1.5245580673217773, "logps/chosen": -656.7188720703125, "logps/rejected": -755.3834838867188, "loss": 0.5413, "rewards/accuracies": 0.75, "rewards/chosen": -1.9858925342559814, "rewards/margins": 0.6740937829017639, "rewards/rejected": -2.6599864959716797, "step": 490 }, { "epoch": 0.32067923912156093, "grad_norm": 26.08722651165643, "learning_rate": 1.2890291717137919e-07, "logits/chosen": -1.6398707628250122, "logits/rejected": -1.6489192247390747, "logps/chosen": -740.1271362304688, "logps/rejected": -744.15576171875, "loss": 0.5919, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0526180267333984, "rewards/margins": 0.4571646749973297, "rewards/rejected": -2.509782552719116, "step": 491 }, { "epoch": 0.3213323536615234, "grad_norm": 11.326643733648055, "learning_rate": 1.287838012227822e-07, "logits/chosen": -1.4910850524902344, "logits/rejected": -1.5168672800064087, "logps/chosen": -666.2679443359375, "logps/rejected": -719.11328125, "loss": 0.5579, "rewards/accuracies": 0.65625, "rewards/chosen": -2.113301992416382, "rewards/margins": 0.3083459138870239, "rewards/rejected": -2.4216480255126953, "step": 492 }, { "epoch": 0.3219854682014858, "grad_norm": 10.301719458410284, "learning_rate": 1.2866440532244618e-07, "logits/chosen": -1.6052420139312744, "logits/rejected": -1.5893734693527222, "logps/chosen": -680.7243041992188, "logps/rejected": -705.3173828125, "loss": 0.5927, "rewards/accuracies": 0.5625, "rewards/chosen": -2.155557632446289, "rewards/margins": 0.13128241896629333, "rewards/rejected": -2.2868399620056152, "step": 493 }, { "epoch": 0.3226385827414483, "grad_norm": 23.898710768355055, "learning_rate": 1.2854473009184242e-07, "logits/chosen": -1.582448959350586, "logits/rejected": -1.6095366477966309, "logps/chosen": -674.9298095703125, "logps/rejected": -756.9193725585938, "loss": 0.5823, "rewards/accuracies": 0.75, "rewards/chosen": -2.0278987884521484, "rewards/margins": 0.5040706992149353, "rewards/rejected": -2.5319693088531494, "step": 494 }, { "epoch": 0.3232916972814107, "grad_norm": 38.15944101714436, "learning_rate": 1.2842477615389622e-07, "logits/chosen": -1.582186222076416, "logits/rejected": -1.589294195175171, "logps/chosen": -693.418701171875, "logps/rejected": -733.3966064453125, "loss": 0.5397, "rewards/accuracies": 0.84375, "rewards/chosen": -2.0407207012176514, "rewards/margins": 0.5694603323936462, "rewards/rejected": -2.6101813316345215, "step": 495 }, { "epoch": 0.3239448118213732, "grad_norm": 14.734546679223891, "learning_rate": 1.2830454413298353e-07, "logits/chosen": -1.6934702396392822, "logits/rejected": -1.692880630493164, "logps/chosen": -768.6975708007812, "logps/rejected": -769.83984375, "loss": 0.5351, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1913037300109863, "rewards/margins": 0.30963531136512756, "rewards/rejected": -2.500938892364502, "step": 496 }, { "epoch": 0.3245979263613356, "grad_norm": 57.674097527721926, "learning_rate": 1.2818403465492783e-07, "logits/chosen": -1.5970431566238403, "logits/rejected": -1.6256811618804932, "logps/chosen": -704.041748046875, "logps/rejected": -755.9532470703125, "loss": 0.5673, "rewards/accuracies": 0.6875, "rewards/chosen": -2.2154181003570557, "rewards/margins": 0.48861750960350037, "rewards/rejected": -2.704035520553589, "step": 497 }, { "epoch": 0.3252510409012981, "grad_norm": 51.51237394300094, "learning_rate": 1.280632483469967e-07, "logits/chosen": -1.601000189781189, "logits/rejected": -1.615774393081665, "logps/chosen": -765.2921142578125, "logps/rejected": -783.3229370117188, "loss": 0.5797, "rewards/accuracies": 0.75, "rewards/chosen": -2.198168992996216, "rewards/margins": 0.4726821184158325, "rewards/rejected": -2.670850992202759, "step": 498 }, { "epoch": 0.3259041554412605, "grad_norm": 52.55743751092252, "learning_rate": 1.2794218583789876e-07, "logits/chosen": -1.6243011951446533, "logits/rejected": -1.6285080909729004, "logps/chosen": -646.337890625, "logps/rejected": -663.8938598632812, "loss": 0.5854, "rewards/accuracies": 0.78125, "rewards/chosen": -2.0927515029907227, "rewards/margins": 0.4221267104148865, "rewards/rejected": -2.514878273010254, "step": 499 }, { "epoch": 0.326557269981223, "grad_norm": 40.80464561018148, "learning_rate": 1.278208477577802e-07, "logits/chosen": -1.6202638149261475, "logits/rejected": -1.6449174880981445, "logps/chosen": -826.0245361328125, "logps/rejected": -890.9912109375, "loss": 0.5447, "rewards/accuracies": 0.78125, "rewards/chosen": -2.2344555854797363, "rewards/margins": 0.7165102958679199, "rewards/rejected": -2.9509658813476562, "step": 500 }, { "epoch": 0.326557269981223, "eval_logits/chosen": -1.6206858158111572, "eval_logits/rejected": -1.6248681545257568, "eval_logps/chosen": -728.922119140625, "eval_logps/rejected": -772.7636108398438, "eval_loss": 0.5644757151603699, "eval_rewards/accuracies": 0.7110000252723694, "eval_rewards/chosen": -2.1857683658599854, "eval_rewards/margins": 0.5194733142852783, "eval_rewards/rejected": -2.7052416801452637, "eval_runtime": 296.3263, "eval_samples_per_second": 13.499, "eval_steps_per_second": 0.844, "step": 500 }, { "epoch": 0.3272103845211854, "grad_norm": 18.6916196942294, "learning_rate": 1.276992347382217e-07, "logits/chosen": -1.6296683549880981, "logits/rejected": -1.6314842700958252, "logps/chosen": -768.372314453125, "logps/rejected": -827.50634765625, "loss": 0.5824, "rewards/accuracies": 0.65625, "rewards/chosen": -2.622889280319214, "rewards/margins": 0.44725632667541504, "rewards/rejected": -3.070145606994629, "step": 501 }, { "epoch": 0.32786349906114787, "grad_norm": 38.99333241282542, "learning_rate": 1.2757734741223494e-07, "logits/chosen": -1.598349690437317, "logits/rejected": -1.59029221534729, "logps/chosen": -643.4448852539062, "logps/rejected": -724.112548828125, "loss": 0.584, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8339576721191406, "rewards/margins": 0.5225554704666138, "rewards/rejected": -2.356513023376465, "step": 502 }, { "epoch": 0.3285166136011103, "grad_norm": 57.964045081569594, "learning_rate": 1.2745518641425945e-07, "logits/chosen": -1.5579354763031006, "logits/rejected": -1.5548033714294434, "logps/chosen": -681.3307495117188, "logps/rejected": -820.7464599609375, "loss": 0.559, "rewards/accuracies": 0.84375, "rewards/chosen": -1.9576513767242432, "rewards/margins": 0.9468222856521606, "rewards/rejected": -2.9044735431671143, "step": 503 }, { "epoch": 0.32916972814107276, "grad_norm": 26.127719689786453, "learning_rate": 1.2733275238015923e-07, "logits/chosen": -1.6055808067321777, "logits/rejected": -1.630063772201538, "logps/chosen": -629.9222412109375, "logps/rejected": -704.1810302734375, "loss": 0.5861, "rewards/accuracies": 0.71875, "rewards/chosen": -2.0422046184539795, "rewards/margins": 0.5639029741287231, "rewards/rejected": -2.606107234954834, "step": 504 }, { "epoch": 0.3298228426810352, "grad_norm": 26.745925680747586, "learning_rate": 1.272100459472195e-07, "logits/chosen": -1.4966247081756592, "logits/rejected": -1.51353120803833, "logps/chosen": -733.5635986328125, "logps/rejected": -757.8365478515625, "loss": 0.6147, "rewards/accuracies": 0.6875, "rewards/chosen": -2.2747292518615723, "rewards/margins": 0.3070646822452545, "rewards/rejected": -2.581793785095215, "step": 505 }, { "epoch": 0.33047595722099765, "grad_norm": 15.726026277688065, "learning_rate": 1.2708706775414333e-07, "logits/chosen": -1.562856912612915, "logits/rejected": -1.5949100255966187, "logps/chosen": -742.255859375, "logps/rejected": -773.4927978515625, "loss": 0.5832, "rewards/accuracies": 0.6875, "rewards/chosen": -2.044926643371582, "rewards/margins": 0.5169533491134644, "rewards/rejected": -2.561879873275757, "step": 506 }, { "epoch": 0.33112907176096007, "grad_norm": 18.237745759610128, "learning_rate": 1.269638184410483e-07, "logits/chosen": -1.6007072925567627, "logits/rejected": -1.627900242805481, "logps/chosen": -760.51708984375, "logps/rejected": -827.0796508789062, "loss": 0.5619, "rewards/accuracies": 0.75, "rewards/chosen": -2.159515619277954, "rewards/margins": 0.579525887966156, "rewards/rejected": -2.739041566848755, "step": 507 }, { "epoch": 0.33178218630092254, "grad_norm": 10.09970310506052, "learning_rate": 1.2684029864946334e-07, "logits/chosen": -1.4610607624053955, "logits/rejected": -1.4745755195617676, "logps/chosen": -622.4526977539062, "logps/rejected": -639.34814453125, "loss": 0.5861, "rewards/accuracies": 0.625, "rewards/chosen": -2.053276538848877, "rewards/margins": 0.30753716826438904, "rewards/rejected": -2.360813617706299, "step": 508 }, { "epoch": 0.33243530084088496, "grad_norm": 29.561202029180375, "learning_rate": 1.2671650902232512e-07, "logits/chosen": -1.5951420068740845, "logits/rejected": -1.6463465690612793, "logps/chosen": -701.150634765625, "logps/rejected": -748.044921875, "loss": 0.5395, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1120245456695557, "rewards/margins": 0.4306832551956177, "rewards/rejected": -2.542707681655884, "step": 509 }, { "epoch": 0.33308841538084744, "grad_norm": 20.140045301365248, "learning_rate": 1.2659245020397487e-07, "logits/chosen": -1.6009819507598877, "logits/rejected": -1.6235804557800293, "logps/chosen": -767.361083984375, "logps/rejected": -796.4589233398438, "loss": 0.6337, "rewards/accuracies": 0.8125, "rewards/chosen": -2.34623646736145, "rewards/margins": 0.62890625, "rewards/rejected": -2.975142478942871, "step": 510 }, { "epoch": 0.33374152992080985, "grad_norm": 20.816657867687876, "learning_rate": 1.2646812284015502e-07, "logits/chosen": -1.5827473402023315, "logits/rejected": -1.5781760215759277, "logps/chosen": -730.8731079101562, "logps/rejected": -745.44775390625, "loss": 0.5493, "rewards/accuracies": 0.78125, "rewards/chosen": -2.1604835987091064, "rewards/margins": 0.3516397774219513, "rewards/rejected": -2.5121235847473145, "step": 511 }, { "epoch": 0.33439464446077233, "grad_norm": 27.37026270313324, "learning_rate": 1.263435275780058e-07, "logits/chosen": -1.5628294944763184, "logits/rejected": -1.5843901634216309, "logps/chosen": -666.028076171875, "logps/rejected": -709.0813598632812, "loss": 0.646, "rewards/accuracies": 0.5625, "rewards/chosen": -1.901466965675354, "rewards/margins": 0.44154587388038635, "rewards/rejected": -2.343013048171997, "step": 512 }, { "epoch": 0.33504775900073475, "grad_norm": 35.16457922567049, "learning_rate": 1.262186650660619e-07, "logits/chosen": -1.5470932722091675, "logits/rejected": -1.5630905628204346, "logps/chosen": -718.0603637695312, "logps/rejected": -732.3851928710938, "loss": 0.5527, "rewards/accuracies": 0.75, "rewards/chosen": -2.389463424682617, "rewards/margins": 0.3482608497142792, "rewards/rejected": -2.7377243041992188, "step": 513 }, { "epoch": 0.3357008735406972, "grad_norm": 70.3813885396455, "learning_rate": 1.2609353595424905e-07, "logits/chosen": -1.5562270879745483, "logits/rejected": -1.5387725830078125, "logps/chosen": -601.1290893554688, "logps/rejected": -723.7789916992188, "loss": 0.5382, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8300285339355469, "rewards/margins": 0.7093777060508728, "rewards/rejected": -2.5394062995910645, "step": 514 }, { "epoch": 0.33635398808065964, "grad_norm": 40.988995422153955, "learning_rate": 1.2596814089388074e-07, "logits/chosen": -1.646407127380371, "logits/rejected": -1.622480869293213, "logps/chosen": -724.177001953125, "logps/rejected": -750.5733032226562, "loss": 0.5101, "rewards/accuracies": 0.84375, "rewards/chosen": -2.0493416786193848, "rewards/margins": 0.4716246724128723, "rewards/rejected": -2.520966053009033, "step": 515 }, { "epoch": 0.3370071026206221, "grad_norm": 22.872898048232333, "learning_rate": 1.2584248053765463e-07, "logits/chosen": -1.668154239654541, "logits/rejected": -1.6877217292785645, "logps/chosen": -788.1798095703125, "logps/rejected": -882.4114990234375, "loss": 0.5467, "rewards/accuracies": 0.78125, "rewards/chosen": -2.5617947578430176, "rewards/margins": 0.7210710048675537, "rewards/rejected": -3.2828657627105713, "step": 516 }, { "epoch": 0.33766021716058453, "grad_norm": 62.010832881435796, "learning_rate": 1.257165555396494e-07, "logits/chosen": -1.6311254501342773, "logits/rejected": -1.603393793106079, "logps/chosen": -683.9799194335938, "logps/rejected": -818.5030517578125, "loss": 0.5689, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8681172132492065, "rewards/margins": 0.7638627290725708, "rewards/rejected": -2.6319799423217773, "step": 517 }, { "epoch": 0.338313331700547, "grad_norm": 47.93407261910419, "learning_rate": 1.2559036655532116e-07, "logits/chosen": -1.639786958694458, "logits/rejected": -1.6281380653381348, "logps/chosen": -655.11083984375, "logps/rejected": -711.305419921875, "loss": 0.6042, "rewards/accuracies": 0.65625, "rewards/chosen": -1.9371042251586914, "rewards/margins": 0.29598766565322876, "rewards/rejected": -2.2330920696258545, "step": 518 }, { "epoch": 0.3389664462405094, "grad_norm": 8.481962150515601, "learning_rate": 1.2546391424150015e-07, "logits/chosen": -1.5316431522369385, "logits/rejected": -1.5820224285125732, "logps/chosen": -723.28173828125, "logps/rejected": -765.1573486328125, "loss": 0.5594, "rewards/accuracies": 0.6875, "rewards/chosen": -2.011814594268799, "rewards/margins": 0.4189469814300537, "rewards/rejected": -2.4307615756988525, "step": 519 }, { "epoch": 0.3396195607804719, "grad_norm": 16.110076519841456, "learning_rate": 1.2533719925638722e-07, "logits/chosen": -1.561875820159912, "logits/rejected": -1.563469409942627, "logps/chosen": -703.0348510742188, "logps/rejected": -769.7527465820312, "loss": 0.5457, "rewards/accuracies": 0.71875, "rewards/chosen": -1.9231442213058472, "rewards/margins": 0.762036144733429, "rewards/rejected": -2.685180187225342, "step": 520 }, { "epoch": 0.3402726753204343, "grad_norm": 12.232146508628508, "learning_rate": 1.2521022225955051e-07, "logits/chosen": -1.6638747453689575, "logits/rejected": -1.675110101699829, "logps/chosen": -702.4622802734375, "logps/rejected": -841.2349853515625, "loss": 0.5551, "rewards/accuracies": 0.71875, "rewards/chosen": -2.236776351928711, "rewards/margins": 0.7252943515777588, "rewards/rejected": -2.962070941925049, "step": 521 }, { "epoch": 0.3409257898603968, "grad_norm": 20.099246846498655, "learning_rate": 1.2508298391192192e-07, "logits/chosen": -1.6344259977340698, "logits/rejected": -1.6060516834259033, "logps/chosen": -690.3515625, "logps/rejected": -725.8463134765625, "loss": 0.5615, "rewards/accuracies": 0.78125, "rewards/chosen": -2.1555328369140625, "rewards/margins": 0.24002605676651, "rewards/rejected": -2.3955588340759277, "step": 522 }, { "epoch": 0.3415789044003592, "grad_norm": 28.678293250926668, "learning_rate": 1.2495548487579377e-07, "logits/chosen": -1.6345038414001465, "logits/rejected": -1.6450937986373901, "logps/chosen": -744.6649780273438, "logps/rejected": -786.7367553710938, "loss": 0.561, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3111023902893066, "rewards/margins": 0.4379115700721741, "rewards/rejected": -2.749014377593994, "step": 523 }, { "epoch": 0.3422320189403217, "grad_norm": 55.26079414913828, "learning_rate": 1.248277258148152e-07, "logits/chosen": -1.632555365562439, "logits/rejected": -1.6509549617767334, "logps/chosen": -782.6264038085938, "logps/rejected": -826.6570434570312, "loss": 0.5775, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1081671714782715, "rewards/margins": 0.5822569131851196, "rewards/rejected": -2.6904244422912598, "step": 524 }, { "epoch": 0.3428851334802841, "grad_norm": 25.386727503372587, "learning_rate": 1.2469970739398895e-07, "logits/chosen": -1.5985530614852905, "logits/rejected": -1.6135672330856323, "logps/chosen": -738.0941772460938, "logps/rejected": -705.9267578125, "loss": 0.6116, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2364561557769775, "rewards/margins": 0.3205564022064209, "rewards/rejected": -2.5570127964019775, "step": 525 }, { "epoch": 0.3435382480202466, "grad_norm": 24.51284417618824, "learning_rate": 1.2457143027966763e-07, "logits/chosen": -1.634655475616455, "logits/rejected": -1.6211440563201904, "logps/chosen": -750.4459228515625, "logps/rejected": -730.4688720703125, "loss": 0.5521, "rewards/accuracies": 0.6875, "rewards/chosen": -2.091447353363037, "rewards/margins": 0.35747280716896057, "rewards/rejected": -2.448920249938965, "step": 526 }, { "epoch": 0.344191362560209, "grad_norm": 44.66836450950333, "learning_rate": 1.2444289513955052e-07, "logits/chosen": -1.5693323612213135, "logits/rejected": -1.6260815858840942, "logps/chosen": -779.1264038085938, "logps/rejected": -830.4940185546875, "loss": 0.5846, "rewards/accuracies": 0.71875, "rewards/chosen": -2.496044635772705, "rewards/margins": 0.4798283278942108, "rewards/rejected": -2.975872755050659, "step": 527 }, { "epoch": 0.34484447710017146, "grad_norm": 58.69879433030782, "learning_rate": 1.2431410264267977e-07, "logits/chosen": -1.604842185974121, "logits/rejected": -1.604213833808899, "logps/chosen": -802.912109375, "logps/rejected": -868.535888671875, "loss": 0.5993, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3447303771972656, "rewards/margins": 0.5476406216621399, "rewards/rejected": -2.8923707008361816, "step": 528 }, { "epoch": 0.3454975916401339, "grad_norm": 36.25638810204293, "learning_rate": 1.2418505345943732e-07, "logits/chosen": -1.5913817882537842, "logits/rejected": -1.631279468536377, "logps/chosen": -760.42138671875, "logps/rejected": -691.488037109375, "loss": 0.582, "rewards/accuracies": 0.5625, "rewards/chosen": -2.126579999923706, "rewards/margins": 0.14462479948997498, "rewards/rejected": -2.2712044715881348, "step": 529 }, { "epoch": 0.34615070618009636, "grad_norm": 22.96730047945058, "learning_rate": 1.24055748261541e-07, "logits/chosen": -1.6423602104187012, "logits/rejected": -1.6510244607925415, "logps/chosen": -752.2402954101562, "logps/rejected": -776.55810546875, "loss": 0.5723, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1990549564361572, "rewards/margins": 0.4823690950870514, "rewards/rejected": -2.681424140930176, "step": 530 }, { "epoch": 0.3468038207200588, "grad_norm": 12.704637412947715, "learning_rate": 1.2392618772204144e-07, "logits/chosen": -1.5454316139221191, "logits/rejected": -1.5524265766143799, "logps/chosen": -704.6710815429688, "logps/rejected": -745.8167114257812, "loss": 0.5883, "rewards/accuracies": 0.65625, "rewards/chosen": -2.09757924079895, "rewards/margins": 0.410552978515625, "rewards/rejected": -2.508132219314575, "step": 531 }, { "epoch": 0.34745693526002125, "grad_norm": 39.764198564369345, "learning_rate": 1.2379637251531815e-07, "logits/chosen": -1.6019591093063354, "logits/rejected": -1.6283773183822632, "logps/chosen": -743.0405883789062, "logps/rejected": -784.7008666992188, "loss": 0.5559, "rewards/accuracies": 0.78125, "rewards/chosen": -2.1056554317474365, "rewards/margins": 0.5337550044059753, "rewards/rejected": -2.6394104957580566, "step": 532 }, { "epoch": 0.34811004979998367, "grad_norm": 34.128906613498735, "learning_rate": 1.2366630331707633e-07, "logits/chosen": -1.6114816665649414, "logits/rejected": -1.6253386735916138, "logps/chosen": -733.2994995117188, "logps/rejected": -764.6762084960938, "loss": 0.5313, "rewards/accuracies": 0.71875, "rewards/chosen": -2.0889058113098145, "rewards/margins": 0.47615137696266174, "rewards/rejected": -2.5650570392608643, "step": 533 }, { "epoch": 0.34876316433994614, "grad_norm": 10.132934728994826, "learning_rate": 1.2353598080434324e-07, "logits/chosen": -1.632277250289917, "logits/rejected": -1.6106319427490234, "logps/chosen": -824.1144409179688, "logps/rejected": -907.9232177734375, "loss": 0.556, "rewards/accuracies": 0.8125, "rewards/chosen": -2.394355297088623, "rewards/margins": 0.591341495513916, "rewards/rejected": -2.985696792602539, "step": 534 }, { "epoch": 0.34941627887990856, "grad_norm": 27.01945814074106, "learning_rate": 1.234054056554646e-07, "logits/chosen": -1.5513118505477905, "logits/rejected": -1.5469458103179932, "logps/chosen": -851.3035278320312, "logps/rejected": -767.6931762695312, "loss": 0.598, "rewards/accuracies": 0.71875, "rewards/chosen": -2.3710482120513916, "rewards/margins": 0.34137019515037537, "rewards/rejected": -2.7124183177948, "step": 535 }, { "epoch": 0.35006939341987103, "grad_norm": 20.633383781514684, "learning_rate": 1.2327457855010123e-07, "logits/chosen": -1.5809993743896484, "logits/rejected": -1.5902115106582642, "logps/chosen": -829.6868896484375, "logps/rejected": -818.66455078125, "loss": 0.5935, "rewards/accuracies": 0.59375, "rewards/chosen": -2.530834913253784, "rewards/margins": 0.49049264192581177, "rewards/rejected": -3.021327495574951, "step": 536 }, { "epoch": 0.35072250795983345, "grad_norm": 8.274651730353373, "learning_rate": 1.2314350016922534e-07, "logits/chosen": -1.634692907333374, "logits/rejected": -1.593540072441101, "logps/chosen": -737.755126953125, "logps/rejected": -739.923583984375, "loss": 0.5132, "rewards/accuracies": 0.71875, "rewards/chosen": -2.248922348022461, "rewards/margins": 0.5101175308227539, "rewards/rejected": -2.759039878845215, "step": 537 }, { "epoch": 0.3513756224997959, "grad_norm": 10.192458514719585, "learning_rate": 1.2301217119511708e-07, "logits/chosen": -1.5304259061813354, "logits/rejected": -1.5202865600585938, "logps/chosen": -765.9959106445312, "logps/rejected": -765.4609375, "loss": 0.5613, "rewards/accuracies": 0.65625, "rewards/chosen": -2.386655330657959, "rewards/margins": 0.377580851316452, "rewards/rejected": -2.7642359733581543, "step": 538 }, { "epoch": 0.35202873703975834, "grad_norm": 34.040881919948525, "learning_rate": 1.2288059231136108e-07, "logits/chosen": -1.532372236251831, "logits/rejected": -1.5646611452102661, "logps/chosen": -741.4403686523438, "logps/rejected": -813.6516723632812, "loss": 0.5147, "rewards/accuracies": 0.84375, "rewards/chosen": -2.1854164600372314, "rewards/margins": 0.7029819488525391, "rewards/rejected": -2.8883986473083496, "step": 539 }, { "epoch": 0.3526818515797208, "grad_norm": 12.432751143916052, "learning_rate": 1.2274876420284258e-07, "logits/chosen": -1.6108044385910034, "logits/rejected": -1.6093095541000366, "logps/chosen": -760.4302978515625, "logps/rejected": -788.9061889648438, "loss": 0.5564, "rewards/accuracies": 0.65625, "rewards/chosen": -2.301989793777466, "rewards/margins": 0.3381144404411316, "rewards/rejected": -2.640104293823242, "step": 540 }, { "epoch": 0.35333496611968324, "grad_norm": 30.749409766082465, "learning_rate": 1.2261668755574421e-07, "logits/chosen": -1.5781821012496948, "logits/rejected": -1.6185979843139648, "logps/chosen": -819.251220703125, "logps/rejected": -834.0662841796875, "loss": 0.5351, "rewards/accuracies": 0.78125, "rewards/chosen": -2.3528733253479004, "rewards/margins": 0.5704101324081421, "rewards/rejected": -2.923283338546753, "step": 541 }, { "epoch": 0.3539880806596457, "grad_norm": 29.16774400285802, "learning_rate": 1.2248436305754222e-07, "logits/chosen": -1.648099422454834, "logits/rejected": -1.6576378345489502, "logps/chosen": -712.8369140625, "logps/rejected": -755.3017578125, "loss": 0.5369, "rewards/accuracies": 0.71875, "rewards/chosen": -1.99370276927948, "rewards/margins": 0.4913850426673889, "rewards/rejected": -2.4850876331329346, "step": 542 }, { "epoch": 0.35464119519960813, "grad_norm": 20.69027911099315, "learning_rate": 1.2235179139700304e-07, "logits/chosen": -1.6414737701416016, "logits/rejected": -1.624894142150879, "logps/chosen": -813.76611328125, "logps/rejected": -831.1404418945312, "loss": 0.5981, "rewards/accuracies": 0.75, "rewards/chosen": -2.3536837100982666, "rewards/margins": 0.42595234513282776, "rewards/rejected": -2.7796361446380615, "step": 543 }, { "epoch": 0.3552943097395706, "grad_norm": 29.949634840397668, "learning_rate": 1.222189732641795e-07, "logits/chosen": -1.5087977647781372, "logits/rejected": -1.4944639205932617, "logps/chosen": -627.0595703125, "logps/rejected": -755.531005859375, "loss": 0.5464, "rewards/accuracies": 0.90625, "rewards/chosen": -1.9218859672546387, "rewards/margins": 1.039351463317871, "rewards/rejected": -2.9612374305725098, "step": 544 }, { "epoch": 0.355947424279533, "grad_norm": 14.943355173162043, "learning_rate": 1.220859093504074e-07, "logits/chosen": -1.5890822410583496, "logits/rejected": -1.5994880199432373, "logps/chosen": -857.3690185546875, "logps/rejected": -871.0678100585938, "loss": 0.5952, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6291110515594482, "rewards/margins": 0.46089449524879456, "rewards/rejected": -3.09000563621521, "step": 545 }, { "epoch": 0.3566005388194955, "grad_norm": 15.565475336026891, "learning_rate": 1.2195260034830187e-07, "logits/chosen": -1.6251795291900635, "logits/rejected": -1.6245146989822388, "logps/chosen": -726.9799194335938, "logps/rejected": -732.9411010742188, "loss": 0.5526, "rewards/accuracies": 0.59375, "rewards/chosen": -2.323512077331543, "rewards/margins": 0.2341621220111847, "rewards/rejected": -2.5576741695404053, "step": 546 }, { "epoch": 0.3572536533594579, "grad_norm": 23.991791905870468, "learning_rate": 1.2181904695175374e-07, "logits/chosen": -1.4712308645248413, "logits/rejected": -1.4625868797302246, "logps/chosen": -717.213134765625, "logps/rejected": -820.203857421875, "loss": 0.6012, "rewards/accuracies": 0.71875, "rewards/chosen": -2.27494740486145, "rewards/margins": 0.7880808115005493, "rewards/rejected": -3.063028335571289, "step": 547 }, { "epoch": 0.3579067678994204, "grad_norm": 35.7010085328891, "learning_rate": 1.2168524985592597e-07, "logits/chosen": -1.5415699481964111, "logits/rejected": -1.5597350597381592, "logps/chosen": -772.77978515625, "logps/rejected": -746.6201782226562, "loss": 0.5508, "rewards/accuracies": 0.5625, "rewards/chosen": -2.421182155609131, "rewards/margins": 0.254733145236969, "rewards/rejected": -2.675915241241455, "step": 548 }, { "epoch": 0.3585598824393828, "grad_norm": 14.830303633870002, "learning_rate": 1.2155120975724996e-07, "logits/chosen": -1.5214571952819824, "logits/rejected": -1.5628215074539185, "logps/chosen": -706.9215087890625, "logps/rejected": -791.664794921875, "loss": 0.5483, "rewards/accuracies": 0.875, "rewards/chosen": -2.0701212882995605, "rewards/margins": 0.7649804949760437, "rewards/rejected": -2.83510160446167, "step": 549 }, { "epoch": 0.3592129969793453, "grad_norm": 11.60924722942488, "learning_rate": 1.214169273534221e-07, "logits/chosen": -1.5903964042663574, "logits/rejected": -1.6238888502120972, "logps/chosen": -754.437255859375, "logps/rejected": -765.71435546875, "loss": 0.5506, "rewards/accuracies": 0.75, "rewards/chosen": -2.2985119819641113, "rewards/margins": 0.33853238821029663, "rewards/rejected": -2.6370441913604736, "step": 550 }, { "epoch": 0.3598661115193077, "grad_norm": 19.208616054146393, "learning_rate": 1.2128240334339978e-07, "logits/chosen": -1.6252080202102661, "logits/rejected": -1.6387659311294556, "logps/chosen": -827.2951049804688, "logps/rejected": -865.0391845703125, "loss": 0.5597, "rewards/accuracies": 0.84375, "rewards/chosen": -2.646684408187866, "rewards/margins": 0.5018330812454224, "rewards/rejected": -3.14851713180542, "step": 551 }, { "epoch": 0.36051922605927017, "grad_norm": 15.97827516501005, "learning_rate": 1.211476384273982e-07, "logits/chosen": -1.6076312065124512, "logits/rejected": -1.6076362133026123, "logps/chosen": -735.5797119140625, "logps/rejected": -755.5748291015625, "loss": 0.5435, "rewards/accuracies": 0.71875, "rewards/chosen": -2.272061347961426, "rewards/margins": 0.8418019413948059, "rewards/rejected": -3.113863229751587, "step": 552 }, { "epoch": 0.3611723405992326, "grad_norm": 12.277648543582611, "learning_rate": 1.2101263330688638e-07, "logits/chosen": -1.6942815780639648, "logits/rejected": -1.6733278036117554, "logps/chosen": -777.0508422851562, "logps/rejected": -763.1177368164062, "loss": 0.5924, "rewards/accuracies": 0.625, "rewards/chosen": -2.428075075149536, "rewards/margins": 0.363750159740448, "rewards/rejected": -2.79182505607605, "step": 553 }, { "epoch": 0.36182545513919506, "grad_norm": 10.465467914806046, "learning_rate": 1.208773886845837e-07, "logits/chosen": -1.604886531829834, "logits/rejected": -1.6047513484954834, "logps/chosen": -739.2161865234375, "logps/rejected": -860.6815185546875, "loss": 0.511, "rewards/accuracies": 0.65625, "rewards/chosen": -2.230139970779419, "rewards/margins": 0.676809549331665, "rewards/rejected": -2.906949520111084, "step": 554 }, { "epoch": 0.3624785696791575, "grad_norm": 21.364165430156397, "learning_rate": 1.2074190526445616e-07, "logits/chosen": -1.6185420751571655, "logits/rejected": -1.6467700004577637, "logps/chosen": -812.8031005859375, "logps/rejected": -845.0140991210938, "loss": 0.4957, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4488282203674316, "rewards/margins": 0.606843113899231, "rewards/rejected": -3.055671453475952, "step": 555 }, { "epoch": 0.36313168421911995, "grad_norm": 12.060857675840488, "learning_rate": 1.2060618375171275e-07, "logits/chosen": -1.582899808883667, "logits/rejected": -1.5877315998077393, "logps/chosen": -716.4563598632812, "logps/rejected": -797.8826293945312, "loss": 0.5325, "rewards/accuracies": 0.59375, "rewards/chosen": -2.498225688934326, "rewards/margins": 0.3760990798473358, "rewards/rejected": -2.8743247985839844, "step": 556 }, { "epoch": 0.3637847987590824, "grad_norm": 13.492857611613953, "learning_rate": 1.2047022485280168e-07, "logits/chosen": -1.539080023765564, "logits/rejected": -1.570099115371704, "logps/chosen": -700.08203125, "logps/rejected": -716.7232666015625, "loss": 0.5849, "rewards/accuracies": 0.78125, "rewards/chosen": -2.2837417125701904, "rewards/margins": 0.5059683322906494, "rewards/rejected": -2.78971004486084, "step": 557 }, { "epoch": 0.36443791329904485, "grad_norm": 33.571052467052354, "learning_rate": 1.2033402927540688e-07, "logits/chosen": -1.5120295286178589, "logits/rejected": -1.4916774034500122, "logps/chosen": -786.6419677734375, "logps/rejected": -877.5504760742188, "loss": 0.5212, "rewards/accuracies": 0.84375, "rewards/chosen": -2.3839404582977295, "rewards/margins": 0.8281471133232117, "rewards/rejected": -3.212087631225586, "step": 558 }, { "epoch": 0.36509102783900726, "grad_norm": 49.87336711471519, "learning_rate": 1.2019759772844423e-07, "logits/chosen": -1.570830225944519, "logits/rejected": -1.612029790878296, "logps/chosen": -759.057373046875, "logps/rejected": -869.3032836914062, "loss": 0.608, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1914572715759277, "rewards/margins": 0.8840473890304565, "rewards/rejected": -3.075504779815674, "step": 559 }, { "epoch": 0.36574414237896974, "grad_norm": 74.5712473751751, "learning_rate": 1.2006093092205777e-07, "logits/chosen": -1.4767194986343384, "logits/rejected": -1.4949413537979126, "logps/chosen": -698.3121948242188, "logps/rejected": -743.3089599609375, "loss": 0.5547, "rewards/accuracies": 0.84375, "rewards/chosen": -2.0461912155151367, "rewards/margins": 0.5971631407737732, "rewards/rejected": -2.643354654312134, "step": 560 }, { "epoch": 0.36639725691893216, "grad_norm": 44.74846351811676, "learning_rate": 1.199240295676162e-07, "logits/chosen": -1.5869390964508057, "logits/rejected": -1.600525140762329, "logps/chosen": -759.1281127929688, "logps/rejected": -777.5281372070312, "loss": 0.6048, "rewards/accuracies": 0.71875, "rewards/chosen": -2.285490036010742, "rewards/margins": 0.29206350445747375, "rewards/rejected": -2.5775535106658936, "step": 561 }, { "epoch": 0.36705037145889463, "grad_norm": 10.999921054518422, "learning_rate": 1.1978689437770896e-07, "logits/chosen": -1.5283629894256592, "logits/rejected": -1.5792925357818604, "logps/chosen": -674.9652709960938, "logps/rejected": -813.370849609375, "loss": 0.5122, "rewards/accuracies": 0.875, "rewards/chosen": -2.0200095176696777, "rewards/margins": 0.8305485844612122, "rewards/rejected": -2.850558280944824, "step": 562 }, { "epoch": 0.36770348599885705, "grad_norm": 55.10455252064543, "learning_rate": 1.1964952606614276e-07, "logits/chosen": -1.6453619003295898, "logits/rejected": -1.6702425479888916, "logps/chosen": -794.0926513671875, "logps/rejected": -870.9822998046875, "loss": 0.5045, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6258649826049805, "rewards/margins": 0.731311559677124, "rewards/rejected": -3.3571763038635254, "step": 563 }, { "epoch": 0.3683566005388195, "grad_norm": 24.477947303915716, "learning_rate": 1.1951192534793764e-07, "logits/chosen": -1.5885945558547974, "logits/rejected": -1.5889862775802612, "logps/chosen": -740.5401611328125, "logps/rejected": -770.5545654296875, "loss": 0.5242, "rewards/accuracies": 0.65625, "rewards/chosen": -2.3373959064483643, "rewards/margins": 0.3964667022228241, "rewards/rejected": -2.7338626384735107, "step": 564 }, { "epoch": 0.36900971507878194, "grad_norm": 18.91974104462604, "learning_rate": 1.193740929393234e-07, "logits/chosen": -1.5289589166641235, "logits/rejected": -1.5133321285247803, "logps/chosen": -695.8096923828125, "logps/rejected": -892.4715576171875, "loss": 0.5767, "rewards/accuracies": 0.78125, "rewards/chosen": -2.182143211364746, "rewards/margins": 1.0010870695114136, "rewards/rejected": -3.18323016166687, "step": 565 }, { "epoch": 0.36966282961874436, "grad_norm": 96.53142007828465, "learning_rate": 1.1923602955773583e-07, "logits/chosen": -1.585618257522583, "logits/rejected": -1.580026626586914, "logps/chosen": -676.3846435546875, "logps/rejected": -825.5516357421875, "loss": 0.5286, "rewards/accuracies": 0.8125, "rewards/chosen": -2.03085994720459, "rewards/margins": 0.9540601968765259, "rewards/rejected": -2.984920024871826, "step": 566 }, { "epoch": 0.37031594415870683, "grad_norm": 9.841984604340173, "learning_rate": 1.1909773592181287e-07, "logits/chosen": -1.6656473875045776, "logits/rejected": -1.6531533002853394, "logps/chosen": -783.5050659179688, "logps/rejected": -795.6260986328125, "loss": 0.5741, "rewards/accuracies": 0.8125, "rewards/chosen": -2.256021738052368, "rewards/margins": 0.621117353439331, "rewards/rejected": -2.877139091491699, "step": 567 }, { "epoch": 0.37096905869866925, "grad_norm": 13.260118641189585, "learning_rate": 1.189592127513911e-07, "logits/chosen": -1.5660011768341064, "logits/rejected": -1.5593037605285645, "logps/chosen": -699.3264770507812, "logps/rejected": -718.5197143554688, "loss": 0.5123, "rewards/accuracies": 0.65625, "rewards/chosen": -2.3721680641174316, "rewards/margins": 0.40681201219558716, "rewards/rejected": -2.778980255126953, "step": 568 }, { "epoch": 0.3716221732386317, "grad_norm": 66.7819908806299, "learning_rate": 1.1882046076750176e-07, "logits/chosen": -1.6180167198181152, "logits/rejected": -1.6349365711212158, "logps/chosen": -837.523681640625, "logps/rejected": -804.4877319335938, "loss": 0.5133, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4943768978118896, "rewards/margins": 0.5837184190750122, "rewards/rejected": -3.078094959259033, "step": 569 }, { "epoch": 0.37227528777859414, "grad_norm": 29.37833169883412, "learning_rate": 1.186814806923671e-07, "logits/chosen": -1.5915193557739258, "logits/rejected": -1.5684032440185547, "logps/chosen": -738.4844360351562, "logps/rejected": -740.8411254882812, "loss": 0.5667, "rewards/accuracies": 0.75, "rewards/chosen": -2.2804222106933594, "rewards/margins": 0.37493157386779785, "rewards/rejected": -2.6553537845611572, "step": 570 }, { "epoch": 0.3729284023185566, "grad_norm": 48.10084583273166, "learning_rate": 1.1854227324939669e-07, "logits/chosen": -1.612912654876709, "logits/rejected": -1.5692781209945679, "logps/chosen": -897.7774658203125, "logps/rejected": -913.3131103515625, "loss": 0.5835, "rewards/accuracies": 0.71875, "rewards/chosen": -2.936650276184082, "rewards/margins": 0.400768518447876, "rewards/rejected": -3.337418794631958, "step": 571 }, { "epoch": 0.37358151685851904, "grad_norm": 11.716756841277345, "learning_rate": 1.1840283916318347e-07, "logits/chosen": -1.5072287321090698, "logits/rejected": -1.4601385593414307, "logps/chosen": -745.0501708984375, "logps/rejected": -801.019775390625, "loss": 0.5261, "rewards/accuracies": 0.6875, "rewards/chosen": -2.25480055809021, "rewards/margins": 0.8588010668754578, "rewards/rejected": -3.1136016845703125, "step": 572 }, { "epoch": 0.3742346313984815, "grad_norm": 18.580715359087044, "learning_rate": 1.1826317915950021e-07, "logits/chosen": -1.5958278179168701, "logits/rejected": -1.6139434576034546, "logps/chosen": -767.4638061523438, "logps/rejected": -891.0325927734375, "loss": 0.5267, "rewards/accuracies": 0.78125, "rewards/chosen": -2.512181043624878, "rewards/margins": 0.7721086144447327, "rewards/rejected": -3.284290075302124, "step": 573 }, { "epoch": 0.37488774593844393, "grad_norm": 24.286745313641156, "learning_rate": 1.181232939652955e-07, "logits/chosen": -1.5305031538009644, "logits/rejected": -1.4823194742202759, "logps/chosen": -748.1857299804688, "logps/rejected": -847.1156005859375, "loss": 0.5462, "rewards/accuracies": 0.84375, "rewards/chosen": -2.5048766136169434, "rewards/margins": 0.7592265009880066, "rewards/rejected": -3.2641031742095947, "step": 574 }, { "epoch": 0.3755408604784064, "grad_norm": 29.652610575297764, "learning_rate": 1.1798318430869012e-07, "logits/chosen": -1.5794718265533447, "logits/rejected": -1.5874780416488647, "logps/chosen": -794.8291625976562, "logps/rejected": -825.0755615234375, "loss": 0.5033, "rewards/accuracies": 0.84375, "rewards/chosen": -2.382725238800049, "rewards/margins": 0.7552610039710999, "rewards/rejected": -3.137986183166504, "step": 575 }, { "epoch": 0.3761939750183688, "grad_norm": 58.912701269068016, "learning_rate": 1.1784285091897322e-07, "logits/chosen": -1.6159145832061768, "logits/rejected": -1.6184728145599365, "logps/chosen": -892.6289672851562, "logps/rejected": -935.57568359375, "loss": 0.5889, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7877440452575684, "rewards/margins": 0.5180625319480896, "rewards/rejected": -3.3058066368103027, "step": 576 }, { "epoch": 0.3768470895583313, "grad_norm": 92.84475707292661, "learning_rate": 1.1770229452659852e-07, "logits/chosen": -1.5818086862564087, "logits/rejected": -1.583407998085022, "logps/chosen": -678.228515625, "logps/rejected": -849.5630493164062, "loss": 0.5562, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2353968620300293, "rewards/margins": 1.1506848335266113, "rewards/rejected": -3.386082172393799, "step": 577 }, { "epoch": 0.3775002040982937, "grad_norm": 24.53245073109831, "learning_rate": 1.1756151586318044e-07, "logits/chosen": -1.5623910427093506, "logits/rejected": -1.6067014932632446, "logps/chosen": -806.4697875976562, "logps/rejected": -860.4945068359375, "loss": 0.5023, "rewards/accuracies": 0.78125, "rewards/chosen": -2.364492654800415, "rewards/margins": 0.7060398459434509, "rewards/rejected": -3.07053279876709, "step": 578 }, { "epoch": 0.3781533186382562, "grad_norm": 18.85494518114228, "learning_rate": 1.174205156614904e-07, "logits/chosen": -1.4894192218780518, "logits/rejected": -1.5204534530639648, "logps/chosen": -829.0343017578125, "logps/rejected": -848.0327758789062, "loss": 0.5509, "rewards/accuracies": 0.71875, "rewards/chosen": -2.7499516010284424, "rewards/margins": 0.4761306643486023, "rewards/rejected": -3.2260818481445312, "step": 579 }, { "epoch": 0.3788064331782186, "grad_norm": 15.464999889697193, "learning_rate": 1.1727929465545294e-07, "logits/chosen": -1.5147112607955933, "logits/rejected": -1.6048409938812256, "logps/chosen": -769.5234985351562, "logps/rejected": -810.0481567382812, "loss": 0.6104, "rewards/accuracies": 0.65625, "rewards/chosen": -2.7371156215667725, "rewards/margins": 0.32579949498176575, "rewards/rejected": -3.062915325164795, "step": 580 }, { "epoch": 0.3794595477181811, "grad_norm": 49.00611608700767, "learning_rate": 1.1713785358014193e-07, "logits/chosen": -1.603877305984497, "logits/rejected": -1.6114190816879272, "logps/chosen": -693.6171875, "logps/rejected": -696.5467529296875, "loss": 0.5624, "rewards/accuracies": 0.8125, "rewards/chosen": -2.458164691925049, "rewards/margins": 0.3706192076206207, "rewards/rejected": -2.8287839889526367, "step": 581 }, { "epoch": 0.3801126622581435, "grad_norm": 41.687356680487866, "learning_rate": 1.1699619317177668e-07, "logits/chosen": -1.492578387260437, "logits/rejected": -1.4849687814712524, "logps/chosen": -711.063232421875, "logps/rejected": -779.2449951171875, "loss": 0.5001, "rewards/accuracies": 0.71875, "rewards/chosen": -2.399482250213623, "rewards/margins": 0.7047311663627625, "rewards/rejected": -3.104212999343872, "step": 582 }, { "epoch": 0.38076577679810597, "grad_norm": 40.272642825045395, "learning_rate": 1.1685431416771825e-07, "logits/chosen": -1.5852625370025635, "logits/rejected": -1.6129764318466187, "logps/chosen": -813.8887939453125, "logps/rejected": -817.9891357421875, "loss": 0.5583, "rewards/accuracies": 0.6875, "rewards/chosen": -2.6035213470458984, "rewards/margins": 0.5876621007919312, "rewards/rejected": -3.19118332862854, "step": 583 }, { "epoch": 0.3814188913380684, "grad_norm": 24.667173577610694, "learning_rate": 1.1671221730646543e-07, "logits/chosen": -1.5789833068847656, "logits/rejected": -1.5653014183044434, "logps/chosen": -698.559814453125, "logps/rejected": -799.244384765625, "loss": 0.5128, "rewards/accuracies": 0.75, "rewards/chosen": -2.156489610671997, "rewards/margins": 0.8030795454978943, "rewards/rejected": -2.959568977355957, "step": 584 }, { "epoch": 0.38207200587803086, "grad_norm": 10.056286338983814, "learning_rate": 1.1656990332765101e-07, "logits/chosen": -1.579622745513916, "logits/rejected": -1.5801726579666138, "logps/chosen": -727.0596923828125, "logps/rejected": -783.4789428710938, "loss": 0.5499, "rewards/accuracies": 0.71875, "rewards/chosen": -2.434051036834717, "rewards/margins": 0.5680094957351685, "rewards/rejected": -3.002060651779175, "step": 585 }, { "epoch": 0.3827251204179933, "grad_norm": 10.639118531618282, "learning_rate": 1.1642737297203792e-07, "logits/chosen": -1.550283432006836, "logits/rejected": -1.545919418334961, "logps/chosen": -678.3670043945312, "logps/rejected": -830.1644897460938, "loss": 0.5517, "rewards/accuracies": 0.84375, "rewards/chosen": -2.09962797164917, "rewards/margins": 0.9918718934059143, "rewards/rejected": -3.0914998054504395, "step": 586 }, { "epoch": 0.38337823495795575, "grad_norm": 20.197574332614796, "learning_rate": 1.1628462698151538e-07, "logits/chosen": -1.5244333744049072, "logits/rejected": -1.5570902824401855, "logps/chosen": -690.13720703125, "logps/rejected": -777.196533203125, "loss": 0.5574, "rewards/accuracies": 0.625, "rewards/chosen": -2.285308837890625, "rewards/margins": 0.6684877872467041, "rewards/rejected": -2.95379638671875, "step": 587 }, { "epoch": 0.3840313494979182, "grad_norm": 66.37641884436981, "learning_rate": 1.1614166609909498e-07, "logits/chosen": -1.5359015464782715, "logits/rejected": -1.546217679977417, "logps/chosen": -738.7581176757812, "logps/rejected": -772.0260009765625, "loss": 0.5286, "rewards/accuracies": 0.75, "rewards/chosen": -2.2918949127197266, "rewards/margins": 0.4515775740146637, "rewards/rejected": -2.7434725761413574, "step": 588 }, { "epoch": 0.38468446403788065, "grad_norm": 13.46028213535803, "learning_rate": 1.1599849106890683e-07, "logits/chosen": -1.527596116065979, "logits/rejected": -1.5285677909851074, "logps/chosen": -734.326171875, "logps/rejected": -749.241455078125, "loss": 0.568, "rewards/accuracies": 0.78125, "rewards/chosen": -2.2978053092956543, "rewards/margins": 0.3799334466457367, "rewards/rejected": -2.677738666534424, "step": 589 }, { "epoch": 0.38533757857784307, "grad_norm": 22.97104425543304, "learning_rate": 1.1585510263619577e-07, "logits/chosen": -1.5912518501281738, "logits/rejected": -1.5906767845153809, "logps/chosen": -724.542236328125, "logps/rejected": -752.7775268554688, "loss": 0.6249, "rewards/accuracies": 0.625, "rewards/chosen": -2.344860076904297, "rewards/margins": 0.30979910492897034, "rewards/rejected": -2.6546592712402344, "step": 590 }, { "epoch": 0.38599069311780554, "grad_norm": 10.721634341469683, "learning_rate": 1.157115015473174e-07, "logits/chosen": -1.5702768564224243, "logits/rejected": -1.5887250900268555, "logps/chosen": -711.1195678710938, "logps/rejected": -744.3549194335938, "loss": 0.5678, "rewards/accuracies": 0.75, "rewards/chosen": -2.271317958831787, "rewards/margins": 0.5261479020118713, "rewards/rejected": -2.7974658012390137, "step": 591 }, { "epoch": 0.38664380765776796, "grad_norm": 20.96230357570753, "learning_rate": 1.155676885497342e-07, "logits/chosen": -1.587166666984558, "logits/rejected": -1.5863206386566162, "logps/chosen": -703.5427856445312, "logps/rejected": -760.740234375, "loss": 0.5131, "rewards/accuracies": 0.78125, "rewards/chosen": -2.3268959522247314, "rewards/margins": 0.47407492995262146, "rewards/rejected": -2.800971031188965, "step": 592 }, { "epoch": 0.38729692219773043, "grad_norm": 19.489795146613808, "learning_rate": 1.154236643920117e-07, "logits/chosen": -1.661864161491394, "logits/rejected": -1.6856908798217773, "logps/chosen": -724.410400390625, "logps/rejected": -773.7425537109375, "loss": 0.5241, "rewards/accuracies": 0.8125, "rewards/chosen": -2.489246368408203, "rewards/margins": 0.5750184059143066, "rewards/rejected": -3.064265012741089, "step": 593 }, { "epoch": 0.38795003673769285, "grad_norm": 73.12725772218887, "learning_rate": 1.1527942982381452e-07, "logits/chosen": -1.5697174072265625, "logits/rejected": -1.5859888792037964, "logps/chosen": -754.874267578125, "logps/rejected": -791.1784057617188, "loss": 0.5575, "rewards/accuracies": 0.75, "rewards/chosen": -2.533051013946533, "rewards/margins": 0.5103416442871094, "rewards/rejected": -3.0433928966522217, "step": 594 }, { "epoch": 0.3886031512776553, "grad_norm": 50.458839536472475, "learning_rate": 1.1513498559590252e-07, "logits/chosen": -1.6159231662750244, "logits/rejected": -1.5974745750427246, "logps/chosen": -762.911376953125, "logps/rejected": -743.5185546875, "loss": 0.5855, "rewards/accuracies": 0.78125, "rewards/chosen": -2.3125293254852295, "rewards/margins": 0.30350756645202637, "rewards/rejected": -2.616036891937256, "step": 595 }, { "epoch": 0.38925626581761774, "grad_norm": 10.34566924087224, "learning_rate": 1.1499033246012685e-07, "logits/chosen": -1.5862224102020264, "logits/rejected": -1.5608221292495728, "logps/chosen": -864.4752807617188, "logps/rejected": -983.65087890625, "loss": 0.5411, "rewards/accuracies": 0.84375, "rewards/chosen": -2.47229266166687, "rewards/margins": 0.9073572754859924, "rewards/rejected": -3.379650115966797, "step": 596 }, { "epoch": 0.3899093803575802, "grad_norm": 27.551259833581323, "learning_rate": 1.1484547116942601e-07, "logits/chosen": -1.6429082155227661, "logits/rejected": -1.6100890636444092, "logps/chosen": -772.03076171875, "logps/rejected": -794.9556884765625, "loss": 0.551, "rewards/accuracies": 0.8125, "rewards/chosen": -2.502126693725586, "rewards/margins": 0.5208601951599121, "rewards/rejected": -3.022986888885498, "step": 597 }, { "epoch": 0.39056249489754263, "grad_norm": 21.20639182270992, "learning_rate": 1.1470040247782205e-07, "logits/chosen": -1.5553147792816162, "logits/rejected": -1.5810749530792236, "logps/chosen": -782.7449951171875, "logps/rejected": -797.2606201171875, "loss": 0.5487, "rewards/accuracies": 0.71875, "rewards/chosen": -2.488363742828369, "rewards/margins": 0.4950363039970398, "rewards/rejected": -2.983400344848633, "step": 598 }, { "epoch": 0.3912156094375051, "grad_norm": 61.94478857020836, "learning_rate": 1.1455512714041656e-07, "logits/chosen": -1.567291021347046, "logits/rejected": -1.5616893768310547, "logps/chosen": -682.39306640625, "logps/rejected": -779.3341674804688, "loss": 0.5335, "rewards/accuracies": 0.71875, "rewards/chosen": -2.248433828353882, "rewards/margins": 0.45859482884407043, "rewards/rejected": -2.707028865814209, "step": 599 }, { "epoch": 0.3918687239774675, "grad_norm": 27.096742126205903, "learning_rate": 1.1440964591338669e-07, "logits/chosen": -1.5842782258987427, "logits/rejected": -1.588181734085083, "logps/chosen": -848.2506713867188, "logps/rejected": -845.3145141601562, "loss": 0.5896, "rewards/accuracies": 0.6875, "rewards/chosen": -2.536600112915039, "rewards/margins": 0.5013671517372131, "rewards/rejected": -3.0379669666290283, "step": 600 }, { "epoch": 0.3918687239774675, "eval_logits/chosen": -1.584659457206726, "eval_logits/rejected": -1.5836009979248047, "eval_logps/chosen": -748.0584106445312, "eval_logps/rejected": -799.712158203125, "eval_loss": 0.5452645421028137, "eval_rewards/accuracies": 0.7179999947547913, "eval_rewards/chosen": -2.3771302700042725, "eval_rewards/margins": 0.5975968241691589, "eval_rewards/rejected": -2.9747273921966553, "eval_runtime": 300.4063, "eval_samples_per_second": 13.315, "eval_steps_per_second": 0.832, "step": 600 }, { "epoch": 0.39252183851743, "grad_norm": 12.348978382810378, "learning_rate": 1.142639595539813e-07, "logits/chosen": -1.5698190927505493, "logits/rejected": -1.529393196105957, "logps/chosen": -787.7490234375, "logps/rejected": -788.6868896484375, "loss": 0.5584, "rewards/accuracies": 0.75, "rewards/chosen": -2.350584030151367, "rewards/margins": 0.6392349004745483, "rewards/rejected": -2.989819049835205, "step": 601 }, { "epoch": 0.3931749530573924, "grad_norm": 10.679850846766614, "learning_rate": 1.1411806882051702e-07, "logits/chosen": -1.5929275751113892, "logits/rejected": -1.5805671215057373, "logps/chosen": -723.1961669921875, "logps/rejected": -713.7118530273438, "loss": 0.5297, "rewards/accuracies": 0.71875, "rewards/chosen": -2.31701397895813, "rewards/margins": 0.5196025371551514, "rewards/rejected": -2.836616039276123, "step": 602 }, { "epoch": 0.3938280675973549, "grad_norm": 35.708189094404084, "learning_rate": 1.1397197447237423e-07, "logits/chosen": -1.5436877012252808, "logits/rejected": -1.562942624092102, "logps/chosen": -728.4673461914062, "logps/rejected": -798.8472290039062, "loss": 0.5137, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6131293773651123, "rewards/margins": 0.4990772604942322, "rewards/rejected": -3.11220645904541, "step": 603 }, { "epoch": 0.3944811821373173, "grad_norm": 82.94052901290438, "learning_rate": 1.1382567726999319e-07, "logits/chosen": -1.5964429378509521, "logits/rejected": -1.5941182374954224, "logps/chosen": -796.7896118164062, "logps/rejected": -837.9287109375, "loss": 0.5751, "rewards/accuracies": 0.71875, "rewards/chosen": -2.377279043197632, "rewards/margins": 0.43824002146720886, "rewards/rejected": -2.815519094467163, "step": 604 }, { "epoch": 0.3951342966772798, "grad_norm": 48.03054462615425, "learning_rate": 1.1367917797487002e-07, "logits/chosen": -1.5520787239074707, "logits/rejected": -1.5235430002212524, "logps/chosen": -703.1958618164062, "logps/rejected": -713.5035400390625, "loss": 0.6487, "rewards/accuracies": 0.59375, "rewards/chosen": -2.3644354343414307, "rewards/margins": 0.27925288677215576, "rewards/rejected": -2.643688440322876, "step": 605 }, { "epoch": 0.3957874112172422, "grad_norm": 32.26597426964153, "learning_rate": 1.1353247734955275e-07, "logits/chosen": -1.527002215385437, "logits/rejected": -1.5140247344970703, "logps/chosen": -783.9526977539062, "logps/rejected": -775.091796875, "loss": 0.5689, "rewards/accuracies": 0.65625, "rewards/chosen": -2.74333119392395, "rewards/margins": 0.22180992364883423, "rewards/rejected": -2.9651412963867188, "step": 606 }, { "epoch": 0.3964405257572047, "grad_norm": 10.429513069602487, "learning_rate": 1.133855761576374e-07, "logits/chosen": -1.5365808010101318, "logits/rejected": -1.5101385116577148, "logps/chosen": -651.0933837890625, "logps/rejected": -708.3961181640625, "loss": 0.5409, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9835056066513062, "rewards/margins": 0.3382415473461151, "rewards/rejected": -2.321747303009033, "step": 607 }, { "epoch": 0.3970936402971671, "grad_norm": 19.72537569851452, "learning_rate": 1.1323847516376392e-07, "logits/chosen": -1.5134871006011963, "logits/rejected": -1.545701026916504, "logps/chosen": -684.7098999023438, "logps/rejected": -666.0336303710938, "loss": 0.5726, "rewards/accuracies": 0.5625, "rewards/chosen": -2.495856523513794, "rewards/margins": 0.06803350895643234, "rewards/rejected": -2.563889980316162, "step": 608 }, { "epoch": 0.39774675483712957, "grad_norm": 78.12335940886078, "learning_rate": 1.1309117513361228e-07, "logits/chosen": -1.4830129146575928, "logits/rejected": -1.4823014736175537, "logps/chosen": -758.7376708984375, "logps/rejected": -757.42138671875, "loss": 0.5917, "rewards/accuracies": 0.6875, "rewards/chosen": -2.4975523948669434, "rewards/margins": 0.32604148983955383, "rewards/rejected": -2.823594093322754, "step": 609 }, { "epoch": 0.398399869377092, "grad_norm": 23.094040308095767, "learning_rate": 1.1294367683389848e-07, "logits/chosen": -1.5845017433166504, "logits/rejected": -1.556647539138794, "logps/chosen": -739.6962280273438, "logps/rejected": -748.73876953125, "loss": 0.5258, "rewards/accuracies": 0.625, "rewards/chosen": -2.4809250831604004, "rewards/margins": 0.4438374638557434, "rewards/rejected": -2.924762725830078, "step": 610 }, { "epoch": 0.39905298391705446, "grad_norm": 39.1177725554315, "learning_rate": 1.1279598103237047e-07, "logits/chosen": -1.60092031955719, "logits/rejected": -1.6122784614562988, "logps/chosen": -759.567138671875, "logps/rejected": -808.7623291015625, "loss": 0.5337, "rewards/accuracies": 0.8125, "rewards/chosen": -2.235379695892334, "rewards/margins": 0.5821799039840698, "rewards/rejected": -2.8175594806671143, "step": 611 }, { "epoch": 0.3997060984570169, "grad_norm": 14.877552981780648, "learning_rate": 1.1264808849780429e-07, "logits/chosen": -1.5760529041290283, "logits/rejected": -1.5651963949203491, "logps/chosen": -779.1673583984375, "logps/rejected": -973.266845703125, "loss": 0.5072, "rewards/accuracies": 0.78125, "rewards/chosen": -2.3549463748931885, "rewards/margins": 1.1115797758102417, "rewards/rejected": -3.4665260314941406, "step": 612 }, { "epoch": 0.40035921299697935, "grad_norm": 38.151466897675625, "learning_rate": 1.125e-07, "logits/chosen": -1.5994532108306885, "logits/rejected": -1.62065589427948, "logps/chosen": -740.2821655273438, "logps/rejected": -788.3598022460938, "loss": 0.5041, "rewards/accuracies": 0.875, "rewards/chosen": -2.354248046875, "rewards/margins": 0.6767839193344116, "rewards/rejected": -3.031032085418701, "step": 613 }, { "epoch": 0.40101232753694177, "grad_norm": 31.35161543065443, "learning_rate": 1.123517163097776e-07, "logits/chosen": -1.6029822826385498, "logits/rejected": -1.6242347955703735, "logps/chosen": -775.5657348632812, "logps/rejected": -801.442138671875, "loss": 0.5279, "rewards/accuracies": 0.75, "rewards/chosen": -2.5471549034118652, "rewards/margins": 0.6026932001113892, "rewards/rejected": -3.149848222732544, "step": 614 }, { "epoch": 0.40166544207690424, "grad_norm": 42.24561626020638, "learning_rate": 1.1220323819897319e-07, "logits/chosen": -1.525698184967041, "logits/rejected": -1.528663992881775, "logps/chosen": -715.5992431640625, "logps/rejected": -701.003173828125, "loss": 0.5406, "rewards/accuracies": 0.71875, "rewards/chosen": -2.155259609222412, "rewards/margins": 0.44246095418930054, "rewards/rejected": -2.5977203845977783, "step": 615 }, { "epoch": 0.40231855661686666, "grad_norm": 43.53559559563413, "learning_rate": 1.120545664404348e-07, "logits/chosen": -1.5581973791122437, "logits/rejected": -1.511064052581787, "logps/chosen": -835.9921264648438, "logps/rejected": -890.62744140625, "loss": 0.4782, "rewards/accuracies": 0.84375, "rewards/chosen": -2.300487518310547, "rewards/margins": 0.912307620048523, "rewards/rejected": -3.2127950191497803, "step": 616 }, { "epoch": 0.40297167115682914, "grad_norm": 22.99506036909689, "learning_rate": 1.1190570180801842e-07, "logits/chosen": -1.5422519445419312, "logits/rejected": -1.580669641494751, "logps/chosen": -782.707763671875, "logps/rejected": -868.0604858398438, "loss": 0.5118, "rewards/accuracies": 0.8125, "rewards/chosen": -2.45739483833313, "rewards/margins": 0.641864538192749, "rewards/rejected": -3.0992591381073, "step": 617 }, { "epoch": 0.40362478569679155, "grad_norm": 9.096921333008568, "learning_rate": 1.11756645076584e-07, "logits/chosen": -1.5296615362167358, "logits/rejected": -1.5493112802505493, "logps/chosen": -742.4532470703125, "logps/rejected": -760.0758666992188, "loss": 0.4997, "rewards/accuracies": 0.75, "rewards/chosen": -2.1916160583496094, "rewards/margins": 0.4671185612678528, "rewards/rejected": -2.6587345600128174, "step": 618 }, { "epoch": 0.40427790023675403, "grad_norm": 19.544727760189303, "learning_rate": 1.1160739702199136e-07, "logits/chosen": -1.5654833316802979, "logits/rejected": -1.6011905670166016, "logps/chosen": -678.881103515625, "logps/rejected": -688.1021728515625, "loss": 0.605, "rewards/accuracies": 0.78125, "rewards/chosen": -2.180164098739624, "rewards/margins": 0.3665446937084198, "rewards/rejected": -2.546708583831787, "step": 619 }, { "epoch": 0.40493101477671645, "grad_norm": 33.42410909057005, "learning_rate": 1.1145795842109621e-07, "logits/chosen": -1.4947848320007324, "logits/rejected": -1.5099647045135498, "logps/chosen": -757.1992797851562, "logps/rejected": -829.0934448242188, "loss": 0.6061, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2935428619384766, "rewards/margins": 0.46146607398986816, "rewards/rejected": -2.7550089359283447, "step": 620 }, { "epoch": 0.4055841293166789, "grad_norm": 18.79594230641745, "learning_rate": 1.1130833005174605e-07, "logits/chosen": -1.5316284894943237, "logits/rejected": -1.5386699438095093, "logps/chosen": -728.5615844726562, "logps/rejected": -749.1942138671875, "loss": 0.5481, "rewards/accuracies": 0.59375, "rewards/chosen": -2.2535488605499268, "rewards/margins": 0.5478615760803223, "rewards/rejected": -2.80141019821167, "step": 621 }, { "epoch": 0.40623724385664134, "grad_norm": 8.286798076996332, "learning_rate": 1.1115851269277616e-07, "logits/chosen": -1.6122698783874512, "logits/rejected": -1.6200590133666992, "logps/chosen": -774.5802001953125, "logps/rejected": -867.3875732421875, "loss": 0.5414, "rewards/accuracies": 0.8125, "rewards/chosen": -2.146087884902954, "rewards/margins": 0.6804053783416748, "rewards/rejected": -2.82649302482605, "step": 622 }, { "epoch": 0.4068903583966038, "grad_norm": 36.91218473973044, "learning_rate": 1.1100850712400558e-07, "logits/chosen": -1.392214298248291, "logits/rejected": -1.4010385274887085, "logps/chosen": -678.6796264648438, "logps/rejected": -669.0376586914062, "loss": 0.5327, "rewards/accuracies": 0.59375, "rewards/chosen": -2.425049066543579, "rewards/margins": 0.2620934844017029, "rewards/rejected": -2.6871426105499268, "step": 623 }, { "epoch": 0.40754347293656623, "grad_norm": 7.985703028509244, "learning_rate": 1.1085831412623295e-07, "logits/chosen": -1.5507965087890625, "logits/rejected": -1.5302854776382446, "logps/chosen": -759.2698364257812, "logps/rejected": -793.293701171875, "loss": 0.5103, "rewards/accuracies": 0.75, "rewards/chosen": -2.41506290435791, "rewards/margins": 0.651451826095581, "rewards/rejected": -3.0665149688720703, "step": 624 }, { "epoch": 0.4081965874765287, "grad_norm": 96.15150960782869, "learning_rate": 1.107079344812325e-07, "logits/chosen": -1.5727829933166504, "logits/rejected": -1.583585500717163, "logps/chosen": -742.6324462890625, "logps/rejected": -827.9069213867188, "loss": 0.5681, "rewards/accuracies": 0.59375, "rewards/chosen": -2.324497938156128, "rewards/margins": 0.5150927901268005, "rewards/rejected": -2.839590549468994, "step": 625 }, { "epoch": 0.4088497020164911, "grad_norm": 44.171131036436144, "learning_rate": 1.1055736897175004e-07, "logits/chosen": -1.5920262336730957, "logits/rejected": -1.5330153703689575, "logps/chosen": -811.8800048828125, "logps/rejected": -813.5941772460938, "loss": 0.5467, "rewards/accuracies": 0.625, "rewards/chosen": -2.6739134788513184, "rewards/margins": 0.49917757511138916, "rewards/rejected": -3.173091173171997, "step": 626 }, { "epoch": 0.4095028165564536, "grad_norm": 56.76814949785248, "learning_rate": 1.1040661838149878e-07, "logits/chosen": -1.5579776763916016, "logits/rejected": -1.5597628355026245, "logps/chosen": -807.7301025390625, "logps/rejected": -894.228515625, "loss": 0.4822, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6176559925079346, "rewards/margins": 0.6038171648979187, "rewards/rejected": -3.221472978591919, "step": 627 }, { "epoch": 0.410155931096416, "grad_norm": 26.749818630716884, "learning_rate": 1.1025568349515528e-07, "logits/chosen": -1.4747806787490845, "logits/rejected": -1.4980195760726929, "logps/chosen": -683.0436401367188, "logps/rejected": -686.6530151367188, "loss": 0.5013, "rewards/accuracies": 0.75, "rewards/chosen": -2.168447732925415, "rewards/margins": 0.4241008460521698, "rewards/rejected": -2.5925486087799072, "step": 628 }, { "epoch": 0.4108090456363785, "grad_norm": 76.09203034611112, "learning_rate": 1.1010456509835548e-07, "logits/chosen": -1.6177140474319458, "logits/rejected": -1.6009244918823242, "logps/chosen": -720.8223876953125, "logps/rejected": -741.0955810546875, "loss": 0.5919, "rewards/accuracies": 0.75, "rewards/chosen": -2.544870376586914, "rewards/margins": 0.48653483390808105, "rewards/rejected": -3.031405210494995, "step": 629 }, { "epoch": 0.4114621601763409, "grad_norm": 52.43809285107618, "learning_rate": 1.0995326397769042e-07, "logits/chosen": -1.529783010482788, "logits/rejected": -1.5328000783920288, "logps/chosen": -664.4801025390625, "logps/rejected": -768.4127807617188, "loss": 0.4978, "rewards/accuracies": 0.71875, "rewards/chosen": -2.260840654373169, "rewards/margins": 0.6211092472076416, "rewards/rejected": -2.8819501399993896, "step": 630 }, { "epoch": 0.4121152747163034, "grad_norm": 9.495957129740592, "learning_rate": 1.0980178092070225e-07, "logits/chosen": -1.578983187675476, "logits/rejected": -1.5786319971084595, "logps/chosen": -745.9150390625, "logps/rejected": -808.736083984375, "loss": 0.5322, "rewards/accuracies": 0.71875, "rewards/chosen": -2.5862929821014404, "rewards/margins": 0.4649811387062073, "rewards/rejected": -3.051273822784424, "step": 631 }, { "epoch": 0.4127683892562658, "grad_norm": 40.60424346315586, "learning_rate": 1.0965011671588021e-07, "logits/chosen": -1.5746647119522095, "logits/rejected": -1.554513931274414, "logps/chosen": -797.9632568359375, "logps/rejected": -785.4474487304688, "loss": 0.601, "rewards/accuracies": 0.6875, "rewards/chosen": -2.463163375854492, "rewards/margins": 0.24002833664417267, "rewards/rejected": -2.7031917572021484, "step": 632 }, { "epoch": 0.4134215037962283, "grad_norm": 52.831172156435784, "learning_rate": 1.094982721526563e-07, "logits/chosen": -1.540035367012024, "logits/rejected": -1.5365304946899414, "logps/chosen": -826.9459228515625, "logps/rejected": -893.4970703125, "loss": 0.4984, "rewards/accuracies": 0.75, "rewards/chosen": -3.0703601837158203, "rewards/margins": 0.7675031423568726, "rewards/rejected": -3.8378632068634033, "step": 633 }, { "epoch": 0.4140746183361907, "grad_norm": 15.726901054602733, "learning_rate": 1.0934624802140147e-07, "logits/chosen": -1.5329731702804565, "logits/rejected": -1.5518198013305664, "logps/chosen": -763.7630615234375, "logps/rejected": -806.2711791992188, "loss": 0.5294, "rewards/accuracies": 0.71875, "rewards/chosen": -2.819103479385376, "rewards/margins": 0.638033926486969, "rewards/rejected": -3.457137107849121, "step": 634 }, { "epoch": 0.41472773287615317, "grad_norm": 18.674657678831167, "learning_rate": 1.0919404511342121e-07, "logits/chosen": -1.5634597539901733, "logits/rejected": -1.5584088563919067, "logps/chosen": -730.9223022460938, "logps/rejected": -818.416259765625, "loss": 0.5022, "rewards/accuracies": 0.84375, "rewards/chosen": -2.1986281871795654, "rewards/margins": 0.8553330898284912, "rewards/rejected": -3.0539615154266357, "step": 635 }, { "epoch": 0.4153808474161156, "grad_norm": 19.5510889690524, "learning_rate": 1.0904166422095162e-07, "logits/chosen": -1.5800487995147705, "logits/rejected": -1.5099263191223145, "logps/chosen": -743.9994506835938, "logps/rejected": -761.8692016601562, "loss": 0.5499, "rewards/accuracies": 0.625, "rewards/chosen": -2.8127152919769287, "rewards/margins": 0.35885605216026306, "rewards/rejected": -3.1715714931488037, "step": 636 }, { "epoch": 0.41603396195607806, "grad_norm": 29.4349268646558, "learning_rate": 1.0888910613715523e-07, "logits/chosen": -1.5855112075805664, "logits/rejected": -1.5745254755020142, "logps/chosen": -724.60888671875, "logps/rejected": -817.6202392578125, "loss": 0.5035, "rewards/accuracies": 0.75, "rewards/chosen": -2.4678425788879395, "rewards/margins": 0.9483436346054077, "rewards/rejected": -3.416186571121216, "step": 637 }, { "epoch": 0.4166870764960405, "grad_norm": 27.606512517928415, "learning_rate": 1.0873637165611688e-07, "logits/chosen": -1.5267822742462158, "logits/rejected": -1.5166692733764648, "logps/chosen": -756.2583618164062, "logps/rejected": -905.69287109375, "loss": 0.5459, "rewards/accuracies": 0.65625, "rewards/chosen": -2.325356960296631, "rewards/margins": 0.8004791736602783, "rewards/rejected": -3.125836133956909, "step": 638 }, { "epoch": 0.41734019103600295, "grad_norm": 10.112999624727967, "learning_rate": 1.085834615728396e-07, "logits/chosen": -1.5627224445343018, "logits/rejected": -1.589940071105957, "logps/chosen": -757.5079345703125, "logps/rejected": -797.4879150390625, "loss": 0.5334, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7449193000793457, "rewards/margins": 0.47740480303764343, "rewards/rejected": -3.2223243713378906, "step": 639 }, { "epoch": 0.41799330557596537, "grad_norm": 49.554186475304675, "learning_rate": 1.0843037668324037e-07, "logits/chosen": -1.6192628145217896, "logits/rejected": -1.6073408126831055, "logps/chosen": -692.6961669921875, "logps/rejected": -731.430419921875, "loss": 0.5213, "rewards/accuracies": 0.75, "rewards/chosen": -2.307335376739502, "rewards/margins": 0.42022186517715454, "rewards/rejected": -2.7275571823120117, "step": 640 }, { "epoch": 0.41864642011592784, "grad_norm": 12.12872006128116, "learning_rate": 1.0827711778414616e-07, "logits/chosen": -1.5925276279449463, "logits/rejected": -1.617337703704834, "logps/chosen": -773.3049926757812, "logps/rejected": -886.1497192382812, "loss": 0.5054, "rewards/accuracies": 0.75, "rewards/chosen": -2.7378339767456055, "rewards/margins": 0.9473193287849426, "rewards/rejected": -3.6851532459259033, "step": 641 }, { "epoch": 0.41929953465589026, "grad_norm": 9.420684381655802, "learning_rate": 1.0812368567328965e-07, "logits/chosen": -1.5554149150848389, "logits/rejected": -1.5351797342300415, "logps/chosen": -741.2017211914062, "logps/rejected": -848.9796752929688, "loss": 0.5536, "rewards/accuracies": 0.75, "rewards/chosen": -2.7187960147857666, "rewards/margins": 0.7441635727882385, "rewards/rejected": -3.4629597663879395, "step": 642 }, { "epoch": 0.41995264919585273, "grad_norm": 17.59371870537502, "learning_rate": 1.0797008114930504e-07, "logits/chosen": -1.5738487243652344, "logits/rejected": -1.5535151958465576, "logps/chosen": -871.8766479492188, "logps/rejected": -991.9896240234375, "loss": 0.52, "rewards/accuracies": 0.78125, "rewards/chosen": -2.753142833709717, "rewards/margins": 0.8435327410697937, "rewards/rejected": -3.5966756343841553, "step": 643 }, { "epoch": 0.42060576373581515, "grad_norm": 65.67234375195817, "learning_rate": 1.078163050117241e-07, "logits/chosen": -1.5654664039611816, "logits/rejected": -1.5345994234085083, "logps/chosen": -724.1328735351562, "logps/rejected": -764.9933471679688, "loss": 0.5788, "rewards/accuracies": 0.6875, "rewards/chosen": -2.66972279548645, "rewards/margins": 0.42094868421554565, "rewards/rejected": -3.0906717777252197, "step": 644 }, { "epoch": 0.4212588782757776, "grad_norm": 29.19611600663847, "learning_rate": 1.0766235806097172e-07, "logits/chosen": -1.6037955284118652, "logits/rejected": -1.552761197090149, "logps/chosen": -850.3397216796875, "logps/rejected": -891.075927734375, "loss": 0.498, "rewards/accuracies": 0.8125, "rewards/chosen": -2.936112880706787, "rewards/margins": 0.9705631136894226, "rewards/rejected": -3.9066760540008545, "step": 645 }, { "epoch": 0.42191199281574004, "grad_norm": 55.001913874158426, "learning_rate": 1.0750824109836202e-07, "logits/chosen": -1.63307785987854, "logits/rejected": -1.5964423418045044, "logps/chosen": -867.8717041015625, "logps/rejected": -924.471435546875, "loss": 0.577, "rewards/accuracies": 0.75, "rewards/chosen": -2.914452314376831, "rewards/margins": 0.48612168431282043, "rewards/rejected": -3.40057373046875, "step": 646 }, { "epoch": 0.4225651073557025, "grad_norm": 28.42526755883717, "learning_rate": 1.0735395492609401e-07, "logits/chosen": -1.477207899093628, "logits/rejected": -1.4840601682662964, "logps/chosen": -654.694091796875, "logps/rejected": -722.9656982421875, "loss": 0.5245, "rewards/accuracies": 0.78125, "rewards/chosen": -2.427155017852783, "rewards/margins": 0.7215266227722168, "rewards/rejected": -3.148681879043579, "step": 647 }, { "epoch": 0.42321822189566494, "grad_norm": 20.153695342892174, "learning_rate": 1.0719950034724741e-07, "logits/chosen": -1.6053216457366943, "logits/rejected": -1.58915114402771, "logps/chosen": -833.958251953125, "logps/rejected": -869.42236328125, "loss": 0.5383, "rewards/accuracies": 0.75, "rewards/chosen": -2.8524227142333984, "rewards/margins": 0.6670438051223755, "rewards/rejected": -3.5194666385650635, "step": 648 }, { "epoch": 0.4238713364356274, "grad_norm": 40.45524219360296, "learning_rate": 1.0704487816577857e-07, "logits/chosen": -1.5915526151657104, "logits/rejected": -1.5662232637405396, "logps/chosen": -755.484619140625, "logps/rejected": -798.5493774414062, "loss": 0.5185, "rewards/accuracies": 0.78125, "rewards/chosen": -2.445694923400879, "rewards/margins": 0.6455334424972534, "rewards/rejected": -3.0912282466888428, "step": 649 }, { "epoch": 0.42452445097558983, "grad_norm": 45.48878539730004, "learning_rate": 1.0689008918651624e-07, "logits/chosen": -1.4695310592651367, "logits/rejected": -1.4832361936569214, "logps/chosen": -884.2122802734375, "logps/rejected": -879.8310546875, "loss": 0.5647, "rewards/accuracies": 0.625, "rewards/chosen": -3.048762559890747, "rewards/margins": 0.2833816707134247, "rewards/rejected": -3.332144260406494, "step": 650 }, { "epoch": 0.4251775655155523, "grad_norm": 56.42516159244524, "learning_rate": 1.0673513421515733e-07, "logits/chosen": -1.6036771535873413, "logits/rejected": -1.617321252822876, "logps/chosen": -794.57421875, "logps/rejected": -828.6959838867188, "loss": 0.5217, "rewards/accuracies": 0.78125, "rewards/chosen": -2.69077467918396, "rewards/margins": 0.46351078152656555, "rewards/rejected": -3.154285430908203, "step": 651 }, { "epoch": 0.4258306800555147, "grad_norm": 46.62254829895216, "learning_rate": 1.0658001405826283e-07, "logits/chosen": -1.5648261308670044, "logits/rejected": -1.5797994136810303, "logps/chosen": -772.7039184570312, "logps/rejected": -819.6089477539062, "loss": 0.5396, "rewards/accuracies": 0.75, "rewards/chosen": -2.7385077476501465, "rewards/margins": 0.5473735332489014, "rewards/rejected": -3.285881519317627, "step": 652 }, { "epoch": 0.4264837945954772, "grad_norm": 59.34200258381232, "learning_rate": 1.0642472952325346e-07, "logits/chosen": -1.5929808616638184, "logits/rejected": -1.6374764442443848, "logps/chosen": -735.03662109375, "logps/rejected": -850.94189453125, "loss": 0.4824, "rewards/accuracies": 0.9375, "rewards/chosen": -2.357342004776001, "rewards/margins": 1.2214264869689941, "rewards/rejected": -3.578768253326416, "step": 653 }, { "epoch": 0.4271369091354396, "grad_norm": 37.46472982323093, "learning_rate": 1.062692814184056e-07, "logits/chosen": -1.5046641826629639, "logits/rejected": -1.4834293127059937, "logps/chosen": -670.6259765625, "logps/rejected": -814.8147583007812, "loss": 0.5009, "rewards/accuracies": 0.875, "rewards/chosen": -2.507924795150757, "rewards/margins": 0.8526814579963684, "rewards/rejected": -3.3606061935424805, "step": 654 }, { "epoch": 0.4277900236754021, "grad_norm": 65.41811386558527, "learning_rate": 1.0611367055284704e-07, "logits/chosen": -1.5957469940185547, "logits/rejected": -1.6134015321731567, "logps/chosen": -910.1995239257812, "logps/rejected": -958.0068969726562, "loss": 0.5173, "rewards/accuracies": 0.78125, "rewards/chosen": -2.841102123260498, "rewards/margins": 0.8195388913154602, "rewards/rejected": -3.6606409549713135, "step": 655 }, { "epoch": 0.4284431382153645, "grad_norm": 15.408196632262841, "learning_rate": 1.0595789773655273e-07, "logits/chosen": -1.5747413635253906, "logits/rejected": -1.565165400505066, "logps/chosen": -880.7881469726562, "logps/rejected": -917.05419921875, "loss": 0.5269, "rewards/accuracies": 0.6875, "rewards/chosen": -2.843327760696411, "rewards/margins": 0.5438075661659241, "rewards/rejected": -3.3871350288391113, "step": 656 }, { "epoch": 0.429096252755327, "grad_norm": 12.34615445550627, "learning_rate": 1.0580196378034061e-07, "logits/chosen": -1.5093989372253418, "logits/rejected": -1.528839111328125, "logps/chosen": -804.33837890625, "logps/rejected": -832.3118896484375, "loss": 0.5247, "rewards/accuracies": 0.6875, "rewards/chosen": -2.883572816848755, "rewards/margins": 0.4760682284832001, "rewards/rejected": -3.3596410751342773, "step": 657 }, { "epoch": 0.4297493672952894, "grad_norm": 20.79744618252237, "learning_rate": 1.0564586949586736e-07, "logits/chosen": -1.5655585527420044, "logits/rejected": -1.5473308563232422, "logps/chosen": -784.39208984375, "logps/rejected": -780.73974609375, "loss": 0.5298, "rewards/accuracies": 0.75, "rewards/chosen": -2.7032248973846436, "rewards/margins": 0.7658544182777405, "rewards/rejected": -3.46907901763916, "step": 658 }, { "epoch": 0.43040248183525187, "grad_norm": 9.541823346620628, "learning_rate": 1.0548961569562423e-07, "logits/chosen": -1.551105260848999, "logits/rejected": -1.5648680925369263, "logps/chosen": -728.1370239257812, "logps/rejected": -863.0052490234375, "loss": 0.5929, "rewards/accuracies": 0.6875, "rewards/chosen": -2.4059126377105713, "rewards/margins": 0.7981605529785156, "rewards/rejected": -3.2040724754333496, "step": 659 }, { "epoch": 0.4310555963752143, "grad_norm": 48.77651510525204, "learning_rate": 1.0533320319293272e-07, "logits/chosen": -1.521799087524414, "logits/rejected": -1.5136668682098389, "logps/chosen": -767.982421875, "logps/rejected": -844.113037109375, "loss": 0.5468, "rewards/accuracies": 0.875, "rewards/chosen": -2.578664541244507, "rewards/margins": 0.6596671342849731, "rewards/rejected": -3.2383313179016113, "step": 660 }, { "epoch": 0.43170871091517676, "grad_norm": 14.135228489391437, "learning_rate": 1.0517663280194042e-07, "logits/chosen": -1.541378140449524, "logits/rejected": -1.5287246704101562, "logps/chosen": -854.5809326171875, "logps/rejected": -845.8003540039062, "loss": 0.5743, "rewards/accuracies": 0.6875, "rewards/chosen": -2.7386510372161865, "rewards/margins": 0.5117663145065308, "rewards/rejected": -3.2504169940948486, "step": 661 }, { "epoch": 0.4323618254551392, "grad_norm": 39.96027400878076, "learning_rate": 1.050199053376168e-07, "logits/chosen": -1.5787113904953003, "logits/rejected": -1.5196938514709473, "logps/chosen": -895.7177734375, "logps/rejected": -902.5272827148438, "loss": 0.5603, "rewards/accuracies": 0.71875, "rewards/chosen": -3.0833802223205566, "rewards/margins": 0.44520139694213867, "rewards/rejected": -3.5285820960998535, "step": 662 }, { "epoch": 0.43301493999510166, "grad_norm": 9.336642307745894, "learning_rate": 1.0486302161574876e-07, "logits/chosen": -1.5741958618164062, "logits/rejected": -1.566494345664978, "logps/chosen": -747.9061889648438, "logps/rejected": -897.3228759765625, "loss": 0.5216, "rewards/accuracies": 0.875, "rewards/chosen": -2.3504905700683594, "rewards/margins": 1.2214593887329102, "rewards/rejected": -3.5719499588012695, "step": 663 }, { "epoch": 0.4336680545350641, "grad_norm": 34.74703628355522, "learning_rate": 1.0470598245293676e-07, "logits/chosen": -1.5200121402740479, "logits/rejected": -1.537473201751709, "logps/chosen": -792.9605102539062, "logps/rejected": -799.4251708984375, "loss": 0.5492, "rewards/accuracies": 0.6875, "rewards/chosen": -2.7280704975128174, "rewards/margins": 0.4831348657608032, "rewards/rejected": -3.211205244064331, "step": 664 }, { "epoch": 0.43432116907502655, "grad_norm": 26.166178900391024, "learning_rate": 1.0454878866659017e-07, "logits/chosen": -1.591930627822876, "logits/rejected": -1.5695099830627441, "logps/chosen": -727.5706787109375, "logps/rejected": -792.0450439453125, "loss": 0.5506, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6660943031311035, "rewards/margins": 0.6367244124412537, "rewards/rejected": -3.302818775177002, "step": 665 }, { "epoch": 0.43497428361498897, "grad_norm": 10.372172666812576, "learning_rate": 1.0439144107492328e-07, "logits/chosen": -1.4827499389648438, "logits/rejected": -1.4672393798828125, "logps/chosen": -800.9483032226562, "logps/rejected": -798.546875, "loss": 0.5153, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8575448989868164, "rewards/margins": 0.5324373841285706, "rewards/rejected": -3.389982223510742, "step": 666 }, { "epoch": 0.43562739815495144, "grad_norm": 11.965156702513355, "learning_rate": 1.0423394049695095e-07, "logits/chosen": -1.5964605808258057, "logits/rejected": -1.5629247426986694, "logps/chosen": -812.4568481445312, "logps/rejected": -916.690185546875, "loss": 0.5439, "rewards/accuracies": 0.625, "rewards/chosen": -2.873971939086914, "rewards/margins": 0.766517162322998, "rewards/rejected": -3.640489101409912, "step": 667 }, { "epoch": 0.43628051269491386, "grad_norm": 17.25391970334388, "learning_rate": 1.0407628775248433e-07, "logits/chosen": -1.5258575677871704, "logits/rejected": -1.557664155960083, "logps/chosen": -774.147216796875, "logps/rejected": -935.6986083984375, "loss": 0.4742, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4668354988098145, "rewards/margins": 1.3980622291564941, "rewards/rejected": -3.8648977279663086, "step": 668 }, { "epoch": 0.43693362723487633, "grad_norm": 14.901011998748888, "learning_rate": 1.0391848366212666e-07, "logits/chosen": -1.594072937965393, "logits/rejected": -1.587380051612854, "logps/chosen": -780.095703125, "logps/rejected": -784.4081420898438, "loss": 0.5749, "rewards/accuracies": 0.5625, "rewards/chosen": -2.8471426963806152, "rewards/margins": 0.19949409365653992, "rewards/rejected": -3.0466368198394775, "step": 669 }, { "epoch": 0.43758674177483875, "grad_norm": 46.17531372330182, "learning_rate": 1.0376052904726888e-07, "logits/chosen": -1.6172115802764893, "logits/rejected": -1.5498206615447998, "logps/chosen": -820.4274291992188, "logps/rejected": -908.1333618164062, "loss": 0.5152, "rewards/accuracies": 0.78125, "rewards/chosen": -2.615933656692505, "rewards/margins": 0.7912464737892151, "rewards/rejected": -3.407179832458496, "step": 670 }, { "epoch": 0.4382398563148012, "grad_norm": 51.65597788618139, "learning_rate": 1.0360242473008551e-07, "logits/chosen": -1.5762639045715332, "logits/rejected": -1.594434380531311, "logps/chosen": -824.281982421875, "logps/rejected": -829.8516845703125, "loss": 0.5618, "rewards/accuracies": 0.75, "rewards/chosen": -3.047189712524414, "rewards/margins": 0.6297323703765869, "rewards/rejected": -3.67692232131958, "step": 671 }, { "epoch": 0.43889297085476364, "grad_norm": 33.441136796480215, "learning_rate": 1.0344417153353023e-07, "logits/chosen": -1.4966257810592651, "logits/rejected": -1.459989309310913, "logps/chosen": -817.207763671875, "logps/rejected": -897.0996704101562, "loss": 0.5394, "rewards/accuracies": 0.71875, "rewards/chosen": -2.9682440757751465, "rewards/margins": 0.6141051054000854, "rewards/rejected": -3.5823495388031006, "step": 672 }, { "epoch": 0.4395460853947261, "grad_norm": 31.378606252739424, "learning_rate": 1.0328577028133171e-07, "logits/chosen": -1.5966362953186035, "logits/rejected": -1.6483103036880493, "logps/chosen": -732.49609375, "logps/rejected": -800.7023315429688, "loss": 0.5138, "rewards/accuracies": 0.75, "rewards/chosen": -2.764153003692627, "rewards/margins": 0.5777581930160522, "rewards/rejected": -3.341911554336548, "step": 673 }, { "epoch": 0.44019919993468853, "grad_norm": 34.56431138782349, "learning_rate": 1.0312722179798924e-07, "logits/chosen": -1.598524808883667, "logits/rejected": -1.6316325664520264, "logps/chosen": -787.64892578125, "logps/rejected": -861.8873291015625, "loss": 0.5273, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7730770111083984, "rewards/margins": 0.7392612099647522, "rewards/rejected": -3.5123379230499268, "step": 674 }, { "epoch": 0.440852314474651, "grad_norm": 15.372795733191637, "learning_rate": 1.0296852690876846e-07, "logits/chosen": -1.5273460149765015, "logits/rejected": -1.5593777894973755, "logps/chosen": -790.3528442382812, "logps/rejected": -815.5953979492188, "loss": 0.5533, "rewards/accuracies": 0.71875, "rewards/chosen": -2.494476079940796, "rewards/margins": 0.4577151834964752, "rewards/rejected": -2.95219087600708, "step": 675 }, { "epoch": 0.4415054290146134, "grad_norm": 36.80495254523571, "learning_rate": 1.0280968643969706e-07, "logits/chosen": -1.5882431268692017, "logits/rejected": -1.5868853330612183, "logps/chosen": -863.531005859375, "logps/rejected": -894.946044921875, "loss": 0.5153, "rewards/accuracies": 0.875, "rewards/chosen": -2.805412769317627, "rewards/margins": 0.8240199685096741, "rewards/rejected": -3.629432201385498, "step": 676 }, { "epoch": 0.4421585435545759, "grad_norm": 9.850541147286798, "learning_rate": 1.0265070121756054e-07, "logits/chosen": -1.5755228996276855, "logits/rejected": -1.5751676559448242, "logps/chosen": -729.009521484375, "logps/rejected": -794.595947265625, "loss": 0.5564, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3728232383728027, "rewards/margins": 0.473653107881546, "rewards/rejected": -2.8464765548706055, "step": 677 }, { "epoch": 0.4428116580945383, "grad_norm": 8.929995911050677, "learning_rate": 1.0249157206989785e-07, "logits/chosen": -1.5073692798614502, "logits/rejected": -1.542021632194519, "logps/chosen": -802.9788818359375, "logps/rejected": -878.1073608398438, "loss": 0.4953, "rewards/accuracies": 0.84375, "rewards/chosen": -2.467076301574707, "rewards/margins": 0.7465323209762573, "rewards/rejected": -3.213608503341675, "step": 678 }, { "epoch": 0.4434647726345008, "grad_norm": 17.43336381715629, "learning_rate": 1.0233229982499702e-07, "logits/chosen": -1.5233006477355957, "logits/rejected": -1.575763463973999, "logps/chosen": -728.665771484375, "logps/rejected": -826.36279296875, "loss": 0.5105, "rewards/accuracies": 0.6875, "rewards/chosen": -2.66928768157959, "rewards/margins": 0.6221798062324524, "rewards/rejected": -3.2914676666259766, "step": 679 }, { "epoch": 0.4441178871744632, "grad_norm": 8.33574898866346, "learning_rate": 1.0217288531189101e-07, "logits/chosen": -1.4891436100006104, "logits/rejected": -1.487116813659668, "logps/chosen": -790.6260375976562, "logps/rejected": -806.4535522460938, "loss": 0.5277, "rewards/accuracies": 0.71875, "rewards/chosen": -2.7111411094665527, "rewards/margins": 0.511049747467041, "rewards/rejected": -3.2221908569335938, "step": 680 }, { "epoch": 0.4447710017144257, "grad_norm": 93.25103994990552, "learning_rate": 1.0201332936035328e-07, "logits/chosen": -1.5428980588912964, "logits/rejected": -1.5394231081008911, "logps/chosen": -750.823486328125, "logps/rejected": -812.9085083007812, "loss": 0.6195, "rewards/accuracies": 0.71875, "rewards/chosen": -2.5228466987609863, "rewards/margins": 0.6121708750724792, "rewards/rejected": -3.1350173950195312, "step": 681 }, { "epoch": 0.4454241162543881, "grad_norm": 24.723412198537826, "learning_rate": 1.0185363280089346e-07, "logits/chosen": -1.5300703048706055, "logits/rejected": -1.5563037395477295, "logps/chosen": -808.7451782226562, "logps/rejected": -814.0613403320312, "loss": 0.5547, "rewards/accuracies": 0.6875, "rewards/chosen": -2.553114891052246, "rewards/margins": 0.2794783413410187, "rewards/rejected": -2.8325929641723633, "step": 682 }, { "epoch": 0.4460772307943506, "grad_norm": 18.06928652509424, "learning_rate": 1.0169379646475307e-07, "logits/chosen": -1.5730717182159424, "logits/rejected": -1.553600549697876, "logps/chosen": -827.7550048828125, "logps/rejected": -824.0142211914062, "loss": 0.5089, "rewards/accuracies": 0.65625, "rewards/chosen": -2.8725967407226562, "rewards/margins": 0.5028645992279053, "rewards/rejected": -3.3754611015319824, "step": 683 }, { "epoch": 0.446730345334313, "grad_norm": 13.432734617314106, "learning_rate": 1.0153382118390124e-07, "logits/chosen": -1.517228364944458, "logits/rejected": -1.5097568035125732, "logps/chosen": -828.512451171875, "logps/rejected": -960.2579345703125, "loss": 0.5539, "rewards/accuracies": 0.78125, "rewards/chosen": -3.074080467224121, "rewards/margins": 0.7247848510742188, "rewards/rejected": -3.7988648414611816, "step": 684 }, { "epoch": 0.44738345987427547, "grad_norm": 111.08651677833566, "learning_rate": 1.0137370779103024e-07, "logits/chosen": -1.6381279230117798, "logits/rejected": -1.642504334449768, "logps/chosen": -895.6856079101562, "logps/rejected": -954.673828125, "loss": 0.4931, "rewards/accuracies": 0.75, "rewards/chosen": -2.942159652709961, "rewards/margins": 0.6529674530029297, "rewards/rejected": -3.5951271057128906, "step": 685 }, { "epoch": 0.4480365744142379, "grad_norm": 13.010841457025625, "learning_rate": 1.0121345711955134e-07, "logits/chosen": -1.4719197750091553, "logits/rejected": -1.4533613920211792, "logps/chosen": -753.68408203125, "logps/rejected": -819.1432495117188, "loss": 0.5515, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3883912563323975, "rewards/margins": 0.776418149471283, "rewards/rejected": -3.1648097038269043, "step": 686 }, { "epoch": 0.44868968895420036, "grad_norm": 19.59305991487636, "learning_rate": 1.0105307000359027e-07, "logits/chosen": -1.5581806898117065, "logits/rejected": -1.563573956489563, "logps/chosen": -782.7869873046875, "logps/rejected": -825.7442626953125, "loss": 0.5426, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6787917613983154, "rewards/margins": 0.797275722026825, "rewards/rejected": -3.4760677814483643, "step": 687 }, { "epoch": 0.4493428034941628, "grad_norm": 41.30661441239489, "learning_rate": 1.0089254727798299e-07, "logits/chosen": -1.5456461906433105, "logits/rejected": -1.5804781913757324, "logps/chosen": -802.162841796875, "logps/rejected": -830.0077514648438, "loss": 0.5099, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7756364345550537, "rewards/margins": 0.779686450958252, "rewards/rejected": -3.5553228855133057, "step": 688 }, { "epoch": 0.44999591803412525, "grad_norm": 43.257182510610015, "learning_rate": 1.0073188977827134e-07, "logits/chosen": -1.530835509300232, "logits/rejected": -1.5210024118423462, "logps/chosen": -774.5211791992188, "logps/rejected": -830.6015625, "loss": 0.4923, "rewards/accuracies": 0.75, "rewards/chosen": -2.939332962036133, "rewards/margins": 0.8855284452438354, "rewards/rejected": -3.824861526489258, "step": 689 }, { "epoch": 0.45064903257408767, "grad_norm": 28.06793800668904, "learning_rate": 1.005710983406987e-07, "logits/chosen": -1.5865156650543213, "logits/rejected": -1.536832571029663, "logps/chosen": -714.5545654296875, "logps/rejected": -739.9314575195312, "loss": 0.5408, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3614490032196045, "rewards/margins": 0.4486488997936249, "rewards/rejected": -2.8100976943969727, "step": 690 }, { "epoch": 0.45130214711405015, "grad_norm": 13.710061169800268, "learning_rate": 1.0041017380220558e-07, "logits/chosen": -1.6234124898910522, "logits/rejected": -1.6429170370101929, "logps/chosen": -942.2227783203125, "logps/rejected": -983.3606567382812, "loss": 0.5934, "rewards/accuracies": 0.6875, "rewards/chosen": -2.9503729343414307, "rewards/margins": 0.5114213824272156, "rewards/rejected": -3.461794376373291, "step": 691 }, { "epoch": 0.45195526165401256, "grad_norm": 42.74504837687908, "learning_rate": 1.002491170004253e-07, "logits/chosen": -1.5679419040679932, "logits/rejected": -1.5443594455718994, "logps/chosen": -842.61474609375, "logps/rejected": -882.8880004882812, "loss": 0.5011, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7981529235839844, "rewards/margins": 0.8635019659996033, "rewards/rejected": -3.6616549491882324, "step": 692 }, { "epoch": 0.45260837619397504, "grad_norm": 51.01361393063693, "learning_rate": 1.0008792877367964e-07, "logits/chosen": -1.6048243045806885, "logits/rejected": -1.5834286212921143, "logps/chosen": -800.6567993164062, "logps/rejected": -932.0943603515625, "loss": 0.6234, "rewards/accuracies": 0.65625, "rewards/chosen": -2.7766685485839844, "rewards/margins": 0.7954378128051758, "rewards/rejected": -3.57210636138916, "step": 693 }, { "epoch": 0.45326149073393746, "grad_norm": 26.952246598872957, "learning_rate": 9.992660996097447e-08, "logits/chosen": -1.487755537033081, "logits/rejected": -1.475599765777588, "logps/chosen": -764.441162109375, "logps/rejected": -858.9654541015625, "loss": 0.471, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6644558906555176, "rewards/margins": 0.7122585773468018, "rewards/rejected": -3.3767144680023193, "step": 694 }, { "epoch": 0.45391460527389993, "grad_norm": 11.341690238802435, "learning_rate": 9.976516140199535e-08, "logits/chosen": -1.654636025428772, "logits/rejected": -1.6279263496398926, "logps/chosen": -713.5647583007812, "logps/rejected": -748.8384399414062, "loss": 0.5867, "rewards/accuracies": 0.6875, "rewards/chosen": -2.517662525177002, "rewards/margins": 0.454134076833725, "rewards/rejected": -2.971796989440918, "step": 695 }, { "epoch": 0.45456771981386235, "grad_norm": 40.33369925086035, "learning_rate": 9.960358393710321e-08, "logits/chosen": -1.528939962387085, "logits/rejected": -1.5497705936431885, "logps/chosen": -764.209716796875, "logps/rejected": -920.8458862304688, "loss": 0.4745, "rewards/accuracies": 0.8125, "rewards/chosen": -2.5999767780303955, "rewards/margins": 1.0393584966659546, "rewards/rejected": -3.6393353939056396, "step": 696 }, { "epoch": 0.4552208343538248, "grad_norm": 12.320269188700282, "learning_rate": 9.944187840732994e-08, "logits/chosen": -1.5356950759887695, "logits/rejected": -1.5276272296905518, "logps/chosen": -824.4510498046875, "logps/rejected": -828.6773681640625, "loss": 0.4832, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7461459636688232, "rewards/margins": 0.7607426047325134, "rewards/rejected": -3.5068888664245605, "step": 697 }, { "epoch": 0.45587394889378724, "grad_norm": 20.693131756565563, "learning_rate": 9.928004565437409e-08, "logits/chosen": -1.5997885465621948, "logits/rejected": -1.596414566040039, "logps/chosen": -749.1096801757812, "logps/rejected": -785.3474731445312, "loss": 0.5544, "rewards/accuracies": 0.84375, "rewards/chosen": -2.6378190517425537, "rewards/margins": 0.5456668138504028, "rewards/rejected": -3.183485746383667, "step": 698 }, { "epoch": 0.4565270634337497, "grad_norm": 31.393879848402715, "learning_rate": 9.911808652059627e-08, "logits/chosen": -1.5327155590057373, "logits/rejected": -1.5094130039215088, "logps/chosen": -713.3516235351562, "logps/rejected": -709.5831298828125, "loss": 0.565, "rewards/accuracies": 0.78125, "rewards/chosen": -2.3836562633514404, "rewards/margins": 0.40900933742523193, "rewards/rejected": -2.792665719985962, "step": 699 }, { "epoch": 0.45718017797371213, "grad_norm": 44.56122798633119, "learning_rate": 9.895600184901504e-08, "logits/chosen": -1.6142834424972534, "logits/rejected": -1.5989428758621216, "logps/chosen": -814.4269409179688, "logps/rejected": -822.01513671875, "loss": 0.5342, "rewards/accuracies": 0.75, "rewards/chosen": -2.7036519050598145, "rewards/margins": 0.4850347638130188, "rewards/rejected": -3.1886863708496094, "step": 700 }, { "epoch": 0.45718017797371213, "eval_logits/chosen": -1.5524276494979858, "eval_logits/rejected": -1.5454105138778687, "eval_logps/chosen": -772.6592407226562, "eval_logps/rejected": -832.8743896484375, "eval_loss": 0.5305144190788269, "eval_rewards/accuracies": 0.7350000143051147, "eval_rewards/chosen": -2.623138427734375, "eval_rewards/margins": 0.6832101345062256, "eval_rewards/rejected": -3.3063488006591797, "eval_runtime": 296.7195, "eval_samples_per_second": 13.481, "eval_steps_per_second": 0.843, "step": 700 }, { "epoch": 0.4578332925136746, "grad_norm": 8.9780374884061, "learning_rate": 9.879379248330239e-08, "logits/chosen": -1.5523990392684937, "logits/rejected": -1.5356321334838867, "logps/chosen": -745.2019653320312, "logps/rejected": -832.36376953125, "loss": 0.5029, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4654135704040527, "rewards/margins": 0.739882230758667, "rewards/rejected": -3.2052958011627197, "step": 701 }, { "epoch": 0.458486407053637, "grad_norm": 13.764196067118446, "learning_rate": 9.863145926777934e-08, "logits/chosen": -1.5704573392868042, "logits/rejected": -1.5471737384796143, "logps/chosen": -749.1688232421875, "logps/rejected": -803.7322998046875, "loss": 0.5578, "rewards/accuracies": 0.6875, "rewards/chosen": -2.7841954231262207, "rewards/margins": 0.7516999840736389, "rewards/rejected": -3.535895347595215, "step": 702 }, { "epoch": 0.4591395215935995, "grad_norm": 28.637922274224316, "learning_rate": 9.846900304741157e-08, "logits/chosen": -1.5777729749679565, "logits/rejected": -1.5776381492614746, "logps/chosen": -775.2662353515625, "logps/rejected": -831.3828735351562, "loss": 0.5208, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6320080757141113, "rewards/margins": 0.6068910360336304, "rewards/rejected": -3.238898992538452, "step": 703 }, { "epoch": 0.4597926361335619, "grad_norm": 19.220774272497522, "learning_rate": 9.830642466780502e-08, "logits/chosen": -1.6617802381515503, "logits/rejected": -1.6172996759414673, "logps/chosen": -792.265380859375, "logps/rejected": -793.1304931640625, "loss": 0.5726, "rewards/accuracies": 0.75, "rewards/chosen": -2.6746013164520264, "rewards/margins": 0.42366841435432434, "rewards/rejected": -3.0982699394226074, "step": 704 }, { "epoch": 0.4604457506735244, "grad_norm": 9.690502883669247, "learning_rate": 9.814372497520143e-08, "logits/chosen": -1.59645414352417, "logits/rejected": -1.5628377199172974, "logps/chosen": -759.610595703125, "logps/rejected": -795.2174072265625, "loss": 0.4766, "rewards/accuracies": 0.75, "rewards/chosen": -2.5422470569610596, "rewards/margins": 0.4729628562927246, "rewards/rejected": -3.015209674835205, "step": 705 }, { "epoch": 0.4610988652134868, "grad_norm": 29.25714731199898, "learning_rate": 9.798090481647411e-08, "logits/chosen": -1.5286840200424194, "logits/rejected": -1.5571849346160889, "logps/chosen": -745.8056640625, "logps/rejected": -824.3461303710938, "loss": 0.5433, "rewards/accuracies": 0.875, "rewards/chosen": -2.467949867248535, "rewards/margins": 0.9363622665405273, "rewards/rejected": -3.4043126106262207, "step": 706 }, { "epoch": 0.4617519797534493, "grad_norm": 11.95355469721655, "learning_rate": 9.781796503912328e-08, "logits/chosen": -1.5327837467193604, "logits/rejected": -1.5477138757705688, "logps/chosen": -788.408447265625, "logps/rejected": -918.1892700195312, "loss": 0.5018, "rewards/accuracies": 0.8125, "rewards/chosen": -2.641738176345825, "rewards/margins": 0.9118731617927551, "rewards/rejected": -3.5536112785339355, "step": 707 }, { "epoch": 0.4624050942934117, "grad_norm": 9.057095892979188, "learning_rate": 9.765490649127187e-08, "logits/chosen": -1.58297860622406, "logits/rejected": -1.5312621593475342, "logps/chosen": -640.93798828125, "logps/rejected": -716.5929565429688, "loss": 0.5052, "rewards/accuracies": 0.65625, "rewards/chosen": -2.1810810565948486, "rewards/margins": 0.6035025119781494, "rewards/rejected": -2.784583568572998, "step": 708 }, { "epoch": 0.4630582088333742, "grad_norm": 17.256671471200423, "learning_rate": 9.749173002166101e-08, "logits/chosen": -1.4686298370361328, "logits/rejected": -1.5486806631088257, "logps/chosen": -787.2227172851562, "logps/rejected": -868.33056640625, "loss": 0.5082, "rewards/accuracies": 0.71875, "rewards/chosen": -2.844477415084839, "rewards/margins": 0.6099371314048767, "rewards/rejected": -3.4544146060943604, "step": 709 }, { "epoch": 0.4637113233733366, "grad_norm": 11.493776981683995, "learning_rate": 9.732843647964563e-08, "logits/chosen": -1.3998236656188965, "logits/rejected": -1.3739439249038696, "logps/chosen": -645.9183959960938, "logps/rejected": -692.6560668945312, "loss": 0.5008, "rewards/accuracies": 0.78125, "rewards/chosen": -2.068920135498047, "rewards/margins": 0.3378119170665741, "rewards/rejected": -2.4067318439483643, "step": 710 }, { "epoch": 0.46436443791329907, "grad_norm": 19.407875344745552, "learning_rate": 9.716502671519003e-08, "logits/chosen": -1.594405174255371, "logits/rejected": -1.5376571416854858, "logps/chosen": -743.6107177734375, "logps/rejected": -760.2347412109375, "loss": 0.5606, "rewards/accuracies": 0.6875, "rewards/chosen": -2.476472854614258, "rewards/margins": 0.6245778799057007, "rewards/rejected": -3.101050853729248, "step": 711 }, { "epoch": 0.4650175524532615, "grad_norm": 30.65022124829921, "learning_rate": 9.700150157886345e-08, "logits/chosen": -1.4861985445022583, "logits/rejected": -1.4618767499923706, "logps/chosen": -715.8246459960938, "logps/rejected": -819.9178466796875, "loss": 0.4772, "rewards/accuracies": 0.78125, "rewards/chosen": -2.780965805053711, "rewards/margins": 0.6235308647155762, "rewards/rejected": -3.404496669769287, "step": 712 }, { "epoch": 0.46567066699322396, "grad_norm": 66.74879517812167, "learning_rate": 9.683786192183569e-08, "logits/chosen": -1.5658459663391113, "logits/rejected": -1.5532565116882324, "logps/chosen": -791.1273803710938, "logps/rejected": -858.0960083007812, "loss": 0.4781, "rewards/accuracies": 0.875, "rewards/chosen": -2.74170184135437, "rewards/margins": 0.8308650255203247, "rewards/rejected": -3.5725669860839844, "step": 713 }, { "epoch": 0.4663237815331864, "grad_norm": 58.623748201586416, "learning_rate": 9.667410859587261e-08, "logits/chosen": -1.5724753141403198, "logits/rejected": -1.581168532371521, "logps/chosen": -814.78125, "logps/rejected": -851.2848510742188, "loss": 0.5508, "rewards/accuracies": 0.625, "rewards/chosen": -2.974850654602051, "rewards/margins": 0.2711948752403259, "rewards/rejected": -3.2460455894470215, "step": 714 }, { "epoch": 0.46697689607314885, "grad_norm": 12.956291639260309, "learning_rate": 9.651024245333177e-08, "logits/chosen": -1.5934419631958008, "logits/rejected": -1.581264615058899, "logps/chosen": -838.2115478515625, "logps/rejected": -835.7385864257812, "loss": 0.5465, "rewards/accuracies": 0.75, "rewards/chosen": -2.9340994358062744, "rewards/margins": 0.6708505749702454, "rewards/rejected": -3.604949951171875, "step": 715 }, { "epoch": 0.46763001061311127, "grad_norm": 11.679151198008364, "learning_rate": 9.634626434715791e-08, "logits/chosen": -1.5744848251342773, "logits/rejected": -1.5612497329711914, "logps/chosen": -834.1600341796875, "logps/rejected": -887.748779296875, "loss": 0.5068, "rewards/accuracies": 0.6875, "rewards/chosen": -2.9672343730926514, "rewards/margins": 0.5058416128158569, "rewards/rejected": -3.4730756282806396, "step": 716 }, { "epoch": 0.46828312515307374, "grad_norm": 16.13898005368633, "learning_rate": 9.618217513087857e-08, "logits/chosen": -1.5531257390975952, "logits/rejected": -1.5761160850524902, "logps/chosen": -858.4584350585938, "logps/rejected": -849.4945678710938, "loss": 0.5375, "rewards/accuracies": 0.75, "rewards/chosen": -2.655836820602417, "rewards/margins": 0.46926283836364746, "rewards/rejected": -3.1250996589660645, "step": 717 }, { "epoch": 0.46893623969303616, "grad_norm": 29.758612473106314, "learning_rate": 9.601797565859966e-08, "logits/chosen": -1.5403785705566406, "logits/rejected": -1.5410677194595337, "logps/chosen": -761.43896484375, "logps/rejected": -858.6434936523438, "loss": 0.508, "rewards/accuracies": 0.71875, "rewards/chosen": -2.7271745204925537, "rewards/margins": 0.8027867078781128, "rewards/rejected": -3.529961585998535, "step": 718 }, { "epoch": 0.46958935423299863, "grad_norm": 12.15073631227884, "learning_rate": 9.585366678500099e-08, "logits/chosen": -1.5667126178741455, "logits/rejected": -1.5372388362884521, "logps/chosen": -725.1177368164062, "logps/rejected": -834.06494140625, "loss": 0.528, "rewards/accuracies": 0.875, "rewards/chosen": -2.6709213256835938, "rewards/margins": 0.7990758419036865, "rewards/rejected": -3.469996929168701, "step": 719 }, { "epoch": 0.47024246877296105, "grad_norm": 24.52142611093719, "learning_rate": 9.568924936533176e-08, "logits/chosen": -1.5142229795455933, "logits/rejected": -1.5096334218978882, "logps/chosen": -788.1256713867188, "logps/rejected": -853.0916748046875, "loss": 0.4793, "rewards/accuracies": 0.84375, "rewards/chosen": -3.0224575996398926, "rewards/margins": 0.8449983596801758, "rewards/rejected": -3.8674559593200684, "step": 720 }, { "epoch": 0.4708955833129235, "grad_norm": 12.298496353962463, "learning_rate": 9.552472425540622e-08, "logits/chosen": -1.4641120433807373, "logits/rejected": -1.4405646324157715, "logps/chosen": -814.8626708984375, "logps/rejected": -852.576904296875, "loss": 0.5757, "rewards/accuracies": 0.6875, "rewards/chosen": -2.7843127250671387, "rewards/margins": 0.5765048265457153, "rewards/rejected": -3.3608171939849854, "step": 721 }, { "epoch": 0.47154869785288595, "grad_norm": 9.717229033768323, "learning_rate": 9.536009231159913e-08, "logits/chosen": -1.5470542907714844, "logits/rejected": -1.5365655422210693, "logps/chosen": -879.9993896484375, "logps/rejected": -939.4688720703125, "loss": 0.4599, "rewards/accuracies": 0.875, "rewards/chosen": -2.9776620864868164, "rewards/margins": 0.7803228497505188, "rewards/rejected": -3.7579846382141113, "step": 722 }, { "epoch": 0.4722018123928484, "grad_norm": 36.798678834078146, "learning_rate": 9.519535439084134e-08, "logits/chosen": -1.5727907419204712, "logits/rejected": -1.5672595500946045, "logps/chosen": -777.5218505859375, "logps/rejected": -853.3557739257812, "loss": 0.5406, "rewards/accuracies": 0.6875, "rewards/chosen": -2.753126859664917, "rewards/margins": 0.6771417260169983, "rewards/rejected": -3.4302685260772705, "step": 723 }, { "epoch": 0.47285492693281084, "grad_norm": 25.32155426640716, "learning_rate": 9.503051135061538e-08, "logits/chosen": -1.5253040790557861, "logits/rejected": -1.5683470964431763, "logps/chosen": -827.9707641601562, "logps/rejected": -843.636962890625, "loss": 0.4839, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8832221031188965, "rewards/margins": 0.7474449276924133, "rewards/rejected": -3.630666732788086, "step": 724 }, { "epoch": 0.4735080414727733, "grad_norm": 29.38483596431046, "learning_rate": 9.486556404895083e-08, "logits/chosen": -1.4884456396102905, "logits/rejected": -1.4345768690109253, "logps/chosen": -738.1644897460938, "logps/rejected": -887.9599609375, "loss": 0.5609, "rewards/accuracies": 0.65625, "rewards/chosen": -2.7885422706604004, "rewards/margins": 1.1899745464324951, "rewards/rejected": -3.9785170555114746, "step": 725 }, { "epoch": 0.47416115601273573, "grad_norm": 10.62636396986744, "learning_rate": 9.470051334442008e-08, "logits/chosen": -1.477259874343872, "logits/rejected": -1.5057060718536377, "logps/chosen": -770.3033447265625, "logps/rejected": -863.6229248046875, "loss": 0.5272, "rewards/accuracies": 0.78125, "rewards/chosen": -2.945749044418335, "rewards/margins": 0.7120295763015747, "rewards/rejected": -3.657778739929199, "step": 726 }, { "epoch": 0.4748142705526982, "grad_norm": 17.901405492456256, "learning_rate": 9.453536009613367e-08, "logits/chosen": -1.49770987033844, "logits/rejected": -1.5056265592575073, "logps/chosen": -741.2904052734375, "logps/rejected": -932.2896118164062, "loss": 0.4609, "rewards/accuracies": 0.625, "rewards/chosen": -2.7706236839294434, "rewards/margins": 0.9994168281555176, "rewards/rejected": -3.770040988922119, "step": 727 }, { "epoch": 0.4754673850926606, "grad_norm": 12.867336823169316, "learning_rate": 9.437010516373592e-08, "logits/chosen": -1.4868743419647217, "logits/rejected": -1.5124531984329224, "logps/chosen": -870.4682006835938, "logps/rejected": -907.4706420898438, "loss": 0.4915, "rewards/accuracies": 0.71875, "rewards/chosen": -3.0645503997802734, "rewards/margins": 0.6983194351196289, "rewards/rejected": -3.7628698348999023, "step": 728 }, { "epoch": 0.4761204996326231, "grad_norm": 50.44249826662269, "learning_rate": 9.420474940740042e-08, "logits/chosen": -1.635010838508606, "logits/rejected": -1.6417289972305298, "logps/chosen": -834.1351318359375, "logps/rejected": -948.0263061523438, "loss": 0.5263, "rewards/accuracies": 0.90625, "rewards/chosen": -3.062831401824951, "rewards/margins": 0.9943631887435913, "rewards/rejected": -4.057194709777832, "step": 729 }, { "epoch": 0.4767736141725855, "grad_norm": 14.64756449278543, "learning_rate": 9.403929368782558e-08, "logits/chosen": -1.5567909479141235, "logits/rejected": -1.6151758432388306, "logps/chosen": -768.6214599609375, "logps/rejected": -870.6768798828125, "loss": 0.4699, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4706671237945557, "rewards/margins": 0.9328286647796631, "rewards/rejected": -3.4034957885742188, "step": 730 }, { "epoch": 0.477426728712548, "grad_norm": 13.947518321539896, "learning_rate": 9.387373886623012e-08, "logits/chosen": -1.51140558719635, "logits/rejected": -1.536016583442688, "logps/chosen": -860.7777709960938, "logps/rejected": -902.8041381835938, "loss": 0.52, "rewards/accuracies": 0.6875, "rewards/chosen": -3.0753116607666016, "rewards/margins": 0.5363348722457886, "rewards/rejected": -3.6116464138031006, "step": 731 }, { "epoch": 0.4780798432525104, "grad_norm": 16.851055770042308, "learning_rate": 9.37080858043486e-08, "logits/chosen": -1.4963881969451904, "logits/rejected": -1.4485998153686523, "logps/chosen": -813.0571899414062, "logps/rejected": -845.43408203125, "loss": 0.519, "rewards/accuracies": 0.6875, "rewards/chosen": -2.9917490482330322, "rewards/margins": 0.81999272108078, "rewards/rejected": -3.811741352081299, "step": 732 }, { "epoch": 0.4787329577924729, "grad_norm": 36.178464401201914, "learning_rate": 9.354233536442691e-08, "logits/chosen": -1.5164326429367065, "logits/rejected": -1.5172199010849, "logps/chosen": -872.368408203125, "logps/rejected": -948.313720703125, "loss": 0.5787, "rewards/accuracies": 0.71875, "rewards/chosen": -3.244874954223633, "rewards/margins": 0.6580997705459595, "rewards/rejected": -3.9029746055603027, "step": 733 }, { "epoch": 0.4793860723324353, "grad_norm": 12.484932403976112, "learning_rate": 9.337648840921784e-08, "logits/chosen": -1.6157573461532593, "logits/rejected": -1.5741209983825684, "logps/chosen": -884.84912109375, "logps/rejected": -1095.204345703125, "loss": 0.4598, "rewards/accuracies": 0.71875, "rewards/chosen": -3.276892900466919, "rewards/margins": 1.09320068359375, "rewards/rejected": -4.370093822479248, "step": 734 }, { "epoch": 0.48003918687239777, "grad_norm": 14.089376755304505, "learning_rate": 9.321054580197656e-08, "logits/chosen": -1.4722511768341064, "logits/rejected": -1.474959135055542, "logps/chosen": -730.8760375976562, "logps/rejected": -825.123779296875, "loss": 0.5249, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8380813598632812, "rewards/margins": 0.6124694347381592, "rewards/rejected": -3.4505507946014404, "step": 735 }, { "epoch": 0.4806923014123602, "grad_norm": 16.35406356020656, "learning_rate": 9.304450840645609e-08, "logits/chosen": -1.4326503276824951, "logits/rejected": -1.4325727224349976, "logps/chosen": -839.5531005859375, "logps/rejected": -894.9701538085938, "loss": 0.5027, "rewards/accuracies": 0.59375, "rewards/chosen": -3.4309120178222656, "rewards/margins": 0.5214735865592957, "rewards/rejected": -3.952385663986206, "step": 736 }, { "epoch": 0.48134541595232266, "grad_norm": 17.20276393860269, "learning_rate": 9.287837708690284e-08, "logits/chosen": -1.5666112899780273, "logits/rejected": -1.524141550064087, "logps/chosen": -820.8998413085938, "logps/rejected": -992.0237426757812, "loss": 0.4732, "rewards/accuracies": 0.75, "rewards/chosen": -2.9690146446228027, "rewards/margins": 0.996252179145813, "rewards/rejected": -3.9652669429779053, "step": 737 }, { "epoch": 0.4819985304922851, "grad_norm": 43.29010354161253, "learning_rate": 9.271215270805212e-08, "logits/chosen": -1.596786379814148, "logits/rejected": -1.5615063905715942, "logps/chosen": -804.3712768554688, "logps/rejected": -849.1760864257812, "loss": 0.56, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8177123069763184, "rewards/margins": 0.5579187273979187, "rewards/rejected": -3.3756308555603027, "step": 738 }, { "epoch": 0.48265164503224756, "grad_norm": 15.346572016217646, "learning_rate": 9.254583613512365e-08, "logits/chosen": -1.6470632553100586, "logits/rejected": -1.6171380281448364, "logps/chosen": -913.9931640625, "logps/rejected": -999.4883422851562, "loss": 0.4807, "rewards/accuracies": 0.75, "rewards/chosen": -3.2777669429779053, "rewards/margins": 0.7959187626838684, "rewards/rejected": -4.073686122894287, "step": 739 }, { "epoch": 0.48330475957221, "grad_norm": 9.556534348064194, "learning_rate": 9.237942823381696e-08, "logits/chosen": -1.573829174041748, "logits/rejected": -1.5064740180969238, "logps/chosen": -777.1859741210938, "logps/rejected": -792.814208984375, "loss": 0.469, "rewards/accuracies": 0.84375, "rewards/chosen": -2.7544057369232178, "rewards/margins": 0.8636622428894043, "rewards/rejected": -3.618068218231201, "step": 740 }, { "epoch": 0.48395787411217245, "grad_norm": 25.011865185818184, "learning_rate": 9.221292987030702e-08, "logits/chosen": -1.5379745960235596, "logits/rejected": -1.4926586151123047, "logps/chosen": -820.5584716796875, "logps/rejected": -835.9033203125, "loss": 0.4647, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9160995483398438, "rewards/margins": 0.7988985180854797, "rewards/rejected": -3.7149980068206787, "step": 741 }, { "epoch": 0.48461098865213487, "grad_norm": 13.575787231912816, "learning_rate": 9.204634191123965e-08, "logits/chosen": -1.508238673210144, "logits/rejected": -1.4783196449279785, "logps/chosen": -902.4708251953125, "logps/rejected": -893.9943237304688, "loss": 0.5139, "rewards/accuracies": 0.8125, "rewards/chosen": -3.170164108276367, "rewards/margins": 0.5966829061508179, "rewards/rejected": -3.7668471336364746, "step": 742 }, { "epoch": 0.48526410319209734, "grad_norm": 15.830542176100348, "learning_rate": 9.187966522372705e-08, "logits/chosen": -1.534610390663147, "logits/rejected": -1.5084702968597412, "logps/chosen": -787.0113525390625, "logps/rejected": -803.7102661132812, "loss": 0.5124, "rewards/accuracies": 0.78125, "rewards/chosen": -2.9289166927337646, "rewards/margins": 0.520931601524353, "rewards/rejected": -3.4498486518859863, "step": 743 }, { "epoch": 0.48591721773205976, "grad_norm": 21.140461323350923, "learning_rate": 9.17129006753432e-08, "logits/chosen": -1.525702714920044, "logits/rejected": -1.5293265581130981, "logps/chosen": -725.1962280273438, "logps/rejected": -809.5682983398438, "loss": 0.4683, "rewards/accuracies": 0.90625, "rewards/chosen": -2.3667490482330322, "rewards/margins": 0.9049954414367676, "rewards/rejected": -3.271744728088379, "step": 744 }, { "epoch": 0.48657033227202223, "grad_norm": 38.71867984311598, "learning_rate": 9.154604913411943e-08, "logits/chosen": -1.5762661695480347, "logits/rejected": -1.5938408374786377, "logps/chosen": -876.135986328125, "logps/rejected": -978.6011962890625, "loss": 0.4793, "rewards/accuracies": 0.8125, "rewards/chosen": -3.281604528427124, "rewards/margins": 0.8068190813064575, "rewards/rejected": -4.088423252105713, "step": 745 }, { "epoch": 0.48722344681198465, "grad_norm": 29.90445121131915, "learning_rate": 9.137911146853995e-08, "logits/chosen": -1.5684157609939575, "logits/rejected": -1.5826231241226196, "logps/chosen": -795.3310546875, "logps/rejected": -863.130859375, "loss": 0.572, "rewards/accuracies": 0.625, "rewards/chosen": -3.2306532859802246, "rewards/margins": 0.5794581770896912, "rewards/rejected": -3.8101112842559814, "step": 746 }, { "epoch": 0.4878765613519471, "grad_norm": 20.953402023971254, "learning_rate": 9.121208854753716e-08, "logits/chosen": -1.511757493019104, "logits/rejected": -1.4341886043548584, "logps/chosen": -716.0580444335938, "logps/rejected": -796.005859375, "loss": 0.5049, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9407427310943604, "rewards/margins": 0.7499470114707947, "rewards/rejected": -3.6906895637512207, "step": 747 }, { "epoch": 0.48852967589190954, "grad_norm": 23.63758752296789, "learning_rate": 9.10449812404873e-08, "logits/chosen": -1.5069888830184937, "logits/rejected": -1.537886142730713, "logps/chosen": -781.0473022460938, "logps/rejected": -866.6488037109375, "loss": 0.4737, "rewards/accuracies": 0.71875, "rewards/chosen": -2.995260000228882, "rewards/margins": 0.538629412651062, "rewards/rejected": -3.5338892936706543, "step": 748 }, { "epoch": 0.489182790431872, "grad_norm": 9.601439395523968, "learning_rate": 9.087779041720581e-08, "logits/chosen": -1.5230836868286133, "logits/rejected": -1.4928765296936035, "logps/chosen": -708.7151489257812, "logps/rejected": -755.310302734375, "loss": 0.4822, "rewards/accuracies": 0.84375, "rewards/chosen": -2.70585560798645, "rewards/margins": 0.5378022789955139, "rewards/rejected": -3.2436575889587402, "step": 749 }, { "epoch": 0.48983590497183443, "grad_norm": 52.26590360034194, "learning_rate": 9.071051694794283e-08, "logits/chosen": -1.546759843826294, "logits/rejected": -1.4855560064315796, "logps/chosen": -800.2686767578125, "logps/rejected": -858.6179809570312, "loss": 0.5451, "rewards/accuracies": 0.6875, "rewards/chosen": -3.1868784427642822, "rewards/margins": 0.29365602135658264, "rewards/rejected": -3.480534553527832, "step": 750 }, { "epoch": 0.4904890195117969, "grad_norm": 27.46647480470444, "learning_rate": 9.054316170337872e-08, "logits/chosen": -1.5644291639328003, "logits/rejected": -1.5516630411148071, "logps/chosen": -817.9088745117188, "logps/rejected": -884.1804809570312, "loss": 0.4977, "rewards/accuracies": 0.75, "rewards/chosen": -2.8426513671875, "rewards/margins": 0.5670284032821655, "rewards/rejected": -3.409679412841797, "step": 751 }, { "epoch": 0.4911421340517593, "grad_norm": 22.65417735488772, "learning_rate": 9.037572555461949e-08, "logits/chosen": -1.4849984645843506, "logits/rejected": -1.5145663022994995, "logps/chosen": -797.359619140625, "logps/rejected": -906.737548828125, "loss": 0.4895, "rewards/accuracies": 0.8125, "rewards/chosen": -2.670722007751465, "rewards/margins": 0.6933446526527405, "rewards/rejected": -3.3640666007995605, "step": 752 }, { "epoch": 0.4917952485917218, "grad_norm": 13.411177355069162, "learning_rate": 9.020820937319222e-08, "logits/chosen": -1.5731292963027954, "logits/rejected": -1.5484392642974854, "logps/chosen": -884.296630859375, "logps/rejected": -940.8453979492188, "loss": 0.471, "rewards/accuracies": 0.84375, "rewards/chosen": -3.245163679122925, "rewards/margins": 0.7693843245506287, "rewards/rejected": -4.014548301696777, "step": 753 }, { "epoch": 0.4924483631316842, "grad_norm": 22.100162564155266, "learning_rate": 9.004061403104063e-08, "logits/chosen": -1.5291342735290527, "logits/rejected": -1.541631817817688, "logps/chosen": -894.0836181640625, "logps/rejected": -972.8955688476562, "loss": 0.4913, "rewards/accuracies": 0.8125, "rewards/chosen": -3.270658016204834, "rewards/margins": 0.9292981028556824, "rewards/rejected": -4.19995641708374, "step": 754 }, { "epoch": 0.49310147767164664, "grad_norm": 40.53277025479099, "learning_rate": 8.987294040052048e-08, "logits/chosen": -1.513358235359192, "logits/rejected": -1.4985860586166382, "logps/chosen": -815.3209838867188, "logps/rejected": -845.5489501953125, "loss": 0.5791, "rewards/accuracies": 0.65625, "rewards/chosen": -2.830094337463379, "rewards/margins": 0.6605682373046875, "rewards/rejected": -3.4906625747680664, "step": 755 }, { "epoch": 0.4937545922116091, "grad_norm": 52.36436157875439, "learning_rate": 8.970518935439494e-08, "logits/chosen": -1.5312970876693726, "logits/rejected": -1.536803126335144, "logps/chosen": -841.9302978515625, "logps/rejected": -1007.2651977539062, "loss": 0.52, "rewards/accuracies": 0.71875, "rewards/chosen": -3.154250383377075, "rewards/margins": 0.8153612017631531, "rewards/rejected": -3.9696109294891357, "step": 756 }, { "epoch": 0.49440770675157153, "grad_norm": 10.906915240973083, "learning_rate": 8.953736176583024e-08, "logits/chosen": -1.5253336429595947, "logits/rejected": -1.533002495765686, "logps/chosen": -844.7198486328125, "logps/rejected": -904.4848022460938, "loss": 0.5255, "rewards/accuracies": 0.8125, "rewards/chosen": -2.983484983444214, "rewards/margins": 0.8254218101501465, "rewards/rejected": -3.8089070320129395, "step": 757 }, { "epoch": 0.495060821291534, "grad_norm": 21.62178976119015, "learning_rate": 8.936945850839103e-08, "logits/chosen": -1.5330315828323364, "logits/rejected": -1.5863041877746582, "logps/chosen": -836.3416137695312, "logps/rejected": -919.7655029296875, "loss": 0.5125, "rewards/accuracies": 0.875, "rewards/chosen": -3.1585867404937744, "rewards/margins": 0.9332132935523987, "rewards/rejected": -4.091800212860107, "step": 758 }, { "epoch": 0.4957139358314964, "grad_norm": 59.346179693747, "learning_rate": 8.920148045603571e-08, "logits/chosen": -1.4187573194503784, "logits/rejected": -1.437163233757019, "logps/chosen": -740.33837890625, "logps/rejected": -822.4481811523438, "loss": 0.4519, "rewards/accuracies": 0.875, "rewards/chosen": -2.87211275100708, "rewards/margins": 0.8624040484428406, "rewards/rejected": -3.7345166206359863, "step": 759 }, { "epoch": 0.4963670503714589, "grad_norm": 17.104210267847428, "learning_rate": 8.903342848311213e-08, "logits/chosen": -1.4946515560150146, "logits/rejected": -1.511128306388855, "logps/chosen": -787.0501098632812, "logps/rejected": -832.3441162109375, "loss": 0.5147, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0261549949645996, "rewards/margins": 0.5119923949241638, "rewards/rejected": -3.538147211074829, "step": 760 }, { "epoch": 0.4970201649114213, "grad_norm": 37.01444427857606, "learning_rate": 8.886530346435281e-08, "logits/chosen": -1.5614135265350342, "logits/rejected": -1.5467019081115723, "logps/chosen": -809.6796264648438, "logps/rejected": -850.0182495117188, "loss": 0.5183, "rewards/accuracies": 0.625, "rewards/chosen": -2.892712116241455, "rewards/margins": 0.5590572357177734, "rewards/rejected": -3.4517693519592285, "step": 761 }, { "epoch": 0.4976732794513838, "grad_norm": 20.36864614388382, "learning_rate": 8.869710627487057e-08, "logits/chosen": -1.5446330308914185, "logits/rejected": -1.5651966333389282, "logps/chosen": -765.47412109375, "logps/rejected": -840.524169921875, "loss": 0.5331, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1215081214904785, "rewards/margins": 0.7881425619125366, "rewards/rejected": -3.9096505641937256, "step": 762 }, { "epoch": 0.4983263939913462, "grad_norm": 88.03942880437029, "learning_rate": 8.852883779015377e-08, "logits/chosen": -1.5617398023605347, "logits/rejected": -1.5589573383331299, "logps/chosen": -719.5538940429688, "logps/rejected": -807.90869140625, "loss": 0.5644, "rewards/accuracies": 0.75, "rewards/chosen": -2.8834686279296875, "rewards/margins": 0.5054572820663452, "rewards/rejected": -3.3889260292053223, "step": 763 }, { "epoch": 0.4989795085313087, "grad_norm": 85.24504779473465, "learning_rate": 8.836049888606199e-08, "logits/chosen": -1.5608245134353638, "logits/rejected": -1.5477367639541626, "logps/chosen": -867.1178588867188, "logps/rejected": -1046.8099365234375, "loss": 0.4946, "rewards/accuracies": 0.8125, "rewards/chosen": -3.545870304107666, "rewards/margins": 1.3517121076583862, "rewards/rejected": -4.897582530975342, "step": 764 }, { "epoch": 0.4996326230712711, "grad_norm": 103.8676814765994, "learning_rate": 8.819209043882131e-08, "logits/chosen": -1.5328567028045654, "logits/rejected": -1.4881083965301514, "logps/chosen": -879.6492919921875, "logps/rejected": -995.6163940429688, "loss": 0.5008, "rewards/accuracies": 0.75, "rewards/chosen": -3.1603341102600098, "rewards/margins": 1.162267804145813, "rewards/rejected": -4.322602272033691, "step": 765 }, { "epoch": 0.5002857376112335, "grad_norm": 11.728242486539118, "learning_rate": 8.802361332501978e-08, "logits/chosen": -1.5301557779312134, "logits/rejected": -1.5447125434875488, "logps/chosen": -793.1903076171875, "logps/rejected": -930.641357421875, "loss": 0.4803, "rewards/accuracies": 0.78125, "rewards/chosen": -3.126751184463501, "rewards/margins": 0.9962693452835083, "rewards/rejected": -4.123020648956299, "step": 766 }, { "epoch": 0.500938852151196, "grad_norm": 10.020322944474998, "learning_rate": 8.785506842160285e-08, "logits/chosen": -1.5191065073013306, "logits/rejected": -1.5158594846725464, "logps/chosen": -775.5346069335938, "logps/rejected": -828.55126953125, "loss": 0.4989, "rewards/accuracies": 0.625, "rewards/chosen": -2.9595863819122314, "rewards/margins": 0.6543839573860168, "rewards/rejected": -3.6139702796936035, "step": 767 }, { "epoch": 0.5015919666911585, "grad_norm": 19.19484359451946, "learning_rate": 8.768645660586886e-08, "logits/chosen": -1.5571017265319824, "logits/rejected": -1.599165439605713, "logps/chosen": -870.1515502929688, "logps/rejected": -1006.7691650390625, "loss": 0.4255, "rewards/accuracies": 0.9375, "rewards/chosen": -3.215618848800659, "rewards/margins": 1.278153657913208, "rewards/rejected": -4.493772506713867, "step": 768 }, { "epoch": 0.5022450812311209, "grad_norm": 55.25909288858508, "learning_rate": 8.751777875546442e-08, "logits/chosen": -1.5235897302627563, "logits/rejected": -1.5015480518341064, "logps/chosen": -766.0463256835938, "logps/rejected": -827.17919921875, "loss": 0.5069, "rewards/accuracies": 0.75, "rewards/chosen": -2.9575929641723633, "rewards/margins": 0.6264766454696655, "rewards/rejected": -3.5840699672698975, "step": 769 }, { "epoch": 0.5028981957710833, "grad_norm": 31.7480045035506, "learning_rate": 8.734903574837985e-08, "logits/chosen": -1.4985229969024658, "logits/rejected": -1.4879060983657837, "logps/chosen": -830.8861083984375, "logps/rejected": -901.6783447265625, "loss": 0.5273, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9937520027160645, "rewards/margins": 0.953312337398529, "rewards/rejected": -3.9470643997192383, "step": 770 }, { "epoch": 0.5035513103110458, "grad_norm": 23.133647048008825, "learning_rate": 8.718022846294466e-08, "logits/chosen": -1.4812209606170654, "logits/rejected": -1.4926509857177734, "logps/chosen": -764.713134765625, "logps/rejected": -867.8782348632812, "loss": 0.4615, "rewards/accuracies": 0.875, "rewards/chosen": -2.96504807472229, "rewards/margins": 0.7924860715866089, "rewards/rejected": -3.7575340270996094, "step": 771 }, { "epoch": 0.5042044248510082, "grad_norm": 13.218818964781667, "learning_rate": 8.701135777782291e-08, "logits/chosen": -1.5664947032928467, "logits/rejected": -1.5273160934448242, "logps/chosen": -843.0235595703125, "logps/rejected": -841.0691528320312, "loss": 0.6184, "rewards/accuracies": 0.625, "rewards/chosen": -3.2560126781463623, "rewards/margins": 0.49974071979522705, "rewards/rejected": -3.755753517150879, "step": 772 }, { "epoch": 0.5048575393909707, "grad_norm": 29.96826439077636, "learning_rate": 8.684242457200865e-08, "logits/chosen": -1.4153635501861572, "logits/rejected": -1.4303746223449707, "logps/chosen": -740.8873291015625, "logps/rejected": -893.1712646484375, "loss": 0.4935, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7135632038116455, "rewards/margins": 1.0459972620010376, "rewards/rejected": -3.7595603466033936, "step": 773 }, { "epoch": 0.5055106539309331, "grad_norm": 17.20275448114271, "learning_rate": 8.667342972482136e-08, "logits/chosen": -1.5214056968688965, "logits/rejected": -1.5121796131134033, "logps/chosen": -887.89794921875, "logps/rejected": -939.7318115234375, "loss": 0.5725, "rewards/accuracies": 0.75, "rewards/chosen": -3.212088108062744, "rewards/margins": 0.7043830156326294, "rewards/rejected": -3.916471481323242, "step": 774 }, { "epoch": 0.5061637684708956, "grad_norm": 44.42583289533232, "learning_rate": 8.650437411590141e-08, "logits/chosen": -1.5345828533172607, "logits/rejected": -1.5515172481536865, "logps/chosen": -841.9071044921875, "logps/rejected": -872.3067626953125, "loss": 0.5004, "rewards/accuracies": 0.71875, "rewards/chosen": -3.2682528495788574, "rewards/margins": 0.5741406679153442, "rewards/rejected": -3.842393636703491, "step": 775 }, { "epoch": 0.506816883010858, "grad_norm": 11.651543725419435, "learning_rate": 8.633525862520538e-08, "logits/chosen": -1.5557384490966797, "logits/rejected": -1.5258511304855347, "logps/chosen": -821.5269165039062, "logps/rejected": -849.76611328125, "loss": 0.5619, "rewards/accuracies": 0.84375, "rewards/chosen": -2.8014473915100098, "rewards/margins": 0.6341366171836853, "rewards/rejected": -3.435584306716919, "step": 776 }, { "epoch": 0.5074699975508205, "grad_norm": 46.00900328788502, "learning_rate": 8.616608413300162e-08, "logits/chosen": -1.465898871421814, "logits/rejected": -1.5084877014160156, "logps/chosen": -825.7061767578125, "logps/rejected": -899.7260131835938, "loss": 0.5337, "rewards/accuracies": 0.71875, "rewards/chosen": -3.140413284301758, "rewards/margins": 0.8182295560836792, "rewards/rejected": -3.9586427211761475, "step": 777 }, { "epoch": 0.5081231120907829, "grad_norm": 34.00237370143665, "learning_rate": 8.599685151986555e-08, "logits/chosen": -1.5190675258636475, "logits/rejected": -1.5114282369613647, "logps/chosen": -788.9437255859375, "logps/rejected": -859.0122680664062, "loss": 0.5124, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8215911388397217, "rewards/margins": 0.813235878944397, "rewards/rejected": -3.634827136993408, "step": 778 }, { "epoch": 0.5087762266307454, "grad_norm": 11.243459420945996, "learning_rate": 8.582756166667506e-08, "logits/chosen": -1.523029088973999, "logits/rejected": -1.5339922904968262, "logps/chosen": -776.9263916015625, "logps/rejected": -869.6576538085938, "loss": 0.4985, "rewards/accuracies": 0.6875, "rewards/chosen": -2.9392380714416504, "rewards/margins": 0.5630586743354797, "rewards/rejected": -3.5022971630096436, "step": 779 }, { "epoch": 0.5094293411707078, "grad_norm": 11.246859304420749, "learning_rate": 8.565821545460607e-08, "logits/chosen": -1.5100888013839722, "logits/rejected": -1.5363471508026123, "logps/chosen": -822.7084350585938, "logps/rejected": -870.5260009765625, "loss": 0.4704, "rewards/accuracies": 0.71875, "rewards/chosen": -2.991995334625244, "rewards/margins": 0.8592602610588074, "rewards/rejected": -3.851255416870117, "step": 780 }, { "epoch": 0.5100824557106702, "grad_norm": 22.710318086337125, "learning_rate": 8.548881376512784e-08, "logits/chosen": -1.55587899684906, "logits/rejected": -1.5447403192520142, "logps/chosen": -911.83935546875, "logps/rejected": -1001.2164916992188, "loss": 0.5482, "rewards/accuracies": 0.90625, "rewards/chosen": -3.194542407989502, "rewards/margins": 1.0161552429199219, "rewards/rejected": -4.210697174072266, "step": 781 }, { "epoch": 0.5107355702506327, "grad_norm": 30.493025252100427, "learning_rate": 8.531935747999837e-08, "logits/chosen": -1.5483152866363525, "logits/rejected": -1.5260441303253174, "logps/chosen": -825.1983642578125, "logps/rejected": -968.305419921875, "loss": 0.4986, "rewards/accuracies": 0.6875, "rewards/chosen": -2.9049713611602783, "rewards/margins": 0.961846113204956, "rewards/rejected": -3.8668179512023926, "step": 782 }, { "epoch": 0.5113886847905952, "grad_norm": 11.07008824806609, "learning_rate": 8.514984748125984e-08, "logits/chosen": -1.4909794330596924, "logits/rejected": -1.5248680114746094, "logps/chosen": -831.9634399414062, "logps/rejected": -922.552978515625, "loss": 0.4796, "rewards/accuracies": 0.84375, "rewards/chosen": -2.975536346435547, "rewards/margins": 1.0760196447372437, "rewards/rejected": -4.051555633544922, "step": 783 }, { "epoch": 0.5120417993305576, "grad_norm": 31.858032148945103, "learning_rate": 8.498028465123402e-08, "logits/chosen": -1.4506326913833618, "logits/rejected": -1.4219865798950195, "logps/chosen": -696.5535278320312, "logps/rejected": -853.2267456054688, "loss": 0.4794, "rewards/accuracies": 0.78125, "rewards/chosen": -2.721505641937256, "rewards/margins": 1.1351335048675537, "rewards/rejected": -3.8566393852233887, "step": 784 }, { "epoch": 0.51269491387052, "grad_norm": 67.88266427137975, "learning_rate": 8.48106698725177e-08, "logits/chosen": -1.5079140663146973, "logits/rejected": -1.5019490718841553, "logps/chosen": -861.70458984375, "logps/rejected": -1028.7357177734375, "loss": 0.5223, "rewards/accuracies": 0.75, "rewards/chosen": -3.1499381065368652, "rewards/margins": 1.0946592092514038, "rewards/rejected": -4.244597434997559, "step": 785 }, { "epoch": 0.5133480284104824, "grad_norm": 77.6759127035247, "learning_rate": 8.464100402797803e-08, "logits/chosen": -1.4545526504516602, "logits/rejected": -1.4197547435760498, "logps/chosen": -805.7166137695312, "logps/rejected": -865.64501953125, "loss": 0.5099, "rewards/accuracies": 0.71875, "rewards/chosen": -3.2749907970428467, "rewards/margins": 0.8093310594558716, "rewards/rejected": -4.084321975708008, "step": 786 }, { "epoch": 0.514001142950445, "grad_norm": 16.83703843955555, "learning_rate": 8.4471288000748e-08, "logits/chosen": -1.5386128425598145, "logits/rejected": -1.5408875942230225, "logps/chosen": -842.3564453125, "logps/rejected": -944.2308959960938, "loss": 0.5631, "rewards/accuracies": 0.71875, "rewards/chosen": -3.174443483352661, "rewards/margins": 0.8851109147071838, "rewards/rejected": -4.059554576873779, "step": 787 }, { "epoch": 0.5146542574904074, "grad_norm": 31.817856468339304, "learning_rate": 8.430152267422177e-08, "logits/chosen": -1.5486061573028564, "logits/rejected": -1.53328537940979, "logps/chosen": -797.2845458984375, "logps/rejected": -912.8262329101562, "loss": 0.5719, "rewards/accuracies": 0.8125, "rewards/chosen": -3.162616729736328, "rewards/margins": 0.9437724947929382, "rewards/rejected": -4.106389045715332, "step": 788 }, { "epoch": 0.5153073720303698, "grad_norm": 32.1546133597687, "learning_rate": 8.413170893205015e-08, "logits/chosen": -1.5652754306793213, "logits/rejected": -1.5659431219100952, "logps/chosen": -796.43994140625, "logps/rejected": -850.5557861328125, "loss": 0.5232, "rewards/accuracies": 0.84375, "rewards/chosen": -3.044846773147583, "rewards/margins": 0.7479317784309387, "rewards/rejected": -3.792778730392456, "step": 789 }, { "epoch": 0.5159604865703322, "grad_norm": 11.020829449688339, "learning_rate": 8.396184765813591e-08, "logits/chosen": -1.5715382099151611, "logits/rejected": -1.5400923490524292, "logps/chosen": -811.490234375, "logps/rejected": -932.505615234375, "loss": 0.5092, "rewards/accuracies": 0.75, "rewards/chosen": -3.054403781890869, "rewards/margins": 1.0824800729751587, "rewards/rejected": -4.136883735656738, "step": 790 }, { "epoch": 0.5166136011102948, "grad_norm": 30.888894963547546, "learning_rate": 8.379193973662927e-08, "logits/chosen": -1.5172935724258423, "logits/rejected": -1.5318307876586914, "logps/chosen": -823.394287109375, "logps/rejected": -943.768310546875, "loss": 0.498, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9745614528656006, "rewards/margins": 1.0555309057235718, "rewards/rejected": -4.030092716217041, "step": 791 }, { "epoch": 0.5172667156502572, "grad_norm": 29.8842541290539, "learning_rate": 8.362198605192326e-08, "logits/chosen": -1.557112693786621, "logits/rejected": -1.5465399026870728, "logps/chosen": -826.0409545898438, "logps/rejected": -961.9547119140625, "loss": 0.4962, "rewards/accuracies": 0.71875, "rewards/chosen": -3.1106011867523193, "rewards/margins": 0.760248601436615, "rewards/rejected": -3.870850086212158, "step": 792 }, { "epoch": 0.5179198301902196, "grad_norm": 25.848000115396033, "learning_rate": 8.345198748864909e-08, "logits/chosen": -1.4976153373718262, "logits/rejected": -1.4590017795562744, "logps/chosen": -834.5736083984375, "logps/rejected": -815.7548217773438, "loss": 0.5151, "rewards/accuracies": 0.5625, "rewards/chosen": -3.545506477355957, "rewards/margins": 0.19579878449440002, "rewards/rejected": -3.741305351257324, "step": 793 }, { "epoch": 0.518572944730182, "grad_norm": 22.608186681230375, "learning_rate": 8.328194493167156e-08, "logits/chosen": -1.5127606391906738, "logits/rejected": -1.5453428030014038, "logps/chosen": -874.2019653320312, "logps/rejected": -923.8192749023438, "loss": 0.5177, "rewards/accuracies": 0.78125, "rewards/chosen": -3.3078603744506836, "rewards/margins": 0.7927409410476685, "rewards/rejected": -4.1006011962890625, "step": 794 }, { "epoch": 0.5192260592701445, "grad_norm": 14.489367562787846, "learning_rate": 8.311185926608451e-08, "logits/chosen": -1.50680673122406, "logits/rejected": -1.484675645828247, "logps/chosen": -844.3919067382812, "logps/rejected": -819.8763427734375, "loss": 0.5177, "rewards/accuracies": 0.71875, "rewards/chosen": -3.12322998046875, "rewards/margins": 0.5924826860427856, "rewards/rejected": -3.715712785720825, "step": 795 }, { "epoch": 0.519879173810107, "grad_norm": 11.919265518405389, "learning_rate": 8.29417313772061e-08, "logits/chosen": -1.5627418756484985, "logits/rejected": -1.580378770828247, "logps/chosen": -855.9948120117188, "logps/rejected": -1031.594970703125, "loss": 0.5956, "rewards/accuracies": 0.71875, "rewards/chosen": -3.3382222652435303, "rewards/margins": 0.9997389912605286, "rewards/rejected": -4.337961673736572, "step": 796 }, { "epoch": 0.5205322883500694, "grad_norm": 28.37261621955881, "learning_rate": 8.277156215057434e-08, "logits/chosen": -1.4434901475906372, "logits/rejected": -1.4327919483184814, "logps/chosen": -895.87646484375, "logps/rejected": -958.2759399414062, "loss": 0.4678, "rewards/accuracies": 0.71875, "rewards/chosen": -3.354534864425659, "rewards/margins": 0.7718441486358643, "rewards/rejected": -4.126379013061523, "step": 797 }, { "epoch": 0.5211854028900318, "grad_norm": 22.35879028875356, "learning_rate": 8.260135247194235e-08, "logits/chosen": -1.539190411567688, "logits/rejected": -1.5733530521392822, "logps/chosen": -710.5164794921875, "logps/rejected": -795.2035522460938, "loss": 0.4548, "rewards/accuracies": 0.6875, "rewards/chosen": -2.8853349685668945, "rewards/margins": 0.540171205997467, "rewards/rejected": -3.425506353378296, "step": 798 }, { "epoch": 0.5218385174299943, "grad_norm": 10.507592796277946, "learning_rate": 8.243110322727382e-08, "logits/chosen": -1.5344185829162598, "logits/rejected": -1.5373344421386719, "logps/chosen": -755.857177734375, "logps/rejected": -858.6672973632812, "loss": 0.4848, "rewards/accuracies": 0.875, "rewards/chosen": -2.8476948738098145, "rewards/margins": 0.9375979900360107, "rewards/rejected": -3.785292863845825, "step": 799 }, { "epoch": 0.5224916319699567, "grad_norm": 33.61581825469172, "learning_rate": 8.226081530273843e-08, "logits/chosen": -1.557815670967102, "logits/rejected": -1.561926007270813, "logps/chosen": -820.7285766601562, "logps/rejected": -851.3828125, "loss": 0.511, "rewards/accuracies": 0.75, "rewards/chosen": -3.0852344036102295, "rewards/margins": 0.7549958825111389, "rewards/rejected": -3.8402299880981445, "step": 800 }, { "epoch": 0.5224916319699567, "eval_logits/chosen": -1.527337670326233, "eval_logits/rejected": -1.5160092115402222, "eval_logps/chosen": -815.5145263671875, "eval_logps/rejected": -886.17138671875, "eval_loss": 0.5177087187767029, "eval_rewards/accuracies": 0.7400000095367432, "eval_rewards/chosen": -3.0516912937164307, "eval_rewards/margins": 0.7876277565956116, "eval_rewards/rejected": -3.8393189907073975, "eval_runtime": 296.7047, "eval_samples_per_second": 13.481, "eval_steps_per_second": 0.843, "step": 800 }, { "epoch": 0.5231447465099192, "grad_norm": 29.72721689887477, "learning_rate": 8.209048958470714e-08, "logits/chosen": -1.5397412776947021, "logits/rejected": -1.573317050933838, "logps/chosen": -794.4456176757812, "logps/rejected": -861.4932861328125, "loss": 0.4681, "rewards/accuracies": 0.71875, "rewards/chosen": -3.1979122161865234, "rewards/margins": 0.44129082560539246, "rewards/rejected": -3.6392035484313965, "step": 801 }, { "epoch": 0.5237978610498816, "grad_norm": 24.93121116287881, "learning_rate": 8.192012695974765e-08, "logits/chosen": -1.440004587173462, "logits/rejected": -1.4455904960632324, "logps/chosen": -756.099365234375, "logps/rejected": -812.0906372070312, "loss": 0.4259, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7917063236236572, "rewards/margins": 0.8878428936004639, "rewards/rejected": -3.6795494556427, "step": 802 }, { "epoch": 0.5244509755898441, "grad_norm": 21.486488987183108, "learning_rate": 8.174972831461975e-08, "logits/chosen": -1.4720410108566284, "logits/rejected": -1.4299757480621338, "logps/chosen": -797.6776733398438, "logps/rejected": -876.0466918945312, "loss": 0.5424, "rewards/accuracies": 0.84375, "rewards/chosen": -3.303070545196533, "rewards/margins": 0.6923160552978516, "rewards/rejected": -3.9953863620758057, "step": 803 }, { "epoch": 0.5251040901298065, "grad_norm": 15.956985029082132, "learning_rate": 8.157929453627079e-08, "logits/chosen": -1.528617262840271, "logits/rejected": -1.543229579925537, "logps/chosen": -814.8829345703125, "logps/rejected": -840.0612182617188, "loss": 0.5554, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0760016441345215, "rewards/margins": 0.729838490486145, "rewards/rejected": -3.805840253829956, "step": 804 }, { "epoch": 0.5257572046697689, "grad_norm": 14.955725644267702, "learning_rate": 8.140882651183087e-08, "logits/chosen": -1.4945454597473145, "logits/rejected": -1.50547194480896, "logps/chosen": -847.5591430664062, "logps/rejected": -860.2355346679688, "loss": 0.5914, "rewards/accuracies": 0.71875, "rewards/chosen": -3.3800101280212402, "rewards/margins": 0.4141124486923218, "rewards/rejected": -3.7941229343414307, "step": 805 }, { "epoch": 0.5264103192097314, "grad_norm": 22.99418263109128, "learning_rate": 8.123832512860848e-08, "logits/chosen": -1.5119673013687134, "logits/rejected": -1.5116000175476074, "logps/chosen": -837.5704956054688, "logps/rejected": -927.5317993164062, "loss": 0.5061, "rewards/accuracies": 0.84375, "rewards/chosen": -3.0885889530181885, "rewards/margins": 0.9447399377822876, "rewards/rejected": -4.033329010009766, "step": 806 }, { "epoch": 0.5270634337496939, "grad_norm": 18.701086163571972, "learning_rate": 8.106779127408563e-08, "logits/chosen": -1.4224164485931396, "logits/rejected": -1.4035032987594604, "logps/chosen": -733.4963989257812, "logps/rejected": -846.0479736328125, "loss": 0.5246, "rewards/accuracies": 0.8125, "rewards/chosen": -2.811711072921753, "rewards/margins": 0.7736900448799133, "rewards/rejected": -3.5854010581970215, "step": 807 }, { "epoch": 0.5277165482896563, "grad_norm": 40.22777398303923, "learning_rate": 8.08972258359134e-08, "logits/chosen": -1.497331142425537, "logits/rejected": -1.5132163763046265, "logps/chosen": -791.0665893554688, "logps/rejected": -848.4359130859375, "loss": 0.5209, "rewards/accuracies": 0.8125, "rewards/chosen": -2.804387092590332, "rewards/margins": 0.7790587544441223, "rewards/rejected": -3.5834455490112305, "step": 808 }, { "epoch": 0.5283696628296187, "grad_norm": 28.929510873590274, "learning_rate": 8.07266297019073e-08, "logits/chosen": -1.5387717485427856, "logits/rejected": -1.5651496648788452, "logps/chosen": -821.0679321289062, "logps/rejected": -911.5535278320312, "loss": 0.4894, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0287632942199707, "rewards/margins": 0.7187597155570984, "rewards/rejected": -3.747523307800293, "step": 809 }, { "epoch": 0.5290227773695811, "grad_norm": 45.22369470855717, "learning_rate": 8.055600376004255e-08, "logits/chosen": -1.5283149480819702, "logits/rejected": -1.520777940750122, "logps/chosen": -786.1401977539062, "logps/rejected": -925.03125, "loss": 0.5102, "rewards/accuracies": 0.84375, "rewards/chosen": -2.684399366378784, "rewards/margins": 1.1089178323745728, "rewards/rejected": -3.7933173179626465, "step": 810 }, { "epoch": 0.5296758919095437, "grad_norm": 42.19326253786495, "learning_rate": 8.038534889844956e-08, "logits/chosen": -1.6188210248947144, "logits/rejected": -1.5729517936706543, "logps/chosen": -868.0130004882812, "logps/rejected": -917.4168090820312, "loss": 0.5735, "rewards/accuracies": 0.84375, "rewards/chosen": -3.09293270111084, "rewards/margins": 1.0333658456802368, "rewards/rejected": -4.126298904418945, "step": 811 }, { "epoch": 0.5303290064495061, "grad_norm": 117.37707476367844, "learning_rate": 8.021466600540928e-08, "logits/chosen": -1.5684051513671875, "logits/rejected": -1.5472626686096191, "logps/chosen": -852.613525390625, "logps/rejected": -913.04833984375, "loss": 0.5436, "rewards/accuracies": 0.6875, "rewards/chosen": -3.4583096504211426, "rewards/margins": 0.717620313167572, "rewards/rejected": -4.175930023193359, "step": 812 }, { "epoch": 0.5309821209894685, "grad_norm": 28.12903118620616, "learning_rate": 8.004395596934856e-08, "logits/chosen": -1.4949054718017578, "logits/rejected": -1.4636915922164917, "logps/chosen": -733.1012573242188, "logps/rejected": -767.9586181640625, "loss": 0.5379, "rewards/accuracies": 0.71875, "rewards/chosen": -2.7585432529449463, "rewards/margins": 0.46337318420410156, "rewards/rejected": -3.221916675567627, "step": 813 }, { "epoch": 0.5316352355294309, "grad_norm": 9.221461814471073, "learning_rate": 7.987321967883549e-08, "logits/chosen": -1.6841922998428345, "logits/rejected": -1.6034355163574219, "logps/chosen": -839.085205078125, "logps/rejected": -876.251953125, "loss": 0.5158, "rewards/accuracies": 0.75, "rewards/chosen": -3.193796396255493, "rewards/margins": 0.6729452610015869, "rewards/rejected": -3.866741418838501, "step": 814 }, { "epoch": 0.5322883500693935, "grad_norm": 10.618048036543714, "learning_rate": 7.970245802257487e-08, "logits/chosen": -1.6014914512634277, "logits/rejected": -1.5899266004562378, "logps/chosen": -847.2337646484375, "logps/rejected": -908.861572265625, "loss": 0.5192, "rewards/accuracies": 0.78125, "rewards/chosen": -3.173642158508301, "rewards/margins": 0.662325382232666, "rewards/rejected": -3.835967540740967, "step": 815 }, { "epoch": 0.5329414646093559, "grad_norm": 24.838517311372605, "learning_rate": 7.953167188940353e-08, "logits/chosen": -1.531007170677185, "logits/rejected": -1.5374797582626343, "logps/chosen": -871.73291015625, "logps/rejected": -1004.2645874023438, "loss": 0.4973, "rewards/accuracies": 0.71875, "rewards/chosen": -3.170253276824951, "rewards/margins": 0.9588162899017334, "rewards/rejected": -4.1290693283081055, "step": 816 }, { "epoch": 0.5335945791493183, "grad_norm": 101.58547120360855, "learning_rate": 7.936086216828568e-08, "logits/chosen": -1.5855270624160767, "logits/rejected": -1.600976586341858, "logps/chosen": -784.5303955078125, "logps/rejected": -773.5015258789062, "loss": 0.4903, "rewards/accuracies": 0.6875, "rewards/chosen": -2.953719139099121, "rewards/margins": 0.48683837056159973, "rewards/rejected": -3.4405574798583984, "step": 817 }, { "epoch": 0.5342476936892807, "grad_norm": 39.27500759821802, "learning_rate": 7.919002974830833e-08, "logits/chosen": -1.577518105506897, "logits/rejected": -1.5548105239868164, "logps/chosen": -707.9842529296875, "logps/rejected": -847.3580932617188, "loss": 0.4995, "rewards/accuracies": 0.84375, "rewards/chosen": -2.563530921936035, "rewards/margins": 0.9225165843963623, "rewards/rejected": -3.4860472679138184, "step": 818 }, { "epoch": 0.5349008082292432, "grad_norm": 51.172477176857036, "learning_rate": 7.901917551867663e-08, "logits/chosen": -1.5167884826660156, "logits/rejected": -1.4933404922485352, "logps/chosen": -764.8781127929688, "logps/rejected": -829.58935546875, "loss": 0.4601, "rewards/accuracies": 0.75, "rewards/chosen": -2.603954792022705, "rewards/margins": 0.4926825761795044, "rewards/rejected": -3.09663724899292, "step": 819 }, { "epoch": 0.5355539227692057, "grad_norm": 30.46290960655871, "learning_rate": 7.884830036870922e-08, "logits/chosen": -1.4870680570602417, "logits/rejected": -1.4628260135650635, "logps/chosen": -821.735595703125, "logps/rejected": -956.1165771484375, "loss": 0.484, "rewards/accuracies": 0.84375, "rewards/chosen": -3.0134003162384033, "rewards/margins": 1.0060707330703735, "rewards/rejected": -4.019470691680908, "step": 820 }, { "epoch": 0.5362070373091681, "grad_norm": 13.505022242348229, "learning_rate": 7.867740518783371e-08, "logits/chosen": -1.5747795104980469, "logits/rejected": -1.5648517608642578, "logps/chosen": -851.3766479492188, "logps/rejected": -903.6992797851562, "loss": 0.5012, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9860377311706543, "rewards/margins": 0.8135138750076294, "rewards/rejected": -3.799551486968994, "step": 821 }, { "epoch": 0.5368601518491305, "grad_norm": 19.48923466978227, "learning_rate": 7.85064908655819e-08, "logits/chosen": -1.4971075057983398, "logits/rejected": -1.548054814338684, "logps/chosen": -855.16748046875, "logps/rejected": -904.4014282226562, "loss": 0.4958, "rewards/accuracies": 0.8125, "rewards/chosen": -3.101994514465332, "rewards/margins": 0.8241324424743652, "rewards/rejected": -3.926126480102539, "step": 822 }, { "epoch": 0.537513266389093, "grad_norm": 24.801244541119452, "learning_rate": 7.833555829158527e-08, "logits/chosen": -1.5962388515472412, "logits/rejected": -1.5477280616760254, "logps/chosen": -887.2816772460938, "logps/rejected": -913.8333129882812, "loss": 0.4767, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1752548217773438, "rewards/margins": 0.6703548431396484, "rewards/rejected": -3.845609664916992, "step": 823 }, { "epoch": 0.5381663809290554, "grad_norm": 42.45784716129404, "learning_rate": 7.816460835557028e-08, "logits/chosen": -1.5705194473266602, "logits/rejected": -1.5918525457382202, "logps/chosen": -808.6838989257812, "logps/rejected": -973.3630981445312, "loss": 0.4975, "rewards/accuracies": 0.78125, "rewards/chosen": -3.012310743331909, "rewards/margins": 1.2067666053771973, "rewards/rejected": -4.2190775871276855, "step": 824 }, { "epoch": 0.5388194954690179, "grad_norm": 18.2583142847124, "learning_rate": 7.799364194735377e-08, "logits/chosen": -1.5670945644378662, "logits/rejected": -1.5296969413757324, "logps/chosen": -861.2294311523438, "logps/rejected": -866.001953125, "loss": 0.5391, "rewards/accuracies": 0.625, "rewards/chosen": -3.31349515914917, "rewards/margins": 0.387903094291687, "rewards/rejected": -3.7013983726501465, "step": 825 }, { "epoch": 0.5394726100089803, "grad_norm": 14.894630917984315, "learning_rate": 7.782265995683828e-08, "logits/chosen": -1.539316177368164, "logits/rejected": -1.5315918922424316, "logps/chosen": -808.724609375, "logps/rejected": -842.3665771484375, "loss": 0.4964, "rewards/accuracies": 0.78125, "rewards/chosen": -2.908222198486328, "rewards/margins": 0.6761490106582642, "rewards/rejected": -3.584371328353882, "step": 826 }, { "epoch": 0.5401257245489428, "grad_norm": 46.701075509344925, "learning_rate": 7.765166327400754e-08, "logits/chosen": -1.5789310932159424, "logits/rejected": -1.552915334701538, "logps/chosen": -835.5314331054688, "logps/rejected": -890.4103393554688, "loss": 0.4631, "rewards/accuracies": 0.875, "rewards/chosen": -3.1613926887512207, "rewards/margins": 0.716499924659729, "rewards/rejected": -3.8778927326202393, "step": 827 }, { "epoch": 0.5407788390889052, "grad_norm": 23.68366708316572, "learning_rate": 7.748065278892171e-08, "logits/chosen": -1.5369728803634644, "logits/rejected": -1.5010067224502563, "logps/chosen": -812.9715576171875, "logps/rejected": -846.779052734375, "loss": 0.5664, "rewards/accuracies": 0.65625, "rewards/chosen": -3.117812156677246, "rewards/margins": 0.5145601630210876, "rewards/rejected": -3.6323723793029785, "step": 828 }, { "epoch": 0.5414319536288676, "grad_norm": 9.80288754353676, "learning_rate": 7.730962939171278e-08, "logits/chosen": -1.5014228820800781, "logits/rejected": -1.503250241279602, "logps/chosen": -921.4869384765625, "logps/rejected": -941.43896484375, "loss": 0.4775, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4559803009033203, "rewards/margins": 0.8563462495803833, "rewards/rejected": -4.312326431274414, "step": 829 }, { "epoch": 0.5420850681688301, "grad_norm": 23.08260983849523, "learning_rate": 7.713859397257995e-08, "logits/chosen": -1.5203160047531128, "logits/rejected": -1.4632571935653687, "logps/chosen": -813.74267578125, "logps/rejected": -845.8812255859375, "loss": 0.5687, "rewards/accuracies": 0.75, "rewards/chosen": -3.1719284057617188, "rewards/margins": 0.44786882400512695, "rewards/rejected": -3.6197972297668457, "step": 830 }, { "epoch": 0.5427381827087926, "grad_norm": 86.00587707569419, "learning_rate": 7.696754742178503e-08, "logits/chosen": -1.5021620988845825, "logits/rejected": -1.5242396593093872, "logps/chosen": -817.4739990234375, "logps/rejected": -865.0052490234375, "loss": 0.5095, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8962783813476562, "rewards/margins": 0.689846932888031, "rewards/rejected": -3.586124897003174, "step": 831 }, { "epoch": 0.543391297248755, "grad_norm": 52.62733320349321, "learning_rate": 7.679649062964774e-08, "logits/chosen": -1.567655324935913, "logits/rejected": -1.5549966096878052, "logps/chosen": -735.2625732421875, "logps/rejected": -834.2606811523438, "loss": 0.459, "rewards/accuracies": 0.875, "rewards/chosen": -2.9052071571350098, "rewards/margins": 0.9503459930419922, "rewards/rejected": -3.85555362701416, "step": 832 }, { "epoch": 0.5440444117887174, "grad_norm": 34.035780541177466, "learning_rate": 7.662542448654109e-08, "logits/chosen": -1.5708342790603638, "logits/rejected": -1.511175274848938, "logps/chosen": -821.263671875, "logps/rejected": -975.2786254882812, "loss": 0.5008, "rewards/accuracies": 0.90625, "rewards/chosen": -2.831267833709717, "rewards/margins": 1.1821346282958984, "rewards/rejected": -4.013402462005615, "step": 833 }, { "epoch": 0.5446975263286798, "grad_norm": 12.816837148569553, "learning_rate": 7.645434988288683e-08, "logits/chosen": -1.5968691110610962, "logits/rejected": -1.5883498191833496, "logps/chosen": -778.30712890625, "logps/rejected": -840.7581176757812, "loss": 0.5001, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8528571128845215, "rewards/margins": 0.7486177086830139, "rewards/rejected": -3.6014747619628906, "step": 834 }, { "epoch": 0.5453506408686424, "grad_norm": 22.221139309815047, "learning_rate": 7.628326770915069e-08, "logits/chosen": -1.5044746398925781, "logits/rejected": -1.4766840934753418, "logps/chosen": -839.3434448242188, "logps/rejected": -888.7236328125, "loss": 0.4587, "rewards/accuracies": 0.625, "rewards/chosen": -3.229734182357788, "rewards/margins": 0.8066515922546387, "rewards/rejected": -4.036386013031006, "step": 835 }, { "epoch": 0.5460037554086048, "grad_norm": 21.464697851921866, "learning_rate": 7.611217885583783e-08, "logits/chosen": -1.6017417907714844, "logits/rejected": -1.6247013807296753, "logps/chosen": -887.736328125, "logps/rejected": -987.9991455078125, "loss": 0.5011, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1084957122802734, "rewards/margins": 0.854011595249176, "rewards/rejected": -3.9625072479248047, "step": 836 }, { "epoch": 0.5466568699485672, "grad_norm": 39.79039092333048, "learning_rate": 7.594108421348816e-08, "logits/chosen": -1.4360780715942383, "logits/rejected": -1.4754202365875244, "logps/chosen": -759.8140258789062, "logps/rejected": -813.8535766601562, "loss": 0.5061, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8158435821533203, "rewards/margins": 0.8641850352287292, "rewards/rejected": -3.6800286769866943, "step": 837 }, { "epoch": 0.5473099844885296, "grad_norm": 55.549125631484166, "learning_rate": 7.576998467267174e-08, "logits/chosen": -1.56089448928833, "logits/rejected": -1.581824541091919, "logps/chosen": -909.1053466796875, "logps/rejected": -948.4317016601562, "loss": 0.5306, "rewards/accuracies": 0.78125, "rewards/chosen": -3.213887929916382, "rewards/margins": 0.46870189905166626, "rewards/rejected": -3.6825897693634033, "step": 838 }, { "epoch": 0.5479630990284922, "grad_norm": 55.14261568208239, "learning_rate": 7.559888112398411e-08, "logits/chosen": -1.4533392190933228, "logits/rejected": -1.4627681970596313, "logps/chosen": -726.8400268554688, "logps/rejected": -814.5458374023438, "loss": 0.5163, "rewards/accuracies": 0.8125, "rewards/chosen": -2.764704942703247, "rewards/margins": 0.9564896821975708, "rewards/rejected": -3.7211947441101074, "step": 839 }, { "epoch": 0.5486162135684546, "grad_norm": 30.505010236460816, "learning_rate": 7.542777445804171e-08, "logits/chosen": -1.3663051128387451, "logits/rejected": -1.3810725212097168, "logps/chosen": -795.670654296875, "logps/rejected": -962.4092407226562, "loss": 0.4755, "rewards/accuracies": 0.84375, "rewards/chosen": -3.0239500999450684, "rewards/margins": 1.2967259883880615, "rewards/rejected": -4.320675849914551, "step": 840 }, { "epoch": 0.549269328108417, "grad_norm": 14.300839480022828, "learning_rate": 7.525666556547714e-08, "logits/chosen": -1.6288928985595703, "logits/rejected": -1.5527688264846802, "logps/chosen": -799.6822509765625, "logps/rejected": -940.3196411132812, "loss": 0.494, "rewards/accuracies": 0.78125, "rewards/chosen": -2.9739322662353516, "rewards/margins": 0.9197661876678467, "rewards/rejected": -3.893698215484619, "step": 841 }, { "epoch": 0.5499224426483794, "grad_norm": 88.7947159228681, "learning_rate": 7.508555533693462e-08, "logits/chosen": -1.4860543012619019, "logits/rejected": -1.4642175436019897, "logps/chosen": -732.2152099609375, "logps/rejected": -754.6222534179688, "loss": 0.4724, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6171233654022217, "rewards/margins": 0.7891316413879395, "rewards/rejected": -3.4062552452087402, "step": 842 }, { "epoch": 0.550575557188342, "grad_norm": 69.35311679506285, "learning_rate": 7.49144446630654e-08, "logits/chosen": -1.4915255308151245, "logits/rejected": -1.4560573101043701, "logps/chosen": -874.8602294921875, "logps/rejected": -909.4222412109375, "loss": 0.5181, "rewards/accuracies": 0.78125, "rewards/chosen": -3.3296079635620117, "rewards/margins": 0.6430115699768066, "rewards/rejected": -3.9726200103759766, "step": 843 }, { "epoch": 0.5512286717283044, "grad_norm": 14.122314075620803, "learning_rate": 7.474333443452289e-08, "logits/chosen": -1.5023688077926636, "logits/rejected": -1.4814002513885498, "logps/chosen": -812.0813598632812, "logps/rejected": -892.1846923828125, "loss": 0.4773, "rewards/accuracies": 0.84375, "rewards/chosen": -3.119580030441284, "rewards/margins": 1.089004397392273, "rewards/rejected": -4.208584785461426, "step": 844 }, { "epoch": 0.5518817862682668, "grad_norm": 22.523095660476606, "learning_rate": 7.45722255419583e-08, "logits/chosen": -1.598945140838623, "logits/rejected": -1.5560745000839233, "logps/chosen": -848.6195678710938, "logps/rejected": -891.3744506835938, "loss": 0.5091, "rewards/accuracies": 0.71875, "rewards/chosen": -3.038963556289673, "rewards/margins": 0.6308895349502563, "rewards/rejected": -3.6698527336120605, "step": 845 }, { "epoch": 0.5525349008082292, "grad_norm": 37.5554038949777, "learning_rate": 7.44011188760159e-08, "logits/chosen": -1.593327283859253, "logits/rejected": -1.6003532409667969, "logps/chosen": -984.0642700195312, "logps/rejected": -945.6554565429688, "loss": 0.5136, "rewards/accuracies": 0.6875, "rewards/chosen": -3.7509801387786865, "rewards/margins": 0.44811174273490906, "rewards/rejected": -4.199091911315918, "step": 846 }, { "epoch": 0.5531880153481917, "grad_norm": 16.920523902644987, "learning_rate": 7.423001532732826e-08, "logits/chosen": -1.5089354515075684, "logits/rejected": -1.468766212463379, "logps/chosen": -713.7976684570312, "logps/rejected": -817.9136962890625, "loss": 0.5476, "rewards/accuracies": 0.75, "rewards/chosen": -2.9451818466186523, "rewards/margins": 1.023890495300293, "rewards/rejected": -3.9690723419189453, "step": 847 }, { "epoch": 0.5538411298881541, "grad_norm": 11.301729003307928, "learning_rate": 7.405891578651185e-08, "logits/chosen": -1.5803592205047607, "logits/rejected": -1.5427913665771484, "logps/chosen": -875.8158569335938, "logps/rejected": -872.2986450195312, "loss": 0.5062, "rewards/accuracies": 0.71875, "rewards/chosen": -3.213369369506836, "rewards/margins": 0.4956037104129791, "rewards/rejected": -3.708972930908203, "step": 848 }, { "epoch": 0.5544942444281166, "grad_norm": 74.39833754806011, "learning_rate": 7.388782114416217e-08, "logits/chosen": -1.6001735925674438, "logits/rejected": -1.5752531290054321, "logps/chosen": -795.3203125, "logps/rejected": -860.407470703125, "loss": 0.5166, "rewards/accuracies": 0.90625, "rewards/chosen": -3.002932548522949, "rewards/margins": 0.9377602934837341, "rewards/rejected": -3.9406933784484863, "step": 849 }, { "epoch": 0.555147358968079, "grad_norm": 19.66725027995187, "learning_rate": 7.371673229084931e-08, "logits/chosen": -1.5427309274673462, "logits/rejected": -1.5140000581741333, "logps/chosen": -799.3212890625, "logps/rejected": -892.1961669921875, "loss": 0.5065, "rewards/accuracies": 0.75, "rewards/chosen": -3.130077838897705, "rewards/margins": 1.1122186183929443, "rewards/rejected": -4.24229621887207, "step": 850 }, { "epoch": 0.5558004735080415, "grad_norm": 40.531931496184235, "learning_rate": 7.354565011711317e-08, "logits/chosen": -1.4393893480300903, "logits/rejected": -1.425200343132019, "logps/chosen": -727.331787109375, "logps/rejected": -858.167236328125, "loss": 0.4825, "rewards/accuracies": 0.875, "rewards/chosen": -2.9096834659576416, "rewards/margins": 1.041924238204956, "rewards/rejected": -3.9516077041625977, "step": 851 }, { "epoch": 0.5564535880480039, "grad_norm": 16.984478852782892, "learning_rate": 7.33745755134589e-08, "logits/chosen": -1.5062000751495361, "logits/rejected": -1.53849458694458, "logps/chosen": -802.3837890625, "logps/rejected": -890.3169555664062, "loss": 0.5344, "rewards/accuracies": 0.78125, "rewards/chosen": -3.2534284591674805, "rewards/margins": 0.8640724420547485, "rewards/rejected": -4.1175007820129395, "step": 852 }, { "epoch": 0.5571067025879664, "grad_norm": 35.84235839938917, "learning_rate": 7.320350937035228e-08, "logits/chosen": -1.5342974662780762, "logits/rejected": -1.5023765563964844, "logps/chosen": -835.8595581054688, "logps/rejected": -865.0036010742188, "loss": 0.5255, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0694611072540283, "rewards/margins": 0.7715376615524292, "rewards/rejected": -3.840998411178589, "step": 853 }, { "epoch": 0.5577598171279288, "grad_norm": 10.233512310662372, "learning_rate": 7.303245257821498e-08, "logits/chosen": -1.517195701599121, "logits/rejected": -1.518413782119751, "logps/chosen": -832.9429931640625, "logps/rejected": -876.033447265625, "loss": 0.5145, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8127410411834717, "rewards/margins": 0.624780535697937, "rewards/rejected": -3.437521457672119, "step": 854 }, { "epoch": 0.5584129316678913, "grad_norm": 26.56768549861038, "learning_rate": 7.286140602742005e-08, "logits/chosen": -1.4718785285949707, "logits/rejected": -1.4336018562316895, "logps/chosen": -897.831298828125, "logps/rejected": -885.6130981445312, "loss": 0.5896, "rewards/accuracies": 0.78125, "rewards/chosen": -3.3304998874664307, "rewards/margins": 0.5319950580596924, "rewards/rejected": -3.862494945526123, "step": 855 }, { "epoch": 0.5590660462078537, "grad_norm": 184.22901412620044, "learning_rate": 7.269037060828724e-08, "logits/chosen": -1.501589298248291, "logits/rejected": -1.5195332765579224, "logps/chosen": -806.921875, "logps/rejected": -836.3479614257812, "loss": 0.5501, "rewards/accuracies": 0.71875, "rewards/chosen": -2.9807791709899902, "rewards/margins": 0.5941177606582642, "rewards/rejected": -3.574897050857544, "step": 856 }, { "epoch": 0.5597191607478161, "grad_norm": 67.4144741062901, "learning_rate": 7.25193472110783e-08, "logits/chosen": -1.5734254121780396, "logits/rejected": -1.5101865530014038, "logps/chosen": -823.78759765625, "logps/rejected": -862.5943603515625, "loss": 0.4974, "rewards/accuracies": 0.8125, "rewards/chosen": -3.210695266723633, "rewards/margins": 0.569321870803833, "rewards/rejected": -3.780017375946045, "step": 857 }, { "epoch": 0.5603722752877786, "grad_norm": 18.871171186142504, "learning_rate": 7.234833672599245e-08, "logits/chosen": -1.580259919166565, "logits/rejected": -1.5304930210113525, "logps/chosen": -937.5279541015625, "logps/rejected": -984.5499267578125, "loss": 0.5273, "rewards/accuracies": 0.8125, "rewards/chosen": -3.529520034790039, "rewards/margins": 0.9078921675682068, "rewards/rejected": -4.437412261962891, "step": 858 }, { "epoch": 0.5610253898277411, "grad_norm": 48.26102317175817, "learning_rate": 7.217734004316172e-08, "logits/chosen": -1.5722343921661377, "logits/rejected": -1.5775549411773682, "logps/chosen": -803.7421875, "logps/rejected": -866.3418579101562, "loss": 0.4969, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7525508403778076, "rewards/margins": 0.8512840270996094, "rewards/rejected": -3.603835105895996, "step": 859 }, { "epoch": 0.5616785043677035, "grad_norm": 13.00017996435541, "learning_rate": 7.200635805264625e-08, "logits/chosen": -1.551430583000183, "logits/rejected": -1.4977049827575684, "logps/chosen": -795.7646484375, "logps/rejected": -836.5863037109375, "loss": 0.4979, "rewards/accuracies": 0.71875, "rewards/chosen": -3.18217134475708, "rewards/margins": 0.8377689123153687, "rewards/rejected": -4.019940376281738, "step": 860 }, { "epoch": 0.5623316189076659, "grad_norm": 9.231732491941768, "learning_rate": 7.183539164442973e-08, "logits/chosen": -1.530940055847168, "logits/rejected": -1.5165448188781738, "logps/chosen": -862.6856689453125, "logps/rejected": -992.7938232421875, "loss": 0.455, "rewards/accuracies": 0.875, "rewards/chosen": -3.554490089416504, "rewards/margins": 1.1885462999343872, "rewards/rejected": -4.743036270141602, "step": 861 }, { "epoch": 0.5629847334476283, "grad_norm": 62.61248101231807, "learning_rate": 7.166444170841473e-08, "logits/chosen": -1.4707496166229248, "logits/rejected": -1.4739537239074707, "logps/chosen": -821.8203125, "logps/rejected": -850.6475219726562, "loss": 0.5211, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7928149700164795, "rewards/margins": 0.47822028398513794, "rewards/rejected": -3.2710354328155518, "step": 862 }, { "epoch": 0.5636378479875909, "grad_norm": 109.02226402387393, "learning_rate": 7.149350913441809e-08, "logits/chosen": -1.5233734846115112, "logits/rejected": -1.5370923280715942, "logps/chosen": -793.0894775390625, "logps/rejected": -808.7146606445312, "loss": 0.534, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8652713298797607, "rewards/margins": 0.5744865536689758, "rewards/rejected": -3.43975830078125, "step": 863 }, { "epoch": 0.5642909625275533, "grad_norm": 29.93004571485592, "learning_rate": 7.132259481216628e-08, "logits/chosen": -1.543608546257019, "logits/rejected": -1.496211051940918, "logps/chosen": -812.6435546875, "logps/rejected": -853.930908203125, "loss": 0.4421, "rewards/accuracies": 0.90625, "rewards/chosen": -3.108489990234375, "rewards/margins": 0.8767549991607666, "rewards/rejected": -3.9852447509765625, "step": 864 }, { "epoch": 0.5649440770675157, "grad_norm": 61.12660963375552, "learning_rate": 7.115169963129076e-08, "logits/chosen": -1.5221493244171143, "logits/rejected": -1.4868009090423584, "logps/chosen": -754.1113891601562, "logps/rejected": -894.2318115234375, "loss": 0.475, "rewards/accuracies": 0.65625, "rewards/chosen": -2.7326500415802, "rewards/margins": 0.7210690975189209, "rewards/rejected": -3.453718662261963, "step": 865 }, { "epoch": 0.5655971916074781, "grad_norm": 17.649759346104574, "learning_rate": 7.098082448132339e-08, "logits/chosen": -1.4271429777145386, "logits/rejected": -1.3975777626037598, "logps/chosen": -795.72412109375, "logps/rejected": -917.9866943359375, "loss": 0.5119, "rewards/accuracies": 0.75, "rewards/chosen": -3.1587576866149902, "rewards/margins": 0.951889157295227, "rewards/rejected": -4.110646724700928, "step": 866 }, { "epoch": 0.5662503061474407, "grad_norm": 8.92934245433921, "learning_rate": 7.080997025169167e-08, "logits/chosen": -1.5210411548614502, "logits/rejected": -1.5380655527114868, "logps/chosen": -813.1768798828125, "logps/rejected": -933.7425537109375, "loss": 0.4734, "rewards/accuracies": 0.84375, "rewards/chosen": -3.348355293273926, "rewards/margins": 0.8711065649986267, "rewards/rejected": -4.2194623947143555, "step": 867 }, { "epoch": 0.5669034206874031, "grad_norm": 12.604264013248898, "learning_rate": 7.063913783171431e-08, "logits/chosen": -1.532928466796875, "logits/rejected": -1.5313588380813599, "logps/chosen": -769.3619995117188, "logps/rejected": -937.7196044921875, "loss": 0.4693, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7876639366149902, "rewards/margins": 1.2222365140914917, "rewards/rejected": -4.0099005699157715, "step": 868 }, { "epoch": 0.5675565352273655, "grad_norm": 78.29234208780828, "learning_rate": 7.046832811059646e-08, "logits/chosen": -1.5154472589492798, "logits/rejected": -1.5125453472137451, "logps/chosen": -837.0894165039062, "logps/rejected": -932.6472778320312, "loss": 0.4858, "rewards/accuracies": 0.6875, "rewards/chosen": -2.860597848892212, "rewards/margins": 0.9204597473144531, "rewards/rejected": -3.781057596206665, "step": 869 }, { "epoch": 0.5682096497673279, "grad_norm": 180.9882527133902, "learning_rate": 7.029754197742512e-08, "logits/chosen": -1.5261096954345703, "logits/rejected": -1.5237348079681396, "logps/chosen": -826.7614135742188, "logps/rejected": -1008.6333618164062, "loss": 0.5309, "rewards/accuracies": 0.75, "rewards/chosen": -3.2434701919555664, "rewards/margins": 1.0818681716918945, "rewards/rejected": -4.325338363647461, "step": 870 }, { "epoch": 0.5688627643072904, "grad_norm": 40.32414479490123, "learning_rate": 7.01267803211645e-08, "logits/chosen": -1.5508898496627808, "logits/rejected": -1.527504801750183, "logps/chosen": -790.9599609375, "logps/rejected": -838.9732666015625, "loss": 0.5444, "rewards/accuracies": 0.59375, "rewards/chosen": -3.0933451652526855, "rewards/margins": 0.376648485660553, "rewards/rejected": -3.469994306564331, "step": 871 }, { "epoch": 0.5695158788472529, "grad_norm": 72.83509120194428, "learning_rate": 6.995604403065144e-08, "logits/chosen": -1.5775620937347412, "logits/rejected": -1.5864499807357788, "logps/chosen": -769.3580932617188, "logps/rejected": -876.96484375, "loss": 0.5465, "rewards/accuracies": 0.78125, "rewards/chosen": -3.112041473388672, "rewards/margins": 0.8449998497962952, "rewards/rejected": -3.957040786743164, "step": 872 }, { "epoch": 0.5701689933872153, "grad_norm": 12.905974016118856, "learning_rate": 6.978533399459071e-08, "logits/chosen": -1.5712107419967651, "logits/rejected": -1.5447235107421875, "logps/chosen": -777.7793579101562, "logps/rejected": -840.2425537109375, "loss": 0.5643, "rewards/accuracies": 0.71875, "rewards/chosen": -2.9859039783477783, "rewards/margins": 0.5791632533073425, "rewards/rejected": -3.5650668144226074, "step": 873 }, { "epoch": 0.5708221079271777, "grad_norm": 21.726965969176625, "learning_rate": 6.961465110155043e-08, "logits/chosen": -1.512514352798462, "logits/rejected": -1.4853906631469727, "logps/chosen": -893.4528198242188, "logps/rejected": -1067.001220703125, "loss": 0.4754, "rewards/accuracies": 0.84375, "rewards/chosen": -3.4161694049835205, "rewards/margins": 1.114807367324829, "rewards/rejected": -4.530977249145508, "step": 874 }, { "epoch": 0.5714752224671402, "grad_norm": 22.280036018028355, "learning_rate": 6.944399623995744e-08, "logits/chosen": -1.5252811908721924, "logits/rejected": -1.5037376880645752, "logps/chosen": -850.0037841796875, "logps/rejected": -850.1954345703125, "loss": 0.5284, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4480960369110107, "rewards/margins": 0.41501325368881226, "rewards/rejected": -3.8631091117858887, "step": 875 }, { "epoch": 0.5721283370071026, "grad_norm": 40.53249032061718, "learning_rate": 6.92733702980927e-08, "logits/chosen": -1.5603222846984863, "logits/rejected": -1.5633220672607422, "logps/chosen": -817.9495849609375, "logps/rejected": -1105.2486572265625, "loss": 0.442, "rewards/accuracies": 0.9375, "rewards/chosen": -3.2014811038970947, "rewards/margins": 1.719805359840393, "rewards/rejected": -4.921286106109619, "step": 876 }, { "epoch": 0.5727814515470651, "grad_norm": 18.002814145085477, "learning_rate": 6.910277416408661e-08, "logits/chosen": -1.5658931732177734, "logits/rejected": -1.5071964263916016, "logps/chosen": -770.361083984375, "logps/rejected": -793.378173828125, "loss": 0.4778, "rewards/accuracies": 0.75, "rewards/chosen": -3.1433448791503906, "rewards/margins": 0.5039821863174438, "rewards/rejected": -3.647326946258545, "step": 877 }, { "epoch": 0.5734345660870275, "grad_norm": 12.307347390907383, "learning_rate": 6.89322087259144e-08, "logits/chosen": -1.511487603187561, "logits/rejected": -1.5128693580627441, "logps/chosen": -846.0947875976562, "logps/rejected": -906.3300170898438, "loss": 0.4864, "rewards/accuracies": 0.71875, "rewards/chosen": -3.2089366912841797, "rewards/margins": 0.9323613047599792, "rewards/rejected": -4.141297340393066, "step": 878 }, { "epoch": 0.57408768062699, "grad_norm": 21.452756223686876, "learning_rate": 6.876167487139154e-08, "logits/chosen": -1.526938796043396, "logits/rejected": -1.5194412469863892, "logps/chosen": -1003.674560546875, "logps/rejected": -1024.025390625, "loss": 0.5129, "rewards/accuracies": 0.71875, "rewards/chosen": -3.6137397289276123, "rewards/margins": 0.7001018524169922, "rewards/rejected": -4.313840866088867, "step": 879 }, { "epoch": 0.5747407951669524, "grad_norm": 12.942447833043328, "learning_rate": 6.859117348816912e-08, "logits/chosen": -1.4752848148345947, "logits/rejected": -1.3929071426391602, "logps/chosen": -797.0738525390625, "logps/rejected": -978.4248046875, "loss": 0.5142, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4416940212249756, "rewards/margins": 1.1079531908035278, "rewards/rejected": -4.549646854400635, "step": 880 }, { "epoch": 0.5753939097069148, "grad_norm": 64.69859008439977, "learning_rate": 6.842070546372922e-08, "logits/chosen": -1.5244097709655762, "logits/rejected": -1.486146092414856, "logps/chosen": -969.6875, "logps/rejected": -1005.9173583984375, "loss": 0.6013, "rewards/accuracies": 0.71875, "rewards/chosen": -3.9753427505493164, "rewards/margins": 0.6842749118804932, "rewards/rejected": -4.659617900848389, "step": 881 }, { "epoch": 0.5760470242468773, "grad_norm": 42.79091393495759, "learning_rate": 6.825027168538024e-08, "logits/chosen": -1.4888124465942383, "logits/rejected": -1.4149153232574463, "logps/chosen": -813.0571899414062, "logps/rejected": -825.3543701171875, "loss": 0.574, "rewards/accuracies": 0.8125, "rewards/chosen": -3.368730068206787, "rewards/margins": 0.39747297763824463, "rewards/rejected": -3.766202926635742, "step": 882 }, { "epoch": 0.5767001387868398, "grad_norm": 18.008747424093183, "learning_rate": 6.807987304025236e-08, "logits/chosen": -1.4898674488067627, "logits/rejected": -1.4935599565505981, "logps/chosen": -851.6963500976562, "logps/rejected": -859.6405639648438, "loss": 0.5219, "rewards/accuracies": 0.6875, "rewards/chosen": -3.2997894287109375, "rewards/margins": 0.487488329410553, "rewards/rejected": -3.787277936935425, "step": 883 }, { "epoch": 0.5773532533268022, "grad_norm": 41.196321721092666, "learning_rate": 6.790951041529286e-08, "logits/chosen": -1.5880982875823975, "logits/rejected": -1.5457425117492676, "logps/chosen": -958.2757568359375, "logps/rejected": -986.9423828125, "loss": 0.522, "rewards/accuracies": 0.6875, "rewards/chosen": -3.4477083683013916, "rewards/margins": 0.6854377388954163, "rewards/rejected": -4.133145809173584, "step": 884 }, { "epoch": 0.5780063678667646, "grad_norm": 30.703596533657475, "learning_rate": 6.773918469726156e-08, "logits/chosen": -1.480318546295166, "logits/rejected": -1.4444735050201416, "logps/chosen": -940.3262939453125, "logps/rejected": -1004.0597534179688, "loss": 0.5406, "rewards/accuracies": 0.65625, "rewards/chosen": -3.5821070671081543, "rewards/margins": 0.554044246673584, "rewards/rejected": -4.136151313781738, "step": 885 }, { "epoch": 0.578659482406727, "grad_norm": 126.80730044989116, "learning_rate": 6.756889677272617e-08, "logits/chosen": -1.5217347145080566, "logits/rejected": -1.498375415802002, "logps/chosen": -762.6358642578125, "logps/rejected": -902.398681640625, "loss": 0.5009, "rewards/accuracies": 0.75, "rewards/chosen": -3.058880567550659, "rewards/margins": 0.8846085667610168, "rewards/rejected": -3.943488597869873, "step": 886 }, { "epoch": 0.5793125969466896, "grad_norm": 11.803823871913366, "learning_rate": 6.739864752805765e-08, "logits/chosen": -1.4541584253311157, "logits/rejected": -1.426889181137085, "logps/chosen": -766.4666137695312, "logps/rejected": -813.1023559570312, "loss": 0.4918, "rewards/accuracies": 0.78125, "rewards/chosen": -3.2732667922973633, "rewards/margins": 0.5391014218330383, "rewards/rejected": -3.8123679161071777, "step": 887 }, { "epoch": 0.579965711486652, "grad_norm": 21.000885515886726, "learning_rate": 6.722843784942565e-08, "logits/chosen": -1.4722752571105957, "logits/rejected": -1.467155933380127, "logps/chosen": -837.9805908203125, "logps/rejected": -857.227294921875, "loss": 0.6011, "rewards/accuracies": 0.625, "rewards/chosen": -3.297391653060913, "rewards/margins": 0.3573925197124481, "rewards/rejected": -3.6547842025756836, "step": 888 }, { "epoch": 0.5806188260266144, "grad_norm": 17.365098193739787, "learning_rate": 6.705826862279391e-08, "logits/chosen": -1.4612233638763428, "logits/rejected": -1.4755150079727173, "logps/chosen": -873.2598876953125, "logps/rejected": -963.6185913085938, "loss": 0.5842, "rewards/accuracies": 0.84375, "rewards/chosen": -3.1137642860412598, "rewards/margins": 1.115809679031372, "rewards/rejected": -4.229574203491211, "step": 889 }, { "epoch": 0.5812719405665768, "grad_norm": 13.20416027815365, "learning_rate": 6.688814073391551e-08, "logits/chosen": -1.4795432090759277, "logits/rejected": -1.528482437133789, "logps/chosen": -914.3403930664062, "logps/rejected": -992.7194213867188, "loss": 0.5484, "rewards/accuracies": 0.6875, "rewards/chosen": -3.8117380142211914, "rewards/margins": 0.9581300616264343, "rewards/rejected": -4.769867897033691, "step": 890 }, { "epoch": 0.5819250551065394, "grad_norm": 58.643014274399484, "learning_rate": 6.671805506832844e-08, "logits/chosen": -1.4377716779708862, "logits/rejected": -1.4766364097595215, "logps/chosen": -770.939697265625, "logps/rejected": -905.95458984375, "loss": 0.5286, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8314456939697266, "rewards/margins": 0.9767945408821106, "rewards/rejected": -3.8082404136657715, "step": 891 }, { "epoch": 0.5825781696465018, "grad_norm": 55.6022723858616, "learning_rate": 6.654801251135092e-08, "logits/chosen": -1.6092873811721802, "logits/rejected": -1.6193262338638306, "logps/chosen": -859.6929321289062, "logps/rejected": -937.2235107421875, "loss": 0.5144, "rewards/accuracies": 0.75, "rewards/chosen": -3.6701273918151855, "rewards/margins": 0.6065517067909241, "rewards/rejected": -4.276679039001465, "step": 892 }, { "epoch": 0.5832312841864642, "grad_norm": 21.239795373704855, "learning_rate": 6.637801394807675e-08, "logits/chosen": -1.465186357498169, "logits/rejected": -1.5041096210479736, "logps/chosen": -876.3399047851562, "logps/rejected": -936.5634765625, "loss": 0.479, "rewards/accuracies": 0.875, "rewards/chosen": -3.4567980766296387, "rewards/margins": 0.6494026184082031, "rewards/rejected": -4.106200695037842, "step": 893 }, { "epoch": 0.5838843987264266, "grad_norm": 22.901871352735185, "learning_rate": 6.620806026337073e-08, "logits/chosen": -1.567824363708496, "logits/rejected": -1.5391998291015625, "logps/chosen": -831.55615234375, "logps/rejected": -894.0813598632812, "loss": 0.5542, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1797890663146973, "rewards/margins": 0.6112327575683594, "rewards/rejected": -3.7910218238830566, "step": 894 }, { "epoch": 0.5845375132663891, "grad_norm": 18.99146292432215, "learning_rate": 6.603815234186409e-08, "logits/chosen": -1.5131785869598389, "logits/rejected": -1.4995113611221313, "logps/chosen": -811.837158203125, "logps/rejected": -859.697265625, "loss": 0.4643, "rewards/accuracies": 0.75, "rewards/chosen": -3.3386635780334473, "rewards/margins": 1.0135873556137085, "rewards/rejected": -4.352251052856445, "step": 895 }, { "epoch": 0.5851906278063516, "grad_norm": 85.94411267727696, "learning_rate": 6.586829106794986e-08, "logits/chosen": -1.4760074615478516, "logits/rejected": -1.5129234790802002, "logps/chosen": -827.99560546875, "logps/rejected": -902.9091796875, "loss": 0.55, "rewards/accuracies": 0.78125, "rewards/chosen": -3.300877809524536, "rewards/margins": 0.9479480981826782, "rewards/rejected": -4.248825550079346, "step": 896 }, { "epoch": 0.585843742346314, "grad_norm": 28.761272058111484, "learning_rate": 6.569847732577822e-08, "logits/chosen": -1.51106595993042, "logits/rejected": -1.5085562467575073, "logps/chosen": -797.22216796875, "logps/rejected": -902.5498046875, "loss": 0.4886, "rewards/accuracies": 0.6875, "rewards/chosen": -3.1795077323913574, "rewards/margins": 0.7169556021690369, "rewards/rejected": -3.896463394165039, "step": 897 }, { "epoch": 0.5864968568862764, "grad_norm": 20.15314666563717, "learning_rate": 6.5528711999252e-08, "logits/chosen": -1.5399476289749146, "logits/rejected": -1.527458906173706, "logps/chosen": -862.6876831054688, "logps/rejected": -874.1339111328125, "loss": 0.5053, "rewards/accuracies": 0.71875, "rewards/chosen": -2.810636520385742, "rewards/margins": 0.7659615278244019, "rewards/rejected": -3.5765976905822754, "step": 898 }, { "epoch": 0.5871499714262389, "grad_norm": 12.274080681893953, "learning_rate": 6.535899597202195e-08, "logits/chosen": -1.513730525970459, "logits/rejected": -1.484087586402893, "logps/chosen": -859.1988525390625, "logps/rejected": -1115.99365234375, "loss": 0.429, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3637874126434326, "rewards/margins": 1.6439146995544434, "rewards/rejected": -5.007702350616455, "step": 899 }, { "epoch": 0.5878030859662013, "grad_norm": 46.88003480260202, "learning_rate": 6.518933012748232e-08, "logits/chosen": -1.4574429988861084, "logits/rejected": -1.4671128988265991, "logps/chosen": -768.5726928710938, "logps/rejected": -817.6867065429688, "loss": 0.5007, "rewards/accuracies": 0.65625, "rewards/chosen": -2.511812210083008, "rewards/margins": 0.7634918689727783, "rewards/rejected": -3.2753043174743652, "step": 900 }, { "epoch": 0.5878030859662013, "eval_logits/chosen": -1.5143874883651733, "eval_logits/rejected": -1.500697135925293, "eval_logps/chosen": -819.5907592773438, "eval_logps/rejected": -894.2119750976562, "eval_loss": 0.5088227987289429, "eval_rewards/accuracies": 0.7540000081062317, "eval_rewards/chosen": -3.092453956604004, "eval_rewards/margins": 0.8272719979286194, "eval_rewards/rejected": -3.9197258949279785, "eval_runtime": 300.2638, "eval_samples_per_second": 13.322, "eval_steps_per_second": 0.833, "step": 900 }, { "epoch": 0.5884562005061638, "grad_norm": 15.611510087556148, "learning_rate": 6.5019715348766e-08, "logits/chosen": -1.4778497219085693, "logits/rejected": -1.4426562786102295, "logps/chosen": -764.3760986328125, "logps/rejected": -894.08740234375, "loss": 0.4707, "rewards/accuracies": 0.84375, "rewards/chosen": -2.829637289047241, "rewards/margins": 1.0723354816436768, "rewards/rejected": -3.901973247528076, "step": 901 }, { "epoch": 0.5891093150461262, "grad_norm": 27.04744713063808, "learning_rate": 6.485015251874019e-08, "logits/chosen": -1.4524825811386108, "logits/rejected": -1.4247350692749023, "logps/chosen": -833.126708984375, "logps/rejected": -851.63037109375, "loss": 0.5002, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4548346996307373, "rewards/margins": 0.45528683066368103, "rewards/rejected": -3.910121440887451, "step": 902 }, { "epoch": 0.5897624295860887, "grad_norm": 14.887394055487325, "learning_rate": 6.468064252000163e-08, "logits/chosen": -1.4548847675323486, "logits/rejected": -1.441070318222046, "logps/chosen": -803.3111572265625, "logps/rejected": -800.1891479492188, "loss": 0.5278, "rewards/accuracies": 0.6875, "rewards/chosen": -3.290768623352051, "rewards/margins": 0.5420413613319397, "rewards/rejected": -3.8328099250793457, "step": 903 }, { "epoch": 0.5904155441260511, "grad_norm": 88.18212760967876, "learning_rate": 6.451118623487215e-08, "logits/chosen": -1.4934018850326538, "logits/rejected": -1.491977334022522, "logps/chosen": -763.4550170898438, "logps/rejected": -997.37255859375, "loss": 0.5368, "rewards/accuracies": 0.84375, "rewards/chosen": -2.9802534580230713, "rewards/margins": 1.5265756845474243, "rewards/rejected": -4.506828784942627, "step": 904 }, { "epoch": 0.5910686586660135, "grad_norm": 24.118764808026647, "learning_rate": 6.434178454539393e-08, "logits/chosen": -1.4591268301010132, "logits/rejected": -1.475930094718933, "logps/chosen": -738.4482421875, "logps/rejected": -801.5521850585938, "loss": 0.4642, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7284841537475586, "rewards/margins": 0.7377627491950989, "rewards/rejected": -3.466247081756592, "step": 905 }, { "epoch": 0.591721773205976, "grad_norm": 17.550624193334937, "learning_rate": 6.417243833332495e-08, "logits/chosen": -1.5250346660614014, "logits/rejected": -1.4923107624053955, "logps/chosen": -816.05419921875, "logps/rejected": -943.5857543945312, "loss": 0.5238, "rewards/accuracies": 0.75, "rewards/chosen": -3.181175708770752, "rewards/margins": 1.0177024602890015, "rewards/rejected": -4.198878288269043, "step": 906 }, { "epoch": 0.5923748877459385, "grad_norm": 23.27302229809888, "learning_rate": 6.400314848013446e-08, "logits/chosen": -1.5263398885726929, "logits/rejected": -1.5403982400894165, "logps/chosen": -898.93408203125, "logps/rejected": -945.4384765625, "loss": 0.5122, "rewards/accuracies": 0.78125, "rewards/chosen": -3.269822835922241, "rewards/margins": 0.5661499500274658, "rewards/rejected": -3.835972785949707, "step": 907 }, { "epoch": 0.5930280022859009, "grad_norm": 56.67567956066173, "learning_rate": 6.383391586699837e-08, "logits/chosen": -1.5245555639266968, "logits/rejected": -1.5307707786560059, "logps/chosen": -879.91064453125, "logps/rejected": -969.3151245117188, "loss": 0.4661, "rewards/accuracies": 0.75, "rewards/chosen": -3.0276691913604736, "rewards/margins": 0.8546069264411926, "rewards/rejected": -3.8822762966156006, "step": 908 }, { "epoch": 0.5936811168258633, "grad_norm": 18.262554233951846, "learning_rate": 6.366474137479459e-08, "logits/chosen": -1.5455853939056396, "logits/rejected": -1.428063154220581, "logps/chosen": -790.438720703125, "logps/rejected": -823.03271484375, "loss": 0.5309, "rewards/accuracies": 0.8125, "rewards/chosen": -3.124584674835205, "rewards/margins": 0.8692245483398438, "rewards/rejected": -3.993809223175049, "step": 909 }, { "epoch": 0.5943342313658257, "grad_norm": 37.07411490442863, "learning_rate": 6.349562588409858e-08, "logits/chosen": -1.540130853652954, "logits/rejected": -1.5190839767456055, "logps/chosen": -887.24462890625, "logps/rejected": -943.5260620117188, "loss": 0.5406, "rewards/accuracies": 0.65625, "rewards/chosen": -3.2311315536499023, "rewards/margins": 0.5626290440559387, "rewards/rejected": -3.7937607765197754, "step": 910 }, { "epoch": 0.5949873459057883, "grad_norm": 81.1787368218721, "learning_rate": 6.332657027517865e-08, "logits/chosen": -1.531748652458191, "logits/rejected": -1.566206932067871, "logps/chosen": -858.2910766601562, "logps/rejected": -1005.5297241210938, "loss": 0.5071, "rewards/accuracies": 0.71875, "rewards/chosen": -3.267298698425293, "rewards/margins": 1.0169984102249146, "rewards/rejected": -4.284296989440918, "step": 911 }, { "epoch": 0.5956404604457507, "grad_norm": 32.28904552141215, "learning_rate": 6.315757542799137e-08, "logits/chosen": -1.5486570596694946, "logits/rejected": -1.530015230178833, "logps/chosen": -837.4837036132812, "logps/rejected": -862.83447265625, "loss": 0.5273, "rewards/accuracies": 0.75, "rewards/chosen": -3.196139335632324, "rewards/margins": 0.7048149108886719, "rewards/rejected": -3.900954246520996, "step": 912 }, { "epoch": 0.5962935749857131, "grad_norm": 10.082427550315396, "learning_rate": 6.29886422221771e-08, "logits/chosen": -1.6052241325378418, "logits/rejected": -1.5839667320251465, "logps/chosen": -873.5725708007812, "logps/rejected": -978.7376708984375, "loss": 0.4844, "rewards/accuracies": 0.84375, "rewards/chosen": -3.3677515983581543, "rewards/margins": 1.1688979864120483, "rewards/rejected": -4.536649227142334, "step": 913 }, { "epoch": 0.5969466895256755, "grad_norm": 49.67965132832666, "learning_rate": 6.281977153705534e-08, "logits/chosen": -1.5648092031478882, "logits/rejected": -1.5402863025665283, "logps/chosen": -851.8053588867188, "logps/rejected": -900.38818359375, "loss": 0.5503, "rewards/accuracies": 0.71875, "rewards/chosen": -2.964672327041626, "rewards/margins": 0.8616347312927246, "rewards/rejected": -3.8263068199157715, "step": 914 }, { "epoch": 0.5975998040656381, "grad_norm": 19.092868780473093, "learning_rate": 6.265096425162015e-08, "logits/chosen": -1.4740824699401855, "logits/rejected": -1.4873621463775635, "logps/chosen": -818.6807250976562, "logps/rejected": -898.1356201171875, "loss": 0.5058, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7591428756713867, "rewards/margins": 1.210620641708374, "rewards/rejected": -3.9697635173797607, "step": 915 }, { "epoch": 0.5982529186056005, "grad_norm": 47.5168167772122, "learning_rate": 6.24822212445356e-08, "logits/chosen": -1.4681650400161743, "logits/rejected": -1.4547978639602661, "logps/chosen": -792.4906616210938, "logps/rejected": -850.310546875, "loss": 0.4641, "rewards/accuracies": 0.875, "rewards/chosen": -2.865018367767334, "rewards/margins": 0.6488918662071228, "rewards/rejected": -3.5139102935791016, "step": 916 }, { "epoch": 0.5989060331455629, "grad_norm": 33.12140555034356, "learning_rate": 6.231354339413116e-08, "logits/chosen": -1.4651777744293213, "logits/rejected": -1.473002314567566, "logps/chosen": -830.7955932617188, "logps/rejected": -866.82080078125, "loss": 0.5181, "rewards/accuracies": 0.5625, "rewards/chosen": -3.1264560222625732, "rewards/margins": 0.34810954332351685, "rewards/rejected": -3.4745657444000244, "step": 917 }, { "epoch": 0.5995591476855253, "grad_norm": 15.005549258653641, "learning_rate": 6.214493157839716e-08, "logits/chosen": -1.5664610862731934, "logits/rejected": -1.541873574256897, "logps/chosen": -868.6499633789062, "logps/rejected": -926.1016845703125, "loss": 0.444, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0149686336517334, "rewards/margins": 0.7024274468421936, "rewards/rejected": -3.7173960208892822, "step": 918 }, { "epoch": 0.6002122622254878, "grad_norm": 31.0641454388364, "learning_rate": 6.197638667498022e-08, "logits/chosen": -1.5562750101089478, "logits/rejected": -1.5133049488067627, "logps/chosen": -773.593994140625, "logps/rejected": -830.7118530273438, "loss": 0.5931, "rewards/accuracies": 0.71875, "rewards/chosen": -3.2983880043029785, "rewards/margins": 0.45370548963546753, "rewards/rejected": -3.7520933151245117, "step": 919 }, { "epoch": 0.6008653767654503, "grad_norm": 20.956839284919962, "learning_rate": 6.180790956117867e-08, "logits/chosen": -1.467447280883789, "logits/rejected": -1.4105029106140137, "logps/chosen": -782.5927124023438, "logps/rejected": -818.0179443359375, "loss": 0.508, "rewards/accuracies": 0.65625, "rewards/chosen": -3.0541372299194336, "rewards/margins": 0.4292081892490387, "rewards/rejected": -3.4833457469940186, "step": 920 }, { "epoch": 0.6015184913054127, "grad_norm": 14.668503955298801, "learning_rate": 6.163950111393799e-08, "logits/chosen": -1.5520689487457275, "logits/rejected": -1.5361812114715576, "logps/chosen": -864.1260375976562, "logps/rejected": -910.3584594726562, "loss": 0.5677, "rewards/accuracies": 0.75, "rewards/chosen": -3.183807134628296, "rewards/margins": 1.1038367748260498, "rewards/rejected": -4.287644386291504, "step": 921 }, { "epoch": 0.6021716058453751, "grad_norm": 27.305517777571882, "learning_rate": 6.147116220984622e-08, "logits/chosen": -1.4598233699798584, "logits/rejected": -1.460257649421692, "logps/chosen": -816.2798461914062, "logps/rejected": -911.81591796875, "loss": 0.4793, "rewards/accuracies": 0.71875, "rewards/chosen": -3.3738749027252197, "rewards/margins": 0.7180493474006653, "rewards/rejected": -4.09192419052124, "step": 922 }, { "epoch": 0.6028247203853376, "grad_norm": 41.739674304138276, "learning_rate": 6.130289372512946e-08, "logits/chosen": -1.444016933441162, "logits/rejected": -1.474906325340271, "logps/chosen": -754.188232421875, "logps/rejected": -857.038818359375, "loss": 0.5205, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8269124031066895, "rewards/margins": 0.8509232997894287, "rewards/rejected": -3.677835464477539, "step": 923 }, { "epoch": 0.6034778349253, "grad_norm": 58.02110360239537, "learning_rate": 6.113469653564719e-08, "logits/chosen": -1.4690029621124268, "logits/rejected": -1.4412765502929688, "logps/chosen": -810.4243774414062, "logps/rejected": -840.3269653320312, "loss": 0.5251, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8800265789031982, "rewards/margins": 0.6990075707435608, "rewards/rejected": -3.5790340900421143, "step": 924 }, { "epoch": 0.6041309494652625, "grad_norm": 61.758663895068544, "learning_rate": 6.096657151688788e-08, "logits/chosen": -1.4827535152435303, "logits/rejected": -1.4380667209625244, "logps/chosen": -844.2979736328125, "logps/rejected": -990.4068603515625, "loss": 0.5044, "rewards/accuracies": 0.8125, "rewards/chosen": -3.213895082473755, "rewards/margins": 0.7906748056411743, "rewards/rejected": -4.004570007324219, "step": 925 }, { "epoch": 0.6047840640052249, "grad_norm": 47.91069541881108, "learning_rate": 6.07985195439643e-08, "logits/chosen": -1.5746287107467651, "logits/rejected": -1.5688815116882324, "logps/chosen": -938.1190185546875, "logps/rejected": -984.5784912109375, "loss": 0.4972, "rewards/accuracies": 0.75, "rewards/chosen": -3.6475698947906494, "rewards/margins": 0.6271919012069702, "rewards/rejected": -4.274761199951172, "step": 926 }, { "epoch": 0.6054371785451874, "grad_norm": 12.621896694201023, "learning_rate": 6.063054149160899e-08, "logits/chosen": -1.4833399057388306, "logits/rejected": -1.5237452983856201, "logps/chosen": -763.331298828125, "logps/rejected": -738.412353515625, "loss": 0.6048, "rewards/accuracies": 0.40625, "rewards/chosen": -2.918760299682617, "rewards/margins": 0.033799026161432266, "rewards/rejected": -2.952559232711792, "step": 927 }, { "epoch": 0.6060902930851498, "grad_norm": 48.565935333486564, "learning_rate": 6.046263823416975e-08, "logits/chosen": -1.5271244049072266, "logits/rejected": -1.4947071075439453, "logps/chosen": -845.4649047851562, "logps/rejected": -829.05615234375, "loss": 0.5946, "rewards/accuracies": 0.65625, "rewards/chosen": -3.154777765274048, "rewards/margins": 0.36189785599708557, "rewards/rejected": -3.5166759490966797, "step": 928 }, { "epoch": 0.6067434076251123, "grad_norm": 201.1566716246379, "learning_rate": 6.029481064560507e-08, "logits/chosen": -1.5659098625183105, "logits/rejected": -1.4598219394683838, "logps/chosen": -745.5149536132812, "logps/rejected": -827.2947998046875, "loss": 0.5307, "rewards/accuracies": 0.71875, "rewards/chosen": -3.176211357116699, "rewards/margins": 0.9403245449066162, "rewards/rejected": -4.116535663604736, "step": 929 }, { "epoch": 0.6073965221650747, "grad_norm": 77.8419741018575, "learning_rate": 6.012705959947953e-08, "logits/chosen": -1.5268572568893433, "logits/rejected": -1.5410304069519043, "logps/chosen": -791.3049926757812, "logps/rejected": -944.24365234375, "loss": 0.5016, "rewards/accuracies": 0.65625, "rewards/chosen": -2.995246410369873, "rewards/margins": 0.96639084815979, "rewards/rejected": -3.961637258529663, "step": 930 }, { "epoch": 0.6080496367050372, "grad_norm": 21.356823796166157, "learning_rate": 5.995938596895936e-08, "logits/chosen": -1.5197898149490356, "logits/rejected": -1.5448129177093506, "logps/chosen": -880.7378540039062, "logps/rejected": -929.0924682617188, "loss": 0.4803, "rewards/accuracies": 0.75, "rewards/chosen": -3.1356070041656494, "rewards/margins": 0.8477343320846558, "rewards/rejected": -3.9833414554595947, "step": 931 }, { "epoch": 0.6087027512449996, "grad_norm": 78.1296697326321, "learning_rate": 5.979179062680777e-08, "logits/chosen": -1.5615133047103882, "logits/rejected": -1.5305187702178955, "logps/chosen": -805.41748046875, "logps/rejected": -857.9087524414062, "loss": 0.4887, "rewards/accuracies": 0.59375, "rewards/chosen": -2.8929922580718994, "rewards/margins": 0.7500249743461609, "rewards/rejected": -3.643017292022705, "step": 932 }, { "epoch": 0.609355865784962, "grad_norm": 44.16786903754792, "learning_rate": 5.96242744453805e-08, "logits/chosen": -1.5862808227539062, "logits/rejected": -1.500042200088501, "logps/chosen": -957.4008178710938, "logps/rejected": -1003.3927001953125, "loss": 0.5375, "rewards/accuracies": 0.84375, "rewards/chosen": -3.4804091453552246, "rewards/margins": 0.9446480870246887, "rewards/rejected": -4.425057411193848, "step": 933 }, { "epoch": 0.6100089803249245, "grad_norm": 43.88738627099548, "learning_rate": 5.945683829662129e-08, "logits/chosen": -1.4918217658996582, "logits/rejected": -1.4933407306671143, "logps/chosen": -826.7015380859375, "logps/rejected": -888.3599853515625, "loss": 0.5092, "rewards/accuracies": 0.84375, "rewards/chosen": -2.8977818489074707, "rewards/margins": 0.904629111289978, "rewards/rejected": -3.80241060256958, "step": 934 }, { "epoch": 0.610662094864887, "grad_norm": 84.54075685318386, "learning_rate": 5.928948305205719e-08, "logits/chosen": -1.6410014629364014, "logits/rejected": -1.5713551044464111, "logps/chosen": -862.2045288085938, "logps/rejected": -862.973876953125, "loss": 0.4944, "rewards/accuracies": 0.6875, "rewards/chosen": -3.225311517715454, "rewards/margins": 0.5583637356758118, "rewards/rejected": -3.7836754322052, "step": 935 }, { "epoch": 0.6113152094048494, "grad_norm": 49.202199653752565, "learning_rate": 5.912220958279421e-08, "logits/chosen": -1.525040626525879, "logits/rejected": -1.5112783908843994, "logps/chosen": -898.3567504882812, "logps/rejected": -948.94091796875, "loss": 0.4935, "rewards/accuracies": 0.75, "rewards/chosen": -3.4913763999938965, "rewards/margins": 0.7729541063308716, "rewards/rejected": -4.2643303871154785, "step": 936 }, { "epoch": 0.6119683239448118, "grad_norm": 19.051733763857207, "learning_rate": 5.895501875951271e-08, "logits/chosen": -1.5405813455581665, "logits/rejected": -1.4980101585388184, "logps/chosen": -818.7405395507812, "logps/rejected": -821.7682495117188, "loss": 0.5139, "rewards/accuracies": 0.71875, "rewards/chosen": -2.834625482559204, "rewards/margins": 0.37921154499053955, "rewards/rejected": -3.213836669921875, "step": 937 }, { "epoch": 0.6126214384847742, "grad_norm": 39.98147203055881, "learning_rate": 5.878791145246284e-08, "logits/chosen": -1.552310585975647, "logits/rejected": -1.4916616678237915, "logps/chosen": -828.3511962890625, "logps/rejected": -897.640380859375, "loss": 0.4085, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9902727603912354, "rewards/margins": 0.938800036907196, "rewards/rejected": -3.929072856903076, "step": 938 }, { "epoch": 0.6132745530247368, "grad_norm": 9.665640962792187, "learning_rate": 5.862088853146006e-08, "logits/chosen": -1.483540654182434, "logits/rejected": -1.4673054218292236, "logps/chosen": -868.4718627929688, "logps/rejected": -904.5808715820312, "loss": 0.4925, "rewards/accuracies": 0.6875, "rewards/chosen": -3.237854480743408, "rewards/margins": 0.40152764320373535, "rewards/rejected": -3.6393821239471436, "step": 939 }, { "epoch": 0.6139276675646992, "grad_norm": 29.6041722046408, "learning_rate": 5.8453950865880574e-08, "logits/chosen": -1.5257904529571533, "logits/rejected": -1.5109593868255615, "logps/chosen": -839.3677368164062, "logps/rejected": -890.6753540039062, "loss": 0.4616, "rewards/accuracies": 0.84375, "rewards/chosen": -2.944354295730591, "rewards/margins": 0.7917759418487549, "rewards/rejected": -3.7361302375793457, "step": 940 }, { "epoch": 0.6145807821046616, "grad_norm": 36.1361730170012, "learning_rate": 5.82870993246568e-08, "logits/chosen": -1.575820803642273, "logits/rejected": -1.5428611040115356, "logps/chosen": -984.8616943359375, "logps/rejected": -1002.877685546875, "loss": 0.5118, "rewards/accuracies": 0.78125, "rewards/chosen": -3.8942298889160156, "rewards/margins": 0.8039427995681763, "rewards/rejected": -4.6981730461120605, "step": 941 }, { "epoch": 0.615233896644624, "grad_norm": 65.95471806082536, "learning_rate": 5.812033477627295e-08, "logits/chosen": -1.5245524644851685, "logits/rejected": -1.4878289699554443, "logps/chosen": -797.1603393554688, "logps/rejected": -862.60400390625, "loss": 0.5424, "rewards/accuracies": 0.6875, "rewards/chosen": -3.427908420562744, "rewards/margins": 0.5726900100708008, "rewards/rejected": -4.000597953796387, "step": 942 }, { "epoch": 0.6158870111845864, "grad_norm": 42.35986198202249, "learning_rate": 5.795365808876033e-08, "logits/chosen": -1.4567804336547852, "logits/rejected": -1.4847595691680908, "logps/chosen": -761.0791625976562, "logps/rejected": -862.4808349609375, "loss": 0.4818, "rewards/accuracies": 0.8125, "rewards/chosen": -2.592480421066284, "rewards/margins": 0.8631225228309631, "rewards/rejected": -3.4556026458740234, "step": 943 }, { "epoch": 0.616540125724549, "grad_norm": 116.76247999305836, "learning_rate": 5.778707012969296e-08, "logits/chosen": -1.5003118515014648, "logits/rejected": -1.4539164304733276, "logps/chosen": -782.5404052734375, "logps/rejected": -837.072509765625, "loss": 0.5626, "rewards/accuracies": 0.78125, "rewards/chosen": -3.003209352493286, "rewards/margins": 0.6795889139175415, "rewards/rejected": -3.682798385620117, "step": 944 }, { "epoch": 0.6171932402645114, "grad_norm": 36.38150538725848, "learning_rate": 5.762057176618306e-08, "logits/chosen": -1.5234191417694092, "logits/rejected": -1.5212476253509521, "logps/chosen": -823.0789184570312, "logps/rejected": -1108.3194580078125, "loss": 0.4678, "rewards/accuracies": 0.875, "rewards/chosen": -2.8806896209716797, "rewards/margins": 1.4171735048294067, "rewards/rejected": -4.297863483428955, "step": 945 }, { "epoch": 0.6178463548044738, "grad_norm": 24.621342657442526, "learning_rate": 5.745416386487637e-08, "logits/chosen": -1.4999808073043823, "logits/rejected": -1.5050296783447266, "logps/chosen": -884.189697265625, "logps/rejected": -1065.35546875, "loss": 0.4709, "rewards/accuracies": 0.78125, "rewards/chosen": -3.3289954662323, "rewards/margins": 1.0373826026916504, "rewards/rejected": -4.366378307342529, "step": 946 }, { "epoch": 0.6184994693444362, "grad_norm": 39.9664210404075, "learning_rate": 5.728784729194788e-08, "logits/chosen": -1.6225008964538574, "logits/rejected": -1.5957049131393433, "logps/chosen": -797.822021484375, "logps/rejected": -842.7942504882812, "loss": 0.459, "rewards/accuracies": 0.90625, "rewards/chosen": -3.0013129711151123, "rewards/margins": 0.8668568134307861, "rewards/rejected": -3.8681697845458984, "step": 947 }, { "epoch": 0.6191525838843988, "grad_norm": 18.300722710956293, "learning_rate": 5.712162291309717e-08, "logits/chosen": -1.4809714555740356, "logits/rejected": -1.4627764225006104, "logps/chosen": -780.6005859375, "logps/rejected": -837.5048828125, "loss": 0.4601, "rewards/accuracies": 0.84375, "rewards/chosen": -2.6685848236083984, "rewards/margins": 0.947687029838562, "rewards/rejected": -3.61627197265625, "step": 948 }, { "epoch": 0.6198056984243612, "grad_norm": 48.042619463612205, "learning_rate": 5.695549159354392e-08, "logits/chosen": -1.5010271072387695, "logits/rejected": -1.5268317461013794, "logps/chosen": -862.3072509765625, "logps/rejected": -902.5770263671875, "loss": 0.5344, "rewards/accuracies": 0.65625, "rewards/chosen": -3.204814910888672, "rewards/margins": 0.5735858082771301, "rewards/rejected": -3.7784006595611572, "step": 949 }, { "epoch": 0.6204588129643236, "grad_norm": 12.02401612140606, "learning_rate": 5.678945419802344e-08, "logits/chosen": -1.5606863498687744, "logits/rejected": -1.553232192993164, "logps/chosen": -786.1539916992188, "logps/rejected": -862.3563842773438, "loss": 0.4561, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9571497440338135, "rewards/margins": 0.9905616641044617, "rewards/rejected": -3.94771146774292, "step": 950 }, { "epoch": 0.621111927504286, "grad_norm": 23.86580730911811, "learning_rate": 5.662351159078216e-08, "logits/chosen": -1.552533745765686, "logits/rejected": -1.5205721855163574, "logps/chosen": -958.0302124023438, "logps/rejected": -975.6397094726562, "loss": 0.5031, "rewards/accuracies": 0.59375, "rewards/chosen": -3.6955161094665527, "rewards/margins": 0.5410246253013611, "rewards/rejected": -4.2365403175354, "step": 951 }, { "epoch": 0.6217650420442485, "grad_norm": 135.43034670723952, "learning_rate": 5.645766463557309e-08, "logits/chosen": -1.4750163555145264, "logits/rejected": -1.4967775344848633, "logps/chosen": -814.1315307617188, "logps/rejected": -898.7764282226562, "loss": 0.554, "rewards/accuracies": 0.78125, "rewards/chosen": -3.208841562271118, "rewards/margins": 0.698648989200592, "rewards/rejected": -3.9074904918670654, "step": 952 }, { "epoch": 0.622418156584211, "grad_norm": 60.25491022191491, "learning_rate": 5.629191419565141e-08, "logits/chosen": -1.471923828125, "logits/rejected": -1.5026978254318237, "logps/chosen": -836.269775390625, "logps/rejected": -875.132080078125, "loss": 0.4922, "rewards/accuracies": 0.6875, "rewards/chosen": -3.013134479522705, "rewards/margins": 0.8672950267791748, "rewards/rejected": -3.880429744720459, "step": 953 }, { "epoch": 0.6230712711241734, "grad_norm": 70.1250850771653, "learning_rate": 5.612626113376988e-08, "logits/chosen": -1.415960431098938, "logits/rejected": -1.4024447202682495, "logps/chosen": -861.2704467773438, "logps/rejected": -906.9395751953125, "loss": 0.4583, "rewards/accuracies": 0.84375, "rewards/chosen": -2.9992175102233887, "rewards/margins": 1.1700170040130615, "rewards/rejected": -4.169234752655029, "step": 954 }, { "epoch": 0.6237243856641358, "grad_norm": 118.63943863651916, "learning_rate": 5.596070631217441e-08, "logits/chosen": -1.5227341651916504, "logits/rejected": -1.5267282724380493, "logps/chosen": -795.377197265625, "logps/rejected": -826.0686645507812, "loss": 0.5701, "rewards/accuracies": 0.78125, "rewards/chosen": -2.9778614044189453, "rewards/margins": 0.6003108024597168, "rewards/rejected": -3.578172206878662, "step": 955 }, { "epoch": 0.6243775002040983, "grad_norm": 17.56854621129216, "learning_rate": 5.579525059259957e-08, "logits/chosen": -1.5488358736038208, "logits/rejected": -1.573460340499878, "logps/chosen": -817.08447265625, "logps/rejected": -863.1941528320312, "loss": 0.4524, "rewards/accuracies": 0.71875, "rewards/chosen": -3.050839900970459, "rewards/margins": 0.9858266115188599, "rewards/rejected": -4.0366668701171875, "step": 956 }, { "epoch": 0.6250306147440607, "grad_norm": 69.88285733489641, "learning_rate": 5.562989483626409e-08, "logits/chosen": -1.4820830821990967, "logits/rejected": -1.4609520435333252, "logps/chosen": -727.6483154296875, "logps/rejected": -839.3212280273438, "loss": 0.4287, "rewards/accuracies": 0.875, "rewards/chosen": -2.8100788593292236, "rewards/margins": 1.259989857673645, "rewards/rejected": -4.070068836212158, "step": 957 }, { "epoch": 0.6256837292840232, "grad_norm": 12.018768296537614, "learning_rate": 5.546463990386634e-08, "logits/chosen": -1.5005813837051392, "logits/rejected": -1.522735357284546, "logps/chosen": -787.5521850585938, "logps/rejected": -897.03564453125, "loss": 0.4824, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8902053833007812, "rewards/margins": 1.399174451828003, "rewards/rejected": -4.289380073547363, "step": 958 }, { "epoch": 0.6263368438239856, "grad_norm": 33.17348323808573, "learning_rate": 5.5299486655579924e-08, "logits/chosen": -1.530156135559082, "logits/rejected": -1.5278276205062866, "logps/chosen": -807.750244140625, "logps/rejected": -950.3114013671875, "loss": 0.4757, "rewards/accuracies": 0.84375, "rewards/chosen": -3.519279956817627, "rewards/margins": 1.149764060974121, "rewards/rejected": -4.669044494628906, "step": 959 }, { "epoch": 0.6269899583639481, "grad_norm": 15.26418942385675, "learning_rate": 5.513443595104917e-08, "logits/chosen": -1.4855040311813354, "logits/rejected": -1.495133638381958, "logps/chosen": -800.33837890625, "logps/rejected": -888.4577026367188, "loss": 0.4282, "rewards/accuracies": 0.78125, "rewards/chosen": -2.716895580291748, "rewards/margins": 0.8394394516944885, "rewards/rejected": -3.556334972381592, "step": 960 }, { "epoch": 0.6276430729039105, "grad_norm": 18.55918206412983, "learning_rate": 5.496948864938463e-08, "logits/chosen": -1.564420461654663, "logits/rejected": -1.5435676574707031, "logps/chosen": -925.506591796875, "logps/rejected": -1076.7103271484375, "loss": 0.4707, "rewards/accuracies": 0.875, "rewards/chosen": -3.589116096496582, "rewards/margins": 1.4336352348327637, "rewards/rejected": -5.022751331329346, "step": 961 }, { "epoch": 0.6282961874438729, "grad_norm": 36.21241704409419, "learning_rate": 5.480464560915865e-08, "logits/chosen": -1.457442045211792, "logits/rejected": -1.466660976409912, "logps/chosen": -880.1464233398438, "logps/rejected": -952.699462890625, "loss": 0.5234, "rewards/accuracies": 0.75, "rewards/chosen": -3.218412160873413, "rewards/margins": 0.8761493563652039, "rewards/rejected": -4.094561576843262, "step": 962 }, { "epoch": 0.6289493019838354, "grad_norm": 59.3801136308343, "learning_rate": 5.463990768840088e-08, "logits/chosen": -1.5099765062332153, "logits/rejected": -1.4892768859863281, "logps/chosen": -755.8883056640625, "logps/rejected": -835.78173828125, "loss": 0.5088, "rewards/accuracies": 0.6875, "rewards/chosen": -2.6380348205566406, "rewards/margins": 0.8317041993141174, "rewards/rejected": -3.469738721847534, "step": 963 }, { "epoch": 0.6296024165237979, "grad_norm": 47.301890427275495, "learning_rate": 5.447527574459378e-08, "logits/chosen": -1.522971272468567, "logits/rejected": -1.533935546875, "logps/chosen": -732.6190185546875, "logps/rejected": -835.910888671875, "loss": 0.4433, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6405673027038574, "rewards/margins": 0.8309347033500671, "rewards/rejected": -3.4715018272399902, "step": 964 }, { "epoch": 0.6302555310637603, "grad_norm": 36.636882714332266, "learning_rate": 5.431075063466824e-08, "logits/chosen": -1.485040307044983, "logits/rejected": -1.5213403701782227, "logps/chosen": -858.669921875, "logps/rejected": -968.9814453125, "loss": 0.4807, "rewards/accuracies": 0.90625, "rewards/chosen": -3.340672254562378, "rewards/margins": 0.8855950236320496, "rewards/rejected": -4.226266860961914, "step": 965 }, { "epoch": 0.6309086456037227, "grad_norm": 126.49507570203406, "learning_rate": 5.4146333214998996e-08, "logits/chosen": -1.5613657236099243, "logits/rejected": -1.510394811630249, "logps/chosen": -831.458740234375, "logps/rejected": -846.1495971679688, "loss": 0.5621, "rewards/accuracies": 0.65625, "rewards/chosen": -3.379094362258911, "rewards/margins": 0.5405767560005188, "rewards/rejected": -3.919670820236206, "step": 966 }, { "epoch": 0.6315617601436851, "grad_norm": 34.41276919074804, "learning_rate": 5.39820243414003e-08, "logits/chosen": -1.5569851398468018, "logits/rejected": -1.5476921796798706, "logps/chosen": -832.99169921875, "logps/rejected": -912.7144165039062, "loss": 0.4608, "rewards/accuracies": 0.78125, "rewards/chosen": -3.6668541431427, "rewards/margins": 0.7351149916648865, "rewards/rejected": -4.401968955993652, "step": 967 }, { "epoch": 0.6322148746836477, "grad_norm": 16.388453341999888, "learning_rate": 5.381782486912144e-08, "logits/chosen": -1.5224254131317139, "logits/rejected": -1.48786199092865, "logps/chosen": -816.0333862304688, "logps/rejected": -842.23486328125, "loss": 0.5767, "rewards/accuracies": 0.625, "rewards/chosen": -3.6304800510406494, "rewards/margins": 0.6139175891876221, "rewards/rejected": -4.24439811706543, "step": 968 }, { "epoch": 0.6328679892236101, "grad_norm": 16.605006142190543, "learning_rate": 5.365373565284211e-08, "logits/chosen": -1.5403863191604614, "logits/rejected": -1.54334557056427, "logps/chosen": -795.1408081054688, "logps/rejected": -888.0582275390625, "loss": 0.5094, "rewards/accuracies": 0.71875, "rewards/chosen": -2.9061810970306396, "rewards/margins": 0.6548399925231934, "rewards/rejected": -3.561021089553833, "step": 969 }, { "epoch": 0.6335211037635725, "grad_norm": 86.2008006757627, "learning_rate": 5.348975754666825e-08, "logits/chosen": -1.4919219017028809, "logits/rejected": -1.508528709411621, "logps/chosen": -781.9150390625, "logps/rejected": -914.5775146484375, "loss": 0.5111, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3672971725463867, "rewards/margins": 0.6925301551818848, "rewards/rejected": -4.0598273277282715, "step": 970 }, { "epoch": 0.6341742183035349, "grad_norm": 108.14966269733243, "learning_rate": 5.33258914041274e-08, "logits/chosen": -1.6157301664352417, "logits/rejected": -1.5612612962722778, "logps/chosen": -840.8732299804688, "logps/rejected": -893.8431396484375, "loss": 0.4703, "rewards/accuracies": 0.71875, "rewards/chosen": -3.353294610977173, "rewards/margins": 0.7179979681968689, "rewards/rejected": -4.071292877197266, "step": 971 }, { "epoch": 0.6348273328434975, "grad_norm": 13.7781550358131, "learning_rate": 5.316213807816432e-08, "logits/chosen": -1.5367947816848755, "logits/rejected": -1.4901845455169678, "logps/chosen": -795.0319213867188, "logps/rejected": -865.484375, "loss": 0.4664, "rewards/accuracies": 0.8125, "rewards/chosen": -3.197481155395508, "rewards/margins": 0.9921532869338989, "rewards/rejected": -4.189634323120117, "step": 972 }, { "epoch": 0.6354804473834599, "grad_norm": 22.603804513420233, "learning_rate": 5.299849842113656e-08, "logits/chosen": -1.5428191423416138, "logits/rejected": -1.565401554107666, "logps/chosen": -923.1882934570312, "logps/rejected": -956.2698974609375, "loss": 0.4838, "rewards/accuracies": 0.78125, "rewards/chosen": -3.5026655197143555, "rewards/margins": 0.8109586238861084, "rewards/rejected": -4.313623905181885, "step": 973 }, { "epoch": 0.6361335619234223, "grad_norm": 126.16238690733957, "learning_rate": 5.283497328480998e-08, "logits/chosen": -1.6220266819000244, "logits/rejected": -1.613905906677246, "logps/chosen": -949.4186401367188, "logps/rejected": -993.9503784179688, "loss": 0.5042, "rewards/accuracies": 0.875, "rewards/chosen": -3.326920986175537, "rewards/margins": 0.878463864326477, "rewards/rejected": -4.205385208129883, "step": 974 }, { "epoch": 0.6367866764633847, "grad_norm": 65.27147183382468, "learning_rate": 5.267156352035437e-08, "logits/chosen": -1.5607855319976807, "logits/rejected": -1.5158913135528564, "logps/chosen": -880.0157470703125, "logps/rejected": -966.0748291015625, "loss": 0.4936, "rewards/accuracies": 0.9375, "rewards/chosen": -3.369838237762451, "rewards/margins": 0.9538223147392273, "rewards/rejected": -4.323660850524902, "step": 975 }, { "epoch": 0.6374397910033472, "grad_norm": 22.36577561540512, "learning_rate": 5.250826997833899e-08, "logits/chosen": -1.5453389883041382, "logits/rejected": -1.540825366973877, "logps/chosen": -879.3869018554688, "logps/rejected": -885.604248046875, "loss": 0.5292, "rewards/accuracies": 0.6875, "rewards/chosen": -3.171088218688965, "rewards/margins": 0.3536863625049591, "rewards/rejected": -3.5247745513916016, "step": 976 }, { "epoch": 0.6380929055433097, "grad_norm": 10.578277029115368, "learning_rate": 5.234509350872813e-08, "logits/chosen": -1.5381667613983154, "logits/rejected": -1.5094205141067505, "logps/chosen": -812.7030639648438, "logps/rejected": -927.2329711914062, "loss": 0.5076, "rewards/accuracies": 0.9375, "rewards/chosen": -3.316643476486206, "rewards/margins": 1.0832691192626953, "rewards/rejected": -4.399912357330322, "step": 977 }, { "epoch": 0.6387460200832721, "grad_norm": 117.19097325210925, "learning_rate": 5.218203496087671e-08, "logits/chosen": -1.555480718612671, "logits/rejected": -1.554026484489441, "logps/chosen": -885.7477416992188, "logps/rejected": -906.7445068359375, "loss": 0.6101, "rewards/accuracies": 0.71875, "rewards/chosen": -3.1535983085632324, "rewards/margins": 0.5752413272857666, "rewards/rejected": -3.72883939743042, "step": 978 }, { "epoch": 0.6393991346232345, "grad_norm": 43.06647390428903, "learning_rate": 5.2019095183525886e-08, "logits/chosen": -1.5017975568771362, "logits/rejected": -1.5113730430603027, "logps/chosen": -754.3181762695312, "logps/rejected": -775.8157958984375, "loss": 0.5283, "rewards/accuracies": 0.71875, "rewards/chosen": -3.1152279376983643, "rewards/margins": 0.6389954090118408, "rewards/rejected": -3.754223346710205, "step": 979 }, { "epoch": 0.640052249163197, "grad_norm": 41.64410725373992, "learning_rate": 5.185627502479857e-08, "logits/chosen": -1.5162409543991089, "logits/rejected": -1.5376276969909668, "logps/chosen": -834.6458740234375, "logps/rejected": -990.2511596679688, "loss": 0.5032, "rewards/accuracies": 0.75, "rewards/chosen": -2.955130100250244, "rewards/margins": 1.1836378574371338, "rewards/rejected": -4.138767719268799, "step": 980 }, { "epoch": 0.6407053637031594, "grad_norm": 58.564011363723154, "learning_rate": 5.1693575332195006e-08, "logits/chosen": -1.4699060916900635, "logits/rejected": -1.4639524221420288, "logps/chosen": -755.951416015625, "logps/rejected": -867.30126953125, "loss": 0.5015, "rewards/accuracies": 0.875, "rewards/chosen": -2.9639952182769775, "rewards/margins": 1.0040624141693115, "rewards/rejected": -3.968057632446289, "step": 981 }, { "epoch": 0.6413584782431219, "grad_norm": 87.46285821272616, "learning_rate": 5.153099695258843e-08, "logits/chosen": -1.4716684818267822, "logits/rejected": -1.5059077739715576, "logps/chosen": -923.4231567382812, "logps/rejected": -1027.0228271484375, "loss": 0.4797, "rewards/accuracies": 0.8125, "rewards/chosen": -3.596079111099243, "rewards/margins": 1.1137962341308594, "rewards/rejected": -4.709875583648682, "step": 982 }, { "epoch": 0.6420115927830843, "grad_norm": 34.65500498760471, "learning_rate": 5.1368540732220656e-08, "logits/chosen": -1.5052711963653564, "logits/rejected": -1.525931477546692, "logps/chosen": -801.951171875, "logps/rejected": -935.139404296875, "loss": 0.5685, "rewards/accuracies": 0.71875, "rewards/chosen": -2.9107887744903564, "rewards/margins": 1.193021535873413, "rewards/rejected": -4.1038103103637695, "step": 983 }, { "epoch": 0.6426647073230468, "grad_norm": 15.45197836175366, "learning_rate": 5.1206207516697614e-08, "logits/chosen": -1.5689780712127686, "logits/rejected": -1.4814107418060303, "logps/chosen": -876.8812255859375, "logps/rejected": -1007.7050170898438, "loss": 0.5014, "rewards/accuracies": 0.71875, "rewards/chosen": -3.607487678527832, "rewards/margins": 1.0582144260406494, "rewards/rejected": -4.665701866149902, "step": 984 }, { "epoch": 0.6433178218630092, "grad_norm": 29.42430407328828, "learning_rate": 5.104399815098496e-08, "logits/chosen": -1.5423859357833862, "logits/rejected": -1.5497791767120361, "logps/chosen": -816.120849609375, "logps/rejected": -943.1589965820312, "loss": 0.5109, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1182186603546143, "rewards/margins": 0.9232460856437683, "rewards/rejected": -4.041464805603027, "step": 985 }, { "epoch": 0.6439709364029716, "grad_norm": 18.815807983433164, "learning_rate": 5.088191347940375e-08, "logits/chosen": -1.51715087890625, "logits/rejected": -1.5343945026397705, "logps/chosen": -813.7894287109375, "logps/rejected": -986.6640014648438, "loss": 0.413, "rewards/accuracies": 0.875, "rewards/chosen": -3.2437820434570312, "rewards/margins": 1.1290417909622192, "rewards/rejected": -4.372823238372803, "step": 986 }, { "epoch": 0.6446240509429341, "grad_norm": 25.895229217854762, "learning_rate": 5.071995434562592e-08, "logits/chosen": -1.4702000617980957, "logits/rejected": -1.4505935907363892, "logps/chosen": -854.226318359375, "logps/rejected": -964.643310546875, "loss": 0.4751, "rewards/accuracies": 0.71875, "rewards/chosen": -3.3152987957000732, "rewards/margins": 0.7256244421005249, "rewards/rejected": -4.040923595428467, "step": 987 }, { "epoch": 0.6452771654828966, "grad_norm": 33.724194436827155, "learning_rate": 5.055812159267003e-08, "logits/chosen": -1.5073872804641724, "logits/rejected": -1.4952834844589233, "logps/chosen": -855.9230346679688, "logps/rejected": -1033.7293701171875, "loss": 0.4451, "rewards/accuracies": 0.9375, "rewards/chosen": -3.2582755088806152, "rewards/margins": 1.6962679624557495, "rewards/rejected": -4.9545440673828125, "step": 988 }, { "epoch": 0.645930280022859, "grad_norm": 11.66573176765737, "learning_rate": 5.0396416062896766e-08, "logits/chosen": -1.579730749130249, "logits/rejected": -1.5733931064605713, "logps/chosen": -906.1632080078125, "logps/rejected": -925.17626953125, "loss": 0.5891, "rewards/accuracies": 0.75, "rewards/chosen": -3.705543041229248, "rewards/margins": 0.6829342246055603, "rewards/rejected": -4.388477325439453, "step": 989 }, { "epoch": 0.6465833945628214, "grad_norm": 150.79764751923673, "learning_rate": 5.023483859800463e-08, "logits/chosen": -1.5521610975265503, "logits/rejected": -1.5487536191940308, "logps/chosen": -818.4269409179688, "logps/rejected": -842.7174682617188, "loss": 0.5022, "rewards/accuracies": 0.71875, "rewards/chosen": -3.1259286403656006, "rewards/margins": 0.5289927124977112, "rewards/rejected": -3.654921293258667, "step": 990 }, { "epoch": 0.6472365091027839, "grad_norm": 16.00702278017296, "learning_rate": 5.0073390039025534e-08, "logits/chosen": -1.443716049194336, "logits/rejected": -1.415492057800293, "logps/chosen": -841.180419921875, "logps/rejected": -963.9603271484375, "loss": 0.5006, "rewards/accuracies": 0.65625, "rewards/chosen": -3.2827086448669434, "rewards/margins": 0.9521893262863159, "rewards/rejected": -4.234897613525391, "step": 991 }, { "epoch": 0.6478896236427464, "grad_norm": 65.24190893051149, "learning_rate": 4.991207122632035e-08, "logits/chosen": -1.5591700077056885, "logits/rejected": -1.5191017389297485, "logps/chosen": -942.4129638671875, "logps/rejected": -1009.4859619140625, "loss": 0.5094, "rewards/accuracies": 0.75, "rewards/chosen": -3.8302550315856934, "rewards/margins": 0.7394087314605713, "rewards/rejected": -4.5696635246276855, "step": 992 }, { "epoch": 0.6485427381827088, "grad_norm": 41.277206672220316, "learning_rate": 4.975088299957471e-08, "logits/chosen": -1.557931661605835, "logits/rejected": -1.555905818939209, "logps/chosen": -836.3804931640625, "logps/rejected": -922.718994140625, "loss": 0.4929, "rewards/accuracies": 0.90625, "rewards/chosen": -3.1140992641448975, "rewards/margins": 1.1032991409301758, "rewards/rejected": -4.217398643493652, "step": 993 }, { "epoch": 0.6491958527226712, "grad_norm": 14.725778545114895, "learning_rate": 4.958982619779442e-08, "logits/chosen": -1.5064477920532227, "logits/rejected": -1.521809697151184, "logps/chosen": -736.304931640625, "logps/rejected": -813.41650390625, "loss": 0.4734, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9339964389801025, "rewards/margins": 0.7689220309257507, "rewards/rejected": -3.702918291091919, "step": 994 }, { "epoch": 0.6498489672626336, "grad_norm": 52.91945215629695, "learning_rate": 4.942890165930129e-08, "logits/chosen": -1.5186612606048584, "logits/rejected": -1.5100464820861816, "logps/chosen": -754.244140625, "logps/rejected": -958.1654052734375, "loss": 0.4943, "rewards/accuracies": 0.78125, "rewards/chosen": -2.929853916168213, "rewards/margins": 1.4619183540344238, "rewards/rejected": -4.391772270202637, "step": 995 }, { "epoch": 0.6505020818025962, "grad_norm": 61.7741247551618, "learning_rate": 4.926811022172866e-08, "logits/chosen": -1.5684045553207397, "logits/rejected": -1.464762568473816, "logps/chosen": -862.2124633789062, "logps/rejected": -940.2279052734375, "loss": 0.4754, "rewards/accuracies": 0.78125, "rewards/chosen": -3.588654041290283, "rewards/margins": 1.0604631900787354, "rewards/rejected": -4.649117469787598, "step": 996 }, { "epoch": 0.6511551963425586, "grad_norm": 85.06663516145058, "learning_rate": 4.9107452722017015e-08, "logits/chosen": -1.4936169385910034, "logits/rejected": -1.497806429862976, "logps/chosen": -827.88720703125, "logps/rejected": -843.3629150390625, "loss": 0.5419, "rewards/accuracies": 0.71875, "rewards/chosen": -3.254192352294922, "rewards/margins": 0.3634593188762665, "rewards/rejected": -3.6176517009735107, "step": 997 }, { "epoch": 0.651808310882521, "grad_norm": 26.951955302254902, "learning_rate": 4.894692999640973e-08, "logits/chosen": -1.5051591396331787, "logits/rejected": -1.4692599773406982, "logps/chosen": -797.8718872070312, "logps/rejected": -915.5535888671875, "loss": 0.5042, "rewards/accuracies": 0.78125, "rewards/chosen": -3.2658886909484863, "rewards/margins": 0.9658613801002502, "rewards/rejected": -4.231750011444092, "step": 998 }, { "epoch": 0.6524614254224834, "grad_norm": 55.7608063216077, "learning_rate": 4.8786542880448653e-08, "logits/chosen": -1.537933349609375, "logits/rejected": -1.4756388664245605, "logps/chosen": -817.772705078125, "logps/rejected": -852.2359008789062, "loss": 0.5176, "rewards/accuracies": 0.65625, "rewards/chosen": -3.0438647270202637, "rewards/margins": 0.5471150875091553, "rewards/rejected": -3.590980052947998, "step": 999 }, { "epoch": 0.653114539962446, "grad_norm": 25.848553185603418, "learning_rate": 4.8626292208969733e-08, "logits/chosen": -1.553949236869812, "logits/rejected": -1.5505160093307495, "logps/chosen": -906.37109375, "logps/rejected": -967.929931640625, "loss": 0.485, "rewards/accuracies": 0.75, "rewards/chosen": -3.5798516273498535, "rewards/margins": 0.8147382140159607, "rewards/rejected": -4.394589900970459, "step": 1000 }, { "epoch": 0.653114539962446, "eval_logits/chosen": -1.4997057914733887, "eval_logits/rejected": -1.483424186706543, "eval_logps/chosen": -823.3939819335938, "eval_logps/rejected": -900.8679809570312, "eval_loss": 0.5032714605331421, "eval_rewards/accuracies": 0.7630000114440918, "eval_rewards/chosen": -3.1304855346679688, "eval_rewards/margins": 0.8558005094528198, "eval_rewards/rejected": -3.986285924911499, "eval_runtime": 296.6594, "eval_samples_per_second": 13.483, "eval_steps_per_second": 0.843, "step": 1000 }, { "epoch": 0.6537676545024084, "grad_norm": 30.491811094126753, "learning_rate": 4.846617881609876e-08, "logits/chosen": -1.520873785018921, "logits/rejected": -1.4895522594451904, "logps/chosen": -772.0455322265625, "logps/rejected": -870.1766357421875, "loss": 0.4863, "rewards/accuracies": 0.78125, "rewards/chosen": -2.9721617698669434, "rewards/margins": 0.8731052875518799, "rewards/rejected": -3.845266819000244, "step": 1001 }, { "epoch": 0.6544207690423708, "grad_norm": 40.22401127775807, "learning_rate": 4.8306203535246946e-08, "logits/chosen": -1.5727264881134033, "logits/rejected": -1.464941143989563, "logps/chosen": -808.9293212890625, "logps/rejected": -871.8182373046875, "loss": 0.5355, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1198384761810303, "rewards/margins": 0.9884458780288696, "rewards/rejected": -4.108283996582031, "step": 1002 }, { "epoch": 0.6550738835823332, "grad_norm": 11.806744900302544, "learning_rate": 4.814636719910657e-08, "logits/chosen": -1.5598303079605103, "logits/rejected": -1.474313735961914, "logps/chosen": -862.10400390625, "logps/rejected": -857.1552124023438, "loss": 0.546, "rewards/accuracies": 0.75, "rewards/chosen": -3.641342878341675, "rewards/margins": 0.5948621034622192, "rewards/rejected": -4.236205101013184, "step": 1003 }, { "epoch": 0.6557269981222957, "grad_norm": 15.051255942076827, "learning_rate": 4.798667063964673e-08, "logits/chosen": -1.5384275913238525, "logits/rejected": -1.494019865989685, "logps/chosen": -754.1705932617188, "logps/rejected": -797.1336059570312, "loss": 0.472, "rewards/accuracies": 0.84375, "rewards/chosen": -3.307189702987671, "rewards/margins": 0.5865740180015564, "rewards/rejected": -3.893763542175293, "step": 1004 }, { "epoch": 0.6563801126622582, "grad_norm": 12.800756135654732, "learning_rate": 4.7827114688108985e-08, "logits/chosen": -1.478043556213379, "logits/rejected": -1.4925929307937622, "logps/chosen": -698.2744140625, "logps/rejected": -822.2246704101562, "loss": 0.5131, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6346049308776855, "rewards/margins": 0.8852910399436951, "rewards/rejected": -3.5198960304260254, "step": 1005 }, { "epoch": 0.6570332272022206, "grad_norm": 16.667891699953692, "learning_rate": 4.7667700175002986e-08, "logits/chosen": -1.4476077556610107, "logits/rejected": -1.460363745689392, "logps/chosen": -839.6358032226562, "logps/rejected": -914.096923828125, "loss": 0.4806, "rewards/accuracies": 0.6875, "rewards/chosen": -3.174222469329834, "rewards/margins": 1.0781100988388062, "rewards/rejected": -4.2523322105407715, "step": 1006 }, { "epoch": 0.657686341742183, "grad_norm": 25.715571290386414, "learning_rate": 4.750842793010217e-08, "logits/chosen": -1.5329231023788452, "logits/rejected": -1.5252712965011597, "logps/chosen": -878.42041015625, "logps/rejected": -985.8428955078125, "loss": 0.5129, "rewards/accuracies": 0.65625, "rewards/chosen": -3.7149271965026855, "rewards/margins": 0.733319878578186, "rewards/rejected": -4.448246955871582, "step": 1007 }, { "epoch": 0.6583394562821455, "grad_norm": 104.75961567327492, "learning_rate": 4.7349298782439464e-08, "logits/chosen": -1.488255500793457, "logits/rejected": -1.4739494323730469, "logps/chosen": -775.230224609375, "logps/rejected": -847.8889770507812, "loss": 0.4897, "rewards/accuracies": 0.75, "rewards/chosen": -2.7119245529174805, "rewards/margins": 0.9239524602890015, "rewards/rejected": -3.6358766555786133, "step": 1008 }, { "epoch": 0.6589925708221079, "grad_norm": 52.144252857671894, "learning_rate": 4.719031356030294e-08, "logits/chosen": -1.5857702493667603, "logits/rejected": -1.5610796213150024, "logps/chosen": -816.1323852539062, "logps/rejected": -970.305419921875, "loss": 0.5158, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0045716762542725, "rewards/margins": 1.0432796478271484, "rewards/rejected": -4.0478515625, "step": 1009 }, { "epoch": 0.6596456853620704, "grad_norm": 102.68956882940834, "learning_rate": 4.703147309123156e-08, "logits/chosen": -1.4632647037506104, "logits/rejected": -1.4590816497802734, "logps/chosen": -727.7660522460938, "logps/rejected": -776.1034545898438, "loss": 0.5347, "rewards/accuracies": 0.71875, "rewards/chosen": -2.654884099960327, "rewards/margins": 0.699772298336029, "rewards/rejected": -3.35465669631958, "step": 1010 }, { "epoch": 0.6602987999020328, "grad_norm": 23.05602130979561, "learning_rate": 4.687277820201077e-08, "logits/chosen": -1.5591952800750732, "logits/rejected": -1.5697981119155884, "logps/chosen": -876.64599609375, "logps/rejected": -1013.8555297851562, "loss": 0.4686, "rewards/accuracies": 0.84375, "rewards/chosen": -3.513322353363037, "rewards/margins": 0.9770915508270264, "rewards/rejected": -4.490414142608643, "step": 1011 }, { "epoch": 0.6609519144419953, "grad_norm": 36.01319445097211, "learning_rate": 4.671422971866829e-08, "logits/chosen": -1.5157580375671387, "logits/rejected": -1.4674278497695923, "logps/chosen": -769.1876220703125, "logps/rejected": -776.2877197265625, "loss": 0.5207, "rewards/accuracies": 0.84375, "rewards/chosen": -2.8552205562591553, "rewards/margins": 0.6877336502075195, "rewards/rejected": -3.542954444885254, "step": 1012 }, { "epoch": 0.6616050289819577, "grad_norm": 88.36042423899895, "learning_rate": 4.655582846646977e-08, "logits/chosen": -1.4231505393981934, "logits/rejected": -1.4381476640701294, "logps/chosen": -826.669921875, "logps/rejected": -876.0449829101562, "loss": 0.5261, "rewards/accuracies": 0.84375, "rewards/chosen": -3.297783613204956, "rewards/margins": 1.0941379070281982, "rewards/rejected": -4.391921043395996, "step": 1013 }, { "epoch": 0.6622581435219201, "grad_norm": 36.895557036458406, "learning_rate": 4.6397575269914516e-08, "logits/chosen": -1.5066965818405151, "logits/rejected": -1.4606728553771973, "logps/chosen": -728.8414306640625, "logps/rejected": -851.2317504882812, "loss": 0.4876, "rewards/accuracies": 0.71875, "rewards/chosen": -2.7775468826293945, "rewards/margins": 1.0805087089538574, "rewards/rejected": -3.858055591583252, "step": 1014 }, { "epoch": 0.6629112580618826, "grad_norm": 42.67356514700968, "learning_rate": 4.6239470952731144e-08, "logits/chosen": -1.4574358463287354, "logits/rejected": -1.4584983587265015, "logps/chosen": -798.5891723632812, "logps/rejected": -916.482421875, "loss": 0.5216, "rewards/accuracies": 0.71875, "rewards/chosen": -2.7513608932495117, "rewards/margins": 0.9783158898353577, "rewards/rejected": -3.7296767234802246, "step": 1015 }, { "epoch": 0.6635643726018451, "grad_norm": 103.36971446244533, "learning_rate": 4.608151633787337e-08, "logits/chosen": -1.4695475101470947, "logits/rejected": -1.4544553756713867, "logps/chosen": -803.3701171875, "logps/rejected": -876.03369140625, "loss": 0.4574, "rewards/accuracies": 0.71875, "rewards/chosen": -2.9469823837280273, "rewards/margins": 1.0371326208114624, "rewards/rejected": -3.9841156005859375, "step": 1016 }, { "epoch": 0.6642174871418075, "grad_norm": 53.366420843639595, "learning_rate": 4.5923712247515675e-08, "logits/chosen": -1.498404860496521, "logits/rejected": -1.5131099224090576, "logps/chosen": -852.6907958984375, "logps/rejected": -1058.0159912109375, "loss": 0.4487, "rewards/accuracies": 0.9375, "rewards/chosen": -3.187804698944092, "rewards/margins": 1.617006778717041, "rewards/rejected": -4.804811954498291, "step": 1017 }, { "epoch": 0.6648706016817699, "grad_norm": 37.41870236327263, "learning_rate": 4.5766059503049046e-08, "logits/chosen": -1.485478401184082, "logits/rejected": -1.501274824142456, "logps/chosen": -788.52734375, "logps/rejected": -801.43505859375, "loss": 0.4773, "rewards/accuracies": 0.625, "rewards/chosen": -2.898345470428467, "rewards/margins": 0.5790349245071411, "rewards/rejected": -3.4773800373077393, "step": 1018 }, { "epoch": 0.6655237162217323, "grad_norm": 36.10214381184078, "learning_rate": 4.560855892507671e-08, "logits/chosen": -1.5048822164535522, "logits/rejected": -1.5141572952270508, "logps/chosen": -830.2411499023438, "logps/rejected": -920.7844848632812, "loss": 0.5217, "rewards/accuracies": 0.78125, "rewards/chosen": -3.3689942359924316, "rewards/margins": 1.0679469108581543, "rewards/rejected": -4.436941146850586, "step": 1019 }, { "epoch": 0.6661768307616949, "grad_norm": 45.389879083458595, "learning_rate": 4.5451211333409836e-08, "logits/chosen": -1.4572731256484985, "logits/rejected": -1.4573543071746826, "logps/chosen": -800.939697265625, "logps/rejected": -863.949951171875, "loss": 0.5109, "rewards/accuracies": 0.875, "rewards/chosen": -2.996425151824951, "rewards/margins": 0.8323167562484741, "rewards/rejected": -3.828742027282715, "step": 1020 }, { "epoch": 0.6668299453016573, "grad_norm": 10.553936200597361, "learning_rate": 4.5294017547063234e-08, "logits/chosen": -1.5413297414779663, "logits/rejected": -1.520641803741455, "logps/chosen": -755.6485595703125, "logps/rejected": -840.0621337890625, "loss": 0.4168, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9807937145233154, "rewards/margins": 1.0162277221679688, "rewards/rejected": -3.9970216751098633, "step": 1021 }, { "epoch": 0.6674830598416197, "grad_norm": 10.572170985494784, "learning_rate": 4.513697838425122e-08, "logits/chosen": -1.5927180051803589, "logits/rejected": -1.6134157180786133, "logps/chosen": -868.3167114257812, "logps/rejected": -956.796875, "loss": 0.5045, "rewards/accuracies": 0.9375, "rewards/chosen": -3.4438788890838623, "rewards/margins": 1.0897140502929688, "rewards/rejected": -4.533592700958252, "step": 1022 }, { "epoch": 0.6681361743815821, "grad_norm": 20.81491049314488, "learning_rate": 4.4980094662383206e-08, "logits/chosen": -1.4901061058044434, "logits/rejected": -1.4351348876953125, "logps/chosen": -784.634765625, "logps/rejected": -926.8602294921875, "loss": 0.5169, "rewards/accuracies": 0.75, "rewards/chosen": -3.037071704864502, "rewards/margins": 1.026695966720581, "rewards/rejected": -4.063767433166504, "step": 1023 }, { "epoch": 0.6687892889215447, "grad_norm": 25.45661309370082, "learning_rate": 4.4823367198059555e-08, "logits/chosen": -1.513559103012085, "logits/rejected": -1.5025659799575806, "logps/chosen": -918.1492919921875, "logps/rejected": -963.5728759765625, "loss": 0.4967, "rewards/accuracies": 0.71875, "rewards/chosen": -3.492802381515503, "rewards/margins": 0.798168957233429, "rewards/rejected": -4.290971279144287, "step": 1024 }, { "epoch": 0.6694424034615071, "grad_norm": 63.951136926876586, "learning_rate": 4.466679680706727e-08, "logits/chosen": -1.5798838138580322, "logits/rejected": -1.4765055179595947, "logps/chosen": -794.0283203125, "logps/rejected": -891.6921997070312, "loss": 0.5115, "rewards/accuracies": 0.71875, "rewards/chosen": -2.961698532104492, "rewards/margins": 0.8770202994346619, "rewards/rejected": -3.8387184143066406, "step": 1025 }, { "epoch": 0.6700955180014695, "grad_norm": 57.70779325077905, "learning_rate": 4.4510384304375773e-08, "logits/chosen": -1.4599577188491821, "logits/rejected": -1.5163075923919678, "logps/chosen": -797.9346923828125, "logps/rejected": -910.8660888671875, "loss": 0.5434, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1370911598205566, "rewards/margins": 0.9660441279411316, "rewards/rejected": -4.103135108947754, "step": 1026 }, { "epoch": 0.6707486325414319, "grad_norm": 160.10963139231885, "learning_rate": 4.435413050413264e-08, "logits/chosen": -1.5408748388290405, "logits/rejected": -1.4838809967041016, "logps/chosen": -880.136474609375, "logps/rejected": -953.572509765625, "loss": 0.496, "rewards/accuracies": 0.75, "rewards/chosen": -3.4785830974578857, "rewards/margins": 0.9672165513038635, "rewards/rejected": -4.445799350738525, "step": 1027 }, { "epoch": 0.6714017470813944, "grad_norm": 30.04810670653747, "learning_rate": 4.41980362196594e-08, "logits/chosen": -1.5107932090759277, "logits/rejected": -1.4906303882598877, "logps/chosen": -787.1295166015625, "logps/rejected": -931.1858520507812, "loss": 0.5156, "rewards/accuracies": 0.90625, "rewards/chosen": -2.80479097366333, "rewards/margins": 1.6828497648239136, "rewards/rejected": -4.487640380859375, "step": 1028 }, { "epoch": 0.6720548616213569, "grad_norm": 8.725100728079909, "learning_rate": 4.4042102263447275e-08, "logits/chosen": -1.4393929243087769, "logits/rejected": -1.3799033164978027, "logps/chosen": -751.6057739257812, "logps/rejected": -883.99755859375, "loss": 0.4621, "rewards/accuracies": 0.78125, "rewards/chosen": -3.040332317352295, "rewards/margins": 1.2370002269744873, "rewards/rejected": -4.277332782745361, "step": 1029 }, { "epoch": 0.6727079761613193, "grad_norm": 10.666968801911986, "learning_rate": 4.388632944715296e-08, "logits/chosen": -1.474955677986145, "logits/rejected": -1.4837840795516968, "logps/chosen": -798.3235473632812, "logps/rejected": -894.8926391601562, "loss": 0.4808, "rewards/accuracies": 0.78125, "rewards/chosen": -2.9682440757751465, "rewards/margins": 0.7301187515258789, "rewards/rejected": -3.6983628273010254, "step": 1030 }, { "epoch": 0.6733610907012817, "grad_norm": 60.661967247074934, "learning_rate": 4.37307185815944e-08, "logits/chosen": -1.562424898147583, "logits/rejected": -1.5712769031524658, "logps/chosen": -810.6946411132812, "logps/rejected": -843.1661987304688, "loss": 0.5108, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2374746799468994, "rewards/margins": 0.8579811453819275, "rewards/rejected": -4.095455646514893, "step": 1031 }, { "epoch": 0.6740142052412442, "grad_norm": 165.13796597577232, "learning_rate": 4.3575270476746543e-08, "logits/chosen": -1.4801855087280273, "logits/rejected": -1.5016859769821167, "logps/chosen": -857.4439697265625, "logps/rejected": -974.478515625, "loss": 0.515, "rewards/accuracies": 0.8125, "rewards/chosen": -3.03521990776062, "rewards/margins": 0.830250084400177, "rewards/rejected": -3.8654701709747314, "step": 1032 }, { "epoch": 0.6746673197812066, "grad_norm": 97.99389725505694, "learning_rate": 4.341998594173717e-08, "logits/chosen": -1.4819018840789795, "logits/rejected": -1.4765572547912598, "logps/chosen": -779.4025268554688, "logps/rejected": -837.5120239257812, "loss": 0.4812, "rewards/accuracies": 0.78125, "rewards/chosen": -2.9766929149627686, "rewards/margins": 0.7156229019165039, "rewards/rejected": -3.6923160552978516, "step": 1033 }, { "epoch": 0.6753204343211691, "grad_norm": 24.808395132696045, "learning_rate": 4.326486578484266e-08, "logits/chosen": -1.4551912546157837, "logits/rejected": -1.4724352359771729, "logps/chosen": -780.5784301757812, "logps/rejected": -889.181396484375, "loss": 0.5369, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8798131942749023, "rewards/margins": 0.8098124265670776, "rewards/rejected": -3.6896255016326904, "step": 1034 }, { "epoch": 0.6759735488611315, "grad_norm": 32.75717304070085, "learning_rate": 4.310991081348376e-08, "logits/chosen": -1.4434568881988525, "logits/rejected": -1.4026532173156738, "logps/chosen": -810.0265502929688, "logps/rejected": -864.6695556640625, "loss": 0.5423, "rewards/accuracies": 0.71875, "rewards/chosen": -3.1554508209228516, "rewards/margins": 0.6393953561782837, "rewards/rejected": -3.7948460578918457, "step": 1035 }, { "epoch": 0.676626663401094, "grad_norm": 95.45593023373124, "learning_rate": 4.295512183422145e-08, "logits/chosen": -1.4506603479385376, "logits/rejected": -1.4620624780654907, "logps/chosen": -711.5603637695312, "logps/rejected": -851.3104858398438, "loss": 0.5207, "rewards/accuracies": 0.71875, "rewards/chosen": -2.811810255050659, "rewards/margins": 1.0253384113311768, "rewards/rejected": -3.837148666381836, "step": 1036 }, { "epoch": 0.6772797779410564, "grad_norm": 13.65496722074414, "learning_rate": 4.280049965275261e-08, "logits/chosen": -1.5078392028808594, "logits/rejected": -1.4908549785614014, "logps/chosen": -752.6651000976562, "logps/rejected": -865.7678833007812, "loss": 0.4159, "rewards/accuracies": 0.875, "rewards/chosen": -2.94309663772583, "rewards/margins": 0.8273873329162598, "rewards/rejected": -3.77048397064209, "step": 1037 }, { "epoch": 0.6779328924810188, "grad_norm": 23.66625717077645, "learning_rate": 4.2646045073906e-08, "logits/chosen": -1.4181371927261353, "logits/rejected": -1.4423913955688477, "logps/chosen": -730.9456176757812, "logps/rejected": -790.7864990234375, "loss": 0.4546, "rewards/accuracies": 0.75, "rewards/chosen": -2.9021615982055664, "rewards/margins": 0.7823746800422668, "rewards/rejected": -3.6845362186431885, "step": 1038 }, { "epoch": 0.6785860070209813, "grad_norm": 38.74541697204639, "learning_rate": 4.249175890163797e-08, "logits/chosen": -1.531599521636963, "logits/rejected": -1.5187830924987793, "logps/chosen": -762.074951171875, "logps/rejected": -845.5159912109375, "loss": 0.4991, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8555479049682617, "rewards/margins": 0.8805040121078491, "rewards/rejected": -3.7360520362854004, "step": 1039 }, { "epoch": 0.6792391215609438, "grad_norm": 85.38760955194438, "learning_rate": 4.233764193902828e-08, "logits/chosen": -1.547985553741455, "logits/rejected": -1.5426965951919556, "logps/chosen": -837.4592895507812, "logps/rejected": -898.4917602539062, "loss": 0.4809, "rewards/accuracies": 0.84375, "rewards/chosen": -2.8173162937164307, "rewards/margins": 0.9812331795692444, "rewards/rejected": -3.798549175262451, "step": 1040 }, { "epoch": 0.6798922361009062, "grad_norm": 57.38295376577258, "learning_rate": 4.2183694988275914e-08, "logits/chosen": -1.5143656730651855, "logits/rejected": -1.5319607257843018, "logps/chosen": -795.1925048828125, "logps/rejected": -825.0725708007812, "loss": 0.5237, "rewards/accuracies": 0.75, "rewards/chosen": -3.043870210647583, "rewards/margins": 0.7048807144165039, "rewards/rejected": -3.748751163482666, "step": 1041 }, { "epoch": 0.6805453506408686, "grad_norm": 13.482382576237015, "learning_rate": 4.2029918850694955e-08, "logits/chosen": -1.5160479545593262, "logits/rejected": -1.5341300964355469, "logps/chosen": -728.4697875976562, "logps/rejected": -833.2994384765625, "loss": 0.516, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0110697746276855, "rewards/margins": 0.9559275507926941, "rewards/rejected": -3.9669976234436035, "step": 1042 }, { "epoch": 0.681198465180831, "grad_norm": 13.93984327839821, "learning_rate": 4.1876314326710367e-08, "logits/chosen": -1.4439070224761963, "logits/rejected": -1.4740190505981445, "logps/chosen": -883.0184326171875, "logps/rejected": -914.0108032226562, "loss": 0.5132, "rewards/accuracies": 0.84375, "rewards/chosen": -3.1336121559143066, "rewards/margins": 0.8373432159423828, "rewards/rejected": -3.9709551334381104, "step": 1043 }, { "epoch": 0.6818515797207936, "grad_norm": 93.33586037651152, "learning_rate": 4.172288221585383e-08, "logits/chosen": -1.4956300258636475, "logits/rejected": -1.4777367115020752, "logps/chosen": -819.32958984375, "logps/rejected": -873.03955078125, "loss": 0.5033, "rewards/accuracies": 0.84375, "rewards/chosen": -3.1138126850128174, "rewards/margins": 0.7950230836868286, "rewards/rejected": -3.9088354110717773, "step": 1044 }, { "epoch": 0.682504694260756, "grad_norm": 24.399107508868735, "learning_rate": 4.1569623316759634e-08, "logits/chosen": -1.5415163040161133, "logits/rejected": -1.5112028121948242, "logps/chosen": -853.9760131835938, "logps/rejected": -965.800537109375, "loss": 0.5098, "rewards/accuracies": 0.75, "rewards/chosen": -3.227302312850952, "rewards/margins": 0.9138199687004089, "rewards/rejected": -4.141121864318848, "step": 1045 }, { "epoch": 0.6831578088007184, "grad_norm": 10.737613957729454, "learning_rate": 4.1416538427160414e-08, "logits/chosen": -1.508976936340332, "logits/rejected": -1.4843047857284546, "logps/chosen": -843.1103515625, "logps/rejected": -909.9649658203125, "loss": 0.5126, "rewards/accuracies": 0.71875, "rewards/chosen": -3.028240203857422, "rewards/margins": 0.6666500568389893, "rewards/rejected": -3.694890022277832, "step": 1046 }, { "epoch": 0.6838109233406808, "grad_norm": 13.15409511965096, "learning_rate": 4.126362834388311e-08, "logits/chosen": -1.4586668014526367, "logits/rejected": -1.450480580329895, "logps/chosen": -823.8426513671875, "logps/rejected": -831.0048828125, "loss": 0.5012, "rewards/accuracies": 0.71875, "rewards/chosen": -2.807426691055298, "rewards/margins": 0.6834732890129089, "rewards/rejected": -3.4908998012542725, "step": 1047 }, { "epoch": 0.6844640378806434, "grad_norm": 35.5691033339692, "learning_rate": 4.11108938628448e-08, "logits/chosen": -1.44548499584198, "logits/rejected": -1.465252161026001, "logps/chosen": -912.369140625, "logps/rejected": -958.8450927734375, "loss": 0.4735, "rewards/accuracies": 0.78125, "rewards/chosen": -3.652446746826172, "rewards/margins": 0.8420040607452393, "rewards/rejected": -4.494450569152832, "step": 1048 }, { "epoch": 0.6851171524206058, "grad_norm": 55.79275790082709, "learning_rate": 4.095833577904842e-08, "logits/chosen": -1.5095837116241455, "logits/rejected": -1.4833836555480957, "logps/chosen": -766.8374633789062, "logps/rejected": -825.0868530273438, "loss": 0.4932, "rewards/accuracies": 0.75, "rewards/chosen": -2.7053170204162598, "rewards/margins": 0.8340969085693359, "rewards/rejected": -3.5394136905670166, "step": 1049 }, { "epoch": 0.6857702669605682, "grad_norm": 16.058246806646952, "learning_rate": 4.0805954886578825e-08, "logits/chosen": -1.4918146133422852, "logits/rejected": -1.4603523015975952, "logps/chosen": -789.8798217773438, "logps/rejected": -847.2196655273438, "loss": 0.5385, "rewards/accuracies": 0.59375, "rewards/chosen": -3.3735291957855225, "rewards/margins": 0.4681362509727478, "rewards/rejected": -3.841665744781494, "step": 1050 }, { "epoch": 0.6864233815005306, "grad_norm": 25.38044398676047, "learning_rate": 4.065375197859855e-08, "logits/chosen": -1.5610326528549194, "logits/rejected": -1.465526819229126, "logps/chosen": -896.5455932617188, "logps/rejected": -925.2203369140625, "loss": 0.5014, "rewards/accuracies": 0.8125, "rewards/chosen": -3.476266622543335, "rewards/margins": 0.7058266401290894, "rewards/rejected": -4.182093620300293, "step": 1051 }, { "epoch": 0.6870764960404931, "grad_norm": 27.871824279674087, "learning_rate": 4.0501727847343706e-08, "logits/chosen": -1.5372264385223389, "logits/rejected": -1.521620512008667, "logps/chosen": -966.5, "logps/rejected": -982.9517822265625, "loss": 0.5236, "rewards/accuracies": 0.71875, "rewards/chosen": -3.5799427032470703, "rewards/margins": 0.6284856796264648, "rewards/rejected": -4.208428382873535, "step": 1052 }, { "epoch": 0.6877296105804556, "grad_norm": 58.973884464098276, "learning_rate": 4.034988328411982e-08, "logits/chosen": -1.4340426921844482, "logits/rejected": -1.4606634378433228, "logps/chosen": -855.3406372070312, "logps/rejected": -920.906494140625, "loss": 0.4464, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8592772483825684, "rewards/margins": 0.6489996314048767, "rewards/rejected": -3.5082767009735107, "step": 1053 }, { "epoch": 0.688382725120418, "grad_norm": 21.96563134850709, "learning_rate": 4.0198219079297756e-08, "logits/chosen": -1.4492591619491577, "logits/rejected": -1.4421489238739014, "logps/chosen": -826.43359375, "logps/rejected": -959.9185180664062, "loss": 0.5362, "rewards/accuracies": 0.75, "rewards/chosen": -3.1718485355377197, "rewards/margins": 0.9998074769973755, "rewards/rejected": -4.171655654907227, "step": 1054 }, { "epoch": 0.6890358396603804, "grad_norm": 56.311096113880815, "learning_rate": 4.004673602230961e-08, "logits/chosen": -1.6020833253860474, "logits/rejected": -1.5822880268096924, "logps/chosen": -848.7802124023438, "logps/rejected": -922.5611572265625, "loss": 0.5243, "rewards/accuracies": 0.75, "rewards/chosen": -3.1218650341033936, "rewards/margins": 0.7863477468490601, "rewards/rejected": -3.908212661743164, "step": 1055 }, { "epoch": 0.6896889542003429, "grad_norm": 17.48722643305911, "learning_rate": 3.989543490164453e-08, "logits/chosen": -1.4615482091903687, "logits/rejected": -1.5211315155029297, "logps/chosen": -854.398681640625, "logps/rejected": -965.4825439453125, "loss": 0.4684, "rewards/accuracies": 0.75, "rewards/chosen": -3.055753231048584, "rewards/margins": 0.7217795848846436, "rewards/rejected": -3.7775330543518066, "step": 1056 }, { "epoch": 0.6903420687403053, "grad_norm": 37.57075928693893, "learning_rate": 3.974431650484468e-08, "logits/chosen": -1.4383900165557861, "logits/rejected": -1.3848916292190552, "logps/chosen": -895.6553344726562, "logps/rejected": -919.5377807617188, "loss": 0.536, "rewards/accuracies": 0.53125, "rewards/chosen": -3.639094352722168, "rewards/margins": 0.42604905366897583, "rewards/rejected": -4.06514310836792, "step": 1057 }, { "epoch": 0.6909951832802678, "grad_norm": 26.018606342144913, "learning_rate": 3.95933816185012e-08, "logits/chosen": -1.5629183053970337, "logits/rejected": -1.528983235359192, "logps/chosen": -896.5241088867188, "logps/rejected": -974.3204345703125, "loss": 0.5263, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3990674018859863, "rewards/margins": 0.964748203754425, "rewards/rejected": -4.363815784454346, "step": 1058 }, { "epoch": 0.6916482978202302, "grad_norm": 137.70834817726936, "learning_rate": 3.944263102824996e-08, "logits/chosen": -1.4782737493515015, "logits/rejected": -1.4820241928100586, "logps/chosen": -844.2286987304688, "logps/rejected": -879.0075073242188, "loss": 0.4687, "rewards/accuracies": 0.8125, "rewards/chosen": -3.190678358078003, "rewards/margins": 0.6480005979537964, "rewards/rejected": -3.8386788368225098, "step": 1059 }, { "epoch": 0.6923014123601927, "grad_norm": 135.63517874372047, "learning_rate": 3.9292065518767495e-08, "logits/chosen": -1.4735994338989258, "logits/rejected": -1.461629033088684, "logps/chosen": -815.2098388671875, "logps/rejected": -904.7244873046875, "loss": 0.5408, "rewards/accuracies": 0.78125, "rewards/chosen": -3.002563714981079, "rewards/margins": 0.6686397194862366, "rewards/rejected": -3.671203374862671, "step": 1060 }, { "epoch": 0.6929545269001551, "grad_norm": 12.912624702236881, "learning_rate": 3.914168587376706e-08, "logits/chosen": -1.5482277870178223, "logits/rejected": -1.5398210287094116, "logps/chosen": -831.609130859375, "logps/rejected": -957.0618286132812, "loss": 0.494, "rewards/accuracies": 0.8125, "rewards/chosen": -3.257383108139038, "rewards/margins": 0.9438177347183228, "rewards/rejected": -4.201200485229492, "step": 1061 }, { "epoch": 0.6936076414401176, "grad_norm": 19.51174100176863, "learning_rate": 3.899149287599442e-08, "logits/chosen": -1.4981271028518677, "logits/rejected": -1.442700982093811, "logps/chosen": -779.00927734375, "logps/rejected": -808.9547729492188, "loss": 0.5122, "rewards/accuracies": 0.84375, "rewards/chosen": -2.9186878204345703, "rewards/margins": 0.47457852959632874, "rewards/rejected": -3.3932666778564453, "step": 1062 }, { "epoch": 0.69426075598008, "grad_norm": 71.96791759257684, "learning_rate": 3.884148730722383e-08, "logits/chosen": -1.5923326015472412, "logits/rejected": -1.5967943668365479, "logps/chosen": -838.6897583007812, "logps/rejected": -877.890625, "loss": 0.4842, "rewards/accuracies": 0.6875, "rewards/chosen": -3.1096723079681396, "rewards/margins": 0.708088219165802, "rewards/rejected": -3.817760467529297, "step": 1063 }, { "epoch": 0.6949138705200425, "grad_norm": 16.78816497884493, "learning_rate": 3.8691669948253964e-08, "logits/chosen": -1.4334865808486938, "logits/rejected": -1.4443024396896362, "logps/chosen": -852.86572265625, "logps/rejected": -981.8878173828125, "loss": 0.483, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1702041625976562, "rewards/margins": 1.1130633354187012, "rewards/rejected": -4.283267974853516, "step": 1064 }, { "epoch": 0.6955669850600049, "grad_norm": 19.842008322377158, "learning_rate": 3.85420415789038e-08, "logits/chosen": -1.4435492753982544, "logits/rejected": -1.480208396911621, "logps/chosen": -812.3143920898438, "logps/rejected": -912.236572265625, "loss": 0.475, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2056543827056885, "rewards/margins": 0.799709141254425, "rewards/rejected": -4.005363464355469, "step": 1065 }, { "epoch": 0.6962200995999673, "grad_norm": 17.570135752432474, "learning_rate": 3.839260297800864e-08, "logits/chosen": -1.5509750843048096, "logits/rejected": -1.542686939239502, "logps/chosen": -918.3096923828125, "logps/rejected": -1013.3355102539062, "loss": 0.4643, "rewards/accuracies": 0.71875, "rewards/chosen": -3.234311103820801, "rewards/margins": 1.1861119270324707, "rewards/rejected": -4.42042350769043, "step": 1066 }, { "epoch": 0.6968732141399298, "grad_norm": 13.333457525865867, "learning_rate": 3.824335492341599e-08, "logits/chosen": -1.5408436059951782, "logits/rejected": -1.5043909549713135, "logps/chosen": -823.61865234375, "logps/rejected": -986.2550048828125, "loss": 0.4637, "rewards/accuracies": 0.84375, "rewards/chosen": -2.8370096683502197, "rewards/margins": 1.4054211378097534, "rewards/rejected": -4.242431163787842, "step": 1067 }, { "epoch": 0.6975263286798923, "grad_norm": 19.537814553814222, "learning_rate": 3.8094298191981565e-08, "logits/chosen": -1.4838995933532715, "logits/rejected": -1.483154535293579, "logps/chosen": -757.1932373046875, "logps/rejected": -872.9207763671875, "loss": 0.5317, "rewards/accuracies": 0.96875, "rewards/chosen": -2.696157217025757, "rewards/margins": 1.4016393423080444, "rewards/rejected": -4.09779691696167, "step": 1068 }, { "epoch": 0.6981794432198547, "grad_norm": 9.688042362394432, "learning_rate": 3.794543355956518e-08, "logits/chosen": -1.4866911172866821, "logits/rejected": -1.449821949005127, "logps/chosen": -834.3885498046875, "logps/rejected": -924.32470703125, "loss": 0.4753, "rewards/accuracies": 0.78125, "rewards/chosen": -2.975139856338501, "rewards/margins": 0.8105930685997009, "rewards/rejected": -3.7857329845428467, "step": 1069 }, { "epoch": 0.6988325577598171, "grad_norm": 13.451541283780028, "learning_rate": 3.779676180102678e-08, "logits/chosen": -1.5045047998428345, "logits/rejected": -1.5032069683074951, "logps/chosen": -714.1544189453125, "logps/rejected": -732.0367431640625, "loss": 0.5696, "rewards/accuracies": 0.59375, "rewards/chosen": -2.933642864227295, "rewards/margins": 0.15253925323486328, "rewards/rejected": -3.0861823558807373, "step": 1070 }, { "epoch": 0.6994856722997795, "grad_norm": 12.44342691849441, "learning_rate": 3.76482836902224e-08, "logits/chosen": -1.572385311126709, "logits/rejected": -1.5535383224487305, "logps/chosen": -902.6881713867188, "logps/rejected": -1089.9610595703125, "loss": 0.4715, "rewards/accuracies": 0.75, "rewards/chosen": -3.132718801498413, "rewards/margins": 1.6019270420074463, "rewards/rejected": -4.734645843505859, "step": 1071 }, { "epoch": 0.7001387868397421, "grad_norm": 10.296422466002083, "learning_rate": 3.750000000000002e-08, "logits/chosen": -1.5108206272125244, "logits/rejected": -1.477285623550415, "logps/chosen": -779.607666015625, "logps/rejected": -822.5366821289062, "loss": 0.4943, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6698358058929443, "rewards/margins": 0.7314268946647644, "rewards/rejected": -3.4012625217437744, "step": 1072 }, { "epoch": 0.7007919013797045, "grad_norm": 19.95219987159032, "learning_rate": 3.735191150219571e-08, "logits/chosen": -1.5256969928741455, "logits/rejected": -1.4763067960739136, "logps/chosen": -837.349853515625, "logps/rejected": -901.6431274414062, "loss": 0.5116, "rewards/accuracies": 0.78125, "rewards/chosen": -3.106201648712158, "rewards/margins": 0.7756415009498596, "rewards/rejected": -3.881843328475952, "step": 1073 }, { "epoch": 0.7014450159196669, "grad_norm": 35.78189318471355, "learning_rate": 3.7204018967629534e-08, "logits/chosen": -1.4913254976272583, "logits/rejected": -1.523918628692627, "logps/chosen": -725.7921142578125, "logps/rejected": -901.6088256835938, "loss": 0.496, "rewards/accuracies": 0.75, "rewards/chosen": -2.752901315689087, "rewards/margins": 1.2373815774917603, "rewards/rejected": -3.9902822971343994, "step": 1074 }, { "epoch": 0.7020981304596293, "grad_norm": 67.37326737845098, "learning_rate": 3.7056323166101525e-08, "logits/chosen": -1.5039902925491333, "logits/rejected": -1.489875078201294, "logps/chosen": -836.2699584960938, "logps/rejected": -852.8065185546875, "loss": 0.4636, "rewards/accuracies": 0.78125, "rewards/chosen": -2.870565891265869, "rewards/margins": 0.6211225986480713, "rewards/rejected": -3.4916884899139404, "step": 1075 }, { "epoch": 0.7027512449995919, "grad_norm": 122.82544033335364, "learning_rate": 3.690882486638771e-08, "logits/chosen": -1.5150420665740967, "logits/rejected": -1.4826247692108154, "logps/chosen": -794.5693359375, "logps/rejected": -939.4715576171875, "loss": 0.4775, "rewards/accuracies": 0.65625, "rewards/chosen": -2.9887611865997314, "rewards/margins": 1.1564222574234009, "rewards/rejected": -4.145183563232422, "step": 1076 }, { "epoch": 0.7034043595395543, "grad_norm": 29.770872322440407, "learning_rate": 3.6761524836236085e-08, "logits/chosen": -1.4067631959915161, "logits/rejected": -1.3834965229034424, "logps/chosen": -879.8606567382812, "logps/rejected": -925.6470336914062, "loss": 0.4719, "rewards/accuracies": 0.6875, "rewards/chosen": -3.328319787979126, "rewards/margins": 0.6426284313201904, "rewards/rejected": -3.9709482192993164, "step": 1077 }, { "epoch": 0.7040574740795167, "grad_norm": 32.679662772650815, "learning_rate": 3.6614423842362605e-08, "logits/chosen": -1.4490833282470703, "logits/rejected": -1.4676353931427002, "logps/chosen": -785.7656860351562, "logps/rejected": -959.332763671875, "loss": 0.5489, "rewards/accuracies": 0.75, "rewards/chosen": -3.0860912799835205, "rewards/margins": 0.828029453754425, "rewards/rejected": -3.914120674133301, "step": 1078 }, { "epoch": 0.7047105886194791, "grad_norm": 26.80045244487548, "learning_rate": 3.646752265044725e-08, "logits/chosen": -1.5650575160980225, "logits/rejected": -1.5426418781280518, "logps/chosen": -833.9342041015625, "logps/rejected": -841.90283203125, "loss": 0.4943, "rewards/accuracies": 0.6875, "rewards/chosen": -3.413437604904175, "rewards/margins": 0.4831462800502777, "rewards/rejected": -3.8965840339660645, "step": 1079 }, { "epoch": 0.7053637031594416, "grad_norm": 128.50612923555713, "learning_rate": 3.6320822025129986e-08, "logits/chosen": -1.4550724029541016, "logits/rejected": -1.4530410766601562, "logps/chosen": -917.0040893554688, "logps/rejected": -1088.79248046875, "loss": 0.5566, "rewards/accuracies": 0.75, "rewards/chosen": -3.368748903274536, "rewards/margins": 1.5208771228790283, "rewards/rejected": -4.889625549316406, "step": 1080 }, { "epoch": 0.706016817699404, "grad_norm": 10.851702941973679, "learning_rate": 3.6174322730006816e-08, "logits/chosen": -1.5157017707824707, "logits/rejected": -1.5042166709899902, "logps/chosen": -775.1571044921875, "logps/rejected": -932.432373046875, "loss": 0.4836, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8636438846588135, "rewards/margins": 1.276392936706543, "rewards/rejected": -4.1400370597839355, "step": 1081 }, { "epoch": 0.7066699322393665, "grad_norm": 10.972406234728096, "learning_rate": 3.6028025527625804e-08, "logits/chosen": -1.513984203338623, "logits/rejected": -1.451591968536377, "logps/chosen": -830.9796142578125, "logps/rejected": -924.893310546875, "loss": 0.4536, "rewards/accuracies": 0.75, "rewards/chosen": -3.566514015197754, "rewards/margins": 0.943221390247345, "rewards/rejected": -4.509735584259033, "step": 1082 }, { "epoch": 0.7073230467793289, "grad_norm": 18.986648016914206, "learning_rate": 3.588193117948301e-08, "logits/chosen": -1.4808248281478882, "logits/rejected": -1.5148813724517822, "logps/chosen": -857.4286499023438, "logps/rejected": -901.3809814453125, "loss": 0.5308, "rewards/accuracies": 0.84375, "rewards/chosen": -3.144946575164795, "rewards/margins": 0.7388260960578918, "rewards/rejected": -3.883772611618042, "step": 1083 }, { "epoch": 0.7079761613192914, "grad_norm": 45.59239900144737, "learning_rate": 3.573604044601873e-08, "logits/chosen": -1.4333078861236572, "logits/rejected": -1.4482799768447876, "logps/chosen": -845.7681884765625, "logps/rejected": -967.3292846679688, "loss": 0.4798, "rewards/accuracies": 0.71875, "rewards/chosen": -3.3961985111236572, "rewards/margins": 1.0223309993743896, "rewards/rejected": -4.418529510498047, "step": 1084 }, { "epoch": 0.7086292758592538, "grad_norm": 13.726428268406318, "learning_rate": 3.559035408661334e-08, "logits/chosen": -1.4743175506591797, "logits/rejected": -1.467435598373413, "logps/chosen": -852.9383544921875, "logps/rejected": -932.0983276367188, "loss": 0.4883, "rewards/accuracies": 0.78125, "rewards/chosen": -3.2556493282318115, "rewards/margins": 1.4109907150268555, "rewards/rejected": -4.666640281677246, "step": 1085 }, { "epoch": 0.7092823903992163, "grad_norm": 24.407459386863895, "learning_rate": 3.544487285958346e-08, "logits/chosen": -1.599566102027893, "logits/rejected": -1.5549285411834717, "logps/chosen": -739.4046020507812, "logps/rejected": -724.448486328125, "loss": 0.5617, "rewards/accuracies": 0.71875, "rewards/chosen": -2.502041816711426, "rewards/margins": 0.5174904465675354, "rewards/rejected": -3.0195322036743164, "step": 1086 }, { "epoch": 0.7099355049391787, "grad_norm": 23.319894013252863, "learning_rate": 3.5299597522177944e-08, "logits/chosen": -1.571434497833252, "logits/rejected": -1.5202834606170654, "logps/chosen": -872.5787353515625, "logps/rejected": -882.1563720703125, "loss": 0.5185, "rewards/accuracies": 0.6875, "rewards/chosen": -3.388620138168335, "rewards/margins": 0.7215232253074646, "rewards/rejected": -4.110143184661865, "step": 1087 }, { "epoch": 0.7105886194791412, "grad_norm": 83.13631229824816, "learning_rate": 3.5154528830574e-08, "logits/chosen": -1.5138779878616333, "logits/rejected": -1.5081030130386353, "logps/chosen": -740.7789916992188, "logps/rejected": -852.57421875, "loss": 0.4798, "rewards/accuracies": 0.71875, "rewards/chosen": -2.757505416870117, "rewards/margins": 1.019026756286621, "rewards/rejected": -3.7765321731567383, "step": 1088 }, { "epoch": 0.7112417340191036, "grad_norm": 15.401125482802248, "learning_rate": 3.500966753987317e-08, "logits/chosen": -1.4526937007904053, "logits/rejected": -1.4075860977172852, "logps/chosen": -769.2919311523438, "logps/rejected": -886.1649169921875, "loss": 0.482, "rewards/accuracies": 0.84375, "rewards/chosen": -2.7922770977020264, "rewards/margins": 0.9756352305412292, "rewards/rejected": -3.7679123878479004, "step": 1089 }, { "epoch": 0.711894848559066, "grad_norm": 22.983567681134193, "learning_rate": 3.486501440409748e-08, "logits/chosen": -1.5759086608886719, "logits/rejected": -1.5538822412490845, "logps/chosen": -872.93896484375, "logps/rejected": -946.4339599609375, "loss": 0.4708, "rewards/accuracies": 0.75, "rewards/chosen": -3.2252018451690674, "rewards/margins": 0.8281301856040955, "rewards/rejected": -4.0533318519592285, "step": 1090 }, { "epoch": 0.7125479630990285, "grad_norm": 25.263230007411096, "learning_rate": 3.472057017618547e-08, "logits/chosen": -1.3890550136566162, "logits/rejected": -1.3870556354522705, "logps/chosen": -866.8782958984375, "logps/rejected": -1034.20458984375, "loss": 0.4849, "rewards/accuracies": 0.71875, "rewards/chosen": -3.3459925651550293, "rewards/margins": 1.1013741493225098, "rewards/rejected": -4.447366237640381, "step": 1091 }, { "epoch": 0.713201077638991, "grad_norm": 23.164092211586546, "learning_rate": 3.4576335607988294e-08, "logits/chosen": -1.556492567062378, "logits/rejected": -1.5247762203216553, "logps/chosen": -825.4620971679688, "logps/rejected": -909.668701171875, "loss": 0.452, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7882065773010254, "rewards/margins": 1.1564209461212158, "rewards/rejected": -3.944627523422241, "step": 1092 }, { "epoch": 0.7138541921789534, "grad_norm": 30.491984391282454, "learning_rate": 3.44323114502658e-08, "logits/chosen": -1.4933907985687256, "logits/rejected": -1.4707281589508057, "logps/chosen": -831.043701171875, "logps/rejected": -888.13037109375, "loss": 0.5366, "rewards/accuracies": 0.71875, "rewards/chosen": -3.5473837852478027, "rewards/margins": 0.7864856719970703, "rewards/rejected": -4.333869934082031, "step": 1093 }, { "epoch": 0.7145073067189158, "grad_norm": 26.466125427075724, "learning_rate": 3.42884984526826e-08, "logits/chosen": -1.4837186336517334, "logits/rejected": -1.4462792873382568, "logps/chosen": -932.162353515625, "logps/rejected": -1006.1118774414062, "loss": 0.5344, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4717721939086914, "rewards/margins": 0.9058358073234558, "rewards/rejected": -4.377608299255371, "step": 1094 }, { "epoch": 0.7151604212588782, "grad_norm": 22.114835756620792, "learning_rate": 3.414489736380423e-08, "logits/chosen": -1.5490305423736572, "logits/rejected": -1.528594970703125, "logps/chosen": -765.060791015625, "logps/rejected": -864.4068603515625, "loss": 0.4142, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7475967407226562, "rewards/margins": 1.1565359830856323, "rewards/rejected": -3.904132843017578, "step": 1095 }, { "epoch": 0.7158135357988408, "grad_norm": 13.258696810534204, "learning_rate": 3.400150893109317e-08, "logits/chosen": -1.4212427139282227, "logits/rejected": -1.3869833946228027, "logps/chosen": -761.0309448242188, "logps/rejected": -820.0980224609375, "loss": 0.4784, "rewards/accuracies": 0.84375, "rewards/chosen": -3.5319325923919678, "rewards/margins": 0.7716701626777649, "rewards/rejected": -4.30360221862793, "step": 1096 }, { "epoch": 0.7164666503388032, "grad_norm": 153.0170543549745, "learning_rate": 3.385833390090502e-08, "logits/chosen": -1.4919387102127075, "logits/rejected": -1.4864999055862427, "logps/chosen": -876.7687377929688, "logps/rejected": -904.2010498046875, "loss": 0.544, "rewards/accuracies": 0.65625, "rewards/chosen": -3.105478525161743, "rewards/margins": 0.4873059093952179, "rewards/rejected": -3.5927841663360596, "step": 1097 }, { "epoch": 0.7171197648787656, "grad_norm": 22.73385039619432, "learning_rate": 3.3715373018484606e-08, "logits/chosen": -1.5606061220169067, "logits/rejected": -1.5803030729293823, "logps/chosen": -864.9635009765625, "logps/rejected": -979.0385131835938, "loss": 0.5134, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1329944133758545, "rewards/margins": 1.058984637260437, "rewards/rejected": -4.19197940826416, "step": 1098 }, { "epoch": 0.717772879418728, "grad_norm": 47.2766708425411, "learning_rate": 3.357262702796206e-08, "logits/chosen": -1.5417003631591797, "logits/rejected": -1.5057967901229858, "logps/chosen": -882.0269775390625, "logps/rejected": -921.403076171875, "loss": 0.4704, "rewards/accuracies": 0.71875, "rewards/chosen": -2.9814200401306152, "rewards/margins": 0.637610912322998, "rewards/rejected": -3.619030714035034, "step": 1099 }, { "epoch": 0.7184259939586906, "grad_norm": 17.039889063919627, "learning_rate": 3.343009667234898e-08, "logits/chosen": -1.4841748476028442, "logits/rejected": -1.5299304723739624, "logps/chosen": -809.2041625976562, "logps/rejected": -876.6251831054688, "loss": 0.4307, "rewards/accuracies": 0.90625, "rewards/chosen": -3.4067025184631348, "rewards/margins": 0.9841042757034302, "rewards/rejected": -4.390806674957275, "step": 1100 }, { "epoch": 0.7184259939586906, "eval_logits/chosen": -1.4911024570465088, "eval_logits/rejected": -1.4728410243988037, "eval_logps/chosen": -824.2158813476562, "eval_logps/rejected": -903.2113037109375, "eval_loss": 0.498925119638443, "eval_rewards/accuracies": 0.7609999775886536, "eval_rewards/chosen": -3.1387054920196533, "eval_rewards/margins": 0.8710131645202637, "eval_rewards/rejected": -4.009718418121338, "eval_runtime": 300.1149, "eval_samples_per_second": 13.328, "eval_steps_per_second": 0.833, "step": 1100 }, { "epoch": 0.719079108498653, "grad_norm": 16.11848033889973, "learning_rate": 3.3287782693534566e-08, "logits/chosen": -1.4810887575149536, "logits/rejected": -1.4817496538162231, "logps/chosen": -845.0581665039062, "logps/rejected": -940.9647827148438, "loss": 0.5442, "rewards/accuracies": 0.90625, "rewards/chosen": -3.3775806427001953, "rewards/margins": 0.7508167624473572, "rewards/rejected": -4.128397464752197, "step": 1101 }, { "epoch": 0.7197322230386154, "grad_norm": 68.65354889922948, "learning_rate": 3.3145685832281736e-08, "logits/chosen": -1.5042979717254639, "logits/rejected": -1.523160457611084, "logps/chosen": -804.5962524414062, "logps/rejected": -879.611572265625, "loss": 0.437, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8270790576934814, "rewards/margins": 0.7390732765197754, "rewards/rejected": -3.566152572631836, "step": 1102 }, { "epoch": 0.7203853375785778, "grad_norm": 73.57913188242289, "learning_rate": 3.30038068282233e-08, "logits/chosen": -1.4858269691467285, "logits/rejected": -1.440920352935791, "logps/chosen": -834.6658935546875, "logps/rejected": -878.7103271484375, "loss": 0.4838, "rewards/accuracies": 0.78125, "rewards/chosen": -3.6924118995666504, "rewards/margins": 0.7402853965759277, "rewards/rejected": -4.432697772979736, "step": 1103 }, { "epoch": 0.7210384521185403, "grad_norm": 13.062198655888952, "learning_rate": 3.286214641985807e-08, "logits/chosen": -1.5750617980957031, "logits/rejected": -1.491349458694458, "logps/chosen": -816.414306640625, "logps/rejected": -881.9208984375, "loss": 0.4839, "rewards/accuracies": 0.90625, "rewards/chosen": -3.108093738555908, "rewards/margins": 0.7648839950561523, "rewards/rejected": -3.8729777336120605, "step": 1104 }, { "epoch": 0.7216915666585028, "grad_norm": 14.08459741772447, "learning_rate": 3.272070534454708e-08, "logits/chosen": -1.5048151016235352, "logits/rejected": -1.4695175886154175, "logps/chosen": -810.2633056640625, "logps/rejected": -856.9027099609375, "loss": 0.4868, "rewards/accuracies": 0.84375, "rewards/chosen": -2.639930486679077, "rewards/margins": 0.8205062747001648, "rewards/rejected": -3.4604365825653076, "step": 1105 }, { "epoch": 0.7223446811984652, "grad_norm": 9.792790670749572, "learning_rate": 3.2579484338509616e-08, "logits/chosen": -1.441813349723816, "logits/rejected": -1.4555586576461792, "logps/chosen": -759.58154296875, "logps/rejected": -834.8800659179688, "loss": 0.4439, "rewards/accuracies": 0.71875, "rewards/chosen": -3.1834728717803955, "rewards/margins": 0.8506563901901245, "rewards/rejected": -4.0341291427612305, "step": 1106 }, { "epoch": 0.7229977957384276, "grad_norm": 13.42092316503716, "learning_rate": 3.2438484136819575e-08, "logits/chosen": -1.5256390571594238, "logits/rejected": -1.5091501474380493, "logps/chosen": -882.526123046875, "logps/rejected": -992.293212890625, "loss": 0.4902, "rewards/accuracies": 0.71875, "rewards/chosen": -3.3528499603271484, "rewards/margins": 1.2070109844207764, "rewards/rejected": -4.559861183166504, "step": 1107 }, { "epoch": 0.7236509102783901, "grad_norm": 36.798480052905624, "learning_rate": 3.22977054734015e-08, "logits/chosen": -1.612870693206787, "logits/rejected": -1.6026808023452759, "logps/chosen": -824.7280883789062, "logps/rejected": -989.9205322265625, "loss": 0.5238, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1623008251190186, "rewards/margins": 1.1107250452041626, "rewards/rejected": -4.273025989532471, "step": 1108 }, { "epoch": 0.7243040248183525, "grad_norm": 15.657306421690263, "learning_rate": 3.215714908102678e-08, "logits/chosen": -1.4942344427108765, "logits/rejected": -1.447746753692627, "logps/chosen": -790.576904296875, "logps/rejected": -844.633056640625, "loss": 0.5197, "rewards/accuracies": 0.75, "rewards/chosen": -3.0599842071533203, "rewards/margins": 0.9251794815063477, "rewards/rejected": -3.985163450241089, "step": 1109 }, { "epoch": 0.724957139358315, "grad_norm": 10.52893767352228, "learning_rate": 3.201681569130988e-08, "logits/chosen": -1.4922351837158203, "logits/rejected": -1.4611746072769165, "logps/chosen": -874.3091430664062, "logps/rejected": -943.7155151367188, "loss": 0.4354, "rewards/accuracies": 0.875, "rewards/chosen": -3.2651565074920654, "rewards/margins": 0.8614932894706726, "rewards/rejected": -4.126649379730225, "step": 1110 }, { "epoch": 0.7256102538982774, "grad_norm": 54.5450174448529, "learning_rate": 3.187670603470451e-08, "logits/chosen": -1.474999189376831, "logits/rejected": -1.5109246969223022, "logps/chosen": -883.6320190429688, "logps/rejected": -954.03369140625, "loss": 0.4765, "rewards/accuracies": 0.84375, "rewards/chosen": -3.4104151725769043, "rewards/margins": 0.9843173027038574, "rewards/rejected": -4.394732475280762, "step": 1111 }, { "epoch": 0.7262633684382399, "grad_norm": 59.388463871448565, "learning_rate": 3.173682084049979e-08, "logits/chosen": -1.4505696296691895, "logits/rejected": -1.458186388015747, "logps/chosen": -664.27587890625, "logps/rejected": -750.30078125, "loss": 0.4635, "rewards/accuracies": 0.78125, "rewards/chosen": -2.536130428314209, "rewards/margins": 0.9553031921386719, "rewards/rejected": -3.49143385887146, "step": 1112 }, { "epoch": 0.7269164829782023, "grad_norm": 137.01606102320702, "learning_rate": 3.159716083681652e-08, "logits/chosen": -1.6230467557907104, "logits/rejected": -1.5941567420959473, "logps/chosen": -926.1805419921875, "logps/rejected": -998.4340209960938, "loss": 0.5433, "rewards/accuracies": 0.90625, "rewards/chosen": -3.0903451442718506, "rewards/margins": 1.0845158100128174, "rewards/rejected": -4.174860954284668, "step": 1113 }, { "epoch": 0.7275695975181647, "grad_norm": 29.04184199104772, "learning_rate": 3.1457726750603317e-08, "logits/chosen": -1.512034296989441, "logits/rejected": -1.47452712059021, "logps/chosen": -846.271728515625, "logps/rejected": -941.9117431640625, "loss": 0.4757, "rewards/accuracies": 0.875, "rewards/chosen": -3.4414303302764893, "rewards/margins": 1.0037970542907715, "rewards/rejected": -4.44522762298584, "step": 1114 }, { "epoch": 0.7282227120581272, "grad_norm": 67.67268164595161, "learning_rate": 3.131851930763289e-08, "logits/chosen": -1.51804518699646, "logits/rejected": -1.497375726699829, "logps/chosen": -890.821533203125, "logps/rejected": -1002.13916015625, "loss": 0.4851, "rewards/accuracies": 0.65625, "rewards/chosen": -3.418884515762329, "rewards/margins": 0.7488983869552612, "rewards/rejected": -4.167783260345459, "step": 1115 }, { "epoch": 0.7288758265980897, "grad_norm": 20.304379891765954, "learning_rate": 3.1179539232498276e-08, "logits/chosen": -1.4011988639831543, "logits/rejected": -1.400657296180725, "logps/chosen": -691.755615234375, "logps/rejected": -767.6619873046875, "loss": 0.5219, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7513551712036133, "rewards/margins": 0.778099775314331, "rewards/rejected": -3.5294547080993652, "step": 1116 }, { "epoch": 0.7295289411380521, "grad_norm": 11.565056384039993, "learning_rate": 3.104078724860892e-08, "logits/chosen": -1.6112922430038452, "logits/rejected": -1.5689594745635986, "logps/chosen": -833.8433837890625, "logps/rejected": -893.80615234375, "loss": 0.514, "rewards/accuracies": 0.75, "rewards/chosen": -3.4891977310180664, "rewards/margins": 0.7067195177078247, "rewards/rejected": -4.195917129516602, "step": 1117 }, { "epoch": 0.7301820556780145, "grad_norm": 40.26811950012431, "learning_rate": 3.090226407818714e-08, "logits/chosen": -1.5029704570770264, "logits/rejected": -1.4893207550048828, "logps/chosen": -858.901611328125, "logps/rejected": -917.4168090820312, "loss": 0.5033, "rewards/accuracies": 0.75, "rewards/chosen": -3.3486008644104004, "rewards/margins": 0.908878743648529, "rewards/rejected": -4.257479667663574, "step": 1118 }, { "epoch": 0.730835170217977, "grad_norm": 16.207319176406262, "learning_rate": 3.07639704422642e-08, "logits/chosen": -1.4990010261535645, "logits/rejected": -1.4996980428695679, "logps/chosen": -875.76904296875, "logps/rejected": -1025.61767578125, "loss": 0.5226, "rewards/accuracies": 0.75, "rewards/chosen": -3.398207902908325, "rewards/margins": 1.0922547578811646, "rewards/rejected": -4.490462303161621, "step": 1119 }, { "epoch": 0.7314882847579395, "grad_norm": 33.13246656200166, "learning_rate": 3.06259070606766e-08, "logits/chosen": -1.4612716436386108, "logits/rejected": -1.4551949501037598, "logps/chosen": -897.0416259765625, "logps/rejected": -981.119873046875, "loss": 0.491, "rewards/accuracies": 0.6875, "rewards/chosen": -3.664212465286255, "rewards/margins": 0.6127184629440308, "rewards/rejected": -4.276930809020996, "step": 1120 }, { "epoch": 0.7321413992979019, "grad_norm": 27.787496208253557, "learning_rate": 3.048807465206237e-08, "logits/chosen": -1.4162070751190186, "logits/rejected": -1.4611150026321411, "logps/chosen": -788.558349609375, "logps/rejected": -879.7010498046875, "loss": 0.5043, "rewards/accuracies": 0.8125, "rewards/chosen": -3.006227493286133, "rewards/margins": 1.1710853576660156, "rewards/rejected": -4.177312850952148, "step": 1121 }, { "epoch": 0.7327945138378643, "grad_norm": 31.88982746049559, "learning_rate": 3.035047393385725e-08, "logits/chosen": -1.5180845260620117, "logits/rejected": -1.5133084058761597, "logps/chosen": -882.9698486328125, "logps/rejected": -901.154052734375, "loss": 0.5017, "rewards/accuracies": 0.6875, "rewards/chosen": -3.381559371948242, "rewards/margins": 0.6179601550102234, "rewards/rejected": -3.9995198249816895, "step": 1122 }, { "epoch": 0.7334476283778267, "grad_norm": 14.275386018313844, "learning_rate": 3.021310562229105e-08, "logits/chosen": -1.424015998840332, "logits/rejected": -1.391045331954956, "logps/chosen": -808.16796875, "logps/rejected": -890.8568115234375, "loss": 0.4597, "rewards/accuracies": 0.8125, "rewards/chosen": -3.398470163345337, "rewards/margins": 0.7546467781066895, "rewards/rejected": -4.1531171798706055, "step": 1123 }, { "epoch": 0.7341007429177893, "grad_norm": 31.589588156255736, "learning_rate": 3.0075970432383824e-08, "logits/chosen": -1.4777663946151733, "logits/rejected": -1.4628345966339111, "logps/chosen": -788.1425170898438, "logps/rejected": -1018.20751953125, "loss": 0.4613, "rewards/accuracies": 0.90625, "rewards/chosen": -2.831998348236084, "rewards/margins": 1.624790906906128, "rewards/rejected": -4.456789016723633, "step": 1124 }, { "epoch": 0.7347538574577517, "grad_norm": 23.163673831394075, "learning_rate": 2.993906907794223e-08, "logits/chosen": -1.4283198118209839, "logits/rejected": -1.4135866165161133, "logps/chosen": -848.6114501953125, "logps/rejected": -875.2060546875, "loss": 0.4822, "rewards/accuracies": 0.6875, "rewards/chosen": -3.065795660018921, "rewards/margins": 0.7749052047729492, "rewards/rejected": -3.84070086479187, "step": 1125 }, { "epoch": 0.7354069719977141, "grad_norm": 66.36390765427439, "learning_rate": 2.980240227155578e-08, "logits/chosen": -1.4410760402679443, "logits/rejected": -1.4437801837921143, "logps/chosen": -856.7933959960938, "logps/rejected": -962.9047241210938, "loss": 0.5124, "rewards/accuracies": 0.75, "rewards/chosen": -3.395413875579834, "rewards/margins": 0.9301176071166992, "rewards/rejected": -4.325531959533691, "step": 1126 }, { "epoch": 0.7360600865376765, "grad_norm": 81.45798747264044, "learning_rate": 2.9665970724593113e-08, "logits/chosen": -1.5262151956558228, "logits/rejected": -1.4462474584579468, "logps/chosen": -931.7483520507812, "logps/rejected": -1020.103271484375, "loss": 0.515, "rewards/accuracies": 0.84375, "rewards/chosen": -3.550004720687866, "rewards/margins": 1.2063486576080322, "rewards/rejected": -4.756353378295898, "step": 1127 }, { "epoch": 0.736713201077639, "grad_norm": 35.012242513097036, "learning_rate": 2.9529775147198323e-08, "logits/chosen": -1.5010284185409546, "logits/rejected": -1.511681079864502, "logps/chosen": -828.48583984375, "logps/rejected": -907.3237915039062, "loss": 0.4117, "rewards/accuracies": 0.90625, "rewards/chosen": -3.0704474449157715, "rewards/margins": 1.052125096321106, "rewards/rejected": -4.122572898864746, "step": 1128 }, { "epoch": 0.7373663156176015, "grad_norm": 12.390653146331033, "learning_rate": 2.9393816248287257e-08, "logits/chosen": -1.4855061769485474, "logits/rejected": -1.5361738204956055, "logps/chosen": -775.18359375, "logps/rejected": -859.5416259765625, "loss": 0.4782, "rewards/accuracies": 0.8125, "rewards/chosen": -2.886140823364258, "rewards/margins": 0.9194518327713013, "rewards/rejected": -3.8055927753448486, "step": 1129 }, { "epoch": 0.7380194301575639, "grad_norm": 41.60514101169407, "learning_rate": 2.925809473554382e-08, "logits/chosen": -1.4579628705978394, "logits/rejected": -1.4414145946502686, "logps/chosen": -883.7291870117188, "logps/rejected": -980.4385375976562, "loss": 0.505, "rewards/accuracies": 0.78125, "rewards/chosen": -3.3500232696533203, "rewards/margins": 0.7552958726882935, "rewards/rejected": -4.105319023132324, "step": 1130 }, { "epoch": 0.7386725446975263, "grad_norm": 23.290126695482954, "learning_rate": 2.9122611315416283e-08, "logits/chosen": -1.5425716638565063, "logits/rejected": -1.511645793914795, "logps/chosen": -785.6270141601562, "logps/rejected": -796.7711181640625, "loss": 0.4878, "rewards/accuracies": 0.6875, "rewards/chosen": -2.8701109886169434, "rewards/margins": 0.560005784034729, "rewards/rejected": -3.430117130279541, "step": 1131 }, { "epoch": 0.7393256592374887, "grad_norm": 14.618557708069593, "learning_rate": 2.898736669311361e-08, "logits/chosen": -1.5095089673995972, "logits/rejected": -1.424134373664856, "logps/chosen": -893.7913818359375, "logps/rejected": -921.09912109375, "loss": 0.5096, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3666210174560547, "rewards/margins": 1.207306981086731, "rewards/rejected": -4.573927879333496, "step": 1132 }, { "epoch": 0.7399787737774512, "grad_norm": 25.284769898933742, "learning_rate": 2.8852361572601798e-08, "logits/chosen": -1.4017362594604492, "logits/rejected": -1.4303860664367676, "logps/chosen": -785.92041015625, "logps/rejected": -907.3682861328125, "loss": 0.4776, "rewards/accuracies": 0.71875, "rewards/chosen": -3.0821828842163086, "rewards/margins": 1.0643155574798584, "rewards/rejected": -4.146498680114746, "step": 1133 }, { "epoch": 0.7406318883174137, "grad_norm": 83.88894947610976, "learning_rate": 2.8717596656600207e-08, "logits/chosen": -1.4224882125854492, "logits/rejected": -1.4386307001113892, "logps/chosen": -819.8443603515625, "logps/rejected": -834.15234375, "loss": 0.6099, "rewards/accuracies": 0.75, "rewards/chosen": -3.7083494663238525, "rewards/margins": 0.4391818642616272, "rewards/rejected": -4.147531509399414, "step": 1134 }, { "epoch": 0.7412850028573761, "grad_norm": 86.50212048050254, "learning_rate": 2.8583072646577904e-08, "logits/chosen": -1.5508708953857422, "logits/rejected": -1.5168962478637695, "logps/chosen": -878.55712890625, "logps/rejected": -941.9532470703125, "loss": 0.4686, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2938575744628906, "rewards/margins": 0.9149394631385803, "rewards/rejected": -4.208796977996826, "step": 1135 }, { "epoch": 0.7419381173973385, "grad_norm": 46.81398792643234, "learning_rate": 2.8448790242750002e-08, "logits/chosen": -1.4926443099975586, "logits/rejected": -1.4884159564971924, "logps/chosen": -827.5214233398438, "logps/rejected": -890.3485107421875, "loss": 0.5022, "rewards/accuracies": 0.84375, "rewards/chosen": -3.1017537117004395, "rewards/margins": 0.6141247749328613, "rewards/rejected": -3.715878486633301, "step": 1136 }, { "epoch": 0.742591231937301, "grad_norm": 73.85737704941202, "learning_rate": 2.831475014407402e-08, "logits/chosen": -1.5685663223266602, "logits/rejected": -1.5168462991714478, "logps/chosen": -825.589599609375, "logps/rejected": -906.937744140625, "loss": 0.5298, "rewards/accuracies": 0.8125, "rewards/chosen": -3.318887948989868, "rewards/margins": 1.1284462213516235, "rewards/rejected": -4.447334289550781, "step": 1137 }, { "epoch": 0.7432443464772635, "grad_norm": 86.46807131648339, "learning_rate": 2.8180953048246247e-08, "logits/chosen": -1.4860353469848633, "logits/rejected": -1.468380093574524, "logps/chosen": -782.5503540039062, "logps/rejected": -827.2178344726562, "loss": 0.4897, "rewards/accuracies": 0.75, "rewards/chosen": -2.859600067138672, "rewards/margins": 0.6450478434562683, "rewards/rejected": -3.5046474933624268, "step": 1138 }, { "epoch": 0.7438974610172259, "grad_norm": 84.46407466008054, "learning_rate": 2.8047399651698154e-08, "logits/chosen": -1.4216095209121704, "logits/rejected": -1.4595965147018433, "logps/chosen": -844.6117553710938, "logps/rejected": -825.7047729492188, "loss": 0.4965, "rewards/accuracies": 0.65625, "rewards/chosen": -3.3110671043395996, "rewards/margins": 0.38550424575805664, "rewards/rejected": -3.6965713500976562, "step": 1139 }, { "epoch": 0.7445505755571883, "grad_norm": 55.02374788341525, "learning_rate": 2.791409064959262e-08, "logits/chosen": -1.4333022832870483, "logits/rejected": -1.3619767427444458, "logps/chosen": -795.4210205078125, "logps/rejected": -872.92333984375, "loss": 0.4726, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1887168884277344, "rewards/margins": 0.8975101113319397, "rewards/rejected": -4.086226940155029, "step": 1140 }, { "epoch": 0.7452036900971508, "grad_norm": 87.71895948569228, "learning_rate": 2.7781026735820516e-08, "logits/chosen": -1.5040805339813232, "logits/rejected": -1.47743821144104, "logps/chosen": -840.98583984375, "logps/rejected": -963.7841796875, "loss": 0.4646, "rewards/accuracies": 0.78125, "rewards/chosen": -3.2599844932556152, "rewards/margins": 1.1233594417572021, "rewards/rejected": -4.3833441734313965, "step": 1141 }, { "epoch": 0.7458568046371132, "grad_norm": 19.67396573430194, "learning_rate": 2.7648208602996965e-08, "logits/chosen": -1.5113935470581055, "logits/rejected": -1.4451313018798828, "logps/chosen": -839.17578125, "logps/rejected": -976.3116455078125, "loss": 0.4397, "rewards/accuracies": 0.8125, "rewards/chosen": -3.220781087875366, "rewards/margins": 1.0840184688568115, "rewards/rejected": -4.304799556732178, "step": 1142 }, { "epoch": 0.7465099191770757, "grad_norm": 84.55591225773897, "learning_rate": 2.751563694245776e-08, "logits/chosen": -1.480290412902832, "logits/rejected": -1.5081285238265991, "logps/chosen": -809.2010498046875, "logps/rejected": -877.5315551757812, "loss": 0.4513, "rewards/accuracies": 0.84375, "rewards/chosen": -2.9877707958221436, "rewards/margins": 0.8733527660369873, "rewards/rejected": -3.8611233234405518, "step": 1143 }, { "epoch": 0.7471630337170381, "grad_norm": 95.02644629261052, "learning_rate": 2.7383312444255792e-08, "logits/chosen": -1.5553948879241943, "logits/rejected": -1.5468087196350098, "logps/chosen": -876.2579345703125, "logps/rejected": -1008.3162231445312, "loss": 0.4952, "rewards/accuracies": 0.90625, "rewards/chosen": -3.5550930500030518, "rewards/margins": 1.053817629814148, "rewards/rejected": -4.60891056060791, "step": 1144 }, { "epoch": 0.7478161482570006, "grad_norm": 81.78452279244114, "learning_rate": 2.7251235797157426e-08, "logits/chosen": -1.5420262813568115, "logits/rejected": -1.5177980661392212, "logps/chosen": -839.2437133789062, "logps/rejected": -872.8123779296875, "loss": 0.5823, "rewards/accuracies": 0.71875, "rewards/chosen": -3.291133403778076, "rewards/margins": 0.3555396795272827, "rewards/rejected": -3.6466729640960693, "step": 1145 }, { "epoch": 0.748469262796963, "grad_norm": 18.413788007685703, "learning_rate": 2.7119407688638925e-08, "logits/chosen": -1.4686810970306396, "logits/rejected": -1.434198021888733, "logps/chosen": -887.4147338867188, "logps/rejected": -914.2786865234375, "loss": 0.5467, "rewards/accuracies": 0.84375, "rewards/chosen": -3.379523754119873, "rewards/margins": 0.6430516839027405, "rewards/rejected": -4.022575378417969, "step": 1146 }, { "epoch": 0.7491223773369254, "grad_norm": 25.04209084197724, "learning_rate": 2.6987828804882885e-08, "logits/chosen": -1.473648190498352, "logits/rejected": -1.4673340320587158, "logps/chosen": -755.6442260742188, "logps/rejected": -806.13818359375, "loss": 0.5137, "rewards/accuracies": 0.6875, "rewards/chosen": -3.124025344848633, "rewards/margins": 0.8462156653404236, "rewards/rejected": -3.970241069793701, "step": 1147 }, { "epoch": 0.7497754918768879, "grad_norm": 17.831323248171984, "learning_rate": 2.6856499830774655e-08, "logits/chosen": -1.5108214616775513, "logits/rejected": -1.4946174621582031, "logps/chosen": -833.5274658203125, "logps/rejected": -902.3148193359375, "loss": 0.533, "rewards/accuracies": 0.78125, "rewards/chosen": -3.2335917949676514, "rewards/margins": 1.1803102493286133, "rewards/rejected": -4.413901329040527, "step": 1148 }, { "epoch": 0.7504286064168504, "grad_norm": 55.940688397054416, "learning_rate": 2.6725421449898775e-08, "logits/chosen": -1.5316839218139648, "logits/rejected": -1.4671372175216675, "logps/chosen": -804.1160278320312, "logps/rejected": -807.0818481445312, "loss": 0.4974, "rewards/accuracies": 0.625, "rewards/chosen": -3.2132842540740967, "rewards/margins": 0.4933798611164093, "rewards/rejected": -3.7066640853881836, "step": 1149 }, { "epoch": 0.7510817209568128, "grad_norm": 12.040478011711151, "learning_rate": 2.6594594344535416e-08, "logits/chosen": -1.5430724620819092, "logits/rejected": -1.4982068538665771, "logps/chosen": -903.4131469726562, "logps/rejected": -982.6273193359375, "loss": 0.494, "rewards/accuracies": 0.78125, "rewards/chosen": -3.6371841430664062, "rewards/margins": 0.9791971445083618, "rewards/rejected": -4.616381645202637, "step": 1150 }, { "epoch": 0.7517348354967752, "grad_norm": 17.536012470211805, "learning_rate": 2.646401919565679e-08, "logits/chosen": -1.5547937154769897, "logits/rejected": -1.578386664390564, "logps/chosen": -779.9551391601562, "logps/rejected": -914.7993774414062, "loss": 0.4548, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0140175819396973, "rewards/margins": 1.036094307899475, "rewards/rejected": -4.050111770629883, "step": 1151 }, { "epoch": 0.7523879500367376, "grad_norm": 13.483448697030374, "learning_rate": 2.6333696682923677e-08, "logits/chosen": -1.478268027305603, "logits/rejected": -1.4786677360534668, "logps/chosen": -848.6054077148438, "logps/rejected": -985.2492065429688, "loss": 0.3929, "rewards/accuracies": 0.8125, "rewards/chosen": -3.365396499633789, "rewards/margins": 1.4922524690628052, "rewards/rejected": -4.857649326324463, "step": 1152 }, { "epoch": 0.7530410645767002, "grad_norm": 17.054706803705002, "learning_rate": 2.6203627484681862e-08, "logits/chosen": -1.5397868156433105, "logits/rejected": -1.4064992666244507, "logps/chosen": -779.4160766601562, "logps/rejected": -861.3985595703125, "loss": 0.5078, "rewards/accuracies": 0.84375, "rewards/chosen": -3.2905077934265137, "rewards/margins": 1.0052298307418823, "rewards/rejected": -4.295737266540527, "step": 1153 }, { "epoch": 0.7536941791166626, "grad_norm": 32.79446433639373, "learning_rate": 2.6073812277958565e-08, "logits/chosen": -1.4748960733413696, "logits/rejected": -1.4349013566970825, "logps/chosen": -851.8651123046875, "logps/rejected": -946.66357421875, "loss": 0.4066, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2334001064300537, "rewards/margins": 0.994164228439331, "rewards/rejected": -4.227564334869385, "step": 1154 }, { "epoch": 0.754347293656625, "grad_norm": 67.74543674863597, "learning_rate": 2.5944251738458985e-08, "logits/chosen": -1.4562938213348389, "logits/rejected": -1.4436331987380981, "logps/chosen": -778.5364990234375, "logps/rejected": -855.3143920898438, "loss": 0.4975, "rewards/accuracies": 0.8125, "rewards/chosen": -3.058043956756592, "rewards/margins": 0.7909223437309265, "rewards/rejected": -3.848966360092163, "step": 1155 }, { "epoch": 0.7550004081965874, "grad_norm": 120.90210970935338, "learning_rate": 2.58149465405627e-08, "logits/chosen": -1.5155216455459595, "logits/rejected": -1.5171725749969482, "logps/chosen": -874.121826171875, "logps/rejected": -1065.99609375, "loss": 0.5047, "rewards/accuracies": 0.875, "rewards/chosen": -3.261012554168701, "rewards/margins": 1.1070443391799927, "rewards/rejected": -4.368056774139404, "step": 1156 }, { "epoch": 0.75565352273655, "grad_norm": 22.64305491156398, "learning_rate": 2.5685897357320236e-08, "logits/chosen": -1.5339815616607666, "logits/rejected": -1.534791350364685, "logps/chosen": -835.3990478515625, "logps/rejected": -900.058349609375, "loss": 0.4601, "rewards/accuracies": 0.75, "rewards/chosen": -3.2667551040649414, "rewards/margins": 0.9697099328041077, "rewards/rejected": -4.236464977264404, "step": 1157 }, { "epoch": 0.7563066372765124, "grad_norm": 12.582513483094319, "learning_rate": 2.555710486044951e-08, "logits/chosen": -1.5386102199554443, "logits/rejected": -1.504981279373169, "logps/chosen": -791.6339721679688, "logps/rejected": -953.6394653320312, "loss": 0.4794, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0311837196350098, "rewards/margins": 1.1218692064285278, "rewards/rejected": -4.153052806854248, "step": 1158 }, { "epoch": 0.7569597518164748, "grad_norm": 12.043718502977505, "learning_rate": 2.542856972033237e-08, "logits/chosen": -1.5250686407089233, "logits/rejected": -1.502375602722168, "logps/chosen": -846.478515625, "logps/rejected": -939.0611572265625, "loss": 0.4625, "rewards/accuracies": 0.90625, "rewards/chosen": -3.351508378982544, "rewards/margins": 1.0537912845611572, "rewards/rejected": -4.405299186706543, "step": 1159 }, { "epoch": 0.7576128663564372, "grad_norm": 38.38806781229567, "learning_rate": 2.5300292606011058e-08, "logits/chosen": -1.334637999534607, "logits/rejected": -1.3652501106262207, "logps/chosen": -769.8375854492188, "logps/rejected": -865.3778076171875, "loss": 0.4443, "rewards/accuracies": 0.84375, "rewards/chosen": -2.9790191650390625, "rewards/margins": 0.9127359986305237, "rewards/rejected": -3.8917548656463623, "step": 1160 }, { "epoch": 0.7582659808963997, "grad_norm": 67.81176562059466, "learning_rate": 2.5172274185184795e-08, "logits/chosen": -1.5681705474853516, "logits/rejected": -1.5657100677490234, "logps/chosen": -860.4470825195312, "logps/rejected": -908.8406982421875, "loss": 0.5013, "rewards/accuracies": 0.8125, "rewards/chosen": -3.02535080909729, "rewards/margins": 0.7398912906646729, "rewards/rejected": -3.765242099761963, "step": 1161 }, { "epoch": 0.7589190954363622, "grad_norm": 18.366197171597655, "learning_rate": 2.504451512420624e-08, "logits/chosen": -1.439697265625, "logits/rejected": -1.4873735904693604, "logps/chosen": -856.3161010742188, "logps/rejected": -1009.01708984375, "loss": 0.4974, "rewards/accuracies": 0.78125, "rewards/chosen": -3.3951454162597656, "rewards/margins": 1.1207799911499023, "rewards/rejected": -4.515925407409668, "step": 1162 }, { "epoch": 0.7595722099763246, "grad_norm": 14.372222255787399, "learning_rate": 2.491701608807807e-08, "logits/chosen": -1.467217206954956, "logits/rejected": -1.4398670196533203, "logps/chosen": -864.7401123046875, "logps/rejected": -916.0717163085938, "loss": 0.5183, "rewards/accuracies": 0.90625, "rewards/chosen": -3.4642720222473145, "rewards/margins": 0.79485023021698, "rewards/rejected": -4.259122848510742, "step": 1163 }, { "epoch": 0.760225324516287, "grad_norm": 51.64803091036702, "learning_rate": 2.478977774044948e-08, "logits/chosen": -1.4573464393615723, "logits/rejected": -1.450842261314392, "logps/chosen": -743.4122314453125, "logps/rejected": -863.0440063476562, "loss": 0.4459, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0900349617004395, "rewards/margins": 0.9152263402938843, "rewards/rejected": -4.005261421203613, "step": 1164 }, { "epoch": 0.7608784390562495, "grad_norm": 22.157747485258298, "learning_rate": 2.466280074361277e-08, "logits/chosen": -1.5302906036376953, "logits/rejected": -1.5034409761428833, "logps/chosen": -872.1870727539062, "logps/rejected": -945.05859375, "loss": 0.5473, "rewards/accuracies": 0.71875, "rewards/chosen": -4.0566864013671875, "rewards/margins": 0.5899438858032227, "rewards/rejected": -4.646629810333252, "step": 1165 }, { "epoch": 0.7615315535962119, "grad_norm": 61.748046100373145, "learning_rate": 2.4536085758499845e-08, "logits/chosen": -1.4410693645477295, "logits/rejected": -1.4090842008590698, "logps/chosen": -860.2272338867188, "logps/rejected": -942.9466552734375, "loss": 0.5124, "rewards/accuracies": 0.84375, "rewards/chosen": -3.1852986812591553, "rewards/margins": 1.0137269496917725, "rewards/rejected": -4.199025630950928, "step": 1166 }, { "epoch": 0.7621846681361744, "grad_norm": 111.18614790178633, "learning_rate": 2.4409633444678828e-08, "logits/chosen": -1.5321191549301147, "logits/rejected": -1.5227891206741333, "logps/chosen": -827.511474609375, "logps/rejected": -978.70458984375, "loss": 0.493, "rewards/accuracies": 0.875, "rewards/chosen": -3.226242780685425, "rewards/margins": 1.047170877456665, "rewards/rejected": -4.27341365814209, "step": 1167 }, { "epoch": 0.7628377826761368, "grad_norm": 39.173609596636474, "learning_rate": 2.42834444603506e-08, "logits/chosen": -1.4213345050811768, "logits/rejected": -1.4618169069290161, "logps/chosen": -748.9151000976562, "logps/rejected": -865.19775390625, "loss": 0.5424, "rewards/accuracies": 0.875, "rewards/chosen": -3.1750364303588867, "rewards/margins": 0.8123236298561096, "rewards/rejected": -3.9873600006103516, "step": 1168 }, { "epoch": 0.7634908972160993, "grad_norm": 100.75678767672636, "learning_rate": 2.4157519462345373e-08, "logits/chosen": -1.494844675064087, "logits/rejected": -1.3755334615707397, "logps/chosen": -878.364990234375, "logps/rejected": -937.23193359375, "loss": 0.507, "rewards/accuracies": 0.6875, "rewards/chosen": -3.4742939472198486, "rewards/margins": 0.7716997861862183, "rewards/rejected": -4.245993614196777, "step": 1169 }, { "epoch": 0.7641440117560617, "grad_norm": 18.303452723748897, "learning_rate": 2.4031859106119267e-08, "logits/chosen": -1.4939923286437988, "logits/rejected": -1.4547199010849, "logps/chosen": -811.0922241210938, "logps/rejected": -918.572021484375, "loss": 0.5138, "rewards/accuracies": 0.875, "rewards/chosen": -3.3449172973632812, "rewards/margins": 1.024104356765747, "rewards/rejected": -4.369021415710449, "step": 1170 }, { "epoch": 0.7647971262960241, "grad_norm": 67.44858894155142, "learning_rate": 2.3906464045750928e-08, "logits/chosen": -1.5134905576705933, "logits/rejected": -1.4933390617370605, "logps/chosen": -844.356689453125, "logps/rejected": -925.1680908203125, "loss": 0.4776, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6389784812927246, "rewards/margins": 0.7109842300415039, "rewards/rejected": -4.34996223449707, "step": 1171 }, { "epoch": 0.7654502408359866, "grad_norm": 60.64995050523796, "learning_rate": 2.3781334933938094e-08, "logits/chosen": -1.5297952890396118, "logits/rejected": -1.520561695098877, "logps/chosen": -983.57958984375, "logps/rejected": -1131.6011962890625, "loss": 0.4451, "rewards/accuracies": 0.78125, "rewards/chosen": -3.437025785446167, "rewards/margins": 1.1520684957504272, "rewards/rejected": -4.589094161987305, "step": 1172 }, { "epoch": 0.7661033553759491, "grad_norm": 61.990214860015534, "learning_rate": 2.3656472421994215e-08, "logits/chosen": -1.447062373161316, "logits/rejected": -1.447553038597107, "logps/chosen": -779.4403076171875, "logps/rejected": -957.9571533203125, "loss": 0.5032, "rewards/accuracies": 0.71875, "rewards/chosen": -3.1557390689849854, "rewards/margins": 1.0444045066833496, "rewards/rejected": -4.200143814086914, "step": 1173 }, { "epoch": 0.7667564699159115, "grad_norm": 16.149576622820327, "learning_rate": 2.3531877159844986e-08, "logits/chosen": -1.5438363552093506, "logits/rejected": -1.5356152057647705, "logps/chosen": -883.9613647460938, "logps/rejected": -928.40087890625, "loss": 0.5555, "rewards/accuracies": 0.71875, "rewards/chosen": -3.240826368331909, "rewards/margins": 0.7265506386756897, "rewards/rejected": -3.967376947402954, "step": 1174 }, { "epoch": 0.7674095844558739, "grad_norm": 12.855340027433474, "learning_rate": 2.3407549796025138e-08, "logits/chosen": -1.444447636604309, "logits/rejected": -1.4474413394927979, "logps/chosen": -902.9122924804688, "logps/rejected": -1022.8636474609375, "loss": 0.4878, "rewards/accuracies": 0.8125, "rewards/chosen": -3.586104393005371, "rewards/margins": 1.1032967567443848, "rewards/rejected": -4.689400672912598, "step": 1175 }, { "epoch": 0.7680626989958363, "grad_norm": 44.85596625562831, "learning_rate": 2.3283490977674887e-08, "logits/chosen": -1.47477126121521, "logits/rejected": -1.473871111869812, "logps/chosen": -713.3693237304688, "logps/rejected": -823.8683471679688, "loss": 0.4626, "rewards/accuracies": 0.75, "rewards/chosen": -3.159616708755493, "rewards/margins": 0.8683343529701233, "rewards/rejected": -4.027950763702393, "step": 1176 }, { "epoch": 0.7687158135357989, "grad_norm": 10.37144025860678, "learning_rate": 2.3159701350536645e-08, "logits/chosen": -1.4041333198547363, "logits/rejected": -1.3923746347427368, "logps/chosen": -832.6809692382812, "logps/rejected": -847.46923828125, "loss": 0.4712, "rewards/accuracies": 0.71875, "rewards/chosen": -3.0015857219696045, "rewards/margins": 0.7252290844917297, "rewards/rejected": -3.7268147468566895, "step": 1177 }, { "epoch": 0.7693689280757613, "grad_norm": 14.75625387529424, "learning_rate": 2.3036181558951672e-08, "logits/chosen": -1.5356097221374512, "logits/rejected": -1.5658423900604248, "logps/chosen": -829.6064453125, "logps/rejected": -899.4547729492188, "loss": 0.4993, "rewards/accuracies": 0.8125, "rewards/chosen": -2.934075117111206, "rewards/margins": 1.1105401515960693, "rewards/rejected": -4.044615268707275, "step": 1178 }, { "epoch": 0.7700220426157237, "grad_norm": 11.129241460176539, "learning_rate": 2.2912932245856683e-08, "logits/chosen": -1.5327390432357788, "logits/rejected": -1.4865920543670654, "logps/chosen": -914.9544067382812, "logps/rejected": -1039.690185546875, "loss": 0.431, "rewards/accuracies": 0.90625, "rewards/chosen": -3.7165744304656982, "rewards/margins": 1.1967370510101318, "rewards/rejected": -4.913311958312988, "step": 1179 }, { "epoch": 0.7706751571556861, "grad_norm": 27.071377513200186, "learning_rate": 2.278995405278051e-08, "logits/chosen": -1.490029215812683, "logits/rejected": -1.4955387115478516, "logps/chosen": -788.3419799804688, "logps/rejected": -908.0980224609375, "loss": 0.4685, "rewards/accuracies": 0.8125, "rewards/chosen": -2.865569829940796, "rewards/margins": 0.890224814414978, "rewards/rejected": -3.7557942867279053, "step": 1180 }, { "epoch": 0.7713282716956487, "grad_norm": 68.89217637019948, "learning_rate": 2.266724761984077e-08, "logits/chosen": -1.5287120342254639, "logits/rejected": -1.5607788562774658, "logps/chosen": -864.7396240234375, "logps/rejected": -897.0245971679688, "loss": 0.5442, "rewards/accuracies": 0.71875, "rewards/chosen": -3.2756667137145996, "rewards/margins": 0.8783707618713379, "rewards/rejected": -4.1540374755859375, "step": 1181 }, { "epoch": 0.7719813862356111, "grad_norm": 23.549000554913878, "learning_rate": 2.2544813585740552e-08, "logits/chosen": -1.5588055849075317, "logits/rejected": -1.566838264465332, "logps/chosen": -907.4659423828125, "logps/rejected": -1011.3638916015625, "loss": 0.4863, "rewards/accuracies": 0.6875, "rewards/chosen": -3.550849437713623, "rewards/margins": 0.8633126616477966, "rewards/rejected": -4.414162635803223, "step": 1182 }, { "epoch": 0.7726345007755735, "grad_norm": 73.6862565187378, "learning_rate": 2.242265258776505e-08, "logits/chosen": -1.4739999771118164, "logits/rejected": -1.4539254903793335, "logps/chosen": -828.4271850585938, "logps/rejected": -1103.9161376953125, "loss": 0.4844, "rewards/accuracies": 0.71875, "rewards/chosen": -2.812699317932129, "rewards/margins": 1.4099655151367188, "rewards/rejected": -4.222664833068848, "step": 1183 }, { "epoch": 0.7732876153155359, "grad_norm": 28.50045032866641, "learning_rate": 2.2300765261778312e-08, "logits/chosen": -1.4642502069473267, "logits/rejected": -1.466383457183838, "logps/chosen": -875.912841796875, "logps/rejected": -906.390380859375, "loss": 0.5413, "rewards/accuracies": 0.6875, "rewards/chosen": -3.574983596801758, "rewards/margins": 0.3633178770542145, "rewards/rejected": -3.9383015632629395, "step": 1184 }, { "epoch": 0.7739407298554984, "grad_norm": 13.495729369988036, "learning_rate": 2.2179152242219803e-08, "logits/chosen": -1.5543307065963745, "logits/rejected": -1.5425894260406494, "logps/chosen": -870.7875366210938, "logps/rejected": -942.445556640625, "loss": 0.489, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3177812099456787, "rewards/margins": 0.8684400320053101, "rewards/rejected": -4.186221599578857, "step": 1185 }, { "epoch": 0.7745938443954609, "grad_norm": 121.88098996369544, "learning_rate": 2.205781416210126e-08, "logits/chosen": -1.3570061922073364, "logits/rejected": -1.3827579021453857, "logps/chosen": -873.6181030273438, "logps/rejected": -917.5654296875, "loss": 0.4861, "rewards/accuracies": 0.78125, "rewards/chosen": -3.150343894958496, "rewards/margins": 1.0735583305358887, "rewards/rejected": -4.223902702331543, "step": 1186 }, { "epoch": 0.7752469589354233, "grad_norm": 42.77475056245068, "learning_rate": 2.1936751653003312e-08, "logits/chosen": -1.525205135345459, "logits/rejected": -1.5034763813018799, "logps/chosen": -855.9130249023438, "logps/rejected": -936.5135498046875, "loss": 0.4463, "rewards/accuracies": 0.84375, "rewards/chosen": -3.344566822052002, "rewards/margins": 0.8852987289428711, "rewards/rejected": -4.229865550994873, "step": 1187 }, { "epoch": 0.7759000734753857, "grad_norm": 22.279778341803244, "learning_rate": 2.181596534507219e-08, "logits/chosen": -1.5698206424713135, "logits/rejected": -1.5208396911621094, "logps/chosen": -858.2536010742188, "logps/rejected": -927.1165771484375, "loss": 0.4899, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2629714012145996, "rewards/margins": 0.9225921630859375, "rewards/rejected": -4.185564041137695, "step": 1188 }, { "epoch": 0.7765531880153482, "grad_norm": 79.29971170882058, "learning_rate": 2.169545586701647e-08, "logits/chosen": -1.6312425136566162, "logits/rejected": -1.5835908651351929, "logps/chosen": -894.0716552734375, "logps/rejected": -900.3629760742188, "loss": 0.4934, "rewards/accuracies": 0.6875, "rewards/chosen": -3.885010242462158, "rewards/margins": 0.54335618019104, "rewards/rejected": -4.428366661071777, "step": 1189 }, { "epoch": 0.7772063025553106, "grad_norm": 50.17561559841045, "learning_rate": 2.157522384610379e-08, "logits/chosen": -1.4537131786346436, "logits/rejected": -1.4958133697509766, "logps/chosen": -851.051513671875, "logps/rejected": -874.2980346679688, "loss": 0.5582, "rewards/accuracies": 0.75, "rewards/chosen": -3.4431140422821045, "rewards/margins": 0.47253862023353577, "rewards/rejected": -3.9156525135040283, "step": 1190 }, { "epoch": 0.7778594170952731, "grad_norm": 16.156948341176815, "learning_rate": 2.1455269908157583e-08, "logits/chosen": -1.574291467666626, "logits/rejected": -1.532688856124878, "logps/chosen": -930.2337646484375, "logps/rejected": -1081.06103515625, "loss": 0.4525, "rewards/accuracies": 0.78125, "rewards/chosen": -3.6473445892333984, "rewards/margins": 1.3627281188964844, "rewards/rejected": -5.010072708129883, "step": 1191 }, { "epoch": 0.7785125316352355, "grad_norm": 18.980973183816936, "learning_rate": 2.133559467755383e-08, "logits/chosen": -1.6023908853530884, "logits/rejected": -1.603822946548462, "logps/chosen": -828.7919311523438, "logps/rejected": -913.5147705078125, "loss": 0.4728, "rewards/accuracies": 0.8125, "rewards/chosen": -2.939816951751709, "rewards/margins": 0.9978110790252686, "rewards/rejected": -3.9376275539398193, "step": 1192 }, { "epoch": 0.779165646175198, "grad_norm": 22.58657566222872, "learning_rate": 2.12161987772178e-08, "logits/chosen": -1.5595180988311768, "logits/rejected": -1.5140559673309326, "logps/chosen": -898.070068359375, "logps/rejected": -931.6812744140625, "loss": 0.5008, "rewards/accuracies": 0.6875, "rewards/chosen": -3.8756892681121826, "rewards/margins": 0.6614542603492737, "rewards/rejected": -4.537143230438232, "step": 1193 }, { "epoch": 0.7798187607151604, "grad_norm": 14.841539963082138, "learning_rate": 2.1097082828620823e-08, "logits/chosen": -1.5292487144470215, "logits/rejected": -1.4865946769714355, "logps/chosen": -779.00244140625, "logps/rejected": -803.3477172851562, "loss": 0.5534, "rewards/accuracies": 0.71875, "rewards/chosen": -2.951836347579956, "rewards/margins": 0.46454131603240967, "rewards/rejected": -3.4163777828216553, "step": 1194 }, { "epoch": 0.7804718752551228, "grad_norm": 70.51401475102385, "learning_rate": 2.0978247451777027e-08, "logits/chosen": -1.4845449924468994, "logits/rejected": -1.4704723358154297, "logps/chosen": -838.1801147460938, "logps/rejected": -939.9219970703125, "loss": 0.4543, "rewards/accuracies": 0.78125, "rewards/chosen": -3.091367244720459, "rewards/margins": 0.8508425354957581, "rewards/rejected": -3.9422097206115723, "step": 1195 }, { "epoch": 0.7811249897950853, "grad_norm": 39.49476961487304, "learning_rate": 2.0859693265240133e-08, "logits/chosen": -1.4192631244659424, "logits/rejected": -1.4639571905136108, "logps/chosen": -819.2432861328125, "logps/rejected": -913.455078125, "loss": 0.4721, "rewards/accuracies": 0.78125, "rewards/chosen": -3.472623109817505, "rewards/margins": 0.8139183521270752, "rewards/rejected": -4.28654146194458, "step": 1196 }, { "epoch": 0.7817781043350478, "grad_norm": 38.127552444665085, "learning_rate": 2.0741420886100226e-08, "logits/chosen": -1.4315974712371826, "logits/rejected": -1.3939564228057861, "logps/chosen": -748.397705078125, "logps/rejected": -789.2747802734375, "loss": 0.609, "rewards/accuracies": 0.625, "rewards/chosen": -3.52626895904541, "rewards/margins": 0.5112132430076599, "rewards/rejected": -4.037482261657715, "step": 1197 }, { "epoch": 0.7824312188750102, "grad_norm": 92.0980308502603, "learning_rate": 2.0623430929980555e-08, "logits/chosen": -1.485008955001831, "logits/rejected": -1.4899412393569946, "logps/chosen": -1039.5450439453125, "logps/rejected": -1029.2069091796875, "loss": 0.5313, "rewards/accuracies": 0.6875, "rewards/chosen": -3.7229716777801514, "rewards/margins": 0.5886335372924805, "rewards/rejected": -4.311605453491211, "step": 1198 }, { "epoch": 0.7830843334149726, "grad_norm": 23.40754516414652, "learning_rate": 2.0505724011034305e-08, "logits/chosen": -1.5619934797286987, "logits/rejected": -1.5442955493927002, "logps/chosen": -920.8180541992188, "logps/rejected": -1025.9613037109375, "loss": 0.4786, "rewards/accuracies": 0.75, "rewards/chosen": -3.7618021965026855, "rewards/margins": 0.9860894680023193, "rewards/rejected": -4.747891426086426, "step": 1199 }, { "epoch": 0.783737447954935, "grad_norm": 71.55959913562558, "learning_rate": 2.0388300741941447e-08, "logits/chosen": -1.4612364768981934, "logits/rejected": -1.426287293434143, "logps/chosen": -804.0591430664062, "logps/rejected": -901.520751953125, "loss": 0.5403, "rewards/accuracies": 0.8125, "rewards/chosen": -2.961287260055542, "rewards/margins": 0.8116233944892883, "rewards/rejected": -3.7729105949401855, "step": 1200 }, { "epoch": 0.783737447954935, "eval_logits/chosen": -1.4822365045547485, "eval_logits/rejected": -1.464072823524475, "eval_logps/chosen": -844.524169921875, "eval_logps/rejected": -927.9747314453125, "eval_loss": 0.49636971950531006, "eval_rewards/accuracies": 0.7620000243186951, "eval_rewards/chosen": -3.3417880535125732, "eval_rewards/margins": 0.9155648350715637, "eval_rewards/rejected": -4.257352828979492, "eval_runtime": 296.4203, "eval_samples_per_second": 13.494, "eval_steps_per_second": 0.843, "step": 1200 }, { "epoch": 0.7843905624948976, "grad_norm": 38.14391865815223, "learning_rate": 2.027116173390549e-08, "logits/chosen": -1.4551496505737305, "logits/rejected": -1.446352243423462, "logps/chosen": -821.078369140625, "logps/rejected": -918.0855712890625, "loss": 0.4739, "rewards/accuracies": 0.78125, "rewards/chosen": -3.5896596908569336, "rewards/margins": 0.7162079215049744, "rewards/rejected": -4.305867671966553, "step": 1201 }, { "epoch": 0.78504367703486, "grad_norm": 59.161675430462715, "learning_rate": 2.015430759665032e-08, "logits/chosen": -1.5572847127914429, "logits/rejected": -1.5632448196411133, "logps/chosen": -812.02294921875, "logps/rejected": -977.6295776367188, "loss": 0.4483, "rewards/accuracies": 0.8125, "rewards/chosen": -3.5253820419311523, "rewards/margins": 1.2742270231246948, "rewards/rejected": -4.799609184265137, "step": 1202 }, { "epoch": 0.7856967915748224, "grad_norm": 96.67396687187629, "learning_rate": 2.003773893841706e-08, "logits/chosen": -1.5836585760116577, "logits/rejected": -1.5667706727981567, "logps/chosen": -812.0299682617188, "logps/rejected": -925.529052734375, "loss": 0.4787, "rewards/accuracies": 0.875, "rewards/chosen": -2.8535349369049072, "rewards/margins": 1.1299189329147339, "rewards/rejected": -3.9834539890289307, "step": 1203 }, { "epoch": 0.7863499061147848, "grad_norm": 10.591507916072793, "learning_rate": 1.9921456365960856e-08, "logits/chosen": -1.4587793350219727, "logits/rejected": -1.4299750328063965, "logps/chosen": -911.44384765625, "logps/rejected": -1020.269287109375, "loss": 0.5223, "rewards/accuracies": 0.6875, "rewards/chosen": -3.7610325813293457, "rewards/margins": 0.6058591604232788, "rewards/rejected": -4.366891860961914, "step": 1204 }, { "epoch": 0.7870030206547474, "grad_norm": 92.25556529129581, "learning_rate": 1.980546048454776e-08, "logits/chosen": -1.4399176836013794, "logits/rejected": -1.4096533060073853, "logps/chosen": -771.2178955078125, "logps/rejected": -856.9338989257812, "loss": 0.5283, "rewards/accuracies": 0.78125, "rewards/chosen": -2.911550521850586, "rewards/margins": 0.6725373268127441, "rewards/rejected": -3.584087610244751, "step": 1205 }, { "epoch": 0.7876561351947098, "grad_norm": 50.596463509645794, "learning_rate": 1.9689751897951532e-08, "logits/chosen": -1.4841548204421997, "logits/rejected": -1.4760531187057495, "logps/chosen": -828.1544799804688, "logps/rejected": -881.6762084960938, "loss": 0.5132, "rewards/accuracies": 0.8125, "rewards/chosen": -3.460951328277588, "rewards/margins": 0.6589277982711792, "rewards/rejected": -4.119879245758057, "step": 1206 }, { "epoch": 0.7883092497346722, "grad_norm": 16.96643780611066, "learning_rate": 1.9574331208450577e-08, "logits/chosen": -1.4726507663726807, "logits/rejected": -1.5015829801559448, "logps/chosen": -860.7503051757812, "logps/rejected": -979.796142578125, "loss": 0.4893, "rewards/accuracies": 0.6875, "rewards/chosen": -3.2518157958984375, "rewards/margins": 0.9525402784347534, "rewards/rejected": -4.2043561935424805, "step": 1207 }, { "epoch": 0.7889623642746346, "grad_norm": 17.37151480950942, "learning_rate": 1.9459199016824668e-08, "logits/chosen": -1.4686522483825684, "logits/rejected": -1.4412596225738525, "logps/chosen": -841.184326171875, "logps/rejected": -1021.4664306640625, "loss": 0.4921, "rewards/accuracies": 0.84375, "rewards/chosen": -3.811629295349121, "rewards/margins": 1.1526672840118408, "rewards/rejected": -4.964296340942383, "step": 1208 }, { "epoch": 0.7896154788145971, "grad_norm": 15.472923440337157, "learning_rate": 1.9344355922351986e-08, "logits/chosen": -1.4205299615859985, "logits/rejected": -1.4428074359893799, "logps/chosen": -806.4320678710938, "logps/rejected": -891.0250854492188, "loss": 0.4432, "rewards/accuracies": 0.84375, "rewards/chosen": -2.923144578933716, "rewards/margins": 1.4412071704864502, "rewards/rejected": -4.364351272583008, "step": 1209 }, { "epoch": 0.7902685933545596, "grad_norm": 67.06746939382018, "learning_rate": 1.922980252280589e-08, "logits/chosen": -1.4646856784820557, "logits/rejected": -1.4149240255355835, "logps/chosen": -835.4559936523438, "logps/rejected": -962.0414428710938, "loss": 0.4964, "rewards/accuracies": 0.8125, "rewards/chosen": -3.448225498199463, "rewards/margins": 0.8466680645942688, "rewards/rejected": -4.294894218444824, "step": 1210 }, { "epoch": 0.790921707894522, "grad_norm": 11.275402565937414, "learning_rate": 1.9115539414451864e-08, "logits/chosen": -1.4719829559326172, "logits/rejected": -1.5199016332626343, "logps/chosen": -804.529541015625, "logps/rejected": -926.13623046875, "loss": 0.4676, "rewards/accuracies": 0.65625, "rewards/chosen": -3.2839932441711426, "rewards/margins": 1.0498989820480347, "rewards/rejected": -4.333892345428467, "step": 1211 }, { "epoch": 0.7915748224344844, "grad_norm": 27.08202842365883, "learning_rate": 1.9001567192044367e-08, "logits/chosen": -1.4914042949676514, "logits/rejected": -1.438066840171814, "logps/chosen": -764.7699584960938, "logps/rejected": -883.679443359375, "loss": 0.4568, "rewards/accuracies": 0.875, "rewards/chosen": -2.755364418029785, "rewards/margins": 0.9105137586593628, "rewards/rejected": -3.6658782958984375, "step": 1212 }, { "epoch": 0.7922279369744469, "grad_norm": 19.72267692530407, "learning_rate": 1.888788644882376e-08, "logits/chosen": -1.5127348899841309, "logits/rejected": -1.491123080253601, "logps/chosen": -755.7037963867188, "logps/rejected": -753.5874633789062, "loss": 0.507, "rewards/accuracies": 0.75, "rewards/chosen": -3.000194549560547, "rewards/margins": 0.5762927532196045, "rewards/rejected": -3.5764873027801514, "step": 1213 }, { "epoch": 0.7928810515144094, "grad_norm": 87.91830418107274, "learning_rate": 1.8774497776513222e-08, "logits/chosen": -1.4988164901733398, "logits/rejected": -1.5042667388916016, "logps/chosen": -846.1057739257812, "logps/rejected": -972.7584228515625, "loss": 0.4886, "rewards/accuracies": 0.65625, "rewards/chosen": -3.0713298320770264, "rewards/margins": 0.8871229290962219, "rewards/rejected": -3.9584529399871826, "step": 1214 }, { "epoch": 0.7935341660543718, "grad_norm": 40.368754316154686, "learning_rate": 1.8661401765315665e-08, "logits/chosen": -1.430102825164795, "logits/rejected": -1.4247267246246338, "logps/chosen": -770.588623046875, "logps/rejected": -842.4027099609375, "loss": 0.4727, "rewards/accuracies": 0.75, "rewards/chosen": -3.137661933898926, "rewards/margins": 1.112958550453186, "rewards/rejected": -4.2506208419799805, "step": 1215 }, { "epoch": 0.7941872805943342, "grad_norm": 80.49162691211416, "learning_rate": 1.8548599003910666e-08, "logits/chosen": -1.4262512922286987, "logits/rejected": -1.4448204040527344, "logps/chosen": -881.8290405273438, "logps/rejected": -979.1732788085938, "loss": 0.4524, "rewards/accuracies": 0.90625, "rewards/chosen": -3.9086101055145264, "rewards/margins": 1.3396742343902588, "rewards/rejected": -5.248284339904785, "step": 1216 }, { "epoch": 0.7948403951342967, "grad_norm": 16.198495509148298, "learning_rate": 1.843609007945138e-08, "logits/chosen": -1.4633352756500244, "logits/rejected": -1.462742805480957, "logps/chosen": -818.7781372070312, "logps/rejected": -850.6954956054688, "loss": 0.5104, "rewards/accuracies": 0.75, "rewards/chosen": -3.175035238265991, "rewards/margins": 0.5016341805458069, "rewards/rejected": -3.676669120788574, "step": 1217 }, { "epoch": 0.7954935096742591, "grad_norm": 30.16649596643903, "learning_rate": 1.832387557756151e-08, "logits/chosen": -1.4935240745544434, "logits/rejected": -1.4943130016326904, "logps/chosen": -749.455078125, "logps/rejected": -783.4890747070312, "loss": 0.5097, "rewards/accuracies": 0.5625, "rewards/chosen": -3.018079996109009, "rewards/margins": 0.39417386054992676, "rewards/rejected": -3.4122540950775146, "step": 1218 }, { "epoch": 0.7961466242142216, "grad_norm": 63.990103383436264, "learning_rate": 1.82119560823323e-08, "logits/chosen": -1.5611546039581299, "logits/rejected": -1.5044180154800415, "logps/chosen": -828.6058349609375, "logps/rejected": -971.0333862304688, "loss": 0.5007, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2003324031829834, "rewards/margins": 1.3617459535598755, "rewards/rejected": -4.56207799911499, "step": 1219 }, { "epoch": 0.796799738754184, "grad_norm": 54.853667872883626, "learning_rate": 1.8100332176319338e-08, "logits/chosen": -1.4820096492767334, "logits/rejected": -1.5054271221160889, "logps/chosen": -863.5888061523438, "logps/rejected": -936.4130249023438, "loss": 0.5115, "rewards/accuracies": 0.90625, "rewards/chosen": -3.6043381690979004, "rewards/margins": 0.8906245827674866, "rewards/rejected": -4.494962692260742, "step": 1220 }, { "epoch": 0.7974528532941465, "grad_norm": 14.995625717970094, "learning_rate": 1.798900444053972e-08, "logits/chosen": -1.5272332429885864, "logits/rejected": -1.516082763671875, "logps/chosen": -958.9482421875, "logps/rejected": -1058.3154296875, "loss": 0.4977, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6495161056518555, "rewards/margins": 0.7711564898490906, "rewards/rejected": -4.420672416687012, "step": 1221 }, { "epoch": 0.7981059678341089, "grad_norm": 17.49568930582209, "learning_rate": 1.7877973454468918e-08, "logits/chosen": -1.5633349418640137, "logits/rejected": -1.5596158504486084, "logps/chosen": -835.0476684570312, "logps/rejected": -886.090087890625, "loss": 0.4917, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4766788482666016, "rewards/margins": 0.8626784682273865, "rewards/rejected": -4.339357376098633, "step": 1222 }, { "epoch": 0.7987590823740713, "grad_norm": 52.96090556956152, "learning_rate": 1.7767239796037765e-08, "logits/chosen": -1.4490398168563843, "logits/rejected": -1.443628191947937, "logps/chosen": -802.57373046875, "logps/rejected": -889.3080444335938, "loss": 0.4679, "rewards/accuracies": 0.75, "rewards/chosen": -3.239607334136963, "rewards/margins": 1.018710970878601, "rewards/rejected": -4.258317947387695, "step": 1223 }, { "epoch": 0.7994121969140338, "grad_norm": 150.8059871681939, "learning_rate": 1.7656804041629487e-08, "logits/chosen": -1.5590651035308838, "logits/rejected": -1.5880131721496582, "logps/chosen": -928.2868041992188, "logps/rejected": -929.5325317382812, "loss": 0.527, "rewards/accuracies": 0.59375, "rewards/chosen": -3.8474342823028564, "rewards/margins": 0.5773695111274719, "rewards/rejected": -4.424803733825684, "step": 1224 }, { "epoch": 0.8000653114539963, "grad_norm": 35.966169524301534, "learning_rate": 1.7546666766076656e-08, "logits/chosen": -1.449623465538025, "logits/rejected": -1.5213428735733032, "logps/chosen": -757.2681884765625, "logps/rejected": -924.322509765625, "loss": 0.4752, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0022165775299072, "rewards/margins": 0.9503800868988037, "rewards/rejected": -3.952596664428711, "step": 1225 }, { "epoch": 0.8007184259939587, "grad_norm": 56.42831654190206, "learning_rate": 1.743682854265825e-08, "logits/chosen": -1.4219781160354614, "logits/rejected": -1.466795802116394, "logps/chosen": -840.7289428710938, "logps/rejected": -965.5621337890625, "loss": 0.4994, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4919745922088623, "rewards/margins": 0.8413702249526978, "rewards/rejected": -4.333344459533691, "step": 1226 }, { "epoch": 0.8013715405339211, "grad_norm": 73.37249453736422, "learning_rate": 1.732728994309661e-08, "logits/chosen": -1.5576728582382202, "logits/rejected": -1.5214743614196777, "logps/chosen": -763.7567749023438, "logps/rejected": -810.2583618164062, "loss": 0.4673, "rewards/accuracies": 0.875, "rewards/chosen": -2.8903627395629883, "rewards/margins": 0.7463845610618591, "rewards/rejected": -3.636747360229492, "step": 1227 }, { "epoch": 0.8020246550738835, "grad_norm": 61.87692158071728, "learning_rate": 1.7218051537554536e-08, "logits/chosen": -1.5665578842163086, "logits/rejected": -1.549164891242981, "logps/chosen": -867.8675537109375, "logps/rejected": -983.275634765625, "loss": 0.4786, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3755083084106445, "rewards/margins": 0.9794182777404785, "rewards/rejected": -4.354927062988281, "step": 1228 }, { "epoch": 0.8026777696138461, "grad_norm": 45.704835797476456, "learning_rate": 1.7109113894632233e-08, "logits/chosen": -1.4501968622207642, "logits/rejected": -1.470442771911621, "logps/chosen": -766.7777099609375, "logps/rejected": -880.814453125, "loss": 0.5157, "rewards/accuracies": 0.71875, "rewards/chosen": -3.088731288909912, "rewards/margins": 0.6824835538864136, "rewards/rejected": -3.7712154388427734, "step": 1229 }, { "epoch": 0.8033308841538085, "grad_norm": 12.041002229288496, "learning_rate": 1.700047758136443e-08, "logits/chosen": -1.4775810241699219, "logits/rejected": -1.4621860980987549, "logps/chosen": -792.953857421875, "logps/rejected": -907.03759765625, "loss": 0.4903, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9522554874420166, "rewards/margins": 1.064509391784668, "rewards/rejected": -4.016765117645264, "step": 1230 }, { "epoch": 0.8039839986937709, "grad_norm": 63.50098008198583, "learning_rate": 1.689214316321739e-08, "logits/chosen": -1.6399027109146118, "logits/rejected": -1.5657185316085815, "logps/chosen": -886.7124633789062, "logps/rejected": -979.0396728515625, "loss": 0.5165, "rewards/accuracies": 0.65625, "rewards/chosen": -3.5052478313446045, "rewards/margins": 1.1115096807479858, "rewards/rejected": -4.616757869720459, "step": 1231 }, { "epoch": 0.8046371132337333, "grad_norm": 34.66004736818009, "learning_rate": 1.678411120408595e-08, "logits/chosen": -1.4924274682998657, "logits/rejected": -1.4871619939804077, "logps/chosen": -876.4576416015625, "logps/rejected": -956.4109497070312, "loss": 0.4692, "rewards/accuracies": 0.84375, "rewards/chosen": -3.6071510314941406, "rewards/margins": 1.06039297580719, "rewards/rejected": -4.667544364929199, "step": 1232 }, { "epoch": 0.8052902277736959, "grad_norm": 43.61205378322202, "learning_rate": 1.6676382266290647e-08, "logits/chosen": -1.4874005317687988, "logits/rejected": -1.5195080041885376, "logps/chosen": -837.166015625, "logps/rejected": -892.5218505859375, "loss": 0.5412, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4187517166137695, "rewards/margins": 0.5736494660377502, "rewards/rejected": -3.992400884628296, "step": 1233 }, { "epoch": 0.8059433423136583, "grad_norm": 98.81263517998161, "learning_rate": 1.6568956910574712e-08, "logits/chosen": -1.530608892440796, "logits/rejected": -1.5279160737991333, "logps/chosen": -837.9874877929688, "logps/rejected": -845.3988647460938, "loss": 0.5812, "rewards/accuracies": 0.65625, "rewards/chosen": -3.741583824157715, "rewards/margins": 0.040613558143377304, "rewards/rejected": -3.7821972370147705, "step": 1234 }, { "epoch": 0.8065964568536207, "grad_norm": 78.86885176654913, "learning_rate": 1.6461835696101227e-08, "logits/chosen": -1.4850387573242188, "logits/rejected": -1.4923676252365112, "logps/chosen": -871.99658203125, "logps/rejected": -949.8641357421875, "loss": 0.4868, "rewards/accuracies": 0.75, "rewards/chosen": -3.7873427867889404, "rewards/margins": 0.7236688733100891, "rewards/rejected": -4.511011600494385, "step": 1235 }, { "epoch": 0.8072495713935831, "grad_norm": 35.00941246017448, "learning_rate": 1.6355019180450148e-08, "logits/chosen": -1.555824637413025, "logits/rejected": -1.4870938062667847, "logps/chosen": -837.19140625, "logps/rejected": -921.0797729492188, "loss": 0.4621, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3375821113586426, "rewards/margins": 0.9209874272346497, "rewards/rejected": -4.258569717407227, "step": 1236 }, { "epoch": 0.8079026859335456, "grad_norm": 90.40912979516071, "learning_rate": 1.6248507919615452e-08, "logits/chosen": -1.4929189682006836, "logits/rejected": -1.5122658014297485, "logps/chosen": -916.9654541015625, "logps/rejected": -1060.645751953125, "loss": 0.42, "rewards/accuracies": 0.71875, "rewards/chosen": -3.5597081184387207, "rewards/margins": 1.2362518310546875, "rewards/rejected": -4.795959949493408, "step": 1237 }, { "epoch": 0.8085558004735081, "grad_norm": 14.457346050782789, "learning_rate": 1.6142302468002227e-08, "logits/chosen": -1.4068070650100708, "logits/rejected": -1.399592638015747, "logps/chosen": -771.3148193359375, "logps/rejected": -843.7901611328125, "loss": 0.5132, "rewards/accuracies": 0.78125, "rewards/chosen": -3.3284435272216797, "rewards/margins": 0.8365193605422974, "rewards/rejected": -4.164963245391846, "step": 1238 }, { "epoch": 0.8092089150134705, "grad_norm": 37.50589751809788, "learning_rate": 1.603640337842377e-08, "logits/chosen": -1.5218846797943115, "logits/rejected": -1.5328527688980103, "logps/chosen": -876.4766235351562, "logps/rejected": -1026.4345703125, "loss": 0.4891, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1716041564941406, "rewards/margins": 0.8838239312171936, "rewards/rejected": -4.0554280281066895, "step": 1239 }, { "epoch": 0.8098620295534329, "grad_norm": 39.42416181441758, "learning_rate": 1.5930811202098737e-08, "logits/chosen": -1.451581597328186, "logits/rejected": -1.4357130527496338, "logps/chosen": -736.4873046875, "logps/rejected": -854.6323852539062, "loss": 0.5309, "rewards/accuracies": 0.6875, "rewards/chosen": -3.231121778488159, "rewards/margins": 0.7173824310302734, "rewards/rejected": -3.9485039710998535, "step": 1240 }, { "epoch": 0.8105151440933954, "grad_norm": 15.457594936787636, "learning_rate": 1.5825526488648268e-08, "logits/chosen": -1.5039798021316528, "logits/rejected": -1.4966380596160889, "logps/chosen": -838.3843994140625, "logps/rejected": -906.8178100585938, "loss": 0.5552, "rewards/accuracies": 0.8125, "rewards/chosen": -3.452669620513916, "rewards/margins": 0.6774091124534607, "rewards/rejected": -4.1300787925720215, "step": 1241 }, { "epoch": 0.8111682586333578, "grad_norm": 30.649527114852525, "learning_rate": 1.572054978609306e-08, "logits/chosen": -1.4918582439422607, "logits/rejected": -1.454628586769104, "logps/chosen": -912.1357421875, "logps/rejected": -949.4915161132812, "loss": 0.5541, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9472150802612305, "rewards/margins": 0.8809834122657776, "rewards/rejected": -4.828197956085205, "step": 1242 }, { "epoch": 0.8118213731733203, "grad_norm": 15.819803195866424, "learning_rate": 1.5615881640850653e-08, "logits/chosen": -1.4457621574401855, "logits/rejected": -1.3891899585723877, "logps/chosen": -806.92626953125, "logps/rejected": -855.1336669921875, "loss": 0.4604, "rewards/accuracies": 0.75, "rewards/chosen": -3.198312282562256, "rewards/margins": 1.005974531173706, "rewards/rejected": -4.204287052154541, "step": 1243 }, { "epoch": 0.8124744877132827, "grad_norm": 20.662834289028424, "learning_rate": 1.551152259773245e-08, "logits/chosen": -1.4745066165924072, "logits/rejected": -1.4477360248565674, "logps/chosen": -888.340576171875, "logps/rejected": -906.5769653320312, "loss": 0.5284, "rewards/accuracies": 0.75, "rewards/chosen": -3.483821392059326, "rewards/margins": 0.7100880146026611, "rewards/rejected": -4.193909168243408, "step": 1244 }, { "epoch": 0.8131276022532452, "grad_norm": 40.934726173262696, "learning_rate": 1.5407473199940978e-08, "logits/chosen": -1.5310475826263428, "logits/rejected": -1.5288856029510498, "logps/chosen": -761.62451171875, "logps/rejected": -932.880126953125, "loss": 0.4466, "rewards/accuracies": 0.84375, "rewards/chosen": -3.145474433898926, "rewards/margins": 1.0549681186676025, "rewards/rejected": -4.200442790985107, "step": 1245 }, { "epoch": 0.8137807167932076, "grad_norm": 59.614420768209214, "learning_rate": 1.5303733989066992e-08, "logits/chosen": -1.5001609325408936, "logits/rejected": -1.4439678192138672, "logps/chosen": -862.7713623046875, "logps/rejected": -916.8041381835938, "loss": 0.5735, "rewards/accuracies": 0.6875, "rewards/chosen": -3.4643964767456055, "rewards/margins": 0.6968106627464294, "rewards/rejected": -4.16120719909668, "step": 1246 }, { "epoch": 0.81443383133317, "grad_norm": 99.7769624540917, "learning_rate": 1.5200305505086678e-08, "logits/chosen": -1.451883316040039, "logits/rejected": -1.4290595054626465, "logps/chosen": -822.1856689453125, "logps/rejected": -901.4767456054688, "loss": 0.547, "rewards/accuracies": 0.6875, "rewards/chosen": -3.326678514480591, "rewards/margins": 0.7827971577644348, "rewards/rejected": -4.109475612640381, "step": 1247 }, { "epoch": 0.8150869458731325, "grad_norm": 35.074610634798226, "learning_rate": 1.509718828635887e-08, "logits/chosen": -1.4157919883728027, "logits/rejected": -1.437688946723938, "logps/chosen": -747.04638671875, "logps/rejected": -875.550537109375, "loss": 0.4635, "rewards/accuracies": 0.78125, "rewards/chosen": -3.111088991165161, "rewards/margins": 0.8460332155227661, "rewards/rejected": -3.9571218490600586, "step": 1248 }, { "epoch": 0.815740060413095, "grad_norm": 31.080617936177795, "learning_rate": 1.4994382869622212e-08, "logits/chosen": -1.5038199424743652, "logits/rejected": -1.5081907510757446, "logps/chosen": -842.7953491210938, "logps/rejected": -893.8030395507812, "loss": 0.5325, "rewards/accuracies": 0.65625, "rewards/chosen": -3.6291537284851074, "rewards/margins": 0.5055683851242065, "rewards/rejected": -4.134722709655762, "step": 1249 }, { "epoch": 0.8163931749530574, "grad_norm": 72.9364075543982, "learning_rate": 1.4891889789992385e-08, "logits/chosen": -1.4778282642364502, "logits/rejected": -1.4801677465438843, "logps/chosen": -810.5230712890625, "logps/rejected": -921.8294067382812, "loss": 0.504, "rewards/accuracies": 0.84375, "rewards/chosen": -3.3633217811584473, "rewards/margins": 1.1871894598007202, "rewards/rejected": -4.550510883331299, "step": 1250 }, { "epoch": 0.8170462894930198, "grad_norm": 25.634012106471616, "learning_rate": 1.4789709580959304e-08, "logits/chosen": -1.4490480422973633, "logits/rejected": -1.4597145318984985, "logps/chosen": -787.5851440429688, "logps/rejected": -899.1139526367188, "loss": 0.5296, "rewards/accuracies": 0.6875, "rewards/chosen": -3.343380928039551, "rewards/margins": 0.7966138124465942, "rewards/rejected": -4.139995098114014, "step": 1251 }, { "epoch": 0.8176994040329822, "grad_norm": 13.077479982264203, "learning_rate": 1.4687842774384366e-08, "logits/chosen": -1.4068443775177002, "logits/rejected": -1.4268113374710083, "logps/chosen": -732.1669921875, "logps/rejected": -751.791259765625, "loss": 0.5786, "rewards/accuracies": 0.65625, "rewards/chosen": -3.2558491230010986, "rewards/margins": 0.2685055732727051, "rewards/rejected": -3.5243544578552246, "step": 1252 }, { "epoch": 0.8183525185729448, "grad_norm": 96.37113725713898, "learning_rate": 1.4586289900497672e-08, "logits/chosen": -1.5462439060211182, "logits/rejected": -1.519241452217102, "logps/chosen": -875.8187866210938, "logps/rejected": -946.7884521484375, "loss": 0.5182, "rewards/accuracies": 0.78125, "rewards/chosen": -3.811178207397461, "rewards/margins": 1.0562658309936523, "rewards/rejected": -4.867443561553955, "step": 1253 }, { "epoch": 0.8190056331129072, "grad_norm": 47.14222768313448, "learning_rate": 1.4485051487895208e-08, "logits/chosen": -1.6328545808792114, "logits/rejected": -1.5723670721054077, "logps/chosen": -878.3783569335938, "logps/rejected": -1000.5723876953125, "loss": 0.4553, "rewards/accuracies": 0.8125, "rewards/chosen": -3.133354902267456, "rewards/margins": 1.4496155977249146, "rewards/rejected": -4.58297061920166, "step": 1254 }, { "epoch": 0.8196587476528696, "grad_norm": 77.13494040128981, "learning_rate": 1.4384128063536215e-08, "logits/chosen": -1.5092742443084717, "logits/rejected": -1.4484039545059204, "logps/chosen": -795.540283203125, "logps/rejected": -922.9327392578125, "loss": 0.5052, "rewards/accuracies": 0.6875, "rewards/chosen": -3.0639872550964355, "rewards/margins": 0.9547647833824158, "rewards/rejected": -4.018751621246338, "step": 1255 }, { "epoch": 0.820311862192832, "grad_norm": 27.794467326470485, "learning_rate": 1.4283520152740358e-08, "logits/chosen": -1.527742624282837, "logits/rejected": -1.550565481185913, "logps/chosen": -901.008056640625, "logps/rejected": -977.6214599609375, "loss": 0.443, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6517744064331055, "rewards/margins": 0.9718442559242249, "rewards/rejected": -4.6236186027526855, "step": 1256 }, { "epoch": 0.8209649767327946, "grad_norm": 48.23537245171485, "learning_rate": 1.4183228279184986e-08, "logits/chosen": -1.4986786842346191, "logits/rejected": -1.505009651184082, "logps/chosen": -780.6826171875, "logps/rejected": -851.0025024414062, "loss": 0.4116, "rewards/accuracies": 0.875, "rewards/chosen": -3.2454466819763184, "rewards/margins": 0.8366904258728027, "rewards/rejected": -4.082137107849121, "step": 1257 }, { "epoch": 0.821618091272757, "grad_norm": 25.124612077309788, "learning_rate": 1.4083252964902476e-08, "logits/chosen": -1.4318965673446655, "logits/rejected": -1.4011112451553345, "logps/chosen": -767.1181640625, "logps/rejected": -834.265380859375, "loss": 0.4421, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2802295684814453, "rewards/margins": 0.9027811884880066, "rewards/rejected": -4.183011054992676, "step": 1258 }, { "epoch": 0.8222712058127194, "grad_norm": 28.873103251905075, "learning_rate": 1.3983594730277437e-08, "logits/chosen": -1.504234790802002, "logits/rejected": -1.4638817310333252, "logps/chosen": -840.501220703125, "logps/rejected": -915.8950805664062, "loss": 0.5238, "rewards/accuracies": 0.875, "rewards/chosen": -3.346611976623535, "rewards/margins": 1.1296738386154175, "rewards/rejected": -4.4762864112854, "step": 1259 }, { "epoch": 0.8229243203526818, "grad_norm": 92.65936407387743, "learning_rate": 1.388425409404406e-08, "logits/chosen": -1.3982869386672974, "logits/rejected": -1.4056661128997803, "logps/chosen": -882.9879150390625, "logps/rejected": -926.9945068359375, "loss": 0.5427, "rewards/accuracies": 0.71875, "rewards/chosen": -3.3799350261688232, "rewards/margins": 0.531534731388092, "rewards/rejected": -3.9114699363708496, "step": 1260 }, { "epoch": 0.8235774348926443, "grad_norm": 21.267825506734145, "learning_rate": 1.378523157328338e-08, "logits/chosen": -1.463804006576538, "logits/rejected": -1.4482476711273193, "logps/chosen": -843.3424682617188, "logps/rejected": -922.429931640625, "loss": 0.4699, "rewards/accuracies": 0.78125, "rewards/chosen": -3.516786813735962, "rewards/margins": 0.4834230840206146, "rewards/rejected": -4.000209808349609, "step": 1261 }, { "epoch": 0.8242305494326068, "grad_norm": 44.39704770318735, "learning_rate": 1.3686527683420598e-08, "logits/chosen": -1.5227603912353516, "logits/rejected": -1.4986686706542969, "logps/chosen": -880.2915649414062, "logps/rejected": -956.4801025390625, "loss": 0.4738, "rewards/accuracies": 0.90625, "rewards/chosen": -3.749066114425659, "rewards/margins": 0.984779417514801, "rewards/rejected": -4.7338457107543945, "step": 1262 }, { "epoch": 0.8248836639725692, "grad_norm": 40.800361223632876, "learning_rate": 1.3588142938222421e-08, "logits/chosen": -1.5330525636672974, "logits/rejected": -1.5526435375213623, "logps/chosen": -796.693603515625, "logps/rejected": -926.6709594726562, "loss": 0.4648, "rewards/accuracies": 0.78125, "rewards/chosen": -2.9938881397247314, "rewards/margins": 1.054705262184143, "rewards/rejected": -4.048593521118164, "step": 1263 }, { "epoch": 0.8255367785125316, "grad_norm": 16.08173399476035, "learning_rate": 1.3490077849794333e-08, "logits/chosen": -1.5841691493988037, "logits/rejected": -1.5560128688812256, "logps/chosen": -929.83056640625, "logps/rejected": -988.2870483398438, "loss": 0.4453, "rewards/accuracies": 0.84375, "rewards/chosen": -3.5736300945281982, "rewards/margins": 0.8916316032409668, "rewards/rejected": -4.465262413024902, "step": 1264 }, { "epoch": 0.8261898930524941, "grad_norm": 18.753303938177545, "learning_rate": 1.3392332928577994e-08, "logits/chosen": -1.5194913148880005, "logits/rejected": -1.525078535079956, "logps/chosen": -871.6900024414062, "logps/rejected": -965.0679931640625, "loss": 0.4307, "rewards/accuracies": 0.6875, "rewards/chosen": -3.447103977203369, "rewards/margins": 0.7160016298294067, "rewards/rejected": -4.163105487823486, "step": 1265 }, { "epoch": 0.8268430075924565, "grad_norm": 37.23704455866974, "learning_rate": 1.3294908683348535e-08, "logits/chosen": -1.4927794933319092, "logits/rejected": -1.459194302558899, "logps/chosen": -876.373046875, "logps/rejected": -885.762939453125, "loss": 0.5455, "rewards/accuracies": 0.75, "rewards/chosen": -3.8211159706115723, "rewards/margins": 0.5798470973968506, "rewards/rejected": -4.400962829589844, "step": 1266 }, { "epoch": 0.827496122132419, "grad_norm": 41.88538792702506, "learning_rate": 1.3197805621211925e-08, "logits/chosen": -1.4792529344558716, "logits/rejected": -1.4594306945800781, "logps/chosen": -812.187744140625, "logps/rejected": -929.7012939453125, "loss": 0.475, "rewards/accuracies": 0.875, "rewards/chosen": -3.358401298522949, "rewards/margins": 0.9242434501647949, "rewards/rejected": -4.282644748687744, "step": 1267 }, { "epoch": 0.8281492366723814, "grad_norm": 143.83646829408002, "learning_rate": 1.3101024247602339e-08, "logits/chosen": -1.3687011003494263, "logits/rejected": -1.416597843170166, "logps/chosen": -873.91796875, "logps/rejected": -925.439453125, "loss": 0.501, "rewards/accuracies": 0.5625, "rewards/chosen": -3.801042318344116, "rewards/margins": 0.3380916118621826, "rewards/rejected": -4.139133930206299, "step": 1268 }, { "epoch": 0.8288023512123439, "grad_norm": 12.16391760621639, "learning_rate": 1.3004565066279519e-08, "logits/chosen": -1.5240532159805298, "logits/rejected": -1.5018377304077148, "logps/chosen": -785.153076171875, "logps/rejected": -848.844970703125, "loss": 0.5177, "rewards/accuracies": 0.75, "rewards/chosen": -3.3579812049865723, "rewards/margins": 0.8355432748794556, "rewards/rejected": -4.193524360656738, "step": 1269 }, { "epoch": 0.8294554657523063, "grad_norm": 12.97712151020145, "learning_rate": 1.2908428579326159e-08, "logits/chosen": -1.483147144317627, "logits/rejected": -1.440002202987671, "logps/chosen": -855.696044921875, "logps/rejected": -937.5626831054688, "loss": 0.513, "rewards/accuracies": 0.8125, "rewards/chosen": -3.419983148574829, "rewards/margins": 1.2121243476867676, "rewards/rejected": -4.632107734680176, "step": 1270 }, { "epoch": 0.8301085802922687, "grad_norm": 61.10324755914619, "learning_rate": 1.2812615287145276e-08, "logits/chosen": -1.4333446025848389, "logits/rejected": -1.4426469802856445, "logps/chosen": -779.0205078125, "logps/rejected": -882.3414306640625, "loss": 0.5547, "rewards/accuracies": 0.71875, "rewards/chosen": -3.4126522541046143, "rewards/margins": 0.8153400421142578, "rewards/rejected": -4.227993011474609, "step": 1271 }, { "epoch": 0.8307616948322312, "grad_norm": 49.65538467582004, "learning_rate": 1.2717125688457627e-08, "logits/chosen": -1.4278748035430908, "logits/rejected": -1.4869357347488403, "logps/chosen": -847.4019165039062, "logps/rejected": -1110.864501953125, "loss": 0.5348, "rewards/accuracies": 0.71875, "rewards/chosen": -3.525916576385498, "rewards/margins": 1.2023837566375732, "rewards/rejected": -4.728300094604492, "step": 1272 }, { "epoch": 0.8314148093721937, "grad_norm": 21.25568560854086, "learning_rate": 1.2621960280299093e-08, "logits/chosen": -1.498213529586792, "logits/rejected": -1.5190093517303467, "logps/chosen": -889.1369018554688, "logps/rejected": -935.4756469726562, "loss": 0.5172, "rewards/accuracies": 0.6875, "rewards/chosen": -3.5845720767974854, "rewards/margins": 1.0147137641906738, "rewards/rejected": -4.599286079406738, "step": 1273 }, { "epoch": 0.8320679239121561, "grad_norm": 20.186165187940464, "learning_rate": 1.252711955801811e-08, "logits/chosen": -1.4972944259643555, "logits/rejected": -1.515435814857483, "logps/chosen": -775.0640869140625, "logps/rejected": -830.6438598632812, "loss": 0.4944, "rewards/accuracies": 0.71875, "rewards/chosen": -2.7457919120788574, "rewards/margins": 0.6729423999786377, "rewards/rejected": -3.418734073638916, "step": 1274 }, { "epoch": 0.8327210384521185, "grad_norm": 80.31553260847888, "learning_rate": 1.2432604015273082e-08, "logits/chosen": -1.5685982704162598, "logits/rejected": -1.5332074165344238, "logps/chosen": -850.53076171875, "logps/rejected": -1026.168701171875, "loss": 0.4492, "rewards/accuracies": 0.78125, "rewards/chosen": -3.52174973487854, "rewards/margins": 1.3801482915878296, "rewards/rejected": -4.901898384094238, "step": 1275 }, { "epoch": 0.833374152992081, "grad_norm": 44.67438081658325, "learning_rate": 1.2338414144029779e-08, "logits/chosen": -1.4650599956512451, "logits/rejected": -1.5019781589508057, "logps/chosen": -831.616455078125, "logps/rejected": -865.5159912109375, "loss": 0.5538, "rewards/accuracies": 0.75, "rewards/chosen": -3.3090555667877197, "rewards/margins": 0.5248807668685913, "rewards/rejected": -3.8339362144470215, "step": 1276 }, { "epoch": 0.8340272675320435, "grad_norm": 39.396696943484436, "learning_rate": 1.2244550434558842e-08, "logits/chosen": -1.461850643157959, "logits/rejected": -1.453994631767273, "logps/chosen": -843.3770751953125, "logps/rejected": -1038.049072265625, "loss": 0.5168, "rewards/accuracies": 0.84375, "rewards/chosen": -3.4972283840179443, "rewards/margins": 1.6629681587219238, "rewards/rejected": -5.160196781158447, "step": 1277 }, { "epoch": 0.8346803820720059, "grad_norm": 65.0379831567537, "learning_rate": 1.2151013375433202e-08, "logits/chosen": -1.553591012954712, "logits/rejected": -1.507093906402588, "logps/chosen": -870.8557739257812, "logps/rejected": -965.6985473632812, "loss": 0.4459, "rewards/accuracies": 0.84375, "rewards/chosen": -3.6408274173736572, "rewards/margins": 1.1605440378189087, "rewards/rejected": -4.801371097564697, "step": 1278 }, { "epoch": 0.8353334966119683, "grad_norm": 30.676217173011974, "learning_rate": 1.2057803453525502e-08, "logits/chosen": -1.468326449394226, "logits/rejected": -1.4737112522125244, "logps/chosen": -782.1486206054688, "logps/rejected": -842.2639770507812, "loss": 0.5042, "rewards/accuracies": 0.78125, "rewards/chosen": -3.191077709197998, "rewards/margins": 0.6720733046531677, "rewards/rejected": -3.8631508350372314, "step": 1279 }, { "epoch": 0.8359866111519307, "grad_norm": 107.16302969733803, "learning_rate": 1.1964921154005631e-08, "logits/chosen": -1.4318448305130005, "logits/rejected": -1.4438923597335815, "logps/chosen": -870.4271850585938, "logps/rejected": -939.97900390625, "loss": 0.5041, "rewards/accuracies": 0.75, "rewards/chosen": -3.3240041732788086, "rewards/margins": 0.9650790095329285, "rewards/rejected": -4.289083003997803, "step": 1280 }, { "epoch": 0.8366397256918933, "grad_norm": 37.456288698932234, "learning_rate": 1.187236696033812e-08, "logits/chosen": -1.5122158527374268, "logits/rejected": -1.454342007637024, "logps/chosen": -791.8545532226562, "logps/rejected": -874.0135498046875, "loss": 0.4348, "rewards/accuracies": 0.78125, "rewards/chosen": -2.786623239517212, "rewards/margins": 0.9403093457221985, "rewards/rejected": -3.7269325256347656, "step": 1281 }, { "epoch": 0.8372928402318557, "grad_norm": 32.26978309244302, "learning_rate": 1.1780141354279698e-08, "logits/chosen": -1.4852337837219238, "logits/rejected": -1.4611694812774658, "logps/chosen": -899.8671875, "logps/rejected": -902.857666015625, "loss": 0.4584, "rewards/accuracies": 0.75, "rewards/chosen": -3.5996108055114746, "rewards/margins": 0.4853794276714325, "rewards/rejected": -4.08499002456665, "step": 1282 }, { "epoch": 0.8379459547718181, "grad_norm": 67.20483755586449, "learning_rate": 1.1688244815876735e-08, "logits/chosen": -1.4753847122192383, "logits/rejected": -1.495114803314209, "logps/chosen": -775.127685546875, "logps/rejected": -854.59423828125, "loss": 0.4918, "rewards/accuracies": 0.78125, "rewards/chosen": -3.08735728263855, "rewards/margins": 0.8931238055229187, "rewards/rejected": -3.9804811477661133, "step": 1283 }, { "epoch": 0.8385990693117805, "grad_norm": 31.10069114339872, "learning_rate": 1.1596677823462769e-08, "logits/chosen": -1.4978846311569214, "logits/rejected": -1.5116081237792969, "logps/chosen": -835.6173095703125, "logps/rejected": -892.9727783203125, "loss": 0.4633, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4220669269561768, "rewards/margins": 1.0128189325332642, "rewards/rejected": -4.4348859786987305, "step": 1284 }, { "epoch": 0.839252183851743, "grad_norm": 95.67340364123025, "learning_rate": 1.1505440853655996e-08, "logits/chosen": -1.580156922340393, "logits/rejected": -1.564608097076416, "logps/chosen": -828.8040771484375, "logps/rejected": -858.2422485351562, "loss": 0.5397, "rewards/accuracies": 0.65625, "rewards/chosen": -3.2406132221221924, "rewards/margins": 0.5403940677642822, "rewards/rejected": -3.7810075283050537, "step": 1285 }, { "epoch": 0.8399052983917055, "grad_norm": 20.610401240104963, "learning_rate": 1.1414534381356817e-08, "logits/chosen": -1.457862377166748, "logits/rejected": -1.4158681631088257, "logps/chosen": -854.0062255859375, "logps/rejected": -980.3665771484375, "loss": 0.5507, "rewards/accuracies": 0.78125, "rewards/chosen": -3.427743911743164, "rewards/margins": 1.1576130390167236, "rewards/rejected": -4.585356712341309, "step": 1286 }, { "epoch": 0.8405584129316679, "grad_norm": 15.387698511738025, "learning_rate": 1.132395887974536e-08, "logits/chosen": -1.512953758239746, "logits/rejected": -1.4666166305541992, "logps/chosen": -873.6098022460938, "logps/rejected": -931.2805786132812, "loss": 0.5066, "rewards/accuracies": 0.8125, "rewards/chosen": -3.396209478378296, "rewards/margins": 0.8550810813903809, "rewards/rejected": -4.251290798187256, "step": 1287 }, { "epoch": 0.8412115274716303, "grad_norm": 17.455285669432005, "learning_rate": 1.123371482027895e-08, "logits/chosen": -1.4253538846969604, "logits/rejected": -1.4188988208770752, "logps/chosen": -823.5361328125, "logps/rejected": -855.1928100585938, "loss": 0.4807, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0535998344421387, "rewards/margins": 1.1148144006729126, "rewards/rejected": -4.16841459274292, "step": 1288 }, { "epoch": 0.8418646420115928, "grad_norm": 95.33465590024296, "learning_rate": 1.1143802672689772e-08, "logits/chosen": -1.5128263235092163, "logits/rejected": -1.5237712860107422, "logps/chosen": -804.3637084960938, "logps/rejected": -936.9696044921875, "loss": 0.4619, "rewards/accuracies": 0.71875, "rewards/chosen": -3.063973903656006, "rewards/margins": 0.989101231098175, "rewards/rejected": -4.053074836730957, "step": 1289 }, { "epoch": 0.8425177565515553, "grad_norm": 18.893079688911747, "learning_rate": 1.1054222904982346e-08, "logits/chosen": -1.4855834245681763, "logits/rejected": -1.4457110166549683, "logps/chosen": -862.0806274414062, "logps/rejected": -923.0704956054688, "loss": 0.4819, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3044419288635254, "rewards/margins": 1.0387816429138184, "rewards/rejected": -4.343223571777344, "step": 1290 }, { "epoch": 0.8431708710915177, "grad_norm": 36.3373091887223, "learning_rate": 1.0964975983431116e-08, "logits/chosen": -1.5048320293426514, "logits/rejected": -1.5244179964065552, "logps/chosen": -848.87548828125, "logps/rejected": -917.09619140625, "loss": 0.4807, "rewards/accuracies": 0.6875, "rewards/chosen": -3.414567708969116, "rewards/margins": 0.6657711267471313, "rewards/rejected": -4.080338954925537, "step": 1291 }, { "epoch": 0.8438239856314801, "grad_norm": 41.15768005703854, "learning_rate": 1.0876062372578e-08, "logits/chosen": -1.4884114265441895, "logits/rejected": -1.495098352432251, "logps/chosen": -854.0869750976562, "logps/rejected": -878.1396484375, "loss": 0.5553, "rewards/accuracies": 0.625, "rewards/chosen": -3.5928542613983154, "rewards/margins": 0.4925929307937622, "rewards/rejected": -4.085447311401367, "step": 1292 }, { "epoch": 0.8444771001714426, "grad_norm": 36.42635614805045, "learning_rate": 1.0787482535230022e-08, "logits/chosen": -1.4687424898147583, "logits/rejected": -1.4143062829971313, "logps/chosen": -813.6422729492188, "logps/rejected": -855.9901123046875, "loss": 0.516, "rewards/accuracies": 0.75, "rewards/chosen": -3.0875415802001953, "rewards/margins": 0.5753628015518188, "rewards/rejected": -3.6629042625427246, "step": 1293 }, { "epoch": 0.845130214711405, "grad_norm": 29.828612800376433, "learning_rate": 1.0699236932456835e-08, "logits/chosen": -1.6260933876037598, "logits/rejected": -1.5973844528198242, "logps/chosen": -855.070068359375, "logps/rejected": -896.7317504882812, "loss": 0.5569, "rewards/accuracies": 0.75, "rewards/chosen": -3.386779546737671, "rewards/margins": 0.6484910249710083, "rewards/rejected": -4.035270690917969, "step": 1294 }, { "epoch": 0.8457833292513675, "grad_norm": 42.20579198570261, "learning_rate": 1.0611326023588388e-08, "logits/chosen": -1.4956504106521606, "logits/rejected": -1.4156677722930908, "logps/chosen": -786.1782836914062, "logps/rejected": -816.1480102539062, "loss": 0.563, "rewards/accuracies": 0.6875, "rewards/chosen": -3.0679595470428467, "rewards/margins": 0.5685796141624451, "rewards/rejected": -3.6365387439727783, "step": 1295 }, { "epoch": 0.8464364437913299, "grad_norm": 13.728489619296084, "learning_rate": 1.0523750266212483e-08, "logits/chosen": -1.5465428829193115, "logits/rejected": -1.500390887260437, "logps/chosen": -882.738525390625, "logps/rejected": -909.25634765625, "loss": 0.5011, "rewards/accuracies": 0.65625, "rewards/chosen": -3.744199752807617, "rewards/margins": 0.5797795653343201, "rewards/rejected": -4.323978900909424, "step": 1296 }, { "epoch": 0.8470895583312924, "grad_norm": 16.665414724980934, "learning_rate": 1.0436510116172425e-08, "logits/chosen": -1.4301414489746094, "logits/rejected": -1.4165202379226685, "logps/chosen": -852.8707275390625, "logps/rejected": -919.6453857421875, "loss": 0.5169, "rewards/accuracies": 0.625, "rewards/chosen": -3.4321398735046387, "rewards/margins": 0.7153012156486511, "rewards/rejected": -4.1474409103393555, "step": 1297 }, { "epoch": 0.8477426728712548, "grad_norm": 51.89482440292027, "learning_rate": 1.0349606027564633e-08, "logits/chosen": -1.4815446138381958, "logits/rejected": -1.4437528848648071, "logps/chosen": -808.208984375, "logps/rejected": -885.4605712890625, "loss": 0.4939, "rewards/accuracies": 0.75, "rewards/chosen": -2.99233078956604, "rewards/margins": 0.8016055822372437, "rewards/rejected": -3.7939364910125732, "step": 1298 }, { "epoch": 0.8483957874112172, "grad_norm": 47.61880788832745, "learning_rate": 1.0263038452736292e-08, "logits/chosen": -1.4789763689041138, "logits/rejected": -1.4989715814590454, "logps/chosen": -751.89453125, "logps/rejected": -971.8363647460938, "loss": 0.4495, "rewards/accuracies": 0.875, "rewards/chosen": -3.0481152534484863, "rewards/margins": 1.4892245531082153, "rewards/rejected": -4.53734016418457, "step": 1299 }, { "epoch": 0.8490489019511797, "grad_norm": 12.835741715040568, "learning_rate": 1.0176807842282977e-08, "logits/chosen": -1.4237325191497803, "logits/rejected": -1.4008545875549316, "logps/chosen": -822.1392211914062, "logps/rejected": -825.3770751953125, "loss": 0.5182, "rewards/accuracies": 0.75, "rewards/chosen": -3.1958327293395996, "rewards/margins": 0.45696282386779785, "rewards/rejected": -3.6527957916259766, "step": 1300 }, { "epoch": 0.8490489019511797, "eval_logits/chosen": -1.4787728786468506, "eval_logits/rejected": -1.4600926637649536, "eval_logps/chosen": -842.8944702148438, "eval_logps/rejected": -926.5396118164062, "eval_loss": 0.49515822529792786, "eval_rewards/accuracies": 0.7599999904632568, "eval_rewards/chosen": -3.3254919052124023, "eval_rewards/margins": 0.9175096750259399, "eval_rewards/rejected": -4.243001461029053, "eval_runtime": 296.4556, "eval_samples_per_second": 13.493, "eval_steps_per_second": 0.843, "step": 1300 }, { "epoch": 0.8497020164911422, "grad_norm": 66.58487284404727, "learning_rate": 1.009091464504633e-08, "logits/chosen": -1.4258389472961426, "logits/rejected": -1.3811410665512085, "logps/chosen": -866.520751953125, "logps/rejected": -932.500244140625, "loss": 0.5058, "rewards/accuracies": 0.75, "rewards/chosen": -3.8197102546691895, "rewards/margins": 0.9209631085395813, "rewards/rejected": -4.740673542022705, "step": 1301 }, { "epoch": 0.8503551310311046, "grad_norm": 15.745580341420917, "learning_rate": 1.0005359308111702e-08, "logits/chosen": -1.535689353942871, "logits/rejected": -1.5087151527404785, "logps/chosen": -806.9986572265625, "logps/rejected": -910.8426513671875, "loss": 0.4717, "rewards/accuracies": 0.71875, "rewards/chosen": -3.248060941696167, "rewards/margins": 1.305046558380127, "rewards/rejected": -4.553107738494873, "step": 1302 }, { "epoch": 0.851008245571067, "grad_norm": 62.332572039035284, "learning_rate": 9.920142276805852e-09, "logits/chosen": -1.465049386024475, "logits/rejected": -1.4151432514190674, "logps/chosen": -927.4882202148438, "logps/rejected": -980.1682739257812, "loss": 0.5446, "rewards/accuracies": 0.75, "rewards/chosen": -3.472818374633789, "rewards/margins": 0.7706753611564636, "rewards/rejected": -4.243494033813477, "step": 1303 }, { "epoch": 0.8516613601110294, "grad_norm": 33.565372393934965, "learning_rate": 9.835263994694587e-09, "logits/chosen": -1.46824049949646, "logits/rejected": -1.4706170558929443, "logps/chosen": -960.3832397460938, "logps/rejected": -1062.023681640625, "loss": 0.4433, "rewards/accuracies": 0.90625, "rewards/chosen": -3.9557857513427734, "rewards/margins": 0.9899990558624268, "rewards/rejected": -4.945785045623779, "step": 1304 }, { "epoch": 0.852314474650992, "grad_norm": 25.81469581412769, "learning_rate": 9.750724903580503e-09, "logits/chosen": -1.5085800886154175, "logits/rejected": -1.5068165063858032, "logps/chosen": -845.734130859375, "logps/rejected": -906.9862060546875, "loss": 0.4746, "rewards/accuracies": 0.6875, "rewards/chosen": -3.1564383506774902, "rewards/margins": 0.7602788209915161, "rewards/rejected": -3.916717052459717, "step": 1305 }, { "epoch": 0.8529675891909544, "grad_norm": 17.54341908103785, "learning_rate": 9.666525443500666e-09, "logits/chosen": -1.5109413862228394, "logits/rejected": -1.397308349609375, "logps/chosen": -848.3529052734375, "logps/rejected": -978.5413818359375, "loss": 0.5116, "rewards/accuracies": 0.65625, "rewards/chosen": -3.651090145111084, "rewards/margins": 1.1326894760131836, "rewards/rejected": -4.783779144287109, "step": 1306 }, { "epoch": 0.8536207037309168, "grad_norm": 17.25055458777116, "learning_rate": 9.582666052724305e-09, "logits/chosen": -1.5393823385238647, "logits/rejected": -1.5182414054870605, "logps/chosen": -882.0748901367188, "logps/rejected": -946.6273803710938, "loss": 0.5355, "rewards/accuracies": 0.75, "rewards/chosen": -3.4100325107574463, "rewards/margins": 0.5562119483947754, "rewards/rejected": -3.9662444591522217, "step": 1307 }, { "epoch": 0.8542738182708792, "grad_norm": 16.192143577334363, "learning_rate": 9.499147167750541e-09, "logits/chosen": -1.425965428352356, "logits/rejected": -1.4162306785583496, "logps/chosen": -811.1448364257812, "logps/rejected": -926.71435546875, "loss": 0.4845, "rewards/accuracies": 0.8125, "rewards/chosen": -3.059163808822632, "rewards/margins": 0.9071324467658997, "rewards/rejected": -3.9662961959838867, "step": 1308 }, { "epoch": 0.8549269328108418, "grad_norm": 74.71761901774033, "learning_rate": 9.415969223306133e-09, "logits/chosen": -1.4104845523834229, "logits/rejected": -1.4388418197631836, "logps/chosen": -902.40380859375, "logps/rejected": -968.486083984375, "loss": 0.5375, "rewards/accuracies": 0.625, "rewards/chosen": -3.3599321842193604, "rewards/margins": 0.6312100291252136, "rewards/rejected": -3.9911422729492188, "step": 1309 }, { "epoch": 0.8555800473508042, "grad_norm": 33.85232737585008, "learning_rate": 9.333132652343193e-09, "logits/chosen": -1.5950632095336914, "logits/rejected": -1.5360954999923706, "logps/chosen": -822.3123779296875, "logps/rejected": -889.439697265625, "loss": 0.4481, "rewards/accuracies": 0.71875, "rewards/chosen": -3.34323787689209, "rewards/margins": 1.0119768381118774, "rewards/rejected": -4.355214595794678, "step": 1310 }, { "epoch": 0.8562331618907666, "grad_norm": 64.61699538785297, "learning_rate": 9.250637886036913e-09, "logits/chosen": -1.4856970310211182, "logits/rejected": -1.4961824417114258, "logps/chosen": -820.3836059570312, "logps/rejected": -874.966796875, "loss": 0.4821, "rewards/accuracies": 0.75, "rewards/chosen": -2.9959676265716553, "rewards/margins": 0.6477092504501343, "rewards/rejected": -3.6436767578125, "step": 1311 }, { "epoch": 0.856886276430729, "grad_norm": 49.59060889243429, "learning_rate": 9.16848535378339e-09, "logits/chosen": -1.4910832643508911, "logits/rejected": -1.4679546356201172, "logps/chosen": -820.6103515625, "logps/rejected": -934.1729125976562, "loss": 0.4813, "rewards/accuracies": 0.78125, "rewards/chosen": -3.2271547317504883, "rewards/margins": 1.107942819595337, "rewards/rejected": -4.335097312927246, "step": 1312 }, { "epoch": 0.8575393909706915, "grad_norm": 67.84816938025705, "learning_rate": 9.086675483197323e-09, "logits/chosen": -1.4484561681747437, "logits/rejected": -1.424304485321045, "logps/chosen": -858.7589111328125, "logps/rejected": -946.8106079101562, "loss": 0.4282, "rewards/accuracies": 0.8125, "rewards/chosen": -3.125239849090576, "rewards/margins": 0.7515519857406616, "rewards/rejected": -3.876791477203369, "step": 1313 }, { "epoch": 0.858192505510654, "grad_norm": 25.662991507947012, "learning_rate": 9.005208700109817e-09, "logits/chosen": -1.4877333641052246, "logits/rejected": -1.4774301052093506, "logps/chosen": -765.2407836914062, "logps/rejected": -906.0118408203125, "loss": 0.4055, "rewards/accuracies": 0.875, "rewards/chosen": -3.1628265380859375, "rewards/margins": 1.3297454118728638, "rewards/rejected": -4.492571830749512, "step": 1314 }, { "epoch": 0.8588456200506164, "grad_norm": 127.58031473226492, "learning_rate": 8.924085428566163e-09, "logits/chosen": -1.4710652828216553, "logits/rejected": -1.4731792211532593, "logps/chosen": -753.9927978515625, "logps/rejected": -920.06005859375, "loss": 0.5066, "rewards/accuracies": 0.71875, "rewards/chosen": -3.049560070037842, "rewards/margins": 0.948535680770874, "rewards/rejected": -3.998095750808716, "step": 1315 }, { "epoch": 0.8594987345905788, "grad_norm": 45.1109159351009, "learning_rate": 8.843306090823632e-09, "logits/chosen": -1.45258367061615, "logits/rejected": -1.4658076763153076, "logps/chosen": -828.3963012695312, "logps/rejected": -949.1826171875, "loss": 0.4856, "rewards/accuracies": 0.75, "rewards/chosen": -3.294034481048584, "rewards/margins": 0.9175934791564941, "rewards/rejected": -4.211627960205078, "step": 1316 }, { "epoch": 0.8601518491305413, "grad_norm": 29.986901265876828, "learning_rate": 8.762871107349267e-09, "logits/chosen": -1.511345386505127, "logits/rejected": -1.4521244764328003, "logps/chosen": -778.9940795898438, "logps/rejected": -893.1011962890625, "loss": 0.5278, "rewards/accuracies": 0.71875, "rewards/chosen": -3.323246479034424, "rewards/margins": 1.0293022394180298, "rewards/rejected": -4.352548599243164, "step": 1317 }, { "epoch": 0.8608049636705037, "grad_norm": 11.35851659183948, "learning_rate": 8.682780896817716e-09, "logits/chosen": -1.3951612710952759, "logits/rejected": -1.4687902927398682, "logps/chosen": -848.2562866210938, "logps/rejected": -946.0987548828125, "loss": 0.5036, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4302592277526855, "rewards/margins": 0.6691716909408569, "rewards/rejected": -4.099431037902832, "step": 1318 }, { "epoch": 0.8614580782104662, "grad_norm": 11.967451850970171, "learning_rate": 8.603035876109013e-09, "logits/chosen": -1.5124762058258057, "logits/rejected": -1.4754977226257324, "logps/chosen": -814.0156860351562, "logps/rejected": -796.652587890625, "loss": 0.4813, "rewards/accuracies": 0.625, "rewards/chosen": -3.291874647140503, "rewards/margins": 0.5844862461090088, "rewards/rejected": -3.8763608932495117, "step": 1319 }, { "epoch": 0.8621111927504286, "grad_norm": 120.18322878609834, "learning_rate": 8.523636460306463e-09, "logits/chosen": -1.4285402297973633, "logits/rejected": -1.4294164180755615, "logps/chosen": -782.0220336914062, "logps/rejected": -898.1871948242188, "loss": 0.5255, "rewards/accuracies": 0.6875, "rewards/chosen": -3.4391157627105713, "rewards/margins": 0.8806977868080139, "rewards/rejected": -4.3198137283325195, "step": 1320 }, { "epoch": 0.862764307290391, "grad_norm": 48.61379884717116, "learning_rate": 8.444583062694439e-09, "logits/chosen": -1.3637633323669434, "logits/rejected": -1.3878041505813599, "logps/chosen": -757.3119506835938, "logps/rejected": -860.0999145507812, "loss": 0.5276, "rewards/accuracies": 0.6875, "rewards/chosen": -3.0247325897216797, "rewards/margins": 0.7837187051773071, "rewards/rejected": -3.8084514141082764, "step": 1321 }, { "epoch": 0.8634174218303535, "grad_norm": 11.712512451580766, "learning_rate": 8.365876094756228e-09, "logits/chosen": -1.5599374771118164, "logits/rejected": -1.5587773323059082, "logps/chosen": -911.1922607421875, "logps/rejected": -981.4324340820312, "loss": 0.5147, "rewards/accuracies": 0.875, "rewards/chosen": -3.283015251159668, "rewards/margins": 0.8019892573356628, "rewards/rejected": -4.085004806518555, "step": 1322 }, { "epoch": 0.864070536370316, "grad_norm": 131.68861126525354, "learning_rate": 8.287515966171928e-09, "logits/chosen": -1.5320029258728027, "logits/rejected": -1.516080379486084, "logps/chosen": -924.517822265625, "logps/rejected": -953.4563598632812, "loss": 0.5502, "rewards/accuracies": 0.78125, "rewards/chosen": -3.6483347415924072, "rewards/margins": 0.7624077796936035, "rewards/rejected": -4.41074275970459, "step": 1323 }, { "epoch": 0.8647236509102784, "grad_norm": 24.356458817864755, "learning_rate": 8.209503084816285e-09, "logits/chosen": -1.546379566192627, "logits/rejected": -1.4577432870864868, "logps/chosen": -800.8156127929688, "logps/rejected": -880.213134765625, "loss": 0.4574, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0747878551483154, "rewards/margins": 1.0365558862686157, "rewards/rejected": -4.1113433837890625, "step": 1324 }, { "epoch": 0.8653767654502408, "grad_norm": 33.56562789954909, "learning_rate": 8.131837856756585e-09, "logits/chosen": -1.4879792928695679, "logits/rejected": -1.4731531143188477, "logps/chosen": -843.258056640625, "logps/rejected": -948.2938232421875, "loss": 0.5075, "rewards/accuracies": 0.8125, "rewards/chosen": -3.485196828842163, "rewards/margins": 1.1023627519607544, "rewards/rejected": -4.587559223175049, "step": 1325 }, { "epoch": 0.8660298799902033, "grad_norm": 60.746333244964944, "learning_rate": 8.054520686250512e-09, "logits/chosen": -1.4594736099243164, "logits/rejected": -1.4537848234176636, "logps/chosen": -705.6865844726562, "logps/rejected": -833.6557006835938, "loss": 0.4751, "rewards/accuracies": 0.84375, "rewards/chosen": -2.902322769165039, "rewards/margins": 0.7870765924453735, "rewards/rejected": -3.689399242401123, "step": 1326 }, { "epoch": 0.8666829945301657, "grad_norm": 27.41375334113206, "learning_rate": 7.977551975744088e-09, "logits/chosen": -1.5716170072555542, "logits/rejected": -1.5631660223007202, "logps/chosen": -939.8978271484375, "logps/rejected": -984.3545532226562, "loss": 0.5208, "rewards/accuracies": 0.6875, "rewards/chosen": -4.000885963439941, "rewards/margins": 0.5963870286941528, "rewards/rejected": -4.597273349761963, "step": 1327 }, { "epoch": 0.8673361090701281, "grad_norm": 16.93864295624459, "learning_rate": 7.900932125869545e-09, "logits/chosen": -1.4590840339660645, "logits/rejected": -1.4381649494171143, "logps/chosen": -835.8486938476562, "logps/rejected": -1007.2196655273438, "loss": 0.5122, "rewards/accuracies": 0.75, "rewards/chosen": -3.375314712524414, "rewards/margins": 1.4108258485794067, "rewards/rejected": -4.786140441894531, "step": 1328 }, { "epoch": 0.8679892236100906, "grad_norm": 28.716003483649484, "learning_rate": 7.824661535443247e-09, "logits/chosen": -1.6202168464660645, "logits/rejected": -1.5609275102615356, "logps/chosen": -914.7964477539062, "logps/rejected": -1054.38525390625, "loss": 0.5092, "rewards/accuracies": 0.8125, "rewards/chosen": -3.545833110809326, "rewards/margins": 1.0506269931793213, "rewards/rejected": -4.596460342407227, "step": 1329 }, { "epoch": 0.8686423381500531, "grad_norm": 41.01865127499047, "learning_rate": 7.748740601463622e-09, "logits/chosen": -1.48884117603302, "logits/rejected": -1.5338010787963867, "logps/chosen": -731.7664794921875, "logps/rejected": -812.95458984375, "loss": 0.4394, "rewards/accuracies": 0.625, "rewards/chosen": -3.1326169967651367, "rewards/margins": 0.877723217010498, "rewards/rejected": -4.010340213775635, "step": 1330 }, { "epoch": 0.8692954526900155, "grad_norm": 64.11269025670589, "learning_rate": 7.673169719109091e-09, "logits/chosen": -1.4994860887527466, "logits/rejected": -1.4728403091430664, "logps/chosen": -873.9459228515625, "logps/rejected": -884.364501953125, "loss": 0.5138, "rewards/accuracies": 0.65625, "rewards/chosen": -3.492875337600708, "rewards/margins": 0.4393343925476074, "rewards/rejected": -3.9322092533111572, "step": 1331 }, { "epoch": 0.8699485672299779, "grad_norm": 38.41559411716, "learning_rate": 7.597949281736019e-09, "logits/chosen": -1.5339343547821045, "logits/rejected": -1.4958739280700684, "logps/chosen": -808.7354125976562, "logps/rejected": -932.888916015625, "loss": 0.5561, "rewards/accuracies": 0.6875, "rewards/chosen": -3.077152967453003, "rewards/margins": 1.2154319286346436, "rewards/rejected": -4.2925848960876465, "step": 1332 }, { "epoch": 0.8706016817699403, "grad_norm": 12.205155892018915, "learning_rate": 7.523079680876613e-09, "logits/chosen": -1.416054368019104, "logits/rejected": -1.377394676208496, "logps/chosen": -824.678955078125, "logps/rejected": -965.9581909179688, "loss": 0.4893, "rewards/accuracies": 0.78125, "rewards/chosen": -4.192705154418945, "rewards/margins": 0.9551449418067932, "rewards/rejected": -5.147850036621094, "step": 1333 }, { "epoch": 0.8712547963099029, "grad_norm": 57.99000568713857, "learning_rate": 7.448561306236989e-09, "logits/chosen": -1.4369691610336304, "logits/rejected": -1.4728755950927734, "logps/chosen": -964.21435546875, "logps/rejected": -1086.426513671875, "loss": 0.4682, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6197071075439453, "rewards/margins": 1.294512391090393, "rewards/rejected": -4.914219379425049, "step": 1334 }, { "epoch": 0.8719079108498653, "grad_norm": 20.044699015378878, "learning_rate": 7.374394545695062e-09, "logits/chosen": -1.4569401741027832, "logits/rejected": -1.4874026775360107, "logps/chosen": -895.0442504882812, "logps/rejected": -896.1160888671875, "loss": 0.4917, "rewards/accuracies": 0.6875, "rewards/chosen": -3.06990385055542, "rewards/margins": 0.44757652282714844, "rewards/rejected": -3.5174806118011475, "step": 1335 }, { "epoch": 0.8725610253898277, "grad_norm": 37.40096323479501, "learning_rate": 7.300579785298516e-09, "logits/chosen": -1.5550789833068848, "logits/rejected": -1.561784029006958, "logps/chosen": -865.9757690429688, "logps/rejected": -1000.7755126953125, "loss": 0.461, "rewards/accuracies": 0.84375, "rewards/chosen": -3.3865833282470703, "rewards/margins": 0.9827724695205688, "rewards/rejected": -4.36935567855835, "step": 1336 }, { "epoch": 0.8732141399297901, "grad_norm": 10.447772657085043, "learning_rate": 7.227117409262912e-09, "logits/chosen": -1.477044701576233, "logits/rejected": -1.4742741584777832, "logps/chosen": -804.1036987304688, "logps/rejected": -945.7398681640625, "loss": 0.4617, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0770020484924316, "rewards/margins": 1.022882342338562, "rewards/rejected": -4.099884033203125, "step": 1337 }, { "epoch": 0.8738672544697527, "grad_norm": 16.999701561801512, "learning_rate": 7.154007799969517e-09, "logits/chosen": -1.5495110750198364, "logits/rejected": -1.5202927589416504, "logps/chosen": -835.4287109375, "logps/rejected": -846.221435546875, "loss": 0.4887, "rewards/accuracies": 0.8125, "rewards/chosen": -3.041454315185547, "rewards/margins": 0.6531744003295898, "rewards/rejected": -3.6946287155151367, "step": 1338 }, { "epoch": 0.8745203690097151, "grad_norm": 93.85006471838479, "learning_rate": 7.081251337963442e-09, "logits/chosen": -1.581192970275879, "logits/rejected": -1.5974817276000977, "logps/chosen": -931.1429443359375, "logps/rejected": -1022.624755859375, "loss": 0.4658, "rewards/accuracies": 0.875, "rewards/chosen": -3.4839534759521484, "rewards/margins": 1.0064407587051392, "rewards/rejected": -4.490394115447998, "step": 1339 }, { "epoch": 0.8751734835496775, "grad_norm": 57.219568967053746, "learning_rate": 7.008848401951622e-09, "logits/chosen": -1.449042558670044, "logits/rejected": -1.4050977230072021, "logps/chosen": -781.509521484375, "logps/rejected": -854.7266845703125, "loss": 0.5102, "rewards/accuracies": 0.75, "rewards/chosen": -2.965616226196289, "rewards/margins": 0.8889732956886292, "rewards/rejected": -3.8545899391174316, "step": 1340 }, { "epoch": 0.8758265980896399, "grad_norm": 73.1771564404403, "learning_rate": 6.9367993688008195e-09, "logits/chosen": -1.4334369897842407, "logits/rejected": -1.4349690675735474, "logps/chosen": -838.9798583984375, "logps/rejected": -894.887939453125, "loss": 0.4708, "rewards/accuracies": 0.78125, "rewards/chosen": -3.614448070526123, "rewards/margins": 0.6810629367828369, "rewards/rejected": -4.295510768890381, "step": 1341 }, { "epoch": 0.8764797126296024, "grad_norm": 88.67111576311824, "learning_rate": 6.865104613535719e-09, "logits/chosen": -1.5720555782318115, "logits/rejected": -1.556181788444519, "logps/chosen": -1001.600830078125, "logps/rejected": -1061.5048828125, "loss": 0.5376, "rewards/accuracies": 0.71875, "rewards/chosen": -3.814486503601074, "rewards/margins": 0.7989178895950317, "rewards/rejected": -4.613404273986816, "step": 1342 }, { "epoch": 0.8771328271695649, "grad_norm": 107.66602305396016, "learning_rate": 6.7937645093369076e-09, "logits/chosen": -1.5547233819961548, "logits/rejected": -1.5719074010849, "logps/chosen": -969.04296875, "logps/rejected": -1082.557373046875, "loss": 0.4579, "rewards/accuracies": 0.75, "rewards/chosen": -3.629171371459961, "rewards/margins": 1.1535873413085938, "rewards/rejected": -4.782759189605713, "step": 1343 }, { "epoch": 0.8777859417095273, "grad_norm": 12.704339641522967, "learning_rate": 6.722779427539007e-09, "logits/chosen": -1.5757780075073242, "logits/rejected": -1.5685677528381348, "logps/chosen": -892.2096557617188, "logps/rejected": -1070.7259521484375, "loss": 0.5011, "rewards/accuracies": 0.78125, "rewards/chosen": -3.5382652282714844, "rewards/margins": 1.3167110681533813, "rewards/rejected": -4.854976654052734, "step": 1344 }, { "epoch": 0.8784390562494897, "grad_norm": 74.90456047914228, "learning_rate": 6.6521497376286425e-09, "logits/chosen": -1.4474490880966187, "logits/rejected": -1.393038272857666, "logps/chosen": -863.8945922851562, "logps/rejected": -985.6024169921875, "loss": 0.525, "rewards/accuracies": 0.6875, "rewards/chosen": -3.4975996017456055, "rewards/margins": 1.0218359231948853, "rewards/rejected": -4.519434928894043, "step": 1345 }, { "epoch": 0.8790921707894522, "grad_norm": 16.34267655213826, "learning_rate": 6.581875807242643e-09, "logits/chosen": -1.4592047929763794, "logits/rejected": -1.4397943019866943, "logps/chosen": -822.5889892578125, "logps/rejected": -898.7081909179688, "loss": 0.4558, "rewards/accuracies": 0.84375, "rewards/chosen": -2.987348794937134, "rewards/margins": 0.8248506784439087, "rewards/rejected": -3.812199115753174, "step": 1346 }, { "epoch": 0.8797452853294147, "grad_norm": 16.0782479464605, "learning_rate": 6.51195800216601e-09, "logits/chosen": -1.5585129261016846, "logits/rejected": -1.5897541046142578, "logps/chosen": -826.855224609375, "logps/rejected": -982.5360717773438, "loss": 0.4817, "rewards/accuracies": 0.90625, "rewards/chosen": -2.9745357036590576, "rewards/margins": 0.9419158697128296, "rewards/rejected": -3.9164514541625977, "step": 1347 }, { "epoch": 0.8803983998693771, "grad_norm": 15.944369178445228, "learning_rate": 6.442396686330104e-09, "logits/chosen": -1.5734515190124512, "logits/rejected": -1.5492303371429443, "logps/chosen": -877.200927734375, "logps/rejected": -935.7286987304688, "loss": 0.4639, "rewards/accuracies": 0.6875, "rewards/chosen": -3.669853925704956, "rewards/margins": 0.9167975187301636, "rewards/rejected": -4.58665132522583, "step": 1348 }, { "epoch": 0.8810515144093395, "grad_norm": 12.702778495003988, "learning_rate": 6.373192221810694e-09, "logits/chosen": -1.5214407444000244, "logits/rejected": -1.5045905113220215, "logps/chosen": -798.3970947265625, "logps/rejected": -847.7827758789062, "loss": 0.4624, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3049206733703613, "rewards/margins": 0.854720950126648, "rewards/rejected": -4.159641742706299, "step": 1349 }, { "epoch": 0.881704628949302, "grad_norm": 100.6971315662948, "learning_rate": 6.304344968826094e-09, "logits/chosen": -1.4444749355316162, "logits/rejected": -1.461186170578003, "logps/chosen": -823.1656494140625, "logps/rejected": -854.8773193359375, "loss": 0.4764, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2248125076293945, "rewards/margins": 0.8384161591529846, "rewards/rejected": -4.063228607177734, "step": 1350 }, { "epoch": 0.8823577434892644, "grad_norm": 28.966983505349088, "learning_rate": 6.235855285735289e-09, "logits/chosen": -1.373712182044983, "logits/rejected": -1.3527984619140625, "logps/chosen": -833.9160766601562, "logps/rejected": -975.32080078125, "loss": 0.4452, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4645469188690186, "rewards/margins": 1.1589038372039795, "rewards/rejected": -4.623451232910156, "step": 1351 }, { "epoch": 0.8830108580292269, "grad_norm": 11.232677637593673, "learning_rate": 6.167723529036051e-09, "logits/chosen": -1.4311108589172363, "logits/rejected": -1.4204224348068237, "logps/chosen": -745.5303955078125, "logps/rejected": -826.6415405273438, "loss": 0.556, "rewards/accuracies": 0.6875, "rewards/chosen": -2.9358832836151123, "rewards/margins": 0.5993090271949768, "rewards/rejected": -3.5351924896240234, "step": 1352 }, { "epoch": 0.8836639725691893, "grad_norm": 52.83410951930048, "learning_rate": 6.099950053363109e-09, "logits/chosen": -1.5700377225875854, "logits/rejected": -1.5207599401474, "logps/chosen": -897.0535888671875, "logps/rejected": -950.018798828125, "loss": 0.4828, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4924488067626953, "rewards/margins": 0.8256914019584656, "rewards/rejected": -4.318140506744385, "step": 1353 }, { "epoch": 0.8843170871091518, "grad_norm": 48.439495319236386, "learning_rate": 6.032535211486303e-09, "logits/chosen": -1.5001033544540405, "logits/rejected": -1.4997605085372925, "logps/chosen": -765.0709838867188, "logps/rejected": -830.47900390625, "loss": 0.4423, "rewards/accuracies": 0.71875, "rewards/chosen": -3.0950000286102295, "rewards/margins": 0.8747947216033936, "rewards/rejected": -3.969794273376465, "step": 1354 }, { "epoch": 0.8849702016491142, "grad_norm": 56.98461717362285, "learning_rate": 5.965479354308739e-09, "logits/chosen": -1.4557162523269653, "logits/rejected": -1.4349863529205322, "logps/chosen": -910.533935546875, "logps/rejected": -992.22998046875, "loss": 0.5308, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4078073501586914, "rewards/margins": 0.8652389049530029, "rewards/rejected": -4.273046016693115, "step": 1355 }, { "epoch": 0.8856233161890766, "grad_norm": 24.54186584954266, "learning_rate": 5.898782830864909e-09, "logits/chosen": -1.51798415184021, "logits/rejected": -1.564520001411438, "logps/chosen": -840.780029296875, "logps/rejected": -975.0224609375, "loss": 0.4677, "rewards/accuracies": 0.6875, "rewards/chosen": -3.289569139480591, "rewards/margins": 1.0400049686431885, "rewards/rejected": -4.329574108123779, "step": 1356 }, { "epoch": 0.886276430729039, "grad_norm": 104.63221608787276, "learning_rate": 5.832445988318996e-09, "logits/chosen": -1.552783489227295, "logits/rejected": -1.462197184562683, "logps/chosen": -885.2568969726562, "logps/rejected": -968.8541259765625, "loss": 0.5129, "rewards/accuracies": 0.8125, "rewards/chosen": -3.7370312213897705, "rewards/margins": 1.4136296510696411, "rewards/rejected": -5.150660514831543, "step": 1357 }, { "epoch": 0.8869295452690016, "grad_norm": 45.185665578640275, "learning_rate": 5.766469171962943e-09, "logits/chosen": -1.5153913497924805, "logits/rejected": -1.4714546203613281, "logps/chosen": -927.0983276367188, "logps/rejected": -1037.369140625, "loss": 0.4428, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4304184913635254, "rewards/margins": 0.805862307548523, "rewards/rejected": -4.23628044128418, "step": 1358 }, { "epoch": 0.887582659808964, "grad_norm": 79.59548680251643, "learning_rate": 5.7008527252147525e-09, "logits/chosen": -1.5614638328552246, "logits/rejected": -1.5693732500076294, "logps/chosen": -825.9854736328125, "logps/rejected": -924.3582763671875, "loss": 0.4688, "rewards/accuracies": 0.75, "rewards/chosen": -3.293821096420288, "rewards/margins": 1.0692007541656494, "rewards/rejected": -4.363021373748779, "step": 1359 }, { "epoch": 0.8882357743489264, "grad_norm": 17.890553860709666, "learning_rate": 5.635596989616628e-09, "logits/chosen": -1.3857258558273315, "logits/rejected": -1.3223706483840942, "logps/chosen": -842.4962768554688, "logps/rejected": -924.7880859375, "loss": 0.5206, "rewards/accuracies": 0.71875, "rewards/chosen": -3.7687184810638428, "rewards/margins": 0.9322965145111084, "rewards/rejected": -4.701014518737793, "step": 1360 }, { "epoch": 0.8888888888888888, "grad_norm": 86.3211811567686, "learning_rate": 5.570702304833225e-09, "logits/chosen": -1.4879322052001953, "logits/rejected": -1.5010827779769897, "logps/chosen": -828.9422607421875, "logps/rejected": -911.2158203125, "loss": 0.482, "rewards/accuracies": 0.71875, "rewards/chosen": -3.6344053745269775, "rewards/margins": 0.5890753865242004, "rewards/rejected": -4.223480701446533, "step": 1361 }, { "epoch": 0.8895420034288514, "grad_norm": 44.45040636623173, "learning_rate": 5.5061690086498995e-09, "logits/chosen": -1.60628342628479, "logits/rejected": -1.5504629611968994, "logps/chosen": -918.0870971679688, "logps/rejected": -938.4238891601562, "loss": 0.5785, "rewards/accuracies": 0.6875, "rewards/chosen": -3.70293927192688, "rewards/margins": 0.504538357257843, "rewards/rejected": -4.207477569580078, "step": 1362 }, { "epoch": 0.8901951179688138, "grad_norm": 59.14938077433194, "learning_rate": 5.441997436970908e-09, "logits/chosen": -1.5273542404174805, "logits/rejected": -1.4696133136749268, "logps/chosen": -905.05615234375, "logps/rejected": -1024.7708740234375, "loss": 0.5164, "rewards/accuracies": 0.78125, "rewards/chosen": -3.6890876293182373, "rewards/margins": 0.9259529709815979, "rewards/rejected": -4.6150407791137695, "step": 1363 }, { "epoch": 0.8908482325087762, "grad_norm": 20.847094913779035, "learning_rate": 5.3781879238177175e-09, "logits/chosen": -1.4799872636795044, "logits/rejected": -1.4763946533203125, "logps/chosen": -818.2520751953125, "logps/rejected": -900.87353515625, "loss": 0.5318, "rewards/accuracies": 0.75, "rewards/chosen": -3.162236213684082, "rewards/margins": 0.6138532757759094, "rewards/rejected": -3.776089668273926, "step": 1364 }, { "epoch": 0.8915013470487386, "grad_norm": 42.086051676513534, "learning_rate": 5.314740801327189e-09, "logits/chosen": -1.5711402893066406, "logits/rejected": -1.5064854621887207, "logps/chosen": -800.728515625, "logps/rejected": -949.0936889648438, "loss": 0.4662, "rewards/accuracies": 0.96875, "rewards/chosen": -3.1598963737487793, "rewards/margins": 1.2613778114318848, "rewards/rejected": -4.421274662017822, "step": 1365 }, { "epoch": 0.8921544615887012, "grad_norm": 36.43558502369589, "learning_rate": 5.251656399749948e-09, "logits/chosen": -1.5465190410614014, "logits/rejected": -1.5165586471557617, "logps/chosen": -931.0474853515625, "logps/rejected": -1152.05126953125, "loss": 0.4131, "rewards/accuracies": 0.90625, "rewards/chosen": -3.7468013763427734, "rewards/margins": 1.6344844102859497, "rewards/rejected": -5.381285667419434, "step": 1366 }, { "epoch": 0.8928075761286636, "grad_norm": 18.023822032353266, "learning_rate": 5.1889350474485425e-09, "logits/chosen": -1.4559390544891357, "logits/rejected": -1.4189265966415405, "logps/chosen": -787.0001831054688, "logps/rejected": -834.5303344726562, "loss": 0.4982, "rewards/accuracies": 0.75, "rewards/chosen": -2.818913698196411, "rewards/margins": 0.7834856510162354, "rewards/rejected": -3.6023998260498047, "step": 1367 }, { "epoch": 0.893460690668626, "grad_norm": 51.00256674463102, "learning_rate": 5.126577070895851e-09, "logits/chosen": -1.4489341974258423, "logits/rejected": -1.4130151271820068, "logps/chosen": -933.874267578125, "logps/rejected": -1004.5009765625, "loss": 0.5661, "rewards/accuracies": 0.78125, "rewards/chosen": -3.800851345062256, "rewards/margins": 1.1616989374160767, "rewards/rejected": -4.962550163269043, "step": 1368 }, { "epoch": 0.8941138052085884, "grad_norm": 28.042878667934062, "learning_rate": 5.064582794673322e-09, "logits/chosen": -1.5686874389648438, "logits/rejected": -1.5284960269927979, "logps/chosen": -913.930908203125, "logps/rejected": -1030.9652099609375, "loss": 0.5008, "rewards/accuracies": 0.71875, "rewards/chosen": -4.0387115478515625, "rewards/margins": 0.8935470581054688, "rewards/rejected": -4.9322590827941895, "step": 1369 }, { "epoch": 0.8947669197485509, "grad_norm": 10.695407778573784, "learning_rate": 5.002952541469296e-09, "logits/chosen": -1.4896258115768433, "logits/rejected": -1.4801172018051147, "logps/chosen": -879.490966796875, "logps/rejected": -959.708251953125, "loss": 0.4915, "rewards/accuracies": 0.78125, "rewards/chosen": -3.568885564804077, "rewards/margins": 0.942472517490387, "rewards/rejected": -4.51135778427124, "step": 1370 }, { "epoch": 0.8954200342885134, "grad_norm": 31.529008960795142, "learning_rate": 4.941686632077316e-09, "logits/chosen": -1.4891539812088013, "logits/rejected": -1.4347100257873535, "logps/chosen": -856.7740478515625, "logps/rejected": -881.070068359375, "loss": 0.4438, "rewards/accuracies": 0.71875, "rewards/chosen": -3.407636880874634, "rewards/margins": 0.6649507284164429, "rewards/rejected": -4.072587490081787, "step": 1371 }, { "epoch": 0.8960731488284758, "grad_norm": 63.6933646160359, "learning_rate": 4.880785385394481e-09, "logits/chosen": -1.5834522247314453, "logits/rejected": -1.5488102436065674, "logps/chosen": -841.972900390625, "logps/rejected": -880.1702880859375, "loss": 0.457, "rewards/accuracies": 0.875, "rewards/chosen": -2.852036714553833, "rewards/margins": 1.1045963764190674, "rewards/rejected": -3.956632614135742, "step": 1372 }, { "epoch": 0.8967262633684382, "grad_norm": 69.78064834856058, "learning_rate": 4.820249118419753e-09, "logits/chosen": -1.4307173490524292, "logits/rejected": -1.4592258930206299, "logps/chosen": -751.7630615234375, "logps/rejected": -990.3554077148438, "loss": 0.4792, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6149396896362305, "rewards/margins": 1.3603920936584473, "rewards/rejected": -3.9753315448760986, "step": 1373 }, { "epoch": 0.8973793779084007, "grad_norm": 10.250335849217539, "learning_rate": 4.760078146252369e-09, "logits/chosen": -1.5606201887130737, "logits/rejected": -1.5576801300048828, "logps/chosen": -913.7922973632812, "logps/rejected": -964.5801391601562, "loss": 0.4753, "rewards/accuracies": 0.75, "rewards/chosen": -4.296699523925781, "rewards/margins": 0.7379961013793945, "rewards/rejected": -5.034695625305176, "step": 1374 }, { "epoch": 0.8980324924483631, "grad_norm": 20.39345965692167, "learning_rate": 4.7002727820901145e-09, "logits/chosen": -1.45150887966156, "logits/rejected": -1.4612077474594116, "logps/chosen": -808.03466796875, "logps/rejected": -928.4208984375, "loss": 0.4667, "rewards/accuracies": 0.6875, "rewards/chosen": -3.283160448074341, "rewards/margins": 1.0168074369430542, "rewards/rejected": -4.2999677658081055, "step": 1375 }, { "epoch": 0.8986856069883256, "grad_norm": 10.800412600192082, "learning_rate": 4.640833337227754e-09, "logits/chosen": -1.5039968490600586, "logits/rejected": -1.4786089658737183, "logps/chosen": -886.8804931640625, "logps/rejected": -959.9457397460938, "loss": 0.4385, "rewards/accuracies": 0.75, "rewards/chosen": -3.3790817260742188, "rewards/margins": 1.2774152755737305, "rewards/rejected": -4.656497001647949, "step": 1376 }, { "epoch": 0.899338721528288, "grad_norm": 103.40843027693559, "learning_rate": 4.581760121055392e-09, "logits/chosen": -1.5100741386413574, "logits/rejected": -1.435150146484375, "logps/chosen": -883.2802124023438, "logps/rejected": -883.0489501953125, "loss": 0.54, "rewards/accuracies": 0.6875, "rewards/chosen": -3.2741122245788574, "rewards/margins": 0.6780447959899902, "rewards/rejected": -3.9521570205688477, "step": 1377 }, { "epoch": 0.8999918360682505, "grad_norm": 48.575124614734456, "learning_rate": 4.523053441056876e-09, "logits/chosen": -1.5189893245697021, "logits/rejected": -1.5679491758346558, "logps/chosen": -803.8499755859375, "logps/rejected": -894.542236328125, "loss": 0.4704, "rewards/accuracies": 0.75, "rewards/chosen": -3.50295352935791, "rewards/margins": 0.8048915266990662, "rewards/rejected": -4.307845592498779, "step": 1378 }, { "epoch": 0.9006449506082129, "grad_norm": 10.423098002326993, "learning_rate": 4.4647136028081536e-09, "logits/chosen": -1.5686522722244263, "logits/rejected": -1.5366755723953247, "logps/chosen": -841.1636352539062, "logps/rejected": -963.812255859375, "loss": 0.4655, "rewards/accuracies": 0.75, "rewards/chosen": -3.380739688873291, "rewards/margins": 1.0111637115478516, "rewards/rejected": -4.391903400421143, "step": 1379 }, { "epoch": 0.9012980651481753, "grad_norm": 103.35704441058242, "learning_rate": 4.4067409099757505e-09, "logits/chosen": -1.4515259265899658, "logits/rejected": -1.421863317489624, "logps/chosen": -816.9014282226562, "logps/rejected": -830.080810546875, "loss": 0.5235, "rewards/accuracies": 0.59375, "rewards/chosen": -3.6674067974090576, "rewards/margins": 0.3336777687072754, "rewards/rejected": -4.001084327697754, "step": 1380 }, { "epoch": 0.9019511796881378, "grad_norm": 17.484965241800026, "learning_rate": 4.349135664315137e-09, "logits/chosen": -1.5792274475097656, "logits/rejected": -1.5765178203582764, "logps/chosen": -947.952392578125, "logps/rejected": -1038.84326171875, "loss": 0.4834, "rewards/accuracies": 0.84375, "rewards/chosen": -3.7614831924438477, "rewards/margins": 0.8391762971878052, "rewards/rejected": -4.6006598472595215, "step": 1381 }, { "epoch": 0.9026042942281003, "grad_norm": 29.173220260017832, "learning_rate": 4.291898165669155e-09, "logits/chosen": -1.5122828483581543, "logits/rejected": -1.5348204374313354, "logps/chosen": -920.7978515625, "logps/rejected": -1017.3441162109375, "loss": 0.446, "rewards/accuracies": 0.6875, "rewards/chosen": -3.638453960418701, "rewards/margins": 0.9234171509742737, "rewards/rejected": -4.56187105178833, "step": 1382 }, { "epoch": 0.9032574087680627, "grad_norm": 20.850486826793972, "learning_rate": 4.235028711966512e-09, "logits/chosen": -1.5382293462753296, "logits/rejected": -1.5173406600952148, "logps/chosen": -957.9508056640625, "logps/rejected": -1045.537353515625, "loss": 0.5194, "rewards/accuracies": 0.78125, "rewards/chosen": -3.8855299949645996, "rewards/margins": 1.027876615524292, "rewards/rejected": -4.9134063720703125, "step": 1383 }, { "epoch": 0.9039105233080251, "grad_norm": 34.38670135334981, "learning_rate": 4.178527599220164e-09, "logits/chosen": -1.420579195022583, "logits/rejected": -1.3815526962280273, "logps/chosen": -813.62255859375, "logps/rejected": -969.841552734375, "loss": 0.5321, "rewards/accuracies": 0.6875, "rewards/chosen": -3.1666247844696045, "rewards/margins": 1.1781651973724365, "rewards/rejected": -4.344789981842041, "step": 1384 }, { "epoch": 0.9045636378479875, "grad_norm": 68.615502539619, "learning_rate": 4.122395121525807e-09, "logits/chosen": -1.4724527597427368, "logits/rejected": -1.4684042930603027, "logps/chosen": -836.29638671875, "logps/rejected": -971.5530395507812, "loss": 0.5204, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4262635707855225, "rewards/margins": 0.8687800765037537, "rewards/rejected": -4.2950439453125, "step": 1385 }, { "epoch": 0.9052167523879501, "grad_norm": 50.58217070416385, "learning_rate": 4.0666315710603585e-09, "logits/chosen": -1.4584506750106812, "logits/rejected": -1.438408613204956, "logps/chosen": -815.32373046875, "logps/rejected": -983.1445922851562, "loss": 0.4398, "rewards/accuracies": 0.84375, "rewards/chosen": -3.262063503265381, "rewards/margins": 1.1712214946746826, "rewards/rejected": -4.433284759521484, "step": 1386 }, { "epoch": 0.9058698669279125, "grad_norm": 37.43119604963389, "learning_rate": 4.011237238080412e-09, "logits/chosen": -1.47329843044281, "logits/rejected": -1.5162948369979858, "logps/chosen": -834.7694091796875, "logps/rejected": -891.7135620117188, "loss": 0.4209, "rewards/accuracies": 0.75, "rewards/chosen": -3.565274238586426, "rewards/margins": 0.6577186584472656, "rewards/rejected": -4.222992897033691, "step": 1387 }, { "epoch": 0.9065229814678749, "grad_norm": 20.4350578675936, "learning_rate": 3.956212410920731e-09, "logits/chosen": -1.5497958660125732, "logits/rejected": -1.5459709167480469, "logps/chosen": -957.7774047851562, "logps/rejected": -1025.171142578125, "loss": 0.4836, "rewards/accuracies": 0.78125, "rewards/chosen": -3.694035053253174, "rewards/margins": 1.2108312845230103, "rewards/rejected": -4.9048662185668945, "step": 1388 }, { "epoch": 0.9071760960078373, "grad_norm": 124.44180200304133, "learning_rate": 3.90155737599277e-09, "logits/chosen": -1.532940149307251, "logits/rejected": -1.521911859512329, "logps/chosen": -958.362060546875, "logps/rejected": -1038.1849365234375, "loss": 0.4972, "rewards/accuracies": 0.71875, "rewards/chosen": -3.649043321609497, "rewards/margins": 1.0646413564682007, "rewards/rejected": -4.713685035705566, "step": 1389 }, { "epoch": 0.9078292105477999, "grad_norm": 21.63423361819083, "learning_rate": 3.847272417783129e-09, "logits/chosen": -1.5962800979614258, "logits/rejected": -1.5604323148727417, "logps/chosen": -924.5335083007812, "logps/rejected": -991.5548095703125, "loss": 0.4995, "rewards/accuracies": 0.65625, "rewards/chosen": -3.826894760131836, "rewards/margins": 0.9313570261001587, "rewards/rejected": -4.758251190185547, "step": 1390 }, { "epoch": 0.9084823250877623, "grad_norm": 33.36489378101839, "learning_rate": 3.793357818852141e-09, "logits/chosen": -1.5311145782470703, "logits/rejected": -1.5007251501083374, "logps/chosen": -857.7115478515625, "logps/rejected": -936.7415161132812, "loss": 0.5142, "rewards/accuracies": 0.84375, "rewards/chosen": -3.454068660736084, "rewards/margins": 1.1231262683868408, "rewards/rejected": -4.577195167541504, "step": 1391 }, { "epoch": 0.9091354396277247, "grad_norm": 23.390479098008697, "learning_rate": 3.739813859832383e-09, "logits/chosen": -1.5484957695007324, "logits/rejected": -1.5225454568862915, "logps/chosen": -945.0482788085938, "logps/rejected": -1007.5425415039062, "loss": 0.5321, "rewards/accuracies": 0.75, "rewards/chosen": -3.9478251934051514, "rewards/margins": 0.5874274969100952, "rewards/rejected": -4.535252571105957, "step": 1392 }, { "epoch": 0.9097885541676871, "grad_norm": 27.975290638594313, "learning_rate": 3.686640819427164e-09, "logits/chosen": -1.4847687482833862, "logits/rejected": -1.4688597917556763, "logps/chosen": -885.1260986328125, "logps/rejected": -946.1005859375, "loss": 0.5318, "rewards/accuracies": 0.71875, "rewards/chosen": -3.731746196746826, "rewards/margins": 0.6578457355499268, "rewards/rejected": -4.389591693878174, "step": 1393 }, { "epoch": 0.9104416687076496, "grad_norm": 11.396992947468927, "learning_rate": 3.633838974409148e-09, "logits/chosen": -1.4794695377349854, "logits/rejected": -1.458986759185791, "logps/chosen": -792.629150390625, "logps/rejected": -829.4083251953125, "loss": 0.5528, "rewards/accuracies": 0.78125, "rewards/chosen": -3.2658491134643555, "rewards/margins": 0.6557501554489136, "rewards/rejected": -3.9215991497039795, "step": 1394 }, { "epoch": 0.9110947832476121, "grad_norm": 51.767743124843925, "learning_rate": 3.5814085996188516e-09, "logits/chosen": -1.5730397701263428, "logits/rejected": -1.5797572135925293, "logps/chosen": -915.05859375, "logps/rejected": -1105.7381591796875, "loss": 0.4246, "rewards/accuracies": 0.6875, "rewards/chosen": -3.560077667236328, "rewards/margins": 1.278112769126892, "rewards/rejected": -4.83819055557251, "step": 1395 }, { "epoch": 0.9117478977875745, "grad_norm": 45.396998730278426, "learning_rate": 3.529349967963263e-09, "logits/chosen": -1.4627869129180908, "logits/rejected": -1.473419427871704, "logps/chosen": -947.2510986328125, "logps/rejected": -1030.341064453125, "loss": 0.4942, "rewards/accuracies": 0.8125, "rewards/chosen": -3.280148506164551, "rewards/margins": 0.8479899764060974, "rewards/rejected": -4.128138065338135, "step": 1396 }, { "epoch": 0.9124010123275369, "grad_norm": 15.614778720558112, "learning_rate": 3.477663350414378e-09, "logits/chosen": -1.5414352416992188, "logits/rejected": -1.560785174369812, "logps/chosen": -816.3881225585938, "logps/rejected": -960.979736328125, "loss": 0.4485, "rewards/accuracies": 0.90625, "rewards/chosen": -3.140190601348877, "rewards/margins": 1.348671555519104, "rewards/rejected": -4.488862037658691, "step": 1397 }, { "epoch": 0.9130541268674994, "grad_norm": 73.82027329116843, "learning_rate": 3.426349016007815e-09, "logits/chosen": -1.4851999282836914, "logits/rejected": -1.4966939687728882, "logps/chosen": -769.3817749023438, "logps/rejected": -939.4908447265625, "loss": 0.4833, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9074413776397705, "rewards/margins": 1.105202555656433, "rewards/rejected": -4.012643337249756, "step": 1398 }, { "epoch": 0.9137072414074618, "grad_norm": 42.06804571860955, "learning_rate": 3.3754072318414346e-09, "logits/chosen": -1.4935294389724731, "logits/rejected": -1.4656096696853638, "logps/chosen": -913.9616088867188, "logps/rejected": -963.2996826171875, "loss": 0.5134, "rewards/accuracies": 0.84375, "rewards/chosen": -4.082716464996338, "rewards/margins": 1.030174970626831, "rewards/rejected": -5.112891674041748, "step": 1399 }, { "epoch": 0.9143603559474243, "grad_norm": 52.8878293286426, "learning_rate": 3.3248382630738813e-09, "logits/chosen": -1.5851317644119263, "logits/rejected": -1.5376235246658325, "logps/chosen": -955.0020141601562, "logps/rejected": -968.5732421875, "loss": 0.5165, "rewards/accuracies": 0.65625, "rewards/chosen": -4.052890777587891, "rewards/margins": 0.612097442150116, "rewards/rejected": -4.664988040924072, "step": 1400 }, { "epoch": 0.9143603559474243, "eval_logits/chosen": -1.479893684387207, "eval_logits/rejected": -1.4609832763671875, "eval_logps/chosen": -843.42822265625, "eval_logps/rejected": -927.4912719726562, "eval_loss": 0.4942656457424164, "eval_rewards/accuracies": 0.7599999904632568, "eval_rewards/chosen": -3.330828905105591, "eval_rewards/margins": 0.9216902256011963, "eval_rewards/rejected": -4.252519130706787, "eval_runtime": 300.2025, "eval_samples_per_second": 13.324, "eval_steps_per_second": 0.833, "step": 1400 }, { "epoch": 0.9150134704873867, "grad_norm": 43.39561827227798, "learning_rate": 3.2746423729232945e-09, "logits/chosen": -1.4578529596328735, "logits/rejected": -1.4696025848388672, "logps/chosen": -826.8597412109375, "logps/rejected": -900.2799682617188, "loss": 0.4753, "rewards/accuracies": 0.78125, "rewards/chosen": -3.285310983657837, "rewards/margins": 0.8875605463981628, "rewards/rejected": -4.172871112823486, "step": 1401 }, { "epoch": 0.9156665850273492, "grad_norm": 12.907015811194501, "learning_rate": 3.224819822665842e-09, "logits/chosen": -1.4285743236541748, "logits/rejected": -1.4410088062286377, "logps/chosen": -723.3681640625, "logps/rejected": -917.68701171875, "loss": 0.4513, "rewards/accuracies": 0.78125, "rewards/chosen": -2.843869209289551, "rewards/margins": 1.6680275201797485, "rewards/rejected": -4.511897087097168, "step": 1402 }, { "epoch": 0.9163196995673116, "grad_norm": 14.776408723830784, "learning_rate": 3.1753708716344364e-09, "logits/chosen": -1.5420118570327759, "logits/rejected": -1.4822347164154053, "logps/chosen": -804.4373779296875, "logps/rejected": -927.0358276367188, "loss": 0.4767, "rewards/accuracies": 0.8125, "rewards/chosen": -3.7062926292419434, "rewards/margins": 0.9591672420501709, "rewards/rejected": -4.665459632873535, "step": 1403 }, { "epoch": 0.916972814107274, "grad_norm": 110.87083222991181, "learning_rate": 3.1262957772173637e-09, "logits/chosen": -1.4802658557891846, "logits/rejected": -1.4908185005187988, "logps/chosen": -890.1760864257812, "logps/rejected": -956.2567138671875, "loss": 0.4505, "rewards/accuracies": 0.875, "rewards/chosen": -3.3808512687683105, "rewards/margins": 0.817065417766571, "rewards/rejected": -4.1979169845581055, "step": 1404 }, { "epoch": 0.9176259286472365, "grad_norm": 61.96498893829653, "learning_rate": 3.0775947948569162e-09, "logits/chosen": -1.4553033113479614, "logits/rejected": -1.469690203666687, "logps/chosen": -904.5153198242188, "logps/rejected": -954.6373901367188, "loss": 0.4687, "rewards/accuracies": 0.75, "rewards/chosen": -3.5189156532287598, "rewards/margins": 0.6943820118904114, "rewards/rejected": -4.2132978439331055, "step": 1405 }, { "epoch": 0.918279043187199, "grad_norm": 67.39643533256593, "learning_rate": 3.0292681780481027e-09, "logits/chosen": -1.587602972984314, "logits/rejected": -1.5586885213851929, "logps/chosen": -860.71923828125, "logps/rejected": -963.667724609375, "loss": 0.4579, "rewards/accuracies": 0.875, "rewards/chosen": -3.3475418090820312, "rewards/margins": 0.9892367124557495, "rewards/rejected": -4.336778163909912, "step": 1406 }, { "epoch": 0.9189321577271614, "grad_norm": 21.82848056849987, "learning_rate": 2.981316178337298e-09, "logits/chosen": -1.4134702682495117, "logits/rejected": -1.4143006801605225, "logps/chosen": -781.74853515625, "logps/rejected": -905.278564453125, "loss": 0.4413, "rewards/accuracies": 0.90625, "rewards/chosen": -3.0943939685821533, "rewards/margins": 1.127016305923462, "rewards/rejected": -4.221409797668457, "step": 1407 }, { "epoch": 0.9195852722671238, "grad_norm": 50.76090513680746, "learning_rate": 2.933739045320946e-09, "logits/chosen": -1.5723644495010376, "logits/rejected": -1.4760441780090332, "logps/chosen": -810.4500122070312, "logps/rejected": -890.5816040039062, "loss": 0.4265, "rewards/accuracies": 0.78125, "rewards/chosen": -3.274219512939453, "rewards/margins": 1.089596152305603, "rewards/rejected": -4.363815784454346, "step": 1408 }, { "epoch": 0.9202383868070863, "grad_norm": 35.65848530691593, "learning_rate": 2.886537026644259e-09, "logits/chosen": -1.4684292078018188, "logits/rejected": -1.492079257965088, "logps/chosen": -921.974853515625, "logps/rejected": -1068.1021728515625, "loss": 0.4578, "rewards/accuracies": 0.84375, "rewards/chosen": -3.9635848999023438, "rewards/margins": 1.2408589124679565, "rewards/rejected": -5.20444393157959, "step": 1409 }, { "epoch": 0.9208915013470488, "grad_norm": 13.02579519124362, "learning_rate": 2.8397103679999535e-09, "logits/chosen": -1.4359222650527954, "logits/rejected": -1.4592533111572266, "logps/chosen": -927.1942749023438, "logps/rejected": -1130.98779296875, "loss": 0.5307, "rewards/accuracies": 0.78125, "rewards/chosen": -3.3110556602478027, "rewards/margins": 1.212038278579712, "rewards/rejected": -4.5230937004089355, "step": 1410 }, { "epoch": 0.9215446158870112, "grad_norm": 71.1836961214141, "learning_rate": 2.7932593131269085e-09, "logits/chosen": -1.506251573562622, "logits/rejected": -1.4403364658355713, "logps/chosen": -857.0916748046875, "logps/rejected": -886.8201293945312, "loss": 0.5038, "rewards/accuracies": 0.75, "rewards/chosen": -3.356765031814575, "rewards/margins": 0.6197909712791443, "rewards/rejected": -3.9765563011169434, "step": 1411 }, { "epoch": 0.9221977304269736, "grad_norm": 13.552404267546486, "learning_rate": 2.747184103808975e-09, "logits/chosen": -1.531203269958496, "logits/rejected": -1.497283935546875, "logps/chosen": -899.0496215820312, "logps/rejected": -952.4412841796875, "loss": 0.4867, "rewards/accuracies": 0.84375, "rewards/chosen": -3.493135452270508, "rewards/margins": 0.9362788796424866, "rewards/rejected": -4.429414749145508, "step": 1412 }, { "epoch": 0.922850844966936, "grad_norm": 83.47511406736928, "learning_rate": 2.7014849798736526e-09, "logits/chosen": -1.4521548748016357, "logits/rejected": -1.4682101011276245, "logps/chosen": -852.2609252929688, "logps/rejected": -949.7713012695312, "loss": 0.4316, "rewards/accuracies": 0.875, "rewards/chosen": -3.3543503284454346, "rewards/margins": 1.411728858947754, "rewards/rejected": -4.766078948974609, "step": 1413 }, { "epoch": 0.9235039595068986, "grad_norm": 20.341630193627967, "learning_rate": 2.6561621791908654e-09, "logits/chosen": -1.53240168094635, "logits/rejected": -1.5243420600891113, "logps/chosen": -909.6138916015625, "logps/rejected": -951.72900390625, "loss": 0.4342, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3367176055908203, "rewards/margins": 0.8963368535041809, "rewards/rejected": -4.233054161071777, "step": 1414 }, { "epoch": 0.924157074046861, "grad_norm": 41.165407344672616, "learning_rate": 2.6112159376717456e-09, "logits/chosen": -1.4888275861740112, "logits/rejected": -1.4489132165908813, "logps/chosen": -770.2021484375, "logps/rejected": -767.581787109375, "loss": 0.5119, "rewards/accuracies": 0.75, "rewards/chosen": -3.1834659576416016, "rewards/margins": 0.5162588357925415, "rewards/rejected": -3.6997246742248535, "step": 1415 }, { "epoch": 0.9248101885868234, "grad_norm": 49.5990277647984, "learning_rate": 2.5666464892673768e-09, "logits/chosen": -1.4363664388656616, "logits/rejected": -1.4629590511322021, "logps/chosen": -779.8847045898438, "logps/rejected": -841.2040405273438, "loss": 0.4964, "rewards/accuracies": 0.75, "rewards/chosen": -2.621016502380371, "rewards/margins": 0.6873450875282288, "rewards/rejected": -3.308361530303955, "step": 1416 }, { "epoch": 0.9254633031267858, "grad_norm": 80.94852955816721, "learning_rate": 2.5224540659675692e-09, "logits/chosen": -1.5635671615600586, "logits/rejected": -1.5525977611541748, "logps/chosen": -759.9940185546875, "logps/rejected": -936.1673583984375, "loss": 0.4763, "rewards/accuracies": 0.90625, "rewards/chosen": -3.061068058013916, "rewards/margins": 1.1235350370407104, "rewards/rejected": -4.184603214263916, "step": 1417 }, { "epoch": 0.9261164176667483, "grad_norm": 42.64229099155968, "learning_rate": 2.4786388977997034e-09, "logits/chosen": -1.5770373344421387, "logits/rejected": -1.4894176721572876, "logps/chosen": -840.8875122070312, "logps/rejected": -915.895263671875, "loss": 0.5005, "rewards/accuracies": 0.6875, "rewards/chosen": -3.297794818878174, "rewards/margins": 0.8870930671691895, "rewards/rejected": -4.184887886047363, "step": 1418 }, { "epoch": 0.9267695322067108, "grad_norm": 95.47316675894163, "learning_rate": 2.435201212827456e-09, "logits/chosen": -1.5091971158981323, "logits/rejected": -1.4532575607299805, "logps/chosen": -874.2222900390625, "logps/rejected": -985.8672485351562, "loss": 0.4799, "rewards/accuracies": 0.5625, "rewards/chosen": -3.3658037185668945, "rewards/margins": 0.7014881372451782, "rewards/rejected": -4.067291736602783, "step": 1419 }, { "epoch": 0.9274226467466732, "grad_norm": 22.60308983441237, "learning_rate": 2.3921412371496834e-09, "logits/chosen": -1.481071949005127, "logits/rejected": -1.403544545173645, "logps/chosen": -835.0723266601562, "logps/rejected": -833.1524047851562, "loss": 0.5539, "rewards/accuracies": 0.75, "rewards/chosen": -3.328864812850952, "rewards/margins": 0.49991360306739807, "rewards/rejected": -3.8287787437438965, "step": 1420 }, { "epoch": 0.9280757612866356, "grad_norm": 22.71188518146946, "learning_rate": 2.349459194899198e-09, "logits/chosen": -1.4905717372894287, "logits/rejected": -1.4012264013290405, "logps/chosen": -864.1593017578125, "logps/rejected": -974.7191162109375, "loss": 0.4975, "rewards/accuracies": 0.71875, "rewards/chosen": -3.465050220489502, "rewards/margins": 0.9912995100021362, "rewards/rejected": -4.4563493728637695, "step": 1421 }, { "epoch": 0.9287288758265981, "grad_norm": 38.38087407026408, "learning_rate": 2.307155308241643e-09, "logits/chosen": -1.51276695728302, "logits/rejected": -1.5155949592590332, "logps/chosen": -840.83056640625, "logps/rejected": -936.4772338867188, "loss": 0.4841, "rewards/accuracies": 0.84375, "rewards/chosen": -3.4179623126983643, "rewards/margins": 1.073073387145996, "rewards/rejected": -4.4910359382629395, "step": 1422 }, { "epoch": 0.9293819903665606, "grad_norm": 31.60230447821753, "learning_rate": 2.2652297973742963e-09, "logits/chosen": -1.5305612087249756, "logits/rejected": -1.509539008140564, "logps/chosen": -770.224609375, "logps/rejected": -883.810302734375, "loss": 0.5138, "rewards/accuracies": 0.75, "rewards/chosen": -3.264803409576416, "rewards/margins": 1.1701805591583252, "rewards/rejected": -4.434983730316162, "step": 1423 }, { "epoch": 0.930035104906523, "grad_norm": 77.90811255353113, "learning_rate": 2.2236828805249184e-09, "logits/chosen": -1.5154668092727661, "logits/rejected": -1.516597867012024, "logps/chosen": -771.7640380859375, "logps/rejected": -854.826904296875, "loss": 0.5166, "rewards/accuracies": 0.875, "rewards/chosen": -2.9361448287963867, "rewards/margins": 0.7485728859901428, "rewards/rejected": -3.6847176551818848, "step": 1424 }, { "epoch": 0.9306882194464854, "grad_norm": 37.785751655050305, "learning_rate": 2.1825147739506805e-09, "logits/chosen": -1.4908900260925293, "logits/rejected": -1.5062459707260132, "logps/chosen": -884.8120727539062, "logps/rejected": -929.1033935546875, "loss": 0.562, "rewards/accuracies": 0.625, "rewards/chosen": -3.515050172805786, "rewards/margins": 0.5184265375137329, "rewards/rejected": -4.03347635269165, "step": 1425 }, { "epoch": 0.9313413339864479, "grad_norm": 15.861485447329448, "learning_rate": 2.141725691936963e-09, "logits/chosen": -1.4910266399383545, "logits/rejected": -1.4862860441207886, "logps/chosen": -826.7039794921875, "logps/rejected": -980.1998291015625, "loss": 0.486, "rewards/accuracies": 0.75, "rewards/chosen": -3.137228012084961, "rewards/margins": 1.146528720855713, "rewards/rejected": -4.283757209777832, "step": 1426 }, { "epoch": 0.9319944485264103, "grad_norm": 28.488624408307338, "learning_rate": 2.1013158467963004e-09, "logits/chosen": -1.6420342922210693, "logits/rejected": -1.6163609027862549, "logps/chosen": -924.2805786132812, "logps/rejected": -944.080810546875, "loss": 0.5047, "rewards/accuracies": 0.75, "rewards/chosen": -3.657701015472412, "rewards/margins": 0.6087216138839722, "rewards/rejected": -4.266422748565674, "step": 1427 }, { "epoch": 0.9326475630663728, "grad_norm": 10.967689522386022, "learning_rate": 2.0612854488672227e-09, "logits/chosen": -1.4555834531784058, "logits/rejected": -1.4464821815490723, "logps/chosen": -761.0975952148438, "logps/rejected": -887.48046875, "loss": 0.448, "rewards/accuracies": 0.875, "rewards/chosen": -3.0129241943359375, "rewards/margins": 1.1502059698104858, "rewards/rejected": -4.163130760192871, "step": 1428 }, { "epoch": 0.9333006776063352, "grad_norm": 14.555394411736843, "learning_rate": 2.0216347065132144e-09, "logits/chosen": -1.4503599405288696, "logits/rejected": -1.418792724609375, "logps/chosen": -837.8484497070312, "logps/rejected": -937.6596069335938, "loss": 0.5286, "rewards/accuracies": 0.71875, "rewards/chosen": -3.435154914855957, "rewards/margins": 0.9483442902565002, "rewards/rejected": -4.3834991455078125, "step": 1429 }, { "epoch": 0.9339537921462977, "grad_norm": 65.87984391899451, "learning_rate": 1.982363826121583e-09, "logits/chosen": -1.5405863523483276, "logits/rejected": -1.545666217803955, "logps/chosen": -946.0952758789062, "logps/rejected": -1030.0819091796875, "loss": 0.477, "rewards/accuracies": 0.78125, "rewards/chosen": -3.5466394424438477, "rewards/margins": 0.8139285445213318, "rewards/rejected": -4.360568046569824, "step": 1430 }, { "epoch": 0.9346069066862601, "grad_norm": 18.205479221571412, "learning_rate": 1.943473012102409e-09, "logits/chosen": -1.531268835067749, "logits/rejected": -1.5460495948791504, "logps/chosen": -898.9242553710938, "logps/rejected": -946.4170532226562, "loss": 0.54, "rewards/accuracies": 0.78125, "rewards/chosen": -3.561772346496582, "rewards/margins": 0.8415082693099976, "rewards/rejected": -4.403279781341553, "step": 1431 }, { "epoch": 0.9352600212262225, "grad_norm": 22.739713888659217, "learning_rate": 1.9049624668874886e-09, "logits/chosen": -1.4721064567565918, "logits/rejected": -1.457444667816162, "logps/chosen": -727.5782470703125, "logps/rejected": -881.88720703125, "loss": 0.432, "rewards/accuracies": 0.78125, "rewards/chosen": -2.979851007461548, "rewards/margins": 1.0425056219100952, "rewards/rejected": -4.0223565101623535, "step": 1432 }, { "epoch": 0.935913135766185, "grad_norm": 23.34819005960656, "learning_rate": 1.866832390929243e-09, "logits/chosen": -1.4648447036743164, "logits/rejected": -1.4732989072799683, "logps/chosen": -815.2117309570312, "logps/rejected": -898.7406005859375, "loss": 0.541, "rewards/accuracies": 0.75, "rewards/chosen": -3.4008235931396484, "rewards/margins": 0.6835126280784607, "rewards/rejected": -4.084336280822754, "step": 1433 }, { "epoch": 0.9365662503061475, "grad_norm": 63.46180789960193, "learning_rate": 1.8290829826997367e-09, "logits/chosen": -1.559678554534912, "logits/rejected": -1.5437560081481934, "logps/chosen": -902.6143188476562, "logps/rejected": -933.3184204101562, "loss": 0.5227, "rewards/accuracies": 0.6875, "rewards/chosen": -3.4241020679473877, "rewards/margins": 0.513259768486023, "rewards/rejected": -3.9373619556427, "step": 1434 }, { "epoch": 0.9372193648461099, "grad_norm": 34.479724315346566, "learning_rate": 1.7917144386895926e-09, "logits/chosen": -1.5433356761932373, "logits/rejected": -1.4930343627929688, "logps/chosen": -839.4082641601562, "logps/rejected": -884.831298828125, "loss": 0.5361, "rewards/accuracies": 0.71875, "rewards/chosen": -3.436253547668457, "rewards/margins": 0.7238208651542664, "rewards/rejected": -4.160074234008789, "step": 1435 }, { "epoch": 0.9378724793860723, "grad_norm": 15.898555518439984, "learning_rate": 1.7547269534069626e-09, "logits/chosen": -1.4363042116165161, "logits/rejected": -1.426371455192566, "logps/chosen": -834.6913452148438, "logps/rejected": -999.5419921875, "loss": 0.4245, "rewards/accuracies": 0.875, "rewards/chosen": -3.229515552520752, "rewards/margins": 1.1570442914962769, "rewards/rejected": -4.386559963226318, "step": 1436 }, { "epoch": 0.9385255939260347, "grad_norm": 24.923739035573195, "learning_rate": 1.7181207193765756e-09, "logits/chosen": -1.5305100679397583, "logits/rejected": -1.5034910440444946, "logps/chosen": -884.4599609375, "logps/rejected": -919.95947265625, "loss": 0.4775, "rewards/accuracies": 0.875, "rewards/chosen": -3.1505179405212402, "rewards/margins": 0.9173092842102051, "rewards/rejected": -4.067827224731445, "step": 1437 }, { "epoch": 0.9391787084659973, "grad_norm": 22.943505162467492, "learning_rate": 1.681895927138674e-09, "logits/chosen": -1.532247543334961, "logits/rejected": -1.5096689462661743, "logps/chosen": -860.7628173828125, "logps/rejected": -1032.169189453125, "loss": 0.4488, "rewards/accuracies": 0.90625, "rewards/chosen": -3.4541678428649902, "rewards/margins": 1.4435040950775146, "rewards/rejected": -4.897672176361084, "step": 1438 }, { "epoch": 0.9398318230059597, "grad_norm": 43.92276660865622, "learning_rate": 1.646052765248046e-09, "logits/chosen": -1.5007368326187134, "logits/rejected": -1.539098858833313, "logps/chosen": -893.9174194335938, "logps/rejected": -938.9563598632812, "loss": 0.5518, "rewards/accuracies": 0.6875, "rewards/chosen": -4.333798885345459, "rewards/margins": 0.5584127902984619, "rewards/rejected": -4.8922119140625, "step": 1439 }, { "epoch": 0.9404849375459221, "grad_norm": 34.468585588337255, "learning_rate": 1.6105914202730608e-09, "logits/chosen": -1.4849194288253784, "logits/rejected": -1.4576518535614014, "logps/chosen": -796.8408813476562, "logps/rejected": -966.2708129882812, "loss": 0.4925, "rewards/accuracies": 0.71875, "rewards/chosen": -3.009169578552246, "rewards/margins": 1.379059076309204, "rewards/rejected": -4.388228893280029, "step": 1440 }, { "epoch": 0.9411380520858845, "grad_norm": 40.61462323177837, "learning_rate": 1.5755120767946607e-09, "logits/chosen": -1.4398796558380127, "logits/rejected": -1.3817028999328613, "logps/chosen": -766.0177612304688, "logps/rejected": -804.5883178710938, "loss": 0.4724, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0304973125457764, "rewards/margins": 0.6275922656059265, "rewards/rejected": -3.6580896377563477, "step": 1441 }, { "epoch": 0.941791166625847, "grad_norm": 10.887896926258218, "learning_rate": 1.5408149174054446e-09, "logits/chosen": -1.4392205476760864, "logits/rejected": -1.4316151142120361, "logps/chosen": -780.572509765625, "logps/rejected": -929.8662109375, "loss": 0.4883, "rewards/accuracies": 0.875, "rewards/chosen": -2.8742194175720215, "rewards/margins": 1.0152424573898315, "rewards/rejected": -3.8894619941711426, "step": 1442 }, { "epoch": 0.9424442811658095, "grad_norm": 11.226257970745792, "learning_rate": 1.506500122708662e-09, "logits/chosen": -1.4671475887298584, "logits/rejected": -1.4885404109954834, "logps/chosen": -872.2683715820312, "logps/rejected": -914.68115234375, "loss": 0.5137, "rewards/accuracies": 0.6875, "rewards/chosen": -3.3775594234466553, "rewards/margins": 0.5714359283447266, "rewards/rejected": -3.9489948749542236, "step": 1443 }, { "epoch": 0.9430973957057719, "grad_norm": 75.39569486082955, "learning_rate": 1.4725678713173207e-09, "logits/chosen": -1.5088709592819214, "logits/rejected": -1.502449870109558, "logps/chosen": -910.4465942382812, "logps/rejected": -961.0289306640625, "loss": 0.527, "rewards/accuracies": 0.59375, "rewards/chosen": -3.666200637817383, "rewards/margins": 0.5112284421920776, "rewards/rejected": -4.177428722381592, "step": 1444 }, { "epoch": 0.9437505102457343, "grad_norm": 12.36325740105416, "learning_rate": 1.4390183398532457e-09, "logits/chosen": -1.5747441053390503, "logits/rejected": -1.5668046474456787, "logps/chosen": -940.868408203125, "logps/rejected": -1135.773681640625, "loss": 0.4886, "rewards/accuracies": 0.78125, "rewards/chosen": -3.498914957046509, "rewards/margins": 1.4520134925842285, "rewards/rejected": -4.950928688049316, "step": 1445 }, { "epoch": 0.9444036247856968, "grad_norm": 53.08556208724623, "learning_rate": 1.405851702946148e-09, "logits/chosen": -1.4660958051681519, "logits/rejected": -1.4496102333068848, "logps/chosen": -885.9743041992188, "logps/rejected": -923.9781494140625, "loss": 0.5484, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4384634494781494, "rewards/margins": 0.7487789392471313, "rewards/rejected": -4.18724250793457, "step": 1446 }, { "epoch": 0.9450567393256593, "grad_norm": 10.230443690320527, "learning_rate": 1.3730681332327242e-09, "logits/chosen": -1.409921646118164, "logits/rejected": -1.4098153114318848, "logps/chosen": -854.5587158203125, "logps/rejected": -979.5097045898438, "loss": 0.4908, "rewards/accuracies": 0.71875, "rewards/chosen": -3.432528018951416, "rewards/margins": 1.0839788913726807, "rewards/rejected": -4.516506671905518, "step": 1447 }, { "epoch": 0.9457098538656217, "grad_norm": 51.690528713979994, "learning_rate": 1.3406678013557492e-09, "logits/chosen": -1.4445513486862183, "logits/rejected": -1.489593267440796, "logps/chosen": -868.4315185546875, "logps/rejected": -968.2102661132812, "loss": 0.431, "rewards/accuracies": 0.875, "rewards/chosen": -3.0471696853637695, "rewards/margins": 0.9745462536811829, "rewards/rejected": -4.021716117858887, "step": 1448 }, { "epoch": 0.9463629684055841, "grad_norm": 66.01296376492611, "learning_rate": 1.3086508759631936e-09, "logits/chosen": -1.609882116317749, "logits/rejected": -1.605045199394226, "logps/chosen": -855.3104248046875, "logps/rejected": -889.3307495117188, "loss": 0.543, "rewards/accuracies": 0.65625, "rewards/chosen": -3.2741992473602295, "rewards/margins": 0.46837118268013, "rewards/rejected": -3.742570161819458, "step": 1449 }, { "epoch": 0.9470160829455466, "grad_norm": 69.83844230333594, "learning_rate": 1.2770175237073661e-09, "logits/chosen": -1.5078953504562378, "logits/rejected": -1.4897831678390503, "logps/chosen": -891.066650390625, "logps/rejected": -890.9603271484375, "loss": 0.5112, "rewards/accuracies": 0.625, "rewards/chosen": -3.392768383026123, "rewards/margins": 0.624529242515564, "rewards/rejected": -4.017297267913818, "step": 1450 }, { "epoch": 0.947669197485509, "grad_norm": 96.35598691320328, "learning_rate": 1.2457679092440054e-09, "logits/chosen": -1.5438551902770996, "logits/rejected": -1.5170389413833618, "logps/chosen": -747.1302490234375, "logps/rejected": -902.2919311523438, "loss": 0.4724, "rewards/accuracies": 0.90625, "rewards/chosen": -2.6579556465148926, "rewards/margins": 1.488484501838684, "rewards/rejected": -4.146440505981445, "step": 1451 }, { "epoch": 0.9483223120254715, "grad_norm": 10.598167900496241, "learning_rate": 1.2149021952314654e-09, "logits/chosen": -1.587843418121338, "logits/rejected": -1.5127121210098267, "logps/chosen": -762.05615234375, "logps/rejected": -796.598388671875, "loss": 0.5009, "rewards/accuracies": 0.75, "rewards/chosen": -3.141162157058716, "rewards/margins": 0.5582625865936279, "rewards/rejected": -3.699424982070923, "step": 1452 }, { "epoch": 0.9489754265654339, "grad_norm": 37.95658977591617, "learning_rate": 1.1844205423298142e-09, "logits/chosen": -1.5430374145507812, "logits/rejected": -1.494556188583374, "logps/chosen": -900.53173828125, "logps/rejected": -893.8412475585938, "loss": 0.4861, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4113199710845947, "rewards/margins": 0.7247394323348999, "rewards/rejected": -4.136059284210205, "step": 1453 }, { "epoch": 0.9496285411053964, "grad_norm": 37.85534269742698, "learning_rate": 1.15432310920007e-09, "logits/chosen": -1.5332300662994385, "logits/rejected": -1.5049644708633423, "logps/chosen": -879.8773803710938, "logps/rejected": -943.8563842773438, "loss": 0.4722, "rewards/accuracies": 0.84375, "rewards/chosen": -3.507415771484375, "rewards/margins": 0.8981890678405762, "rewards/rejected": -4.405604839324951, "step": 1454 }, { "epoch": 0.9502816556453588, "grad_norm": 13.249961940362768, "learning_rate": 1.1246100525033165e-09, "logits/chosen": -1.51246976852417, "logits/rejected": -1.4882327318191528, "logps/chosen": -900.8251342773438, "logps/rejected": -959.567626953125, "loss": 0.4925, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6211113929748535, "rewards/margins": 0.7765340805053711, "rewards/rejected": -4.397645473480225, "step": 1455 }, { "epoch": 0.9509347701853212, "grad_norm": 78.0554509757586, "learning_rate": 1.0952815268999049e-09, "logits/chosen": -1.4893440008163452, "logits/rejected": -1.458596110343933, "logps/chosen": -775.9324951171875, "logps/rejected": -817.9571533203125, "loss": 0.4805, "rewards/accuracies": 0.75, "rewards/chosen": -3.140364646911621, "rewards/margins": 0.7064178586006165, "rewards/rejected": -3.846782684326172, "step": 1456 }, { "epoch": 0.9515878847252837, "grad_norm": 112.04084469499804, "learning_rate": 1.0663376850486628e-09, "logits/chosen": -1.5067682266235352, "logits/rejected": -1.438143253326416, "logps/chosen": -828.5684814453125, "logps/rejected": -927.2904052734375, "loss": 0.5531, "rewards/accuracies": 0.6875, "rewards/chosen": -3.495786666870117, "rewards/margins": 0.701140820980072, "rewards/rejected": -4.196927547454834, "step": 1457 }, { "epoch": 0.9522409992652462, "grad_norm": 39.94282013087609, "learning_rate": 1.0377786776060854e-09, "logits/chosen": -1.5286598205566406, "logits/rejected": -1.4984127283096313, "logps/chosen": -771.9295654296875, "logps/rejected": -820.18310546875, "loss": 0.4538, "rewards/accuracies": 0.78125, "rewards/chosen": -2.990509033203125, "rewards/margins": 1.0369949340820312, "rewards/rejected": -4.027503967285156, "step": 1458 }, { "epoch": 0.9528941138052086, "grad_norm": 93.88679149047655, "learning_rate": 1.0096046532255376e-09, "logits/chosen": -1.6264631748199463, "logits/rejected": -1.5820178985595703, "logps/chosen": -873.897705078125, "logps/rejected": -970.2108154296875, "loss": 0.5254, "rewards/accuracies": 0.84375, "rewards/chosen": -3.5638842582702637, "rewards/margins": 0.973100483417511, "rewards/rejected": -4.536985397338867, "step": 1459 }, { "epoch": 0.953547228345171, "grad_norm": 19.02700301018552, "learning_rate": 9.818157585565284e-10, "logits/chosen": -1.4352535009384155, "logits/rejected": -1.3992193937301636, "logps/chosen": -827.4281616210938, "logps/rejected": -891.892822265625, "loss": 0.49, "rewards/accuracies": 0.6875, "rewards/chosen": -3.2847142219543457, "rewards/margins": 0.696830153465271, "rewards/rejected": -3.9815444946289062, "step": 1460 }, { "epoch": 0.9542003428851334, "grad_norm": 12.308757033298024, "learning_rate": 9.544121382438875e-10, "logits/chosen": -1.5002676248550415, "logits/rejected": -1.46719491481781, "logps/chosen": -895.8308715820312, "logps/rejected": -926.5982666015625, "loss": 0.4543, "rewards/accuracies": 0.78125, "rewards/chosen": -3.5023105144500732, "rewards/margins": 0.7259926795959473, "rewards/rejected": -4.2283034324646, "step": 1461 }, { "epoch": 0.954853457425096, "grad_norm": 18.453922423435763, "learning_rate": 9.273939349270565e-10, "logits/chosen": -1.5010855197906494, "logits/rejected": -1.3877067565917969, "logps/chosen": -768.5657348632812, "logps/rejected": -797.06396484375, "loss": 0.5329, "rewards/accuracies": 0.6875, "rewards/chosen": -2.9298336505889893, "rewards/margins": 0.8176116943359375, "rewards/rejected": -3.7474451065063477, "step": 1462 }, { "epoch": 0.9555065719650584, "grad_norm": 51.301955382401765, "learning_rate": 9.00761289239324e-10, "logits/chosen": -1.5010892152786255, "logits/rejected": -1.4880714416503906, "logps/chosen": -826.286865234375, "logps/rejected": -911.9449462890625, "loss": 0.4457, "rewards/accuracies": 0.84375, "rewards/chosen": -3.053290843963623, "rewards/margins": 0.8817964792251587, "rewards/rejected": -3.9350876808166504, "step": 1463 }, { "epoch": 0.9561596865050208, "grad_norm": 25.443543945201426, "learning_rate": 8.74514339807117e-10, "logits/chosen": -1.4686068296432495, "logits/rejected": -1.4487457275390625, "logps/chosen": -860.71484375, "logps/rejected": -960.4424438476562, "loss": 0.5242, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8519248962402344, "rewards/margins": 1.1036741733551025, "rewards/rejected": -4.955598831176758, "step": 1464 }, { "epoch": 0.9568128010449832, "grad_norm": 47.38366581435745, "learning_rate": 8.48653223249235e-10, "logits/chosen": -1.4724491834640503, "logits/rejected": -1.4838087558746338, "logps/chosen": -881.0520629882812, "logps/rejected": -929.6651611328125, "loss": 0.4467, "rewards/accuracies": 0.65625, "rewards/chosen": -3.963942289352417, "rewards/margins": 0.7111070156097412, "rewards/rejected": -4.675049304962158, "step": 1465 }, { "epoch": 0.9574659155849458, "grad_norm": 22.495822791256995, "learning_rate": 8.23178074176184e-10, "logits/chosen": -1.5290329456329346, "logits/rejected": -1.5657262802124023, "logps/chosen": -871.360107421875, "logps/rejected": -918.6497802734375, "loss": 0.4997, "rewards/accuracies": 0.875, "rewards/chosen": -3.4585037231445312, "rewards/margins": 0.9077770113945007, "rewards/rejected": -4.3662800788879395, "step": 1466 }, { "epoch": 0.9581190301249082, "grad_norm": 56.19391082571053, "learning_rate": 7.980890251894606e-10, "logits/chosen": -1.5316890478134155, "logits/rejected": -1.5544103384017944, "logps/chosen": -880.1229858398438, "logps/rejected": -1003.7587890625, "loss": 0.4847, "rewards/accuracies": 0.90625, "rewards/chosen": -3.2340664863586426, "rewards/margins": 1.333704948425293, "rewards/rejected": -4.567770957946777, "step": 1467 }, { "epoch": 0.9587721446648706, "grad_norm": 19.370534224099423, "learning_rate": 7.733862068808522e-10, "logits/chosen": -1.5931593179702759, "logits/rejected": -1.595098614692688, "logps/chosen": -917.0612182617188, "logps/rejected": -948.163818359375, "loss": 0.5525, "rewards/accuracies": 0.71875, "rewards/chosen": -3.9437098503112793, "rewards/margins": 0.40092504024505615, "rewards/rejected": -4.344634532928467, "step": 1468 }, { "epoch": 0.959425259204833, "grad_norm": 51.95592093037611, "learning_rate": 7.490697478317709e-10, "logits/chosen": -1.4602024555206299, "logits/rejected": -1.472777247428894, "logps/chosen": -886.6076049804688, "logps/rejected": -977.1819458007812, "loss": 0.4909, "rewards/accuracies": 0.78125, "rewards/chosen": -3.663419246673584, "rewards/margins": 0.9881623983383179, "rewards/rejected": -4.651581764221191, "step": 1469 }, { "epoch": 0.9600783737447955, "grad_norm": 33.610490066674124, "learning_rate": 7.251397746125709e-10, "logits/chosen": -1.5568439960479736, "logits/rejected": -1.548944354057312, "logps/chosen": -813.5231323242188, "logps/rejected": -983.3826904296875, "loss": 0.5045, "rewards/accuracies": 0.75, "rewards/chosen": -3.4104156494140625, "rewards/margins": 1.39304518699646, "rewards/rejected": -4.803461074829102, "step": 1470 }, { "epoch": 0.960731488284758, "grad_norm": 32.807228872430244, "learning_rate": 7.01596411781899e-10, "logits/chosen": -1.5096122026443481, "logits/rejected": -1.489628791809082, "logps/chosen": -898.908447265625, "logps/rejected": -961.82958984375, "loss": 0.4871, "rewards/accuracies": 0.75, "rewards/chosen": -3.5367379188537598, "rewards/margins": 1.0929946899414062, "rewards/rejected": -4.629732608795166, "step": 1471 }, { "epoch": 0.9613846028247204, "grad_norm": 12.404429113630796, "learning_rate": 6.784397818860532e-10, "logits/chosen": -1.4223408699035645, "logits/rejected": -1.4489169120788574, "logps/chosen": -749.7882690429688, "logps/rejected": -879.7198486328125, "loss": 0.478, "rewards/accuracies": 0.875, "rewards/chosen": -3.0195603370666504, "rewards/margins": 1.003127098083496, "rewards/rejected": -4.022687911987305, "step": 1472 }, { "epoch": 0.9620377173646828, "grad_norm": 25.259975174041397, "learning_rate": 6.556700054583253e-10, "logits/chosen": -1.4970064163208008, "logits/rejected": -1.4991717338562012, "logps/chosen": -767.5467529296875, "logps/rejected": -903.8729248046875, "loss": 0.5629, "rewards/accuracies": 0.75, "rewards/chosen": -3.0890581607818604, "rewards/margins": 0.8758949041366577, "rewards/rejected": -3.9649529457092285, "step": 1473 }, { "epoch": 0.9626908319046453, "grad_norm": 56.02069174460643, "learning_rate": 6.332872010183843e-10, "logits/chosen": -1.5272411108016968, "logits/rejected": -1.53700590133667, "logps/chosen": -874.013427734375, "logps/rejected": -928.57373046875, "loss": 0.4722, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1264383792877197, "rewards/margins": 0.8865586519241333, "rewards/rejected": -4.012997150421143, "step": 1474 }, { "epoch": 0.9633439464446077, "grad_norm": 12.235784698068322, "learning_rate": 6.112914850716771e-10, "logits/chosen": -1.4167723655700684, "logits/rejected": -1.4353446960449219, "logps/chosen": -847.5712890625, "logps/rejected": -927.8067626953125, "loss": 0.4272, "rewards/accuracies": 0.84375, "rewards/chosen": -3.4825477600097656, "rewards/margins": 0.6743806004524231, "rewards/rejected": -4.156928539276123, "step": 1475 }, { "epoch": 0.9639970609845702, "grad_norm": 53.93753110085255, "learning_rate": 5.896829721087709e-10, "logits/chosen": -1.4274340867996216, "logits/rejected": -1.419229507446289, "logps/chosen": -805.4702758789062, "logps/rejected": -839.0209350585938, "loss": 0.4951, "rewards/accuracies": 0.71875, "rewards/chosen": -3.2356109619140625, "rewards/margins": 0.7077401280403137, "rewards/rejected": -3.9433510303497314, "step": 1476 }, { "epoch": 0.9646501755245326, "grad_norm": 14.406009332648742, "learning_rate": 5.684617746048198e-10, "logits/chosen": -1.5064961910247803, "logits/rejected": -1.4822742938995361, "logps/chosen": -826.1743774414062, "logps/rejected": -864.1240234375, "loss": 0.4387, "rewards/accuracies": 0.8125, "rewards/chosen": -3.075737476348877, "rewards/margins": 0.7606046199798584, "rewards/rejected": -3.8363420963287354, "step": 1477 }, { "epoch": 0.9653032900644951, "grad_norm": 38.670095331691506, "learning_rate": 5.476280030189406e-10, "logits/chosen": -1.5506258010864258, "logits/rejected": -1.5424368381500244, "logps/chosen": -908.591064453125, "logps/rejected": -1038.738525390625, "loss": 0.5371, "rewards/accuracies": 0.71875, "rewards/chosen": -3.491032838821411, "rewards/margins": 1.2838757038116455, "rewards/rejected": -4.774909019470215, "step": 1478 }, { "epoch": 0.9659564046044575, "grad_norm": 25.905815918474016, "learning_rate": 5.271817657936467e-10, "logits/chosen": -1.4018712043762207, "logits/rejected": -1.3280322551727295, "logps/chosen": -733.0325317382812, "logps/rejected": -867.226806640625, "loss": 0.4825, "rewards/accuracies": 0.75, "rewards/chosen": -3.2157974243164062, "rewards/margins": 0.9778196811676025, "rewards/rejected": -4.19361686706543, "step": 1479 }, { "epoch": 0.96660951914442, "grad_norm": 21.34054547875047, "learning_rate": 5.071231693542732e-10, "logits/chosen": -1.5019912719726562, "logits/rejected": -1.4946105480194092, "logps/chosen": -867.0049438476562, "logps/rejected": -912.6876220703125, "loss": 0.4603, "rewards/accuracies": 0.65625, "rewards/chosen": -3.469493865966797, "rewards/margins": 0.9318346977233887, "rewards/rejected": -4.4013285636901855, "step": 1480 }, { "epoch": 0.9672626336843824, "grad_norm": 53.27235117213442, "learning_rate": 4.874523181084611e-10, "logits/chosen": -1.520746111869812, "logits/rejected": -1.5099103450775146, "logps/chosen": -950.77880859375, "logps/rejected": -1004.3994750976562, "loss": 0.4787, "rewards/accuracies": 0.75, "rewards/chosen": -3.89591908454895, "rewards/margins": 0.5920171141624451, "rewards/rejected": -4.487936019897461, "step": 1481 }, { "epoch": 0.9679157482243449, "grad_norm": 28.111966813993863, "learning_rate": 4.681693144455656e-10, "logits/chosen": -1.5613229274749756, "logits/rejected": -1.4852428436279297, "logps/chosen": -881.259765625, "logps/rejected": -977.8818969726562, "loss": 0.5095, "rewards/accuracies": 0.78125, "rewards/chosen": -3.6972122192382812, "rewards/margins": 0.9360721111297607, "rewards/rejected": -4.633284568786621, "step": 1482 }, { "epoch": 0.9685688627643073, "grad_norm": 54.78566289216356, "learning_rate": 4.4927425873614867e-10, "logits/chosen": -1.5294121503829956, "logits/rejected": -1.4791772365570068, "logps/chosen": -838.6341552734375, "logps/rejected": -868.4017333984375, "loss": 0.4892, "rewards/accuracies": 0.78125, "rewards/chosen": -3.21793794631958, "rewards/margins": 0.8229467272758484, "rewards/rejected": -4.040884971618652, "step": 1483 }, { "epoch": 0.9692219773042697, "grad_norm": 18.897385067422782, "learning_rate": 4.30767249331454e-10, "logits/chosen": -1.5810683965682983, "logits/rejected": -1.5569019317626953, "logps/chosen": -889.7955322265625, "logps/rejected": -911.5818481445312, "loss": 0.504, "rewards/accuracies": 0.78125, "rewards/chosen": -3.5015814304351807, "rewards/margins": 0.5960092544555664, "rewards/rejected": -4.097590923309326, "step": 1484 }, { "epoch": 0.9698750918442322, "grad_norm": 11.548963014295419, "learning_rate": 4.1264838256289126e-10, "logits/chosen": -1.5425231456756592, "logits/rejected": -1.5341880321502686, "logps/chosen": -866.2848510742188, "logps/rejected": -1056.494384765625, "loss": 0.4792, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4112586975097656, "rewards/margins": 1.2582924365997314, "rewards/rejected": -4.669550895690918, "step": 1485 }, { "epoch": 0.9705282063841947, "grad_norm": 45.7932753999572, "learning_rate": 3.9491775274153595e-10, "logits/chosen": -1.4852381944656372, "logits/rejected": -1.4904857873916626, "logps/chosen": -817.4542846679688, "logps/rejected": -949.4779663085938, "loss": 0.466, "rewards/accuracies": 0.78125, "rewards/chosen": -3.714642286300659, "rewards/margins": 0.9591977000236511, "rewards/rejected": -4.673839569091797, "step": 1486 }, { "epoch": 0.9711813209241571, "grad_norm": 21.73552603940438, "learning_rate": 3.7757545215764686e-10, "logits/chosen": -1.483793020248413, "logits/rejected": -1.5184385776519775, "logps/chosen": -764.6434936523438, "logps/rejected": -1080.6500244140625, "loss": 0.487, "rewards/accuracies": 0.78125, "rewards/chosen": -3.000844717025757, "rewards/margins": 2.042339563369751, "rewards/rejected": -5.043184280395508, "step": 1487 }, { "epoch": 0.9718344354641195, "grad_norm": 30.8551493809766, "learning_rate": 3.606215710801663e-10, "logits/chosen": -1.5189553499221802, "logits/rejected": -1.555790662765503, "logps/chosen": -820.7545166015625, "logps/rejected": -871.0463256835938, "loss": 0.5234, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4103848934173584, "rewards/margins": 0.8298892974853516, "rewards/rejected": -4.240273475646973, "step": 1488 }, { "epoch": 0.9724875500040819, "grad_norm": 47.632353902452365, "learning_rate": 3.440561977562789e-10, "logits/chosen": -1.4855304956436157, "logits/rejected": -1.4552634954452515, "logps/chosen": -765.5675048828125, "logps/rejected": -875.1714477539062, "loss": 0.4912, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0855441093444824, "rewards/margins": 0.9937740564346313, "rewards/rejected": -4.079318046569824, "step": 1489 }, { "epoch": 0.9731406645440445, "grad_norm": 59.60831943859033, "learning_rate": 3.278794184109118e-10, "logits/chosen": -1.5224008560180664, "logits/rejected": -1.5374361276626587, "logps/chosen": -921.4551391601562, "logps/rejected": -966.5579223632812, "loss": 0.4683, "rewards/accuracies": 0.71875, "rewards/chosen": -3.87973690032959, "rewards/margins": 0.5896239280700684, "rewards/rejected": -4.469360828399658, "step": 1490 }, { "epoch": 0.9737937790840069, "grad_norm": 40.26002383284324, "learning_rate": 3.1209131724633517e-10, "logits/chosen": -1.5184483528137207, "logits/rejected": -1.5036489963531494, "logps/chosen": -814.7634887695312, "logps/rejected": -889.21923828125, "loss": 0.5193, "rewards/accuracies": 0.75, "rewards/chosen": -3.4046013355255127, "rewards/margins": 0.740337610244751, "rewards/rejected": -4.144938945770264, "step": 1491 }, { "epoch": 0.9744468936239693, "grad_norm": 34.218594481949225, "learning_rate": 2.9669197644168755e-10, "logits/chosen": -1.5170872211456299, "logits/rejected": -1.5203460454940796, "logps/chosen": -835.5225219726562, "logps/rejected": -900.4053955078125, "loss": 0.4703, "rewards/accuracies": 0.78125, "rewards/chosen": -3.076322078704834, "rewards/margins": 0.8840709328651428, "rewards/rejected": -3.9603934288024902, "step": 1492 }, { "epoch": 0.9751000081639317, "grad_norm": 10.70959520154608, "learning_rate": 2.8168147615254265e-10, "logits/chosen": -1.4827511310577393, "logits/rejected": -1.463191270828247, "logps/chosen": -825.8433837890625, "logps/rejected": -895.0802612304688, "loss": 0.433, "rewards/accuracies": 0.6875, "rewards/chosen": -3.2821357250213623, "rewards/margins": 0.6120182871818542, "rewards/rejected": -3.8941545486450195, "step": 1493 }, { "epoch": 0.9757531227038942, "grad_norm": 14.011776786991527, "learning_rate": 2.6705989451054343e-10, "logits/chosen": -1.3761827945709229, "logits/rejected": -1.3667436838150024, "logps/chosen": -856.0826416015625, "logps/rejected": -964.3638305664062, "loss": 0.527, "rewards/accuracies": 0.875, "rewards/chosen": -3.3973817825317383, "rewards/margins": 0.8620023727416992, "rewards/rejected": -4.2593841552734375, "step": 1494 }, { "epoch": 0.9764062372438567, "grad_norm": 56.388774885579274, "learning_rate": 2.528273076229187e-10, "logits/chosen": -1.4972938299179077, "logits/rejected": -1.4492802619934082, "logps/chosen": -858.5281982421875, "logps/rejected": -913.3388671875, "loss": 0.5209, "rewards/accuracies": 0.75, "rewards/chosen": -3.4028518199920654, "rewards/margins": 0.6677621006965637, "rewards/rejected": -4.070613861083984, "step": 1495 }, { "epoch": 0.9770593517838191, "grad_norm": 14.034593283767544, "learning_rate": 2.389837895721586e-10, "logits/chosen": -1.5359089374542236, "logits/rejected": -1.5055701732635498, "logps/chosen": -792.88037109375, "logps/rejected": -861.2803955078125, "loss": 0.4586, "rewards/accuracies": 0.65625, "rewards/chosen": -3.479644775390625, "rewards/margins": 0.4910427927970886, "rewards/rejected": -3.9706876277923584, "step": 1496 }, { "epoch": 0.9777124663237815, "grad_norm": 57.03647156257448, "learning_rate": 2.255294124155982e-10, "logits/chosen": -1.5697842836380005, "logits/rejected": -1.507396936416626, "logps/chosen": -896.1490478515625, "logps/rejected": -974.0924682617188, "loss": 0.5092, "rewards/accuracies": 0.875, "rewards/chosen": -3.4873480796813965, "rewards/margins": 1.329317331314087, "rewards/rejected": -4.8166656494140625, "step": 1497 }, { "epoch": 0.978365580863744, "grad_norm": 46.79910545953451, "learning_rate": 2.124642461850179e-10, "logits/chosen": -1.5201869010925293, "logits/rejected": -1.4976688623428345, "logps/chosen": -853.4832153320312, "logps/rejected": -928.146728515625, "loss": 0.4592, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6100432872772217, "rewards/margins": 0.6564720273017883, "rewards/rejected": -4.266514778137207, "step": 1498 }, { "epoch": 0.9790186954037065, "grad_norm": 23.03773550817736, "learning_rate": 1.997883588863436e-10, "logits/chosen": -1.5425734519958496, "logits/rejected": -1.5184190273284912, "logps/chosen": -913.212890625, "logps/rejected": -1053.3978271484375, "loss": 0.4411, "rewards/accuracies": 0.71875, "rewards/chosen": -3.4705982208251953, "rewards/margins": 0.9973933696746826, "rewards/rejected": -4.467991352081299, "step": 1499 }, { "epoch": 0.9796718099436689, "grad_norm": 81.36896110575177, "learning_rate": 1.875018164992137e-10, "logits/chosen": -1.534759283065796, "logits/rejected": -1.4785943031311035, "logps/chosen": -828.40234375, "logps/rejected": -832.9805908203125, "loss": 0.5192, "rewards/accuracies": 0.625, "rewards/chosen": -3.459455728530884, "rewards/margins": 0.5882536172866821, "rewards/rejected": -4.0477094650268555, "step": 1500 }, { "epoch": 0.9796718099436689, "eval_logits/chosen": -1.4782989025115967, "eval_logits/rejected": -1.4591182470321655, "eval_logps/chosen": -844.1143798828125, "eval_logps/rejected": -928.2655029296875, "eval_loss": 0.49420246481895447, "eval_rewards/accuracies": 0.7620000243186951, "eval_rewards/chosen": -3.33768892288208, "eval_rewards/margins": 0.9225709438323975, "eval_rewards/rejected": -4.260260105133057, "eval_runtime": 296.4379, "eval_samples_per_second": 13.494, "eval_steps_per_second": 0.843, "step": 1500 }, { "epoch": 0.9803249244836313, "grad_norm": 26.21653471442728, "learning_rate": 1.7560468297669606e-10, "logits/chosen": -1.5513707399368286, "logits/rejected": -1.5569634437561035, "logps/chosen": -814.3359375, "logps/rejected": -854.103759765625, "loss": 0.4669, "rewards/accuracies": 0.78125, "rewards/chosen": -3.062058925628662, "rewards/margins": 0.5742703676223755, "rewards/rejected": -3.6363296508789062, "step": 1501 }, { "epoch": 0.9809780390235938, "grad_norm": 12.418015543507641, "learning_rate": 1.640970202449382e-10, "logits/chosen": -1.4305357933044434, "logits/rejected": -1.4416850805282593, "logps/chosen": -852.3341064453125, "logps/rejected": -901.50537109375, "loss": 0.4805, "rewards/accuracies": 0.71875, "rewards/chosen": -3.029092311859131, "rewards/margins": 0.7821911573410034, "rewards/rejected": -3.811283588409424, "step": 1502 }, { "epoch": 0.9816311535635562, "grad_norm": 35.91459788005618, "learning_rate": 1.52978888202826e-10, "logits/chosen": -1.497631311416626, "logits/rejected": -1.5434260368347168, "logps/chosen": -787.7675170898438, "logps/rejected": -954.6630249023438, "loss": 0.4877, "rewards/accuracies": 0.90625, "rewards/chosen": -3.3874967098236084, "rewards/margins": 1.1716786623001099, "rewards/rejected": -4.55917501449585, "step": 1503 }, { "epoch": 0.9822842681035187, "grad_norm": 15.545739037319393, "learning_rate": 1.4225034472169217e-10, "logits/chosen": -1.479112982749939, "logits/rejected": -1.4773443937301636, "logps/chosen": -777.0709228515625, "logps/rejected": -973.9321899414062, "loss": 0.4295, "rewards/accuracies": 0.84375, "rewards/chosen": -3.061960458755493, "rewards/margins": 1.352473258972168, "rewards/rejected": -4.414434432983398, "step": 1504 }, { "epoch": 0.9829373826434811, "grad_norm": 10.049630463778348, "learning_rate": 1.3191144564502488e-10, "logits/chosen": -1.5108180046081543, "logits/rejected": -1.474786400794983, "logps/chosen": -910.9762573242188, "logps/rejected": -990.8765869140625, "loss": 0.5196, "rewards/accuracies": 0.6875, "rewards/chosen": -3.5121772289276123, "rewards/margins": 0.993119478225708, "rewards/rejected": -4.5052971839904785, "step": 1505 }, { "epoch": 0.9835904971834436, "grad_norm": 31.720266818268886, "learning_rate": 1.2196224478814297e-10, "logits/chosen": -1.4539670944213867, "logits/rejected": -1.4328685998916626, "logps/chosen": -865.9363403320312, "logps/rejected": -964.106201171875, "loss": 0.483, "rewards/accuracies": 0.84375, "rewards/chosen": -3.5068650245666504, "rewards/margins": 0.9375267028808594, "rewards/rejected": -4.44439172744751, "step": 1506 }, { "epoch": 0.984243611723406, "grad_norm": 45.556241819502084, "learning_rate": 1.1240279393793795e-10, "logits/chosen": -1.5094014406204224, "logits/rejected": -1.4748339653015137, "logps/chosen": -926.0755004882812, "logps/rejected": -976.0172729492188, "loss": 0.4813, "rewards/accuracies": 0.875, "rewards/chosen": -3.7970991134643555, "rewards/margins": 0.9570468068122864, "rewards/rejected": -4.754145622253418, "step": 1507 }, { "epoch": 0.9848967262633684, "grad_norm": 40.10867343247857, "learning_rate": 1.0323314285260731e-10, "logits/chosen": -1.54648756980896, "logits/rejected": -1.495444893836975, "logps/chosen": -783.347900390625, "logps/rejected": -871.718505859375, "loss": 0.4933, "rewards/accuracies": 0.75, "rewards/chosen": -3.104883909225464, "rewards/margins": 1.1070107221603394, "rewards/rejected": -4.211894512176514, "step": 1508 }, { "epoch": 0.9855498408033309, "grad_norm": 32.74695628540036, "learning_rate": 9.445333926139665e-11, "logits/chosen": -1.5436725616455078, "logits/rejected": -1.5149645805358887, "logps/chosen": -904.5325317382812, "logps/rejected": -958.0336303710938, "loss": 0.4634, "rewards/accuracies": 0.8125, "rewards/chosen": -3.427320957183838, "rewards/margins": 0.8428292274475098, "rewards/rejected": -4.270150661468506, "step": 1509 }, { "epoch": 0.9862029553432933, "grad_norm": 50.87038917169162, "learning_rate": 8.606342886432472e-11, "logits/chosen": -1.5588243007659912, "logits/rejected": -1.532243251800537, "logps/chosen": -860.1405029296875, "logps/rejected": -904.2451171875, "loss": 0.4499, "rewards/accuracies": 0.75, "rewards/chosen": -3.0775833129882812, "rewards/margins": 0.7431819438934326, "rewards/rejected": -3.8207650184631348, "step": 1510 }, { "epoch": 0.9868560698832558, "grad_norm": 29.345234968333465, "learning_rate": 7.806345533197534e-11, "logits/chosen": -1.5143674612045288, "logits/rejected": -1.483849287033081, "logps/chosen": -851.5826416015625, "logps/rejected": -954.4253540039062, "loss": 0.5093, "rewards/accuracies": 0.8125, "rewards/chosen": -3.5056495666503906, "rewards/margins": 0.8994309902191162, "rewards/rejected": -4.405080795288086, "step": 1511 }, { "epoch": 0.9875091844232182, "grad_norm": 22.39474777978432, "learning_rate": 7.045346030526423e-11, "logits/chosen": -1.5250229835510254, "logits/rejected": -1.4954551458358765, "logps/chosen": -931.7249145507812, "logps/rejected": -1065.670166015625, "loss": 0.4391, "rewards/accuracies": 0.84375, "rewards/chosen": -3.5478012561798096, "rewards/margins": 1.2276500463485718, "rewards/rejected": -4.77545166015625, "step": 1512 }, { "epoch": 0.9881622989631806, "grad_norm": 68.548727751151, "learning_rate": 6.323348339521418e-11, "logits/chosen": -1.5496234893798828, "logits/rejected": -1.5093997716903687, "logps/chosen": -938.7208251953125, "logps/rejected": -940.5839233398438, "loss": 0.5334, "rewards/accuracies": 0.6875, "rewards/chosen": -3.8004541397094727, "rewards/margins": 0.6491421461105347, "rewards/rejected": -4.449596405029297, "step": 1513 }, { "epoch": 0.9888154135031431, "grad_norm": 15.003945051271025, "learning_rate": 5.640356218274689e-11, "logits/chosen": -1.4270832538604736, "logits/rejected": -1.4271320104599, "logps/chosen": -785.04052734375, "logps/rejected": -928.6375732421875, "loss": 0.5223, "rewards/accuracies": 0.6875, "rewards/chosen": -3.0938949584960938, "rewards/margins": 0.9844279885292053, "rewards/rejected": -4.078322887420654, "step": 1514 }, { "epoch": 0.9894685280431056, "grad_norm": 52.68067104199576, "learning_rate": 4.996373221849981e-11, "logits/chosen": -1.4737398624420166, "logits/rejected": -1.478607177734375, "logps/chosen": -859.705810546875, "logps/rejected": -980.18701171875, "loss": 0.4386, "rewards/accuracies": 0.875, "rewards/chosen": -3.2958946228027344, "rewards/margins": 1.2142889499664307, "rewards/rejected": -4.510182857513428, "step": 1515 }, { "epoch": 0.990121642583068, "grad_norm": 41.105753672966415, "learning_rate": 4.391402702263458e-11, "logits/chosen": -1.5442856550216675, "logits/rejected": -1.5355677604675293, "logps/chosen": -766.7518920898438, "logps/rejected": -807.2686157226562, "loss": 0.4765, "rewards/accuracies": 0.71875, "rewards/chosen": -3.052950620651245, "rewards/margins": 0.6595571041107178, "rewards/rejected": -3.712507724761963, "step": 1516 }, { "epoch": 0.9907747571230304, "grad_norm": 68.18799629848323, "learning_rate": 3.82544780846622e-11, "logits/chosen": -1.4942461252212524, "logits/rejected": -1.4979506731033325, "logps/chosen": -900.58935546875, "logps/rejected": -1016.7969970703125, "loss": 0.4941, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9103312492370605, "rewards/margins": 0.9849377870559692, "rewards/rejected": -4.89526891708374, "step": 1517 }, { "epoch": 0.9914278716629928, "grad_norm": 20.36699675427797, "learning_rate": 3.2985114863276484e-11, "logits/chosen": -1.4821314811706543, "logits/rejected": -1.4812616109848022, "logps/chosen": -811.2876586914062, "logps/rejected": -998.925048828125, "loss": 0.5106, "rewards/accuracies": 0.75, "rewards/chosen": -3.557675838470459, "rewards/margins": 1.2705349922180176, "rewards/rejected": -4.828211307525635, "step": 1518 }, { "epoch": 0.9920809862029554, "grad_norm": 58.88999010926919, "learning_rate": 2.810596478619587e-11, "logits/chosen": -1.5028977394104004, "logits/rejected": -1.4703741073608398, "logps/chosen": -901.3570556640625, "logps/rejected": -913.9461059570312, "loss": 0.5199, "rewards/accuracies": 0.625, "rewards/chosen": -3.426830291748047, "rewards/margins": 0.6236191391944885, "rewards/rejected": -4.050449371337891, "step": 1519 }, { "epoch": 0.9927341007429178, "grad_norm": 34.40888479605642, "learning_rate": 2.3617053250046815e-11, "logits/chosen": -1.5230963230133057, "logits/rejected": -1.4947023391723633, "logps/chosen": -770.20849609375, "logps/rejected": -841.6630859375, "loss": 0.4742, "rewards/accuracies": 0.75, "rewards/chosen": -3.0242300033569336, "rewards/margins": 0.8307550549507141, "rewards/rejected": -3.854984760284424, "step": 1520 }, { "epoch": 0.9933872152828802, "grad_norm": 22.3379376658494, "learning_rate": 1.951840362018897e-11, "logits/chosen": -1.5373893976211548, "logits/rejected": -1.5419368743896484, "logps/chosen": -797.622314453125, "logps/rejected": -868.178955078125, "loss": 0.4944, "rewards/accuracies": 0.71875, "rewards/chosen": -3.065598249435425, "rewards/margins": 0.9640765190124512, "rewards/rejected": -4.029674530029297, "step": 1521 }, { "epoch": 0.9940403298228426, "grad_norm": 37.37852136541512, "learning_rate": 1.5810037230648554e-11, "logits/chosen": -1.4990514516830444, "logits/rejected": -1.498454213142395, "logps/chosen": -753.627685546875, "logps/rejected": -842.1553344726562, "loss": 0.4797, "rewards/accuracies": 0.84375, "rewards/chosen": -2.89300537109375, "rewards/margins": 0.8533806204795837, "rewards/rejected": -3.7463865280151367, "step": 1522 }, { "epoch": 0.9946934443628052, "grad_norm": 35.34109312088976, "learning_rate": 1.2491973383951803e-11, "logits/chosen": -1.433802843093872, "logits/rejected": -1.4264992475509644, "logps/chosen": -843.60986328125, "logps/rejected": -899.9165649414062, "loss": 0.5295, "rewards/accuracies": 0.6875, "rewards/chosen": -3.5381946563720703, "rewards/margins": 0.6915282011032104, "rewards/rejected": -4.229722499847412, "step": 1523 }, { "epoch": 0.9953465589027676, "grad_norm": 25.436468352463912, "learning_rate": 9.564229351050056e-12, "logits/chosen": -1.4901500940322876, "logits/rejected": -1.4961150884628296, "logps/chosen": -803.0394287109375, "logps/rejected": -855.164794921875, "loss": 0.5505, "rewards/accuracies": 0.78125, "rewards/chosen": -2.957271099090576, "rewards/margins": 0.79988694190979, "rewards/rejected": -3.757158041000366, "step": 1524 }, { "epoch": 0.99599967344273, "grad_norm": 10.73432201222575, "learning_rate": 7.0268203712448015e-12, "logits/chosen": -1.5934062004089355, "logits/rejected": -1.544229507446289, "logps/chosen": -856.8956298828125, "logps/rejected": -1123.9691162109375, "loss": 0.4846, "rewards/accuracies": 0.84375, "rewards/chosen": -3.3569083213806152, "rewards/margins": 2.04354190826416, "rewards/rejected": -5.400450706481934, "step": 1525 }, { "epoch": 0.9966527879826924, "grad_norm": 87.38096016034348, "learning_rate": 4.879759652079429e-12, "logits/chosen": -1.467991828918457, "logits/rejected": -1.427109718322754, "logps/chosen": -805.0746459960938, "logps/rejected": -918.366455078125, "loss": 0.5265, "rewards/accuracies": 0.78125, "rewards/chosen": -3.3931772708892822, "rewards/margins": 0.822685956954956, "rewards/rejected": -4.215863227844238, "step": 1526 }, { "epoch": 0.9973059025226549, "grad_norm": 24.30638520435916, "learning_rate": 3.123058369280951e-12, "logits/chosen": -1.4472224712371826, "logits/rejected": -1.4677808284759521, "logps/chosen": -832.0319213867188, "logps/rejected": -963.2366333007812, "loss": 0.5076, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1619575023651123, "rewards/margins": 0.8776848912239075, "rewards/rejected": -4.039642333984375, "step": 1527 }, { "epoch": 0.9979590170626174, "grad_norm": 17.20772991475794, "learning_rate": 1.756725666710035e-12, "logits/chosen": -1.4970341920852661, "logits/rejected": -1.4706482887268066, "logps/chosen": -862.6663818359375, "logps/rejected": -1025.07177734375, "loss": 0.4429, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1549482345581055, "rewards/margins": 1.3957924842834473, "rewards/rejected": -4.550740718841553, "step": 1528 }, { "epoch": 0.9986121316025798, "grad_norm": 17.972242666702652, "learning_rate": 7.80768656319375e-13, "logits/chosen": -1.409681797027588, "logits/rejected": -1.3731991052627563, "logps/chosen": -763.9820556640625, "logps/rejected": -802.5194702148438, "loss": 0.4251, "rewards/accuracies": 0.75, "rewards/chosen": -3.1270015239715576, "rewards/margins": 0.6430358290672302, "rewards/rejected": -3.7700371742248535, "step": 1529 }, { "epoch": 0.9992652461425422, "grad_norm": 30.620744387516318, "learning_rate": 1.9519241807874897e-13, "logits/chosen": -1.5083037614822388, "logits/rejected": -1.4722343683242798, "logps/chosen": -971.5647583007812, "logps/rejected": -995.6734619140625, "loss": 0.4784, "rewards/accuracies": 0.71875, "rewards/chosen": -4.024622440338135, "rewards/margins": 0.6361601948738098, "rewards/rejected": -4.660782814025879, "step": 1530 }, { "epoch": 0.9999183606825047, "grad_norm": 25.960604541088838, "learning_rate": 0.0, "logits/chosen": -1.506800651550293, "logits/rejected": -1.5056391954421997, "logps/chosen": -892.1127319335938, "logps/rejected": -1093.7352294921875, "loss": 0.4507, "rewards/accuracies": 0.84375, "rewards/chosen": -3.275479793548584, "rewards/margins": 1.1585028171539307, "rewards/rejected": -4.433982849121094, "step": 1531 }, { "epoch": 0.9999183606825047, "step": 1531, "total_flos": 0.0, "train_loss": 0.5524635882372952, "train_runtime": 40329.8241, "train_samples_per_second": 4.859, "train_steps_per_second": 0.038 } ], "logging_steps": 1, "max_steps": 1531, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }