{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 355, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 2.158629831791895, "learning_rate": 1.3888888888888887e-08, "logits/chosen": -2.804708957672119, "logits/rejected": -2.8150453567504883, "logps/chosen": -217.97438049316406, "logps/rejected": -216.58865356445312, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 8.21578821692664, "learning_rate": 1.3888888888888888e-07, "logits/chosen": -2.8844423294067383, "logits/rejected": -2.799159526824951, "logps/chosen": -366.7507629394531, "logps/rejected": -275.4356384277344, "loss": 0.6932, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": -0.00016000178584363312, "rewards/margins": -0.0003212409501429647, "rewards/margins_max": 0.002417487557977438, "rewards/margins_min": -0.004127700813114643, "rewards/margins_std": 0.0029805246740579605, "rewards/rejected": 0.00016123917885124683, "step": 10 }, { "epoch": 0.06, "grad_norm": 2.0515925664507124, "learning_rate": 2.7777777777777776e-07, "logits/chosen": -2.74739670753479, "logits/rejected": -2.6935513019561768, "logps/chosen": -329.11138916015625, "logps/rejected": -216.7494659423828, "loss": 0.6929, "rewards/accuracies": 0.625, "rewards/chosen": 0.00035773837589658797, "rewards/margins": 0.00041456689359620214, "rewards/margins_max": 0.0031081512570381165, "rewards/margins_min": -0.002314414829015732, "rewards/margins_std": 0.0023837233893573284, "rewards/rejected": -5.682848859578371e-05, "step": 20 }, { "epoch": 0.08, "grad_norm": 2.310838256933408, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.835951328277588, "logits/rejected": -2.7546262741088867, "logps/chosen": -329.10211181640625, "logps/rejected": -233.05508422851562, "loss": 0.6917, "rewards/accuracies": 0.75, "rewards/chosen": 0.0018310332670807838, "rewards/margins": 0.002559047192335129, "rewards/margins_max": 0.006475468166172504, "rewards/margins_min": -0.0006590075790882111, "rewards/margins_std": 0.0032629654742777348, "rewards/rejected": -0.0007280135178007185, "step": 30 }, { "epoch": 0.11, "grad_norm": 1.9718465521886885, "learning_rate": 4.998060489154965e-07, "logits/chosen": -2.8140110969543457, "logits/rejected": -2.76088285446167, "logps/chosen": -285.4794006347656, "logps/rejected": -227.7167205810547, "loss": 0.6905, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.0035794698633253574, "rewards/margins": 0.0046408590860664845, "rewards/margins_max": 0.011065036058425903, "rewards/margins_min": -0.0008336328901350498, "rewards/margins_std": 0.005423419643193483, "rewards/rejected": -0.0010613898048177361, "step": 40 }, { "epoch": 0.14, "grad_norm": 2.3429157043740894, "learning_rate": 4.976275538042932e-07, "logits/chosen": -2.813694477081299, "logits/rejected": -2.7310705184936523, "logps/chosen": -317.00640869140625, "logps/rejected": -234.43209838867188, "loss": 0.688, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.007789776660501957, "rewards/margins": 0.010368594899773598, "rewards/margins_max": 0.021328028291463852, "rewards/margins_min": 0.0012870692880824208, "rewards/margins_std": 0.009302936494350433, "rewards/rejected": -0.00257881754077971, "step": 50 }, { "epoch": 0.17, "grad_norm": 2.326595034093221, "learning_rate": 4.930493069997119e-07, "logits/chosen": -2.7512717247009277, "logits/rejected": -2.7030184268951416, "logps/chosen": -343.24273681640625, "logps/rejected": -264.2438049316406, "loss": 0.6845, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.015377024188637733, "rewards/margins": 0.01811446249485016, "rewards/margins_max": 0.03716810420155525, "rewards/margins_min": 0.003869078354910016, "rewards/margins_std": 0.01510803122073412, "rewards/rejected": -0.0027374387718737125, "step": 60 }, { "epoch": 0.2, "grad_norm": 1.802371962995753, "learning_rate": 4.861156761634013e-07, "logits/chosen": -2.8008124828338623, "logits/rejected": -2.7141239643096924, "logps/chosen": -360.14227294921875, "logps/rejected": -237.1912841796875, "loss": 0.6809, "rewards/accuracies": 0.9375, "rewards/chosen": 0.02196549065411091, "rewards/margins": 0.026430394500494003, "rewards/margins_max": 0.05226575583219528, "rewards/margins_min": 0.005380354821681976, "rewards/margins_std": 0.02180148847401142, "rewards/rejected": -0.0044649080373346806, "step": 70 }, { "epoch": 0.23, "grad_norm": 2.0226092318731266, "learning_rate": 4.768938549177392e-07, "logits/chosen": -2.842362403869629, "logits/rejected": -2.778277635574341, "logps/chosen": -329.4476318359375, "logps/rejected": -288.3177795410156, "loss": 0.6774, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.02400829829275608, "rewards/margins": 0.03225512057542801, "rewards/margins_max": 0.06589716672897339, "rewards/margins_min": 0.006357196718454361, "rewards/margins_std": 0.027716059237718582, "rewards/rejected": -0.008246822282671928, "step": 80 }, { "epoch": 0.25, "grad_norm": 2.439721586727848, "learning_rate": 4.654732116743193e-07, "logits/chosen": -2.7840921878814697, "logits/rejected": -2.700878620147705, "logps/chosen": -336.05194091796875, "logps/rejected": -200.1630096435547, "loss": 0.672, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.029071927070617676, "rewards/margins": 0.04093674570322037, "rewards/margins_max": 0.08617201447486877, "rewards/margins_min": 0.006826425436884165, "rewards/margins_std": 0.0362737737596035, "rewards/rejected": -0.011864816769957542, "step": 90 }, { "epoch": 0.28, "grad_norm": 2.2225465127306605, "learning_rate": 4.519644235671752e-07, "logits/chosen": -2.8582470417022705, "logits/rejected": -2.7655489444732666, "logps/chosen": -342.58416748046875, "logps/rejected": -265.08441162109375, "loss": 0.6666, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.037609733641147614, "rewards/margins": 0.050220172852277756, "rewards/margins_max": 0.10150803625583649, "rewards/margins_min": 0.007549063768237829, "rewards/margins_std": 0.0440022274851799, "rewards/rejected": -0.01261043269187212, "step": 100 }, { "epoch": 0.28, "eval_logits/chosen": -2.7978174686431885, "eval_logits/rejected": -2.7595677375793457, "eval_logps/chosen": -285.2066650390625, "eval_logps/rejected": -259.86334228515625, "eval_loss": 0.6906961798667908, "eval_rewards/accuracies": 0.578000009059906, "eval_rewards/chosen": -0.00613220501691103, "eval_rewards/margins": 0.006711836438626051, "eval_rewards/margins_max": 0.04891812801361084, "eval_rewards/margins_min": -0.02950645610690117, "eval_rewards/margins_std": 0.025911005213856697, "eval_rewards/rejected": -0.012844040989875793, "eval_runtime": 428.4446, "eval_samples_per_second": 4.668, "eval_steps_per_second": 0.292, "step": 100 }, { "epoch": 0.31, "grad_norm": 2.4760394100510057, "learning_rate": 4.364984038837727e-07, "logits/chosen": -2.8690743446350098, "logits/rejected": -2.7577908039093018, "logps/chosen": -385.70233154296875, "logps/rejected": -288.461669921875, "loss": 0.6591, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0566016249358654, "rewards/margins": 0.06779664754867554, "rewards/margins_max": 0.13572999835014343, "rewards/margins_min": 0.010938728228211403, "rewards/margins_std": 0.05764765292406082, "rewards/rejected": -0.011195014230906963, "step": 110 }, { "epoch": 0.34, "grad_norm": 2.035920276115207, "learning_rate": 4.1922503338800447e-07, "logits/chosen": -2.8610854148864746, "logits/rejected": -2.7858219146728516, "logps/chosen": -387.9818115234375, "logps/rejected": -267.68585205078125, "loss": 0.657, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.06763629615306854, "rewards/margins": 0.07911892235279083, "rewards/margins_max": 0.16764040291309357, "rewards/margins_min": 0.013401249423623085, "rewards/margins_std": 0.07113669812679291, "rewards/rejected": -0.01148262806236744, "step": 120 }, { "epoch": 0.37, "grad_norm": 2.010676971608138, "learning_rate": 4.003117078299021e-07, "logits/chosen": -2.818753957748413, "logits/rejected": -2.741856098175049, "logps/chosen": -396.28985595703125, "logps/rejected": -302.45050048828125, "loss": 0.6454, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.08936750888824463, "rewards/margins": 0.10461701452732086, "rewards/margins_max": 0.20179173350334167, "rewards/margins_min": 0.02413741685450077, "rewards/margins_std": 0.08073713630437851, "rewards/rejected": -0.015249502845108509, "step": 130 }, { "epoch": 0.39, "grad_norm": 1.7425219980216828, "learning_rate": 3.799417157181075e-07, "logits/chosen": -2.7920029163360596, "logits/rejected": -2.7359843254089355, "logps/chosen": -364.29058837890625, "logps/rejected": -272.58355712890625, "loss": 0.6467, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.08406248688697815, "rewards/margins": 0.10730169713497162, "rewards/margins_max": 0.22186696529388428, "rewards/margins_min": 0.012349050492048264, "rewards/margins_std": 0.09653683751821518, "rewards/rejected": -0.02323923259973526, "step": 140 }, { "epoch": 0.42, "grad_norm": 2.0933384277297957, "learning_rate": 3.583124620760659e-07, "logits/chosen": -2.825629711151123, "logits/rejected": -2.7282826900482178, "logps/chosen": -315.4014892578125, "logps/rejected": -216.2842254638672, "loss": 0.6435, "rewards/accuracies": 0.9375, "rewards/chosen": 0.07449642568826675, "rewards/margins": 0.09953755140304565, "rewards/margins_max": 0.21898682415485382, "rewards/margins_min": 0.014027351513504982, "rewards/margins_std": 0.09459034353494644, "rewards/rejected": -0.0250411219894886, "step": 150 }, { "epoch": 0.45, "grad_norm": 1.769669348441601, "learning_rate": 3.356335553954679e-07, "logits/chosen": -2.74135684967041, "logits/rejected": -2.6822197437286377, "logps/chosen": -335.69464111328125, "logps/rejected": -237.88046264648438, "loss": 0.6336, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.09816019237041473, "rewards/margins": 0.1330377608537674, "rewards/margins_max": 0.2625694274902344, "rewards/margins_min": 0.02169904112815857, "rewards/margins_std": 0.1116378903388977, "rewards/rejected": -0.03487757220864296, "step": 160 }, { "epoch": 0.48, "grad_norm": 1.8260362870579057, "learning_rate": 3.121247763262235e-07, "logits/chosen": -2.8216443061828613, "logits/rejected": -2.7401599884033203, "logps/chosen": -364.33587646484375, "logps/rejected": -299.15887451171875, "loss": 0.635, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.11069444566965103, "rewards/margins": 0.1395573914051056, "rewards/margins_max": 0.2983313202857971, "rewards/margins_min": 0.007289635483175516, "rewards/margins_std": 0.13639435172080994, "rewards/rejected": -0.02886294387280941, "step": 170 }, { "epoch": 0.51, "grad_norm": 2.082827875491237, "learning_rate": 2.880139477883347e-07, "logits/chosen": -2.789100408554077, "logits/rejected": -2.700629949569702, "logps/chosen": -339.28125, "logps/rejected": -296.9674377441406, "loss": 0.6302, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.08692200481891632, "rewards/margins": 0.11842750012874603, "rewards/margins_max": 0.23567883670330048, "rewards/margins_min": 0.011810391210019588, "rewards/margins_std": 0.10012297332286835, "rewards/rejected": -0.03150549530982971, "step": 180 }, { "epoch": 0.54, "grad_norm": 2.575609586836664, "learning_rate": 2.635347271463544e-07, "logits/chosen": -2.787972927093506, "logits/rejected": -2.6533846855163574, "logps/chosen": -349.08880615234375, "logps/rejected": -242.5450897216797, "loss": 0.6257, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.10282168537378311, "rewards/margins": 0.14908696711063385, "rewards/margins_max": 0.28802552819252014, "rewards/margins_min": 0.025781046599149704, "rewards/margins_std": 0.1190432757139206, "rewards/rejected": -0.04626528546214104, "step": 190 }, { "epoch": 0.56, "grad_norm": 2.049618345880406, "learning_rate": 2.3892434184240534e-07, "logits/chosen": -2.857001543045044, "logits/rejected": -2.7506966590881348, "logps/chosen": -387.255126953125, "logps/rejected": -270.194091796875, "loss": 0.6251, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.11739673465490341, "rewards/margins": 0.1590125858783722, "rewards/margins_max": 0.32226094603538513, "rewards/margins_min": 0.03523118048906326, "rewards/margins_std": 0.12896928191184998, "rewards/rejected": -0.04161586984992027, "step": 200 }, { "epoch": 0.56, "eval_logits/chosen": -2.769979953765869, "eval_logits/rejected": -2.7318813800811768, "eval_logps/chosen": -288.19482421875, "eval_logps/rejected": -263.79888916015625, "eval_loss": 0.6876310110092163, "eval_rewards/accuracies": 0.5870000123977661, "eval_rewards/chosen": -0.03601397946476936, "eval_rewards/margins": 0.016185704618692398, "eval_rewards/margins_max": 0.11952462792396545, "eval_rewards/margins_min": -0.07521206140518188, "eval_rewards/margins_std": 0.0641048476099968, "eval_rewards/rejected": -0.05219968408346176, "eval_runtime": 427.8872, "eval_samples_per_second": 4.674, "eval_steps_per_second": 0.292, "step": 200 }, { "epoch": 0.59, "grad_norm": 1.9845466839870691, "learning_rate": 2.1442129043167873e-07, "logits/chosen": -2.751984119415283, "logits/rejected": -2.6815638542175293, "logps/chosen": -344.3485412597656, "logps/rejected": -262.97393798828125, "loss": 0.6188, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.10976312309503555, "rewards/margins": 0.16779468953609467, "rewards/margins_max": 0.34073972702026367, "rewards/margins_min": 0.03692127764225006, "rewards/margins_std": 0.14643600583076477, "rewards/rejected": -0.05803157761693001, "step": 210 }, { "epoch": 0.62, "grad_norm": 2.0373527988549145, "learning_rate": 1.9026303129961048e-07, "logits/chosen": -2.8502397537231445, "logits/rejected": -2.7268834114074707, "logps/chosen": -393.9187927246094, "logps/rejected": -280.2196960449219, "loss": 0.6142, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13092514872550964, "rewards/margins": 0.18590331077575684, "rewards/margins_max": 0.3502216637134552, "rewards/margins_min": 0.03089449368417263, "rewards/margins_std": 0.14906269311904907, "rewards/rejected": -0.0549781434237957, "step": 220 }, { "epoch": 0.65, "grad_norm": 2.1828985591845402, "learning_rate": 1.6668368145931396e-07, "logits/chosen": -2.875049114227295, "logits/rejected": -2.744711399078369, "logps/chosen": -390.4495849609375, "logps/rejected": -268.98565673828125, "loss": 0.6067, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.12363851070404053, "rewards/margins": 0.17946995794773102, "rewards/margins_max": 0.34318283200263977, "rewards/margins_min": 0.036444298923015594, "rewards/margins_std": 0.13844837248325348, "rewards/rejected": -0.05583144351840019, "step": 230 }, { "epoch": 0.68, "grad_norm": 1.8193698783653034, "learning_rate": 1.4391174773015834e-07, "logits/chosen": -2.802640199661255, "logits/rejected": -2.71109938621521, "logps/chosen": -333.38397216796875, "logps/rejected": -289.92462158203125, "loss": 0.6224, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.09798085689544678, "rewards/margins": 0.14519774913787842, "rewards/margins_max": 0.293338418006897, "rewards/margins_min": 0.01530275959521532, "rewards/margins_std": 0.12239019572734833, "rewards/rejected": -0.047216884791851044, "step": 240 }, { "epoch": 0.7, "grad_norm": 1.9709337022102749, "learning_rate": 1.2216791228457775e-07, "logits/chosen": -2.7975411415100098, "logits/rejected": -2.6804046630859375, "logps/chosen": -351.70257568359375, "logps/rejected": -260.0617370605469, "loss": 0.6084, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.12191393226385117, "rewards/margins": 0.1914171278476715, "rewards/margins_max": 0.36713144183158875, "rewards/margins_min": 0.05136305093765259, "rewards/margins_std": 0.142560213804245, "rewards/rejected": -0.06950321048498154, "step": 250 }, { "epoch": 0.73, "grad_norm": 1.752772587188972, "learning_rate": 1.0166289402331391e-07, "logits/chosen": -2.8487606048583984, "logits/rejected": -2.737738847732544, "logps/chosen": -345.0237731933594, "logps/rejected": -265.47198486328125, "loss": 0.6074, "rewards/accuracies": 0.9375, "rewards/chosen": 0.11421672999858856, "rewards/margins": 0.17133933305740356, "rewards/margins_max": 0.3663348853588104, "rewards/margins_min": 0.02202555350959301, "rewards/margins_std": 0.15875253081321716, "rewards/rejected": -0.057122599333524704, "step": 260 }, { "epoch": 0.76, "grad_norm": 2.056956615608033, "learning_rate": 8.259540650444734e-08, "logits/chosen": -2.8006067276000977, "logits/rejected": -2.7100348472595215, "logps/chosen": -365.325927734375, "logps/rejected": -270.2814636230469, "loss": 0.6098, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.1257423460483551, "rewards/margins": 0.20250901579856873, "rewards/margins_max": 0.3718946874141693, "rewards/margins_min": 0.04439568892121315, "rewards/margins_std": 0.1491011530160904, "rewards/rejected": -0.07676666229963303, "step": 270 }, { "epoch": 0.79, "grad_norm": 1.9417182779069821, "learning_rate": 6.515023221586721e-08, "logits/chosen": -2.7494287490844727, "logits/rejected": -2.7017343044281006, "logps/chosen": -320.38360595703125, "logps/rejected": -279.5456848144531, "loss": 0.6125, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.10271165519952774, "rewards/margins": 0.16274654865264893, "rewards/margins_max": 0.3613172173500061, "rewards/margins_min": 0.03712720423936844, "rewards/margins_std": 0.14939478039741516, "rewards/rejected": -0.060034893453121185, "step": 280 }, { "epoch": 0.82, "grad_norm": 2.159880856830845, "learning_rate": 4.949643185335287e-08, "logits/chosen": -2.7616562843322754, "logits/rejected": -2.6814732551574707, "logps/chosen": -331.0811462402344, "logps/rejected": -272.906982421875, "loss": 0.6168, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0991094559431076, "rewards/margins": 0.16621682047843933, "rewards/margins_max": 0.34492212533950806, "rewards/margins_min": 0.02063518390059471, "rewards/margins_std": 0.14825591444969177, "rewards/rejected": -0.06710737198591232, "step": 290 }, { "epoch": 0.85, "grad_norm": 2.247267229121733, "learning_rate": 3.578570595810274e-08, "logits/chosen": -2.805422306060791, "logits/rejected": -2.7308857440948486, "logps/chosen": -351.537109375, "logps/rejected": -296.57861328125, "loss": 0.6029, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12097059190273285, "rewards/margins": 0.19219207763671875, "rewards/margins_max": 0.3688820004463196, "rewards/margins_min": 0.05189325660467148, "rewards/margins_std": 0.14433594048023224, "rewards/rejected": -0.07122147083282471, "step": 300 }, { "epoch": 0.85, "eval_logits/chosen": -2.757275342941284, "eval_logits/rejected": -2.7190775871276855, "eval_logps/chosen": -289.97894287109375, "eval_logps/rejected": -266.02178955078125, "eval_loss": 0.6861926913261414, "eval_rewards/accuracies": 0.5870000123977661, "eval_rewards/chosen": -0.05385516211390495, "eval_rewards/margins": 0.020573224872350693, "eval_rewards/margins_max": 0.14790384471416473, "eval_rewards/margins_min": -0.09322728216648102, "eval_rewards/margins_std": 0.079450324177742, "eval_rewards/rejected": -0.07442838698625565, "eval_runtime": 427.9454, "eval_samples_per_second": 4.673, "eval_steps_per_second": 0.292, "step": 300 }, { "epoch": 0.87, "grad_norm": 1.8974226134430692, "learning_rate": 2.415092479103503e-08, "logits/chosen": -2.840935230255127, "logits/rejected": -2.709672212600708, "logps/chosen": -345.2643737792969, "logps/rejected": -222.6641082763672, "loss": 0.6093, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.1101798266172409, "rewards/margins": 0.1844477355480194, "rewards/margins_max": 0.3727789521217346, "rewards/margins_min": 0.046926215291023254, "rewards/margins_std": 0.1535920351743698, "rewards/rejected": -0.0742679089307785, "step": 310 }, { "epoch": 0.9, "grad_norm": 1.7695491697233519, "learning_rate": 1.4704840690808656e-08, "logits/chosen": -2.796245813369751, "logits/rejected": -2.7119815349578857, "logps/chosen": -339.24664306640625, "logps/rejected": -268.58984375, "loss": 0.6037, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.1139553040266037, "rewards/margins": 0.17954252660274506, "rewards/margins_max": 0.37754225730895996, "rewards/margins_min": 0.0253077894449234, "rewards/margins_std": 0.16321782767772675, "rewards/rejected": -0.06558724492788315, "step": 320 }, { "epoch": 0.93, "grad_norm": 2.0645664151522136, "learning_rate": 7.538995394063995e-09, "logits/chosen": -2.8658013343811035, "logits/rejected": -2.760768175125122, "logps/chosen": -386.96258544921875, "logps/rejected": -275.399658203125, "loss": 0.6073, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.13314954936504364, "rewards/margins": 0.20348164439201355, "rewards/margins_max": 0.3888625502586365, "rewards/margins_min": 0.05053550750017166, "rewards/margins_std": 0.15889115631580353, "rewards/rejected": -0.0703321173787117, "step": 330 }, { "epoch": 0.96, "grad_norm": 2.0081843762801617, "learning_rate": 2.7228329070159705e-09, "logits/chosen": -2.7621803283691406, "logits/rejected": -2.6747400760650635, "logps/chosen": -334.4164123535156, "logps/rejected": -258.71417236328125, "loss": 0.607, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.10603541135787964, "rewards/margins": 0.17292913794517517, "rewards/margins_max": 0.366424024105072, "rewards/margins_min": 0.027338892221450806, "rewards/margins_std": 0.15261869132518768, "rewards/rejected": -0.06689374148845673, "step": 340 }, { "epoch": 0.99, "grad_norm": 3.396556816184061, "learning_rate": 3.0302652553296226e-10, "logits/chosen": -2.754178285598755, "logits/rejected": -2.6804542541503906, "logps/chosen": -348.5409851074219, "logps/rejected": -294.7231750488281, "loss": 0.6046, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.10722661018371582, "rewards/margins": 0.18621695041656494, "rewards/margins_max": 0.36825960874557495, "rewards/margins_min": 0.045198000967502594, "rewards/margins_std": 0.14424237608909607, "rewards/rejected": -0.07899035513401031, "step": 350 }, { "epoch": 1.0, "step": 355, "total_flos": 0.0, "train_loss": 0.6394854995566355, "train_runtime": 4022.9516, "train_samples_per_second": 1.411, "train_steps_per_second": 0.088 } ], "logging_steps": 10, "max_steps": 355, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }