{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 1, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0625, "grad_norm": 20.46257781982422, "learning_rate": 2.5e-05, "loss": 0.9078, "step": 1 }, { "epoch": 0.0625, "eval_accuracy": 0.496, "eval_loss": 0.8677021265029907, "eval_runtime": 5.3085, "eval_samples_per_second": 47.094, "eval_steps_per_second": 1.507, "step": 1 }, { "epoch": 0.125, "grad_norm": 13.105232238769531, "learning_rate": 5e-05, "loss": 0.8122, "step": 2 }, { "epoch": 0.125, "eval_accuracy": 0.496, "eval_loss": 0.8238774538040161, "eval_runtime": 5.3062, "eval_samples_per_second": 47.115, "eval_steps_per_second": 1.508, "step": 2 }, { "epoch": 0.1875, "grad_norm": 20.202281951904297, "learning_rate": 4.968354430379747e-05, "loss": 0.9022, "step": 3 }, { "epoch": 0.1875, "eval_accuracy": 0.496, "eval_loss": 0.7353798747062683, "eval_runtime": 5.3102, "eval_samples_per_second": 47.079, "eval_steps_per_second": 1.507, "step": 3 }, { "epoch": 0.25, "grad_norm": 9.847990989685059, "learning_rate": 4.936708860759494e-05, "loss": 0.7986, "step": 4 }, { "epoch": 0.25, "eval_accuracy": 0.504, "eval_loss": 0.6991503834724426, "eval_runtime": 5.3099, "eval_samples_per_second": 47.082, "eval_steps_per_second": 1.507, "step": 4 }, { "epoch": 0.3125, "grad_norm": 8.949254989624023, "learning_rate": 4.905063291139241e-05, "loss": 0.7573, "step": 5 }, { "epoch": 0.3125, "eval_accuracy": 0.524, "eval_loss": 0.7064882516860962, "eval_runtime": 5.3093, "eval_samples_per_second": 47.088, "eval_steps_per_second": 1.507, "step": 5 }, { "epoch": 0.375, "grad_norm": 8.430787086486816, "learning_rate": 4.8734177215189874e-05, "loss": 0.7791, "step": 6 }, { "epoch": 0.375, "eval_accuracy": 0.516, "eval_loss": 0.7329081892967224, "eval_runtime": 5.3099, "eval_samples_per_second": 47.082, "eval_steps_per_second": 1.507, "step": 6 }, { "epoch": 0.4375, "grad_norm": 12.53791618347168, "learning_rate": 4.8417721518987346e-05, "loss": 0.7908, "step": 7 }, { "epoch": 0.4375, "eval_accuracy": 0.512, "eval_loss": 0.7393457293510437, "eval_runtime": 5.3089, "eval_samples_per_second": 47.091, "eval_steps_per_second": 1.507, "step": 7 }, { "epoch": 0.5, "grad_norm": 9.000187873840332, "learning_rate": 4.810126582278481e-05, "loss": 0.6994, "step": 8 }, { "epoch": 0.5, "eval_accuracy": 0.508, "eval_loss": 0.7413183450698853, "eval_runtime": 5.3109, "eval_samples_per_second": 47.073, "eval_steps_per_second": 1.506, "step": 8 }, { "epoch": 0.5625, "grad_norm": 8.807014465332031, "learning_rate": 4.778481012658228e-05, "loss": 0.7374, "step": 9 }, { "epoch": 0.5625, "eval_accuracy": 0.512, "eval_loss": 0.7299355268478394, "eval_runtime": 5.3168, "eval_samples_per_second": 47.021, "eval_steps_per_second": 1.505, "step": 9 }, { "epoch": 0.625, "grad_norm": 13.030564308166504, "learning_rate": 4.7468354430379746e-05, "loss": 0.7256, "step": 10 }, { "epoch": 0.625, "eval_accuracy": 0.52, "eval_loss": 0.7148457169532776, "eval_runtime": 5.3079, "eval_samples_per_second": 47.1, "eval_steps_per_second": 1.507, "step": 10 }, { "epoch": 0.6875, "grad_norm": 4.144247531890869, "learning_rate": 4.715189873417722e-05, "loss": 0.7446, "step": 11 }, { "epoch": 0.6875, "eval_accuracy": 0.52, "eval_loss": 0.7011504173278809, "eval_runtime": 5.3153, "eval_samples_per_second": 47.034, "eval_steps_per_second": 1.505, "step": 11 }, { "epoch": 0.75, "grad_norm": 2.742114782333374, "learning_rate": 4.683544303797468e-05, "loss": 0.7096, "step": 12 }, { "epoch": 0.75, "eval_accuracy": 0.536, "eval_loss": 0.692496120929718, "eval_runtime": 5.3161, "eval_samples_per_second": 47.027, "eval_steps_per_second": 1.505, "step": 12 }, { "epoch": 0.8125, "grad_norm": 2.824199914932251, "learning_rate": 4.6518987341772154e-05, "loss": 0.7714, "step": 13 }, { "epoch": 0.8125, "eval_accuracy": 0.524, "eval_loss": 0.6915624737739563, "eval_runtime": 5.2625, "eval_samples_per_second": 47.506, "eval_steps_per_second": 1.52, "step": 13 }, { "epoch": 0.875, "grad_norm": 2.4258909225463867, "learning_rate": 4.6202531645569625e-05, "loss": 0.7283, "step": 14 }, { "epoch": 0.875, "eval_accuracy": 0.508, "eval_loss": 0.6929062604904175, "eval_runtime": 5.3144, "eval_samples_per_second": 47.042, "eval_steps_per_second": 1.505, "step": 14 }, { "epoch": 0.9375, "grad_norm": 8.152018547058105, "learning_rate": 4.588607594936709e-05, "loss": 0.7151, "step": 15 }, { "epoch": 0.9375, "eval_accuracy": 0.516, "eval_loss": 0.6922968626022339, "eval_runtime": 5.2611, "eval_samples_per_second": 47.519, "eval_steps_per_second": 1.521, "step": 15 }, { "epoch": 1.0, "grad_norm": 2.9275994300842285, "learning_rate": 4.556962025316456e-05, "loss": 0.7188, "step": 16 }, { "epoch": 1.0, "eval_accuracy": 0.524, "eval_loss": 0.6940605640411377, "eval_runtime": 5.3139, "eval_samples_per_second": 47.046, "eval_steps_per_second": 1.505, "step": 16 }, { "epoch": 1.0625, "grad_norm": 4.6839919090271, "learning_rate": 4.525316455696203e-05, "loss": 0.7019, "step": 17 }, { "epoch": 1.0625, "eval_accuracy": 0.532, "eval_loss": 0.6920918226242065, "eval_runtime": 5.3169, "eval_samples_per_second": 47.02, "eval_steps_per_second": 1.505, "step": 17 }, { "epoch": 1.125, "grad_norm": 1.5658074617385864, "learning_rate": 4.49367088607595e-05, "loss": 0.6642, "step": 18 }, { "epoch": 1.125, "eval_accuracy": 0.528, "eval_loss": 0.6912792921066284, "eval_runtime": 5.2622, "eval_samples_per_second": 47.509, "eval_steps_per_second": 1.52, "step": 18 }, { "epoch": 1.1875, "grad_norm": 4.528787612915039, "learning_rate": 4.462025316455696e-05, "loss": 0.662, "step": 19 }, { "epoch": 1.1875, "eval_accuracy": 0.52, "eval_loss": 0.6918222904205322, "eval_runtime": 5.3182, "eval_samples_per_second": 47.008, "eval_steps_per_second": 1.504, "step": 19 }, { "epoch": 1.25, "grad_norm": 12.500655174255371, "learning_rate": 4.430379746835443e-05, "loss": 0.701, "step": 20 }, { "epoch": 1.25, "eval_accuracy": 0.528, "eval_loss": 0.6922558546066284, "eval_runtime": 5.3168, "eval_samples_per_second": 47.021, "eval_steps_per_second": 1.505, "step": 20 }, { "epoch": 1.3125, "grad_norm": 5.84976053237915, "learning_rate": 4.3987341772151904e-05, "loss": 0.7243, "step": 21 }, { "epoch": 1.3125, "eval_accuracy": 0.528, "eval_loss": 0.6901601552963257, "eval_runtime": 5.3209, "eval_samples_per_second": 46.984, "eval_steps_per_second": 1.503, "step": 21 }, { "epoch": 1.375, "grad_norm": 8.601661682128906, "learning_rate": 4.367088607594937e-05, "loss": 0.6922, "step": 22 }, { "epoch": 1.375, "eval_accuracy": 0.508, "eval_loss": 0.6896132826805115, "eval_runtime": 5.3101, "eval_samples_per_second": 47.08, "eval_steps_per_second": 1.507, "step": 22 }, { "epoch": 1.4375, "grad_norm": 4.005645751953125, "learning_rate": 4.3354430379746834e-05, "loss": 0.7231, "step": 23 }, { "epoch": 1.4375, "eval_accuracy": 0.524, "eval_loss": 0.6889511942863464, "eval_runtime": 5.3165, "eval_samples_per_second": 47.023, "eval_steps_per_second": 1.505, "step": 23 }, { "epoch": 1.5, "grad_norm": 5.095814228057861, "learning_rate": 4.3037974683544305e-05, "loss": 0.707, "step": 24 }, { "epoch": 1.5, "eval_accuracy": 0.528, "eval_loss": 0.6890156269073486, "eval_runtime": 5.3198, "eval_samples_per_second": 46.995, "eval_steps_per_second": 1.504, "step": 24 }, { "epoch": 1.5625, "grad_norm": 3.86997127532959, "learning_rate": 4.2721518987341776e-05, "loss": 0.7092, "step": 25 }, { "epoch": 1.5625, "eval_accuracy": 0.528, "eval_loss": 0.6895156502723694, "eval_runtime": 5.3123, "eval_samples_per_second": 47.061, "eval_steps_per_second": 1.506, "step": 25 }, { "epoch": 1.625, "grad_norm": 4.612833023071289, "learning_rate": 4.240506329113924e-05, "loss": 0.6849, "step": 26 }, { "epoch": 1.625, "eval_accuracy": 0.532, "eval_loss": 0.6930058598518372, "eval_runtime": 5.3108, "eval_samples_per_second": 47.074, "eval_steps_per_second": 1.506, "step": 26 }, { "epoch": 1.6875, "grad_norm": 10.45712947845459, "learning_rate": 4.208860759493671e-05, "loss": 0.7547, "step": 27 }, { "epoch": 1.6875, "eval_accuracy": 0.552, "eval_loss": 0.6948456764221191, "eval_runtime": 5.3113, "eval_samples_per_second": 47.07, "eval_steps_per_second": 1.506, "step": 27 }, { "epoch": 1.75, "grad_norm": 3.3531408309936523, "learning_rate": 4.177215189873418e-05, "loss": 0.6861, "step": 28 }, { "epoch": 1.75, "eval_accuracy": 0.536, "eval_loss": 0.6975019574165344, "eval_runtime": 5.3121, "eval_samples_per_second": 47.062, "eval_steps_per_second": 1.506, "step": 28 }, { "epoch": 1.8125, "grad_norm": 3.751863479614258, "learning_rate": 4.145569620253165e-05, "loss": 0.6843, "step": 29 }, { "epoch": 1.8125, "eval_accuracy": 0.52, "eval_loss": 0.7005517482757568, "eval_runtime": 5.3141, "eval_samples_per_second": 47.045, "eval_steps_per_second": 1.505, "step": 29 }, { "epoch": 1.875, "grad_norm": 11.031332969665527, "learning_rate": 4.113924050632912e-05, "loss": 0.6785, "step": 30 }, { "epoch": 1.875, "eval_accuracy": 0.524, "eval_loss": 0.7026211023330688, "eval_runtime": 5.3166, "eval_samples_per_second": 47.023, "eval_steps_per_second": 1.505, "step": 30 }, { "epoch": 1.9375, "grad_norm": 10.911657333374023, "learning_rate": 4.0822784810126584e-05, "loss": 0.7188, "step": 31 }, { "epoch": 1.9375, "eval_accuracy": 0.516, "eval_loss": 0.7035732269287109, "eval_runtime": 5.3153, "eval_samples_per_second": 47.034, "eval_steps_per_second": 1.505, "step": 31 }, { "epoch": 2.0, "grad_norm": 2.079655408859253, "learning_rate": 4.050632911392405e-05, "loss": 0.6547, "step": 32 }, { "epoch": 2.0, "eval_accuracy": 0.524, "eval_loss": 0.699148416519165, "eval_runtime": 5.263, "eval_samples_per_second": 47.502, "eval_steps_per_second": 1.52, "step": 32 }, { "epoch": 2.0625, "grad_norm": 5.025875568389893, "learning_rate": 4.018987341772152e-05, "loss": 0.6927, "step": 33 }, { "epoch": 2.0625, "eval_accuracy": 0.528, "eval_loss": 0.6973759531974792, "eval_runtime": 5.3068, "eval_samples_per_second": 47.109, "eval_steps_per_second": 1.507, "step": 33 }, { "epoch": 2.125, "grad_norm": 4.838070392608643, "learning_rate": 3.987341772151899e-05, "loss": 0.6951, "step": 34 }, { "epoch": 2.125, "eval_accuracy": 0.548, "eval_loss": 0.6930713057518005, "eval_runtime": 5.2597, "eval_samples_per_second": 47.531, "eval_steps_per_second": 1.521, "step": 34 }, { "epoch": 2.1875, "grad_norm": 4.793885707855225, "learning_rate": 3.9556962025316456e-05, "loss": 0.6579, "step": 35 }, { "epoch": 2.1875, "eval_accuracy": 0.544, "eval_loss": 0.6935703158378601, "eval_runtime": 5.308, "eval_samples_per_second": 47.098, "eval_steps_per_second": 1.507, "step": 35 }, { "epoch": 2.25, "grad_norm": 12.430251121520996, "learning_rate": 3.924050632911392e-05, "loss": 0.6801, "step": 36 }, { "epoch": 2.25, "eval_accuracy": 0.56, "eval_loss": 0.692376971244812, "eval_runtime": 5.3069, "eval_samples_per_second": 47.108, "eval_steps_per_second": 1.507, "step": 36 }, { "epoch": 2.3125, "grad_norm": 13.888100624084473, "learning_rate": 3.89240506329114e-05, "loss": 0.7173, "step": 37 }, { "epoch": 2.3125, "eval_accuracy": 0.564, "eval_loss": 0.6904882788658142, "eval_runtime": 5.308, "eval_samples_per_second": 47.099, "eval_steps_per_second": 1.507, "step": 37 }, { "epoch": 2.375, "grad_norm": 10.471617698669434, "learning_rate": 3.8607594936708864e-05, "loss": 0.6623, "step": 38 }, { "epoch": 2.375, "eval_accuracy": 0.536, "eval_loss": 0.6898769736289978, "eval_runtime": 5.2604, "eval_samples_per_second": 47.525, "eval_steps_per_second": 1.521, "step": 38 }, { "epoch": 2.4375, "grad_norm": 1.6034440994262695, "learning_rate": 3.829113924050633e-05, "loss": 0.6557, "step": 39 }, { "epoch": 2.4375, "eval_accuracy": 0.524, "eval_loss": 0.6899453401565552, "eval_runtime": 5.3091, "eval_samples_per_second": 47.089, "eval_steps_per_second": 1.507, "step": 39 }, { "epoch": 2.5, "grad_norm": 5.970409393310547, "learning_rate": 3.79746835443038e-05, "loss": 0.6564, "step": 40 }, { "epoch": 2.5, "eval_accuracy": 0.548, "eval_loss": 0.6897187232971191, "eval_runtime": 5.3126, "eval_samples_per_second": 47.058, "eval_steps_per_second": 1.506, "step": 40 }, { "epoch": 2.5625, "grad_norm": 2.122326374053955, "learning_rate": 3.765822784810127e-05, "loss": 0.6905, "step": 41 }, { "epoch": 2.5625, "eval_accuracy": 0.516, "eval_loss": 0.6902675628662109, "eval_runtime": 5.2618, "eval_samples_per_second": 47.512, "eval_steps_per_second": 1.52, "step": 41 }, { "epoch": 2.625, "grad_norm": 5.0628461837768555, "learning_rate": 3.7341772151898736e-05, "loss": 0.6681, "step": 42 }, { "epoch": 2.625, "eval_accuracy": 0.516, "eval_loss": 0.688769519329071, "eval_runtime": 5.3111, "eval_samples_per_second": 47.071, "eval_steps_per_second": 1.506, "step": 42 }, { "epoch": 2.6875, "grad_norm": 4.445148944854736, "learning_rate": 3.70253164556962e-05, "loss": 0.7124, "step": 43 }, { "epoch": 2.6875, "eval_accuracy": 0.504, "eval_loss": 0.68701171875, "eval_runtime": 5.3106, "eval_samples_per_second": 47.076, "eval_steps_per_second": 1.506, "step": 43 }, { "epoch": 2.75, "grad_norm": 2.6510183811187744, "learning_rate": 3.670886075949367e-05, "loss": 0.6739, "step": 44 }, { "epoch": 2.75, "eval_accuracy": 0.496, "eval_loss": 0.6874648332595825, "eval_runtime": 5.3122, "eval_samples_per_second": 47.061, "eval_steps_per_second": 1.506, "step": 44 }, { "epoch": 2.8125, "grad_norm": 9.51041316986084, "learning_rate": 3.639240506329114e-05, "loss": 0.7424, "step": 45 }, { "epoch": 2.8125, "eval_accuracy": 0.496, "eval_loss": 0.6860937476158142, "eval_runtime": 5.3085, "eval_samples_per_second": 47.094, "eval_steps_per_second": 1.507, "step": 45 }, { "epoch": 2.875, "grad_norm": 11.795768737792969, "learning_rate": 3.607594936708861e-05, "loss": 0.6765, "step": 46 }, { "epoch": 2.875, "eval_accuracy": 0.524, "eval_loss": 0.6877519488334656, "eval_runtime": 5.3088, "eval_samples_per_second": 47.092, "eval_steps_per_second": 1.507, "step": 46 }, { "epoch": 2.9375, "grad_norm": 3.7762112617492676, "learning_rate": 3.575949367088608e-05, "loss": 0.6783, "step": 47 }, { "epoch": 2.9375, "eval_accuracy": 0.524, "eval_loss": 0.6865390539169312, "eval_runtime": 5.3045, "eval_samples_per_second": 47.13, "eval_steps_per_second": 1.508, "step": 47 }, { "epoch": 3.0, "grad_norm": 5.639859676361084, "learning_rate": 3.5443037974683544e-05, "loss": 0.6502, "step": 48 }, { "epoch": 3.0, "eval_accuracy": 0.528, "eval_loss": 0.6866562366485596, "eval_runtime": 5.3119, "eval_samples_per_second": 47.064, "eval_steps_per_second": 1.506, "step": 48 }, { "epoch": 3.0625, "grad_norm": 2.553826093673706, "learning_rate": 3.5126582278481015e-05, "loss": 0.6147, "step": 49 }, { "epoch": 3.0625, "eval_accuracy": 0.536, "eval_loss": 0.6869843602180481, "eval_runtime": 5.3099, "eval_samples_per_second": 47.082, "eval_steps_per_second": 1.507, "step": 49 }, { "epoch": 3.125, "grad_norm": 7.775162696838379, "learning_rate": 3.4810126582278487e-05, "loss": 0.664, "step": 50 }, { "epoch": 3.125, "eval_accuracy": 0.556, "eval_loss": 0.6859511733055115, "eval_runtime": 5.307, "eval_samples_per_second": 47.108, "eval_steps_per_second": 1.507, "step": 50 }, { "epoch": 3.1875, "grad_norm": 11.011063575744629, "learning_rate": 3.449367088607595e-05, "loss": 0.7327, "step": 51 }, { "epoch": 3.1875, "eval_accuracy": 0.52, "eval_loss": 0.6846230626106262, "eval_runtime": 5.3104, "eval_samples_per_second": 47.078, "eval_steps_per_second": 1.506, "step": 51 }, { "epoch": 3.25, "grad_norm": 5.553410053253174, "learning_rate": 3.4177215189873416e-05, "loss": 0.646, "step": 52 }, { "epoch": 3.25, "eval_accuracy": 0.548, "eval_loss": 0.6841738224029541, "eval_runtime": 5.3067, "eval_samples_per_second": 47.11, "eval_steps_per_second": 1.508, "step": 52 }, { "epoch": 3.3125, "grad_norm": 2.593555212020874, "learning_rate": 3.386075949367089e-05, "loss": 0.6843, "step": 53 }, { "epoch": 3.3125, "eval_accuracy": 0.56, "eval_loss": 0.6838808655738831, "eval_runtime": 5.259, "eval_samples_per_second": 47.537, "eval_steps_per_second": 1.521, "step": 53 }, { "epoch": 3.375, "grad_norm": 4.265442848205566, "learning_rate": 3.354430379746836e-05, "loss": 0.6446, "step": 54 }, { "epoch": 3.375, "eval_accuracy": 0.568, "eval_loss": 0.6847597360610962, "eval_runtime": 5.3077, "eval_samples_per_second": 47.102, "eval_steps_per_second": 1.507, "step": 54 }, { "epoch": 3.4375, "grad_norm": 4.211143493652344, "learning_rate": 3.322784810126582e-05, "loss": 0.621, "step": 55 }, { "epoch": 3.4375, "eval_accuracy": 0.56, "eval_loss": 0.6858515739440918, "eval_runtime": 5.3077, "eval_samples_per_second": 47.102, "eval_steps_per_second": 1.507, "step": 55 }, { "epoch": 3.5, "grad_norm": 6.034246444702148, "learning_rate": 3.291139240506329e-05, "loss": 0.6932, "step": 56 }, { "epoch": 3.5, "eval_accuracy": 0.572, "eval_loss": 0.685128927230835, "eval_runtime": 5.259, "eval_samples_per_second": 47.538, "eval_steps_per_second": 1.521, "step": 56 }, { "epoch": 3.5625, "grad_norm": 3.592977285385132, "learning_rate": 3.2594936708860766e-05, "loss": 0.6561, "step": 57 }, { "epoch": 3.5625, "eval_accuracy": 0.56, "eval_loss": 0.6853691339492798, "eval_runtime": 5.3095, "eval_samples_per_second": 47.085, "eval_steps_per_second": 1.507, "step": 57 }, { "epoch": 3.625, "grad_norm": 15.00008487701416, "learning_rate": 3.227848101265823e-05, "loss": 0.7426, "step": 58 }, { "epoch": 3.625, "eval_accuracy": 0.564, "eval_loss": 0.6856933832168579, "eval_runtime": 5.3139, "eval_samples_per_second": 47.046, "eval_steps_per_second": 1.505, "step": 58 }, { "epoch": 3.6875, "grad_norm": 1.8786113262176514, "learning_rate": 3.1962025316455695e-05, "loss": 0.6519, "step": 59 }, { "epoch": 3.6875, "eval_accuracy": 0.564, "eval_loss": 0.6856015920639038, "eval_runtime": 5.3109, "eval_samples_per_second": 47.073, "eval_steps_per_second": 1.506, "step": 59 }, { "epoch": 3.75, "grad_norm": 10.02760124206543, "learning_rate": 3.1645569620253167e-05, "loss": 0.7199, "step": 60 }, { "epoch": 3.75, "eval_accuracy": 0.564, "eval_loss": 0.685232400894165, "eval_runtime": 5.3095, "eval_samples_per_second": 47.086, "eval_steps_per_second": 1.507, "step": 60 }, { "epoch": 3.8125, "grad_norm": 7.662132263183594, "learning_rate": 3.132911392405064e-05, "loss": 0.6768, "step": 61 }, { "epoch": 3.8125, "eval_accuracy": 0.568, "eval_loss": 0.6829081773757935, "eval_runtime": 5.3144, "eval_samples_per_second": 47.042, "eval_steps_per_second": 1.505, "step": 61 }, { "epoch": 3.875, "grad_norm": 3.759078025817871, "learning_rate": 3.10126582278481e-05, "loss": 0.672, "step": 62 }, { "epoch": 3.875, "eval_accuracy": 0.548, "eval_loss": 0.6834414005279541, "eval_runtime": 5.3148, "eval_samples_per_second": 47.038, "eval_steps_per_second": 1.505, "step": 62 }, { "epoch": 3.9375, "grad_norm": 2.5582971572875977, "learning_rate": 3.0696202531645574e-05, "loss": 0.6285, "step": 63 }, { "epoch": 3.9375, "eval_accuracy": 0.544, "eval_loss": 0.6857627034187317, "eval_runtime": 5.311, "eval_samples_per_second": 47.072, "eval_steps_per_second": 1.506, "step": 63 }, { "epoch": 4.0, "grad_norm": 2.1489081382751465, "learning_rate": 3.0379746835443042e-05, "loss": 0.7185, "step": 64 }, { "epoch": 4.0, "eval_accuracy": 0.564, "eval_loss": 0.6834043264389038, "eval_runtime": 5.3171, "eval_samples_per_second": 47.018, "eval_steps_per_second": 1.505, "step": 64 }, { "epoch": 4.0625, "grad_norm": 11.183042526245117, "learning_rate": 3.0063291139240506e-05, "loss": 0.6931, "step": 65 }, { "epoch": 4.0625, "eval_accuracy": 0.564, "eval_loss": 0.6835390329360962, "eval_runtime": 5.3212, "eval_samples_per_second": 46.982, "eval_steps_per_second": 1.503, "step": 65 }, { "epoch": 4.125, "grad_norm": 3.0586631298065186, "learning_rate": 2.9746835443037974e-05, "loss": 0.6612, "step": 66 }, { "epoch": 4.125, "eval_accuracy": 0.568, "eval_loss": 0.6841933727264404, "eval_runtime": 5.3148, "eval_samples_per_second": 47.038, "eval_steps_per_second": 1.505, "step": 66 }, { "epoch": 4.1875, "grad_norm": 4.501911163330078, "learning_rate": 2.9430379746835446e-05, "loss": 0.6831, "step": 67 }, { "epoch": 4.1875, "eval_accuracy": 0.58, "eval_loss": 0.6846015453338623, "eval_runtime": 5.3149, "eval_samples_per_second": 47.037, "eval_steps_per_second": 1.505, "step": 67 }, { "epoch": 4.25, "grad_norm": 1.5323090553283691, "learning_rate": 2.9113924050632914e-05, "loss": 0.6068, "step": 68 }, { "epoch": 4.25, "eval_accuracy": 0.572, "eval_loss": 0.6852548718452454, "eval_runtime": 5.3093, "eval_samples_per_second": 47.088, "eval_steps_per_second": 1.507, "step": 68 }, { "epoch": 4.3125, "grad_norm": 2.318204164505005, "learning_rate": 2.879746835443038e-05, "loss": 0.6319, "step": 69 }, { "epoch": 4.3125, "eval_accuracy": 0.576, "eval_loss": 0.685799777507782, "eval_runtime": 5.3146, "eval_samples_per_second": 47.04, "eval_steps_per_second": 1.505, "step": 69 }, { "epoch": 4.375, "grad_norm": 4.164924621582031, "learning_rate": 2.848101265822785e-05, "loss": 0.6175, "step": 70 }, { "epoch": 4.375, "eval_accuracy": 0.56, "eval_loss": 0.6858603358268738, "eval_runtime": 5.3132, "eval_samples_per_second": 47.053, "eval_steps_per_second": 1.506, "step": 70 }, { "epoch": 4.4375, "grad_norm": 6.288677215576172, "learning_rate": 2.8164556962025318e-05, "loss": 0.7104, "step": 71 }, { "epoch": 4.4375, "eval_accuracy": 0.556, "eval_loss": 0.686227560043335, "eval_runtime": 5.315, "eval_samples_per_second": 47.037, "eval_steps_per_second": 1.505, "step": 71 }, { "epoch": 4.5, "grad_norm": 11.5812406539917, "learning_rate": 2.7848101265822786e-05, "loss": 0.6557, "step": 72 }, { "epoch": 4.5, "eval_accuracy": 0.584, "eval_loss": 0.6848037242889404, "eval_runtime": 5.3108, "eval_samples_per_second": 47.074, "eval_steps_per_second": 1.506, "step": 72 }, { "epoch": 4.5625, "grad_norm": 2.7816808223724365, "learning_rate": 2.7531645569620257e-05, "loss": 0.7062, "step": 73 }, { "epoch": 4.5625, "eval_accuracy": 0.536, "eval_loss": 0.6861132979393005, "eval_runtime": 5.3155, "eval_samples_per_second": 47.033, "eval_steps_per_second": 1.505, "step": 73 }, { "epoch": 4.625, "grad_norm": 2.379595994949341, "learning_rate": 2.7215189873417722e-05, "loss": 0.6942, "step": 74 }, { "epoch": 4.625, "eval_accuracy": 0.536, "eval_loss": 0.688970685005188, "eval_runtime": 5.3161, "eval_samples_per_second": 47.027, "eval_steps_per_second": 1.505, "step": 74 }, { "epoch": 4.6875, "grad_norm": 4.3716254234313965, "learning_rate": 2.689873417721519e-05, "loss": 0.6781, "step": 75 }, { "epoch": 4.6875, "eval_accuracy": 0.54, "eval_loss": 0.6908857226371765, "eval_runtime": 5.2606, "eval_samples_per_second": 47.523, "eval_steps_per_second": 1.521, "step": 75 }, { "epoch": 4.75, "grad_norm": 5.97125768661499, "learning_rate": 2.6582278481012658e-05, "loss": 0.6524, "step": 76 }, { "epoch": 4.75, "eval_accuracy": 0.552, "eval_loss": 0.6924248337745667, "eval_runtime": 5.2685, "eval_samples_per_second": 47.452, "eval_steps_per_second": 1.518, "step": 76 }, { "epoch": 4.8125, "grad_norm": 6.963293075561523, "learning_rate": 2.626582278481013e-05, "loss": 0.6492, "step": 77 }, { "epoch": 4.8125, "eval_accuracy": 0.528, "eval_loss": 0.6949306726455688, "eval_runtime": 5.2662, "eval_samples_per_second": 47.472, "eval_steps_per_second": 1.519, "step": 77 }, { "epoch": 4.875, "grad_norm": 3.0112223625183105, "learning_rate": 2.5949367088607597e-05, "loss": 0.6908, "step": 78 }, { "epoch": 4.875, "eval_accuracy": 0.536, "eval_loss": 0.6964213848114014, "eval_runtime": 5.3233, "eval_samples_per_second": 46.963, "eval_steps_per_second": 1.503, "step": 78 }, { "epoch": 4.9375, "grad_norm": 4.341198921203613, "learning_rate": 2.5632911392405062e-05, "loss": 0.6404, "step": 79 }, { "epoch": 4.9375, "eval_accuracy": 0.532, "eval_loss": 0.6959052681922913, "eval_runtime": 5.3169, "eval_samples_per_second": 47.02, "eval_steps_per_second": 1.505, "step": 79 }, { "epoch": 5.0, "grad_norm": 3.803114414215088, "learning_rate": 2.5316455696202533e-05, "loss": 0.7171, "step": 80 }, { "epoch": 5.0, "eval_accuracy": 0.532, "eval_loss": 0.6955664157867432, "eval_runtime": 5.3111, "eval_samples_per_second": 47.071, "eval_steps_per_second": 1.506, "step": 80 }, { "epoch": 5.0625, "grad_norm": 5.514766216278076, "learning_rate": 2.5e-05, "loss": 0.7032, "step": 81 }, { "epoch": 5.0625, "eval_accuracy": 0.564, "eval_loss": 0.6955312490463257, "eval_runtime": 5.3195, "eval_samples_per_second": 46.997, "eval_steps_per_second": 1.504, "step": 81 }, { "epoch": 5.125, "grad_norm": 3.385983943939209, "learning_rate": 2.468354430379747e-05, "loss": 0.5897, "step": 82 }, { "epoch": 5.125, "eval_accuracy": 0.568, "eval_loss": 0.6949701905250549, "eval_runtime": 5.3225, "eval_samples_per_second": 46.97, "eval_steps_per_second": 1.503, "step": 82 }, { "epoch": 5.1875, "grad_norm": 3.0686802864074707, "learning_rate": 2.4367088607594937e-05, "loss": 0.5982, "step": 83 }, { "epoch": 5.1875, "eval_accuracy": 0.544, "eval_loss": 0.692812979221344, "eval_runtime": 5.3141, "eval_samples_per_second": 47.044, "eval_steps_per_second": 1.505, "step": 83 }, { "epoch": 5.25, "grad_norm": 2.1283679008483887, "learning_rate": 2.4050632911392405e-05, "loss": 0.6012, "step": 84 }, { "epoch": 5.25, "eval_accuracy": 0.52, "eval_loss": 0.6914409399032593, "eval_runtime": 5.3123, "eval_samples_per_second": 47.061, "eval_steps_per_second": 1.506, "step": 84 }, { "epoch": 5.3125, "grad_norm": 7.913130760192871, "learning_rate": 2.3734177215189873e-05, "loss": 0.5944, "step": 85 }, { "epoch": 5.3125, "eval_accuracy": 0.552, "eval_loss": 0.6904281973838806, "eval_runtime": 5.3133, "eval_samples_per_second": 47.052, "eval_steps_per_second": 1.506, "step": 85 }, { "epoch": 5.375, "grad_norm": 5.7936506271362305, "learning_rate": 2.341772151898734e-05, "loss": 0.6313, "step": 86 }, { "epoch": 5.375, "eval_accuracy": 0.548, "eval_loss": 0.6896288990974426, "eval_runtime": 5.3153, "eval_samples_per_second": 47.034, "eval_steps_per_second": 1.505, "step": 86 }, { "epoch": 5.4375, "grad_norm": 2.888784885406494, "learning_rate": 2.3101265822784813e-05, "loss": 0.6628, "step": 87 }, { "epoch": 5.4375, "eval_accuracy": 0.548, "eval_loss": 0.6882919669151306, "eval_runtime": 5.3123, "eval_samples_per_second": 47.061, "eval_steps_per_second": 1.506, "step": 87 }, { "epoch": 5.5, "grad_norm": 11.805291175842285, "learning_rate": 2.278481012658228e-05, "loss": 0.69, "step": 88 }, { "epoch": 5.5, "eval_accuracy": 0.536, "eval_loss": 0.6885942220687866, "eval_runtime": 5.311, "eval_samples_per_second": 47.072, "eval_steps_per_second": 1.506, "step": 88 }, { "epoch": 5.5625, "grad_norm": 2.115865468978882, "learning_rate": 2.246835443037975e-05, "loss": 0.637, "step": 89 }, { "epoch": 5.5625, "eval_accuracy": 0.544, "eval_loss": 0.6866220831871033, "eval_runtime": 5.3087, "eval_samples_per_second": 47.093, "eval_steps_per_second": 1.507, "step": 89 }, { "epoch": 5.625, "grad_norm": 5.957005023956299, "learning_rate": 2.2151898734177217e-05, "loss": 0.6177, "step": 90 }, { "epoch": 5.625, "eval_accuracy": 0.532, "eval_loss": 0.6857876181602478, "eval_runtime": 5.3149, "eval_samples_per_second": 47.037, "eval_steps_per_second": 1.505, "step": 90 }, { "epoch": 5.6875, "grad_norm": 2.0642597675323486, "learning_rate": 2.1835443037974685e-05, "loss": 0.6189, "step": 91 }, { "epoch": 5.6875, "eval_accuracy": 0.528, "eval_loss": 0.6860429644584656, "eval_runtime": 5.32, "eval_samples_per_second": 46.993, "eval_steps_per_second": 1.504, "step": 91 }, { "epoch": 5.75, "grad_norm": 4.555762767791748, "learning_rate": 2.1518987341772153e-05, "loss": 0.6034, "step": 92 }, { "epoch": 5.75, "eval_accuracy": 0.556, "eval_loss": 0.6854721903800964, "eval_runtime": 5.3096, "eval_samples_per_second": 47.085, "eval_steps_per_second": 1.507, "step": 92 }, { "epoch": 5.8125, "grad_norm": 2.375642776489258, "learning_rate": 2.120253164556962e-05, "loss": 0.6416, "step": 93 }, { "epoch": 5.8125, "eval_accuracy": 0.532, "eval_loss": 0.684140145778656, "eval_runtime": 5.3159, "eval_samples_per_second": 47.029, "eval_steps_per_second": 1.505, "step": 93 }, { "epoch": 5.875, "grad_norm": 6.5690226554870605, "learning_rate": 2.088607594936709e-05, "loss": 0.6129, "step": 94 }, { "epoch": 5.875, "eval_accuracy": 0.544, "eval_loss": 0.6858144402503967, "eval_runtime": 5.3132, "eval_samples_per_second": 47.053, "eval_steps_per_second": 1.506, "step": 94 }, { "epoch": 5.9375, "grad_norm": 1.774743676185608, "learning_rate": 2.056962025316456e-05, "loss": 0.62, "step": 95 }, { "epoch": 5.9375, "eval_accuracy": 0.528, "eval_loss": 0.6840459108352661, "eval_runtime": 5.313, "eval_samples_per_second": 47.054, "eval_steps_per_second": 1.506, "step": 95 }, { "epoch": 6.0, "grad_norm": 7.703030586242676, "learning_rate": 2.0253164556962025e-05, "loss": 0.6521, "step": 96 }, { "epoch": 6.0, "eval_accuracy": 0.552, "eval_loss": 0.6837988495826721, "eval_runtime": 5.3128, "eval_samples_per_second": 47.056, "eval_steps_per_second": 1.506, "step": 96 }, { "epoch": 6.0625, "grad_norm": 4.748805046081543, "learning_rate": 1.9936708860759496e-05, "loss": 0.6026, "step": 97 }, { "epoch": 6.0625, "eval_accuracy": 0.592, "eval_loss": 0.6840049028396606, "eval_runtime": 5.3155, "eval_samples_per_second": 47.032, "eval_steps_per_second": 1.505, "step": 97 }, { "epoch": 6.125, "grad_norm": 2.8667399883270264, "learning_rate": 1.962025316455696e-05, "loss": 0.6371, "step": 98 }, { "epoch": 6.125, "eval_accuracy": 0.588, "eval_loss": 0.6844116449356079, "eval_runtime": 5.3126, "eval_samples_per_second": 47.058, "eval_steps_per_second": 1.506, "step": 98 }, { "epoch": 6.1875, "grad_norm": 3.6409852504730225, "learning_rate": 1.9303797468354432e-05, "loss": 0.6306, "step": 99 }, { "epoch": 6.1875, "eval_accuracy": 0.596, "eval_loss": 0.6820478439331055, "eval_runtime": 5.3117, "eval_samples_per_second": 47.066, "eval_steps_per_second": 1.506, "step": 99 }, { "epoch": 6.25, "grad_norm": 4.963222503662109, "learning_rate": 1.89873417721519e-05, "loss": 0.6391, "step": 100 }, { "epoch": 6.25, "eval_accuracy": 0.608, "eval_loss": 0.6823159456253052, "eval_runtime": 5.3114, "eval_samples_per_second": 47.068, "eval_steps_per_second": 1.506, "step": 100 }, { "epoch": 6.3125, "grad_norm": 1.682558298110962, "learning_rate": 1.8670886075949368e-05, "loss": 0.6195, "step": 101 }, { "epoch": 6.3125, "eval_accuracy": 0.624, "eval_loss": 0.6814062595367432, "eval_runtime": 5.3109, "eval_samples_per_second": 47.073, "eval_steps_per_second": 1.506, "step": 101 }, { "epoch": 6.375, "grad_norm": 2.050185203552246, "learning_rate": 1.8354430379746836e-05, "loss": 0.6386, "step": 102 }, { "epoch": 6.375, "eval_accuracy": 0.608, "eval_loss": 0.6834206581115723, "eval_runtime": 5.3138, "eval_samples_per_second": 47.047, "eval_steps_per_second": 1.506, "step": 102 }, { "epoch": 6.4375, "grad_norm": 15.974775314331055, "learning_rate": 1.8037974683544304e-05, "loss": 0.6763, "step": 103 }, { "epoch": 6.4375, "eval_accuracy": 0.628, "eval_loss": 0.6807863712310791, "eval_runtime": 5.3117, "eval_samples_per_second": 47.066, "eval_steps_per_second": 1.506, "step": 103 }, { "epoch": 6.5, "grad_norm": 5.336279392242432, "learning_rate": 1.7721518987341772e-05, "loss": 0.6451, "step": 104 }, { "epoch": 6.5, "eval_accuracy": 0.616, "eval_loss": 0.6820644736289978, "eval_runtime": 5.3156, "eval_samples_per_second": 47.031, "eval_steps_per_second": 1.505, "step": 104 }, { "epoch": 6.5625, "grad_norm": 2.329730749130249, "learning_rate": 1.7405063291139243e-05, "loss": 0.6277, "step": 105 }, { "epoch": 6.5625, "eval_accuracy": 0.612, "eval_loss": 0.6823762059211731, "eval_runtime": 5.314, "eval_samples_per_second": 47.046, "eval_steps_per_second": 1.505, "step": 105 }, { "epoch": 6.625, "grad_norm": 4.839900970458984, "learning_rate": 1.7088607594936708e-05, "loss": 0.6026, "step": 106 }, { "epoch": 6.625, "eval_accuracy": 0.62, "eval_loss": 0.6794611811637878, "eval_runtime": 5.3123, "eval_samples_per_second": 47.061, "eval_steps_per_second": 1.506, "step": 106 }, { "epoch": 6.6875, "grad_norm": 3.032259702682495, "learning_rate": 1.677215189873418e-05, "loss": 0.5925, "step": 107 }, { "epoch": 6.6875, "eval_accuracy": 0.608, "eval_loss": 0.6825341582298279, "eval_runtime": 5.313, "eval_samples_per_second": 47.055, "eval_steps_per_second": 1.506, "step": 107 }, { "epoch": 6.75, "grad_norm": 8.187138557434082, "learning_rate": 1.6455696202531644e-05, "loss": 0.6207, "step": 108 }, { "epoch": 6.75, "eval_accuracy": 0.608, "eval_loss": 0.6813283562660217, "eval_runtime": 5.3086, "eval_samples_per_second": 47.093, "eval_steps_per_second": 1.507, "step": 108 }, { "epoch": 6.8125, "grad_norm": 5.521825313568115, "learning_rate": 1.6139240506329115e-05, "loss": 0.6052, "step": 109 }, { "epoch": 6.8125, "eval_accuracy": 0.576, "eval_loss": 0.6814047694206238, "eval_runtime": 5.3113, "eval_samples_per_second": 47.069, "eval_steps_per_second": 1.506, "step": 109 }, { "epoch": 6.875, "grad_norm": 7.56046724319458, "learning_rate": 1.5822784810126583e-05, "loss": 0.6089, "step": 110 }, { "epoch": 6.875, "eval_accuracy": 0.56, "eval_loss": 0.6806631088256836, "eval_runtime": 5.3189, "eval_samples_per_second": 47.002, "eval_steps_per_second": 1.504, "step": 110 }, { "epoch": 6.9375, "grad_norm": 1.980968952178955, "learning_rate": 1.550632911392405e-05, "loss": 0.6069, "step": 111 }, { "epoch": 6.9375, "eval_accuracy": 0.548, "eval_loss": 0.6806347370147705, "eval_runtime": 5.3128, "eval_samples_per_second": 47.057, "eval_steps_per_second": 1.506, "step": 111 }, { "epoch": 7.0, "grad_norm": 7.8832573890686035, "learning_rate": 1.5189873417721521e-05, "loss": 0.5864, "step": 112 }, { "epoch": 7.0, "eval_accuracy": 0.556, "eval_loss": 0.6805583238601685, "eval_runtime": 5.2659, "eval_samples_per_second": 47.475, "eval_steps_per_second": 1.519, "step": 112 }, { "epoch": 7.0625, "grad_norm": 2.3190677165985107, "learning_rate": 1.4873417721518987e-05, "loss": 0.6027, "step": 113 }, { "epoch": 7.0625, "eval_accuracy": 0.568, "eval_loss": 0.681329607963562, "eval_runtime": 5.3114, "eval_samples_per_second": 47.069, "eval_steps_per_second": 1.506, "step": 113 }, { "epoch": 7.125, "grad_norm": 9.695440292358398, "learning_rate": 1.4556962025316457e-05, "loss": 0.6683, "step": 114 }, { "epoch": 7.125, "eval_accuracy": 0.58, "eval_loss": 0.6799863576889038, "eval_runtime": 5.3176, "eval_samples_per_second": 47.014, "eval_steps_per_second": 1.504, "step": 114 }, { "epoch": 7.1875, "grad_norm": 1.5382860898971558, "learning_rate": 1.4240506329113925e-05, "loss": 0.6095, "step": 115 }, { "epoch": 7.1875, "eval_accuracy": 0.596, "eval_loss": 0.6808972358703613, "eval_runtime": 5.3123, "eval_samples_per_second": 47.06, "eval_steps_per_second": 1.506, "step": 115 }, { "epoch": 7.25, "grad_norm": 3.808166027069092, "learning_rate": 1.3924050632911393e-05, "loss": 0.5888, "step": 116 }, { "epoch": 7.25, "eval_accuracy": 0.616, "eval_loss": 0.6814682483673096, "eval_runtime": 5.3131, "eval_samples_per_second": 47.053, "eval_steps_per_second": 1.506, "step": 116 }, { "epoch": 7.3125, "grad_norm": 2.3935582637786865, "learning_rate": 1.3607594936708861e-05, "loss": 0.6377, "step": 117 }, { "epoch": 7.3125, "eval_accuracy": 0.624, "eval_loss": 0.6837414503097534, "eval_runtime": 5.3178, "eval_samples_per_second": 47.012, "eval_steps_per_second": 1.504, "step": 117 }, { "epoch": 7.375, "grad_norm": 9.851938247680664, "learning_rate": 1.3291139240506329e-05, "loss": 0.5808, "step": 118 }, { "epoch": 7.375, "eval_accuracy": 0.628, "eval_loss": 0.6859772801399231, "eval_runtime": 5.3156, "eval_samples_per_second": 47.031, "eval_steps_per_second": 1.505, "step": 118 }, { "epoch": 7.4375, "grad_norm": 4.824713706970215, "learning_rate": 1.2974683544303799e-05, "loss": 0.6078, "step": 119 }, { "epoch": 7.4375, "eval_accuracy": 0.624, "eval_loss": 0.6843095421791077, "eval_runtime": 5.3153, "eval_samples_per_second": 47.034, "eval_steps_per_second": 1.505, "step": 119 }, { "epoch": 7.5, "grad_norm": 6.523806571960449, "learning_rate": 1.2658227848101267e-05, "loss": 0.6079, "step": 120 }, { "epoch": 7.5, "eval_accuracy": 0.636, "eval_loss": 0.6858022212982178, "eval_runtime": 5.3164, "eval_samples_per_second": 47.025, "eval_steps_per_second": 1.505, "step": 120 }, { "epoch": 7.5625, "grad_norm": 2.3785831928253174, "learning_rate": 1.2341772151898735e-05, "loss": 0.5977, "step": 121 }, { "epoch": 7.5625, "eval_accuracy": 0.624, "eval_loss": 0.6857990622520447, "eval_runtime": 5.2701, "eval_samples_per_second": 47.437, "eval_steps_per_second": 1.518, "step": 121 }, { "epoch": 7.625, "grad_norm": 3.375392436981201, "learning_rate": 1.2025316455696203e-05, "loss": 0.6302, "step": 122 }, { "epoch": 7.625, "eval_accuracy": 0.624, "eval_loss": 0.6846601366996765, "eval_runtime": 5.3246, "eval_samples_per_second": 46.952, "eval_steps_per_second": 1.502, "step": 122 }, { "epoch": 7.6875, "grad_norm": 1.7652422189712524, "learning_rate": 1.170886075949367e-05, "loss": 0.5844, "step": 123 }, { "epoch": 7.6875, "eval_accuracy": 0.616, "eval_loss": 0.684487521648407, "eval_runtime": 5.3177, "eval_samples_per_second": 47.013, "eval_steps_per_second": 1.504, "step": 123 }, { "epoch": 7.75, "grad_norm": 2.071542263031006, "learning_rate": 1.139240506329114e-05, "loss": 0.5954, "step": 124 }, { "epoch": 7.75, "eval_accuracy": 0.596, "eval_loss": 0.6850346922874451, "eval_runtime": 5.3238, "eval_samples_per_second": 46.959, "eval_steps_per_second": 1.503, "step": 124 }, { "epoch": 7.8125, "grad_norm": 8.903618812561035, "learning_rate": 1.1075949367088608e-05, "loss": 0.6133, "step": 125 }, { "epoch": 7.8125, "eval_accuracy": 0.576, "eval_loss": 0.6850595474243164, "eval_runtime": 5.3186, "eval_samples_per_second": 47.005, "eval_steps_per_second": 1.504, "step": 125 }, { "epoch": 7.875, "grad_norm": 3.892320156097412, "learning_rate": 1.0759493670886076e-05, "loss": 0.6233, "step": 126 }, { "epoch": 7.875, "eval_accuracy": 0.576, "eval_loss": 0.6834313869476318, "eval_runtime": 5.3197, "eval_samples_per_second": 46.995, "eval_steps_per_second": 1.504, "step": 126 }, { "epoch": 7.9375, "grad_norm": 1.781283974647522, "learning_rate": 1.0443037974683544e-05, "loss": 0.614, "step": 127 }, { "epoch": 7.9375, "eval_accuracy": 0.56, "eval_loss": 0.6822507381439209, "eval_runtime": 5.315, "eval_samples_per_second": 47.036, "eval_steps_per_second": 1.505, "step": 127 }, { "epoch": 8.0, "grad_norm": 6.411435127258301, "learning_rate": 1.0126582278481012e-05, "loss": 0.6076, "step": 128 }, { "epoch": 8.0, "eval_accuracy": 0.544, "eval_loss": 0.6811806559562683, "eval_runtime": 5.3173, "eval_samples_per_second": 47.016, "eval_steps_per_second": 1.505, "step": 128 }, { "epoch": 8.0625, "grad_norm": 5.01564359664917, "learning_rate": 9.81012658227848e-06, "loss": 0.6302, "step": 129 }, { "epoch": 8.0625, "eval_accuracy": 0.548, "eval_loss": 0.6792832016944885, "eval_runtime": 5.3135, "eval_samples_per_second": 47.05, "eval_steps_per_second": 1.506, "step": 129 }, { "epoch": 8.125, "grad_norm": 9.113856315612793, "learning_rate": 9.49367088607595e-06, "loss": 0.6034, "step": 130 }, { "epoch": 8.125, "eval_accuracy": 0.54, "eval_loss": 0.6796222925186157, "eval_runtime": 5.316, "eval_samples_per_second": 47.028, "eval_steps_per_second": 1.505, "step": 130 }, { "epoch": 8.1875, "grad_norm": 7.10761022567749, "learning_rate": 9.177215189873418e-06, "loss": 0.5996, "step": 131 }, { "epoch": 8.1875, "eval_accuracy": 0.548, "eval_loss": 0.6793979406356812, "eval_runtime": 5.3223, "eval_samples_per_second": 46.972, "eval_steps_per_second": 1.503, "step": 131 }, { "epoch": 8.25, "grad_norm": 3.1391570568084717, "learning_rate": 8.860759493670886e-06, "loss": 0.6148, "step": 132 }, { "epoch": 8.25, "eval_accuracy": 0.544, "eval_loss": 0.6803398728370667, "eval_runtime": 5.3155, "eval_samples_per_second": 47.032, "eval_steps_per_second": 1.505, "step": 132 }, { "epoch": 8.3125, "grad_norm": 4.992683410644531, "learning_rate": 8.544303797468354e-06, "loss": 0.6273, "step": 133 }, { "epoch": 8.3125, "eval_accuracy": 0.556, "eval_loss": 0.6810378432273865, "eval_runtime": 5.3212, "eval_samples_per_second": 46.982, "eval_steps_per_second": 1.503, "step": 133 }, { "epoch": 8.375, "grad_norm": 6.507674694061279, "learning_rate": 8.227848101265822e-06, "loss": 0.6111, "step": 134 }, { "epoch": 8.375, "eval_accuracy": 0.552, "eval_loss": 0.6796850562095642, "eval_runtime": 5.318, "eval_samples_per_second": 47.01, "eval_steps_per_second": 1.504, "step": 134 }, { "epoch": 8.4375, "grad_norm": 2.2606754302978516, "learning_rate": 7.911392405063292e-06, "loss": 0.6564, "step": 135 }, { "epoch": 8.4375, "eval_accuracy": 0.556, "eval_loss": 0.6825842261314392, "eval_runtime": 5.3142, "eval_samples_per_second": 47.044, "eval_steps_per_second": 1.505, "step": 135 }, { "epoch": 8.5, "grad_norm": 6.89772891998291, "learning_rate": 7.5949367088607605e-06, "loss": 0.5715, "step": 136 }, { "epoch": 8.5, "eval_accuracy": 0.564, "eval_loss": 0.6805503964424133, "eval_runtime": 5.318, "eval_samples_per_second": 47.01, "eval_steps_per_second": 1.504, "step": 136 }, { "epoch": 8.5625, "grad_norm": 6.177364349365234, "learning_rate": 7.2784810126582285e-06, "loss": 0.632, "step": 137 }, { "epoch": 8.5625, "eval_accuracy": 0.564, "eval_loss": 0.6819558143615723, "eval_runtime": 5.3138, "eval_samples_per_second": 47.047, "eval_steps_per_second": 1.506, "step": 137 }, { "epoch": 8.625, "grad_norm": 4.602504253387451, "learning_rate": 6.9620253164556965e-06, "loss": 0.5721, "step": 138 }, { "epoch": 8.625, "eval_accuracy": 0.568, "eval_loss": 0.682602047920227, "eval_runtime": 5.3181, "eval_samples_per_second": 47.009, "eval_steps_per_second": 1.504, "step": 138 }, { "epoch": 8.6875, "grad_norm": 6.823434829711914, "learning_rate": 6.6455696202531645e-06, "loss": 0.5641, "step": 139 }, { "epoch": 8.6875, "eval_accuracy": 0.596, "eval_loss": 0.683552622795105, "eval_runtime": 5.3141, "eval_samples_per_second": 47.045, "eval_steps_per_second": 1.505, "step": 139 }, { "epoch": 8.75, "grad_norm": 1.5306897163391113, "learning_rate": 6.329113924050633e-06, "loss": 0.5734, "step": 140 }, { "epoch": 8.75, "eval_accuracy": 0.588, "eval_loss": 0.6846555471420288, "eval_runtime": 5.318, "eval_samples_per_second": 47.01, "eval_steps_per_second": 1.504, "step": 140 }, { "epoch": 8.8125, "grad_norm": 2.248267650604248, "learning_rate": 6.012658227848101e-06, "loss": 0.5735, "step": 141 }, { "epoch": 8.8125, "eval_accuracy": 0.58, "eval_loss": 0.6854619383811951, "eval_runtime": 5.3165, "eval_samples_per_second": 47.024, "eval_steps_per_second": 1.505, "step": 141 }, { "epoch": 8.875, "grad_norm": 3.2699761390686035, "learning_rate": 5.69620253164557e-06, "loss": 0.5866, "step": 142 }, { "epoch": 8.875, "eval_accuracy": 0.604, "eval_loss": 0.686599612236023, "eval_runtime": 5.3145, "eval_samples_per_second": 47.041, "eval_steps_per_second": 1.505, "step": 142 }, { "epoch": 8.9375, "grad_norm": 3.8013508319854736, "learning_rate": 5.379746835443038e-06, "loss": 0.6378, "step": 143 }, { "epoch": 8.9375, "eval_accuracy": 0.612, "eval_loss": 0.6867551207542419, "eval_runtime": 5.3188, "eval_samples_per_second": 47.003, "eval_steps_per_second": 1.504, "step": 143 }, { "epoch": 9.0, "grad_norm": 5.138718128204346, "learning_rate": 5.063291139240506e-06, "loss": 0.5879, "step": 144 }, { "epoch": 9.0, "eval_accuracy": 0.62, "eval_loss": 0.6862714886665344, "eval_runtime": 5.3141, "eval_samples_per_second": 47.044, "eval_steps_per_second": 1.505, "step": 144 }, { "epoch": 9.0625, "grad_norm": 8.058981895446777, "learning_rate": 4.746835443037975e-06, "loss": 0.6328, "step": 145 }, { "epoch": 9.0625, "eval_accuracy": 0.62, "eval_loss": 0.6870667934417725, "eval_runtime": 5.3138, "eval_samples_per_second": 47.047, "eval_steps_per_second": 1.506, "step": 145 }, { "epoch": 9.125, "grad_norm": 3.1329164505004883, "learning_rate": 4.430379746835443e-06, "loss": 0.6065, "step": 146 }, { "epoch": 9.125, "eval_accuracy": 0.64, "eval_loss": 0.6861372590065002, "eval_runtime": 5.3252, "eval_samples_per_second": 46.947, "eval_steps_per_second": 1.502, "step": 146 }, { "epoch": 9.1875, "grad_norm": 3.5430028438568115, "learning_rate": 4.113924050632911e-06, "loss": 0.5956, "step": 147 }, { "epoch": 9.1875, "eval_accuracy": 0.652, "eval_loss": 0.6884194612503052, "eval_runtime": 5.3177, "eval_samples_per_second": 47.012, "eval_steps_per_second": 1.504, "step": 147 }, { "epoch": 9.25, "grad_norm": 8.4904146194458, "learning_rate": 3.7974683544303802e-06, "loss": 0.6129, "step": 148 }, { "epoch": 9.25, "eval_accuracy": 0.644, "eval_loss": 0.6896327137947083, "eval_runtime": 5.3169, "eval_samples_per_second": 47.019, "eval_steps_per_second": 1.505, "step": 148 }, { "epoch": 9.3125, "grad_norm": 3.3191404342651367, "learning_rate": 3.4810126582278482e-06, "loss": 0.562, "step": 149 }, { "epoch": 9.3125, "eval_accuracy": 0.628, "eval_loss": 0.6911370158195496, "eval_runtime": 5.3098, "eval_samples_per_second": 47.083, "eval_steps_per_second": 1.507, "step": 149 }, { "epoch": 9.375, "grad_norm": 4.453939914703369, "learning_rate": 3.1645569620253167e-06, "loss": 0.6041, "step": 150 }, { "epoch": 9.375, "eval_accuracy": 0.64, "eval_loss": 0.6905979514122009, "eval_runtime": 5.3171, "eval_samples_per_second": 47.019, "eval_steps_per_second": 1.505, "step": 150 }, { "epoch": 9.4375, "grad_norm": 2.314422130584717, "learning_rate": 2.848101265822785e-06, "loss": 0.5941, "step": 151 }, { "epoch": 9.4375, "eval_accuracy": 0.64, "eval_loss": 0.6917807459831238, "eval_runtime": 5.3171, "eval_samples_per_second": 47.018, "eval_steps_per_second": 1.505, "step": 151 }, { "epoch": 9.5, "grad_norm": 1.290116786956787, "learning_rate": 2.531645569620253e-06, "loss": 0.5764, "step": 152 }, { "epoch": 9.5, "eval_accuracy": 0.632, "eval_loss": 0.6930723190307617, "eval_runtime": 5.3153, "eval_samples_per_second": 47.034, "eval_steps_per_second": 1.505, "step": 152 }, { "epoch": 9.5625, "grad_norm": 5.727312088012695, "learning_rate": 2.2151898734177215e-06, "loss": 0.543, "step": 153 }, { "epoch": 9.5625, "eval_accuracy": 0.644, "eval_loss": 0.6937922239303589, "eval_runtime": 5.3186, "eval_samples_per_second": 47.005, "eval_steps_per_second": 1.504, "step": 153 }, { "epoch": 9.625, "grad_norm": 2.591679811477661, "learning_rate": 1.8987341772151901e-06, "loss": 0.5774, "step": 154 }, { "epoch": 9.625, "eval_accuracy": 0.636, "eval_loss": 0.6921583414077759, "eval_runtime": 5.3205, "eval_samples_per_second": 46.988, "eval_steps_per_second": 1.504, "step": 154 }, { "epoch": 9.6875, "grad_norm": 6.093884468078613, "learning_rate": 1.5822784810126583e-06, "loss": 0.5699, "step": 155 }, { "epoch": 9.6875, "eval_accuracy": 0.644, "eval_loss": 0.6911875009536743, "eval_runtime": 5.2694, "eval_samples_per_second": 47.443, "eval_steps_per_second": 1.518, "step": 155 }, { "epoch": 9.75, "grad_norm": 9.202909469604492, "learning_rate": 1.2658227848101265e-06, "loss": 0.5794, "step": 156 }, { "epoch": 9.75, "eval_accuracy": 0.64, "eval_loss": 0.6915637850761414, "eval_runtime": 5.3245, "eval_samples_per_second": 46.953, "eval_steps_per_second": 1.502, "step": 156 }, { "epoch": 9.8125, "grad_norm": 1.493701696395874, "learning_rate": 9.493670886075951e-07, "loss": 0.5696, "step": 157 }, { "epoch": 9.8125, "eval_accuracy": 0.632, "eval_loss": 0.6913649439811707, "eval_runtime": 5.3147, "eval_samples_per_second": 47.04, "eval_steps_per_second": 1.505, "step": 157 }, { "epoch": 9.875, "grad_norm": 4.0445876121521, "learning_rate": 6.329113924050633e-07, "loss": 0.6054, "step": 158 }, { "epoch": 9.875, "eval_accuracy": 0.64, "eval_loss": 0.6925042867660522, "eval_runtime": 5.3166, "eval_samples_per_second": 47.022, "eval_steps_per_second": 1.505, "step": 158 }, { "epoch": 9.9375, "grad_norm": 5.018632411956787, "learning_rate": 3.1645569620253163e-07, "loss": 0.6027, "step": 159 }, { "epoch": 9.9375, "eval_accuracy": 0.632, "eval_loss": 0.6916110515594482, "eval_runtime": 5.3164, "eval_samples_per_second": 47.024, "eval_steps_per_second": 1.505, "step": 159 }, { "epoch": 10.0, "grad_norm": 4.067493438720703, "learning_rate": 0.0, "loss": 0.5915, "step": 160 }, { "epoch": 10.0, "eval_accuracy": 0.64, "eval_loss": 0.6912569403648376, "eval_runtime": 5.3132, "eval_samples_per_second": 47.053, "eval_steps_per_second": 1.506, "step": 160 }, { "epoch": 10.0, "step": 160, "total_flos": 5.798258932711424e+16, "train_loss": 0.6541111502796412, "train_runtime": 1664.9314, "train_samples_per_second": 6.006, "train_steps_per_second": 0.096 } ], "logging_steps": 1, "max_steps": 160, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 5.798258932711424e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }