{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.984, "eval_steps": 1, "global_step": 124, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 19.792200088500977, "learning_rate": 2.5e-05, "loss": 0.9587, "step": 1 }, { "epoch": 0.016, "eval_accuracy": 0.4, "eval_loss": 1.2008212804794312, "eval_runtime": 9.5896, "eval_samples_per_second": 26.07, "eval_steps_per_second": 3.337, "step": 1 }, { "epoch": 0.032, "grad_norm": 26.413536071777344, "learning_rate": 5e-05, "loss": 1.0902, "step": 2 }, { "epoch": 0.032, "eval_accuracy": 0.4, "eval_loss": 1.1015820503234863, "eval_runtime": 9.4487, "eval_samples_per_second": 26.459, "eval_steps_per_second": 3.387, "step": 2 }, { "epoch": 0.048, "grad_norm": 16.49271011352539, "learning_rate": 4.959016393442623e-05, "loss": 0.8807, "step": 3 }, { "epoch": 0.048, "eval_accuracy": 0.428, "eval_loss": 0.9215332269668579, "eval_runtime": 9.4596, "eval_samples_per_second": 26.428, "eval_steps_per_second": 3.383, "step": 3 }, { "epoch": 0.064, "grad_norm": 30.666654586791992, "learning_rate": 4.918032786885246e-05, "loss": 0.9722, "step": 4 }, { "epoch": 0.064, "eval_accuracy": 0.476, "eval_loss": 0.8115702867507935, "eval_runtime": 9.4643, "eval_samples_per_second": 26.415, "eval_steps_per_second": 3.381, "step": 4 }, { "epoch": 0.08, "grad_norm": 10.605666160583496, "learning_rate": 4.8770491803278687e-05, "loss": 0.7957, "step": 5 }, { "epoch": 0.08, "eval_accuracy": 0.476, "eval_loss": 0.7770839929580688, "eval_runtime": 9.4208, "eval_samples_per_second": 26.537, "eval_steps_per_second": 3.397, "step": 5 }, { "epoch": 0.096, "grad_norm": 15.425003051757812, "learning_rate": 4.836065573770492e-05, "loss": 0.5228, "step": 6 }, { "epoch": 0.096, "eval_accuracy": 0.504, "eval_loss": 0.805004894733429, "eval_runtime": 9.4526, "eval_samples_per_second": 26.448, "eval_steps_per_second": 3.385, "step": 6 }, { "epoch": 0.112, "grad_norm": 10.599884033203125, "learning_rate": 4.795081967213115e-05, "loss": 0.739, "step": 7 }, { "epoch": 0.112, "eval_accuracy": 0.532, "eval_loss": 0.8277338743209839, "eval_runtime": 9.4867, "eval_samples_per_second": 26.353, "eval_steps_per_second": 3.373, "step": 7 }, { "epoch": 0.128, "grad_norm": 32.59059143066406, "learning_rate": 4.754098360655738e-05, "loss": 1.2898, "step": 8 }, { "epoch": 0.128, "eval_accuracy": 0.564, "eval_loss": 0.8153437376022339, "eval_runtime": 9.4204, "eval_samples_per_second": 26.538, "eval_steps_per_second": 3.397, "step": 8 }, { "epoch": 0.144, "grad_norm": 22.269615173339844, "learning_rate": 4.713114754098361e-05, "loss": 0.9083, "step": 9 }, { "epoch": 0.144, "eval_accuracy": 0.612, "eval_loss": 0.7878813743591309, "eval_runtime": 9.4054, "eval_samples_per_second": 26.581, "eval_steps_per_second": 3.402, "step": 9 }, { "epoch": 0.16, "grad_norm": 8.692429542541504, "learning_rate": 4.672131147540984e-05, "loss": 0.5176, "step": 10 }, { "epoch": 0.16, "eval_accuracy": 0.6, "eval_loss": 0.7594003677368164, "eval_runtime": 9.4569, "eval_samples_per_second": 26.436, "eval_steps_per_second": 3.384, "step": 10 }, { "epoch": 0.176, "grad_norm": 9.585773468017578, "learning_rate": 4.631147540983607e-05, "loss": 0.7224, "step": 11 }, { "epoch": 0.176, "eval_accuracy": 0.604, "eval_loss": 0.7379999756813049, "eval_runtime": 9.4177, "eval_samples_per_second": 26.546, "eval_steps_per_second": 3.398, "step": 11 }, { "epoch": 0.192, "grad_norm": 10.833252906799316, "learning_rate": 4.59016393442623e-05, "loss": 0.8363, "step": 12 }, { "epoch": 0.192, "eval_accuracy": 0.588, "eval_loss": 0.7211699485778809, "eval_runtime": 9.4924, "eval_samples_per_second": 26.337, "eval_steps_per_second": 3.371, "step": 12 }, { "epoch": 0.208, "grad_norm": 17.36851692199707, "learning_rate": 4.549180327868853e-05, "loss": 0.868, "step": 13 }, { "epoch": 0.208, "eval_accuracy": 0.58, "eval_loss": 0.7057519555091858, "eval_runtime": 9.4694, "eval_samples_per_second": 26.401, "eval_steps_per_second": 3.379, "step": 13 }, { "epoch": 0.224, "grad_norm": 9.86408805847168, "learning_rate": 4.508196721311476e-05, "loss": 0.5603, "step": 14 }, { "epoch": 0.224, "eval_accuracy": 0.584, "eval_loss": 0.7084277272224426, "eval_runtime": 9.4106, "eval_samples_per_second": 26.566, "eval_steps_per_second": 3.4, "step": 14 }, { "epoch": 0.24, "grad_norm": 6.7643585205078125, "learning_rate": 4.467213114754098e-05, "loss": 0.6958, "step": 15 }, { "epoch": 0.24, "eval_accuracy": 0.572, "eval_loss": 0.7116777300834656, "eval_runtime": 9.4526, "eval_samples_per_second": 26.448, "eval_steps_per_second": 3.385, "step": 15 }, { "epoch": 0.256, "grad_norm": 13.049300193786621, "learning_rate": 4.426229508196721e-05, "loss": 0.5868, "step": 16 }, { "epoch": 0.256, "eval_accuracy": 0.576, "eval_loss": 0.7166406512260437, "eval_runtime": 9.4248, "eval_samples_per_second": 26.526, "eval_steps_per_second": 3.395, "step": 16 }, { "epoch": 0.272, "grad_norm": 12.840044021606445, "learning_rate": 4.3852459016393444e-05, "loss": 0.5497, "step": 17 }, { "epoch": 0.272, "eval_accuracy": 0.588, "eval_loss": 0.7239003777503967, "eval_runtime": 9.4276, "eval_samples_per_second": 26.518, "eval_steps_per_second": 3.394, "step": 17 }, { "epoch": 0.288, "grad_norm": 9.021048545837402, "learning_rate": 4.3442622950819674e-05, "loss": 0.7557, "step": 18 }, { "epoch": 0.288, "eval_accuracy": 0.592, "eval_loss": 0.728591799736023, "eval_runtime": 9.4602, "eval_samples_per_second": 26.427, "eval_steps_per_second": 3.383, "step": 18 }, { "epoch": 0.304, "grad_norm": 15.930183410644531, "learning_rate": 4.3032786885245904e-05, "loss": 0.8174, "step": 19 }, { "epoch": 0.304, "eval_accuracy": 0.588, "eval_loss": 0.7309414148330688, "eval_runtime": 9.4241, "eval_samples_per_second": 26.528, "eval_steps_per_second": 3.396, "step": 19 }, { "epoch": 0.32, "grad_norm": 25.526287078857422, "learning_rate": 4.262295081967213e-05, "loss": 0.9582, "step": 20 }, { "epoch": 0.32, "eval_accuracy": 0.58, "eval_loss": 0.7253652215003967, "eval_runtime": 9.487, "eval_samples_per_second": 26.352, "eval_steps_per_second": 3.373, "step": 20 }, { "epoch": 0.336, "grad_norm": 16.851058959960938, "learning_rate": 4.2213114754098365e-05, "loss": 0.7394, "step": 21 }, { "epoch": 0.336, "eval_accuracy": 0.572, "eval_loss": 0.721359372138977, "eval_runtime": 9.4839, "eval_samples_per_second": 26.361, "eval_steps_per_second": 3.374, "step": 21 }, { "epoch": 0.352, "grad_norm": 16.92612648010254, "learning_rate": 4.1803278688524595e-05, "loss": 0.7682, "step": 22 }, { "epoch": 0.352, "eval_accuracy": 0.58, "eval_loss": 0.7189823985099792, "eval_runtime": 9.4414, "eval_samples_per_second": 26.479, "eval_steps_per_second": 3.389, "step": 22 }, { "epoch": 0.368, "grad_norm": 9.329913139343262, "learning_rate": 4.1393442622950826e-05, "loss": 0.5394, "step": 23 }, { "epoch": 0.368, "eval_accuracy": 0.564, "eval_loss": 0.7176367044448853, "eval_runtime": 9.4362, "eval_samples_per_second": 26.494, "eval_steps_per_second": 3.391, "step": 23 }, { "epoch": 0.384, "grad_norm": 16.587936401367188, "learning_rate": 4.098360655737705e-05, "loss": 0.7886, "step": 24 }, { "epoch": 0.384, "eval_accuracy": 0.572, "eval_loss": 0.7161562442779541, "eval_runtime": 9.4353, "eval_samples_per_second": 26.496, "eval_steps_per_second": 3.392, "step": 24 }, { "epoch": 0.4, "grad_norm": 15.896271705627441, "learning_rate": 4.057377049180328e-05, "loss": 0.5579, "step": 25 }, { "epoch": 0.4, "eval_accuracy": 0.572, "eval_loss": 0.7171699404716492, "eval_runtime": 9.4618, "eval_samples_per_second": 26.422, "eval_steps_per_second": 3.382, "step": 25 }, { "epoch": 0.416, "grad_norm": 6.284942626953125, "learning_rate": 4.016393442622951e-05, "loss": 0.619, "step": 26 }, { "epoch": 0.416, "eval_accuracy": 0.576, "eval_loss": 0.7149707078933716, "eval_runtime": 9.4462, "eval_samples_per_second": 26.466, "eval_steps_per_second": 3.388, "step": 26 }, { "epoch": 0.432, "grad_norm": 7.851229667663574, "learning_rate": 3.975409836065574e-05, "loss": 0.6796, "step": 27 }, { "epoch": 0.432, "eval_accuracy": 0.572, "eval_loss": 0.7145332098007202, "eval_runtime": 9.4497, "eval_samples_per_second": 26.456, "eval_steps_per_second": 3.386, "step": 27 }, { "epoch": 0.448, "grad_norm": 6.50039529800415, "learning_rate": 3.934426229508197e-05, "loss": 0.8046, "step": 28 }, { "epoch": 0.448, "eval_accuracy": 0.568, "eval_loss": 0.7118340134620667, "eval_runtime": 9.4429, "eval_samples_per_second": 26.475, "eval_steps_per_second": 3.389, "step": 28 }, { "epoch": 0.464, "grad_norm": 10.894524574279785, "learning_rate": 3.89344262295082e-05, "loss": 0.6829, "step": 29 }, { "epoch": 0.464, "eval_accuracy": 0.568, "eval_loss": 0.7091230750083923, "eval_runtime": 9.4254, "eval_samples_per_second": 26.524, "eval_steps_per_second": 3.395, "step": 29 }, { "epoch": 0.48, "grad_norm": 17.76140594482422, "learning_rate": 3.8524590163934424e-05, "loss": 0.8194, "step": 30 }, { "epoch": 0.48, "eval_accuracy": 0.548, "eval_loss": 0.7109335660934448, "eval_runtime": 9.4302, "eval_samples_per_second": 26.511, "eval_steps_per_second": 3.393, "step": 30 }, { "epoch": 0.496, "grad_norm": 4.884728908538818, "learning_rate": 3.8114754098360655e-05, "loss": 0.6432, "step": 31 }, { "epoch": 0.496, "eval_accuracy": 0.536, "eval_loss": 0.7137030959129333, "eval_runtime": 9.4055, "eval_samples_per_second": 26.58, "eval_steps_per_second": 3.402, "step": 31 }, { "epoch": 0.512, "grad_norm": 8.217907905578613, "learning_rate": 3.7704918032786885e-05, "loss": 0.6199, "step": 32 }, { "epoch": 0.512, "eval_accuracy": 0.536, "eval_loss": 0.7147109508514404, "eval_runtime": 9.4314, "eval_samples_per_second": 26.507, "eval_steps_per_second": 3.393, "step": 32 }, { "epoch": 0.528, "grad_norm": 5.067286014556885, "learning_rate": 3.729508196721312e-05, "loss": 0.5238, "step": 33 }, { "epoch": 0.528, "eval_accuracy": 0.528, "eval_loss": 0.7139023542404175, "eval_runtime": 9.4057, "eval_samples_per_second": 26.579, "eval_steps_per_second": 3.402, "step": 33 }, { "epoch": 0.544, "grad_norm": 9.185476303100586, "learning_rate": 3.6885245901639346e-05, "loss": 0.5065, "step": 34 }, { "epoch": 0.544, "eval_accuracy": 0.54, "eval_loss": 0.7080722451210022, "eval_runtime": 9.4148, "eval_samples_per_second": 26.554, "eval_steps_per_second": 3.399, "step": 34 }, { "epoch": 0.56, "grad_norm": 10.447481155395508, "learning_rate": 3.6475409836065576e-05, "loss": 0.7825, "step": 35 }, { "epoch": 0.56, "eval_accuracy": 0.556, "eval_loss": 0.7053359150886536, "eval_runtime": 9.4329, "eval_samples_per_second": 26.503, "eval_steps_per_second": 3.392, "step": 35 }, { "epoch": 0.576, "grad_norm": 9.977537155151367, "learning_rate": 3.6065573770491806e-05, "loss": 0.7256, "step": 36 }, { "epoch": 0.576, "eval_accuracy": 0.556, "eval_loss": 0.7060820460319519, "eval_runtime": 9.426, "eval_samples_per_second": 26.522, "eval_steps_per_second": 3.395, "step": 36 }, { "epoch": 0.592, "grad_norm": 8.119141578674316, "learning_rate": 3.5655737704918037e-05, "loss": 0.7407, "step": 37 }, { "epoch": 0.592, "eval_accuracy": 0.544, "eval_loss": 0.7100077867507935, "eval_runtime": 9.4303, "eval_samples_per_second": 26.51, "eval_steps_per_second": 3.393, "step": 37 }, { "epoch": 0.608, "grad_norm": 13.609740257263184, "learning_rate": 3.524590163934427e-05, "loss": 0.6665, "step": 38 }, { "epoch": 0.608, "eval_accuracy": 0.544, "eval_loss": 0.7075429558753967, "eval_runtime": 9.4113, "eval_samples_per_second": 26.564, "eval_steps_per_second": 3.4, "step": 38 }, { "epoch": 0.624, "grad_norm": 22.365285873413086, "learning_rate": 3.483606557377049e-05, "loss": 0.8188, "step": 39 }, { "epoch": 0.624, "eval_accuracy": 0.564, "eval_loss": 0.7029336094856262, "eval_runtime": 9.4257, "eval_samples_per_second": 26.523, "eval_steps_per_second": 3.395, "step": 39 }, { "epoch": 0.64, "grad_norm": 10.358452796936035, "learning_rate": 3.442622950819672e-05, "loss": 0.6671, "step": 40 }, { "epoch": 0.64, "eval_accuracy": 0.568, "eval_loss": 0.6954512000083923, "eval_runtime": 9.4493, "eval_samples_per_second": 26.457, "eval_steps_per_second": 3.386, "step": 40 }, { "epoch": 0.656, "grad_norm": 15.979942321777344, "learning_rate": 3.401639344262295e-05, "loss": 0.7222, "step": 41 }, { "epoch": 0.656, "eval_accuracy": 0.568, "eval_loss": 0.6924257874488831, "eval_runtime": 9.4502, "eval_samples_per_second": 26.454, "eval_steps_per_second": 3.386, "step": 41 }, { "epoch": 0.672, "grad_norm": 16.25983428955078, "learning_rate": 3.360655737704918e-05, "loss": 0.7285, "step": 42 }, { "epoch": 0.672, "eval_accuracy": 0.576, "eval_loss": 0.6920918226242065, "eval_runtime": 9.4123, "eval_samples_per_second": 26.561, "eval_steps_per_second": 3.4, "step": 42 }, { "epoch": 0.688, "grad_norm": 7.8817853927612305, "learning_rate": 3.319672131147541e-05, "loss": 0.7068, "step": 43 }, { "epoch": 0.688, "eval_accuracy": 0.588, "eval_loss": 0.693978488445282, "eval_runtime": 9.4142, "eval_samples_per_second": 26.556, "eval_steps_per_second": 3.399, "step": 43 }, { "epoch": 0.704, "grad_norm": 11.203206062316895, "learning_rate": 3.2786885245901635e-05, "loss": 0.613, "step": 44 }, { "epoch": 0.704, "eval_accuracy": 0.6, "eval_loss": 0.6923867464065552, "eval_runtime": 9.4098, "eval_samples_per_second": 26.568, "eval_steps_per_second": 3.401, "step": 44 }, { "epoch": 0.72, "grad_norm": 8.55033016204834, "learning_rate": 3.237704918032787e-05, "loss": 0.5672, "step": 45 }, { "epoch": 0.72, "eval_accuracy": 0.604, "eval_loss": 0.695925772190094, "eval_runtime": 9.4467, "eval_samples_per_second": 26.464, "eval_steps_per_second": 3.387, "step": 45 }, { "epoch": 0.736, "grad_norm": 9.487948417663574, "learning_rate": 3.19672131147541e-05, "loss": 0.6208, "step": 46 }, { "epoch": 0.736, "eval_accuracy": 0.604, "eval_loss": 0.7002148628234863, "eval_runtime": 9.4163, "eval_samples_per_second": 26.55, "eval_steps_per_second": 3.398, "step": 46 }, { "epoch": 0.752, "grad_norm": 7.840662479400635, "learning_rate": 3.155737704918033e-05, "loss": 0.6282, "step": 47 }, { "epoch": 0.752, "eval_accuracy": 0.608, "eval_loss": 0.7034921646118164, "eval_runtime": 9.4244, "eval_samples_per_second": 26.527, "eval_steps_per_second": 3.395, "step": 47 }, { "epoch": 0.768, "grad_norm": 6.098258972167969, "learning_rate": 3.114754098360656e-05, "loss": 0.6129, "step": 48 }, { "epoch": 0.768, "eval_accuracy": 0.604, "eval_loss": 0.7040849328041077, "eval_runtime": 9.3957, "eval_samples_per_second": 26.608, "eval_steps_per_second": 3.406, "step": 48 }, { "epoch": 0.784, "grad_norm": 7.861691951751709, "learning_rate": 3.073770491803279e-05, "loss": 0.6396, "step": 49 }, { "epoch": 0.784, "eval_accuracy": 0.608, "eval_loss": 0.7040830254554749, "eval_runtime": 9.396, "eval_samples_per_second": 26.607, "eval_steps_per_second": 3.406, "step": 49 }, { "epoch": 0.8, "grad_norm": 9.376338958740234, "learning_rate": 3.0327868852459017e-05, "loss": 0.5983, "step": 50 }, { "epoch": 0.8, "eval_accuracy": 0.608, "eval_loss": 0.7050849795341492, "eval_runtime": 9.4089, "eval_samples_per_second": 26.571, "eval_steps_per_second": 3.401, "step": 50 }, { "epoch": 0.816, "grad_norm": 8.683838844299316, "learning_rate": 2.9918032786885248e-05, "loss": 0.6681, "step": 51 }, { "epoch": 0.816, "eval_accuracy": 0.604, "eval_loss": 0.705935537815094, "eval_runtime": 9.3804, "eval_samples_per_second": 26.651, "eval_steps_per_second": 3.411, "step": 51 }, { "epoch": 0.832, "grad_norm": 17.765621185302734, "learning_rate": 2.9508196721311478e-05, "loss": 0.8503, "step": 52 }, { "epoch": 0.832, "eval_accuracy": 0.604, "eval_loss": 0.6994922161102295, "eval_runtime": 9.4185, "eval_samples_per_second": 26.544, "eval_steps_per_second": 3.398, "step": 52 }, { "epoch": 0.848, "grad_norm": 15.548516273498535, "learning_rate": 2.9098360655737705e-05, "loss": 0.7585, "step": 53 }, { "epoch": 0.848, "eval_accuracy": 0.6, "eval_loss": 0.692019522190094, "eval_runtime": 9.5871, "eval_samples_per_second": 26.077, "eval_steps_per_second": 3.338, "step": 53 }, { "epoch": 0.864, "grad_norm": 8.666825294494629, "learning_rate": 2.8688524590163935e-05, "loss": 0.5713, "step": 54 }, { "epoch": 0.864, "eval_accuracy": 0.6, "eval_loss": 0.68896484375, "eval_runtime": 9.4277, "eval_samples_per_second": 26.518, "eval_steps_per_second": 3.394, "step": 54 }, { "epoch": 0.88, "grad_norm": 16.585477828979492, "learning_rate": 2.8278688524590162e-05, "loss": 0.7261, "step": 55 }, { "epoch": 0.88, "eval_accuracy": 0.6, "eval_loss": 0.6847422122955322, "eval_runtime": 9.4025, "eval_samples_per_second": 26.589, "eval_steps_per_second": 3.403, "step": 55 }, { "epoch": 0.896, "grad_norm": 17.52354621887207, "learning_rate": 2.7868852459016392e-05, "loss": 0.7457, "step": 56 }, { "epoch": 0.896, "eval_accuracy": 0.604, "eval_loss": 0.6801777482032776, "eval_runtime": 9.4431, "eval_samples_per_second": 26.474, "eval_steps_per_second": 3.389, "step": 56 }, { "epoch": 0.912, "grad_norm": 14.731335639953613, "learning_rate": 2.7459016393442626e-05, "loss": 0.8242, "step": 57 }, { "epoch": 0.912, "eval_accuracy": 0.576, "eval_loss": 0.6787323951721191, "eval_runtime": 9.4268, "eval_samples_per_second": 26.52, "eval_steps_per_second": 3.395, "step": 57 }, { "epoch": 0.928, "grad_norm": 6.853959083557129, "learning_rate": 2.7049180327868856e-05, "loss": 0.7688, "step": 58 }, { "epoch": 0.928, "eval_accuracy": 0.568, "eval_loss": 0.6817187666893005, "eval_runtime": 9.4078, "eval_samples_per_second": 26.574, "eval_steps_per_second": 3.401, "step": 58 }, { "epoch": 0.944, "grad_norm": 13.072829246520996, "learning_rate": 2.6639344262295087e-05, "loss": 0.5804, "step": 59 }, { "epoch": 0.944, "eval_accuracy": 0.572, "eval_loss": 0.685714840888977, "eval_runtime": 9.422, "eval_samples_per_second": 26.534, "eval_steps_per_second": 3.396, "step": 59 }, { "epoch": 0.96, "grad_norm": 8.29138469696045, "learning_rate": 2.6229508196721314e-05, "loss": 0.8167, "step": 60 }, { "epoch": 0.96, "eval_accuracy": 0.568, "eval_loss": 0.6867265701293945, "eval_runtime": 9.4234, "eval_samples_per_second": 26.53, "eval_steps_per_second": 3.396, "step": 60 }, { "epoch": 0.976, "grad_norm": 5.209651470184326, "learning_rate": 2.5819672131147544e-05, "loss": 0.5874, "step": 61 }, { "epoch": 0.976, "eval_accuracy": 0.576, "eval_loss": 0.6885351538658142, "eval_runtime": 9.4091, "eval_samples_per_second": 26.57, "eval_steps_per_second": 3.401, "step": 61 }, { "epoch": 0.992, "grad_norm": 8.127976417541504, "learning_rate": 2.540983606557377e-05, "loss": 0.6197, "step": 62 }, { "epoch": 0.992, "eval_accuracy": 0.572, "eval_loss": 0.6853671669960022, "eval_runtime": 9.4343, "eval_samples_per_second": 26.499, "eval_steps_per_second": 3.392, "step": 62 }, { "epoch": 1.008, "grad_norm": 4.938397407531738, "learning_rate": 2.5e-05, "loss": 0.6458, "step": 63 }, { "epoch": 1.008, "eval_accuracy": 0.584, "eval_loss": 0.6829023361206055, "eval_runtime": 9.4315, "eval_samples_per_second": 26.507, "eval_steps_per_second": 3.393, "step": 63 }, { "epoch": 1.024, "grad_norm": 15.248034477233887, "learning_rate": 2.459016393442623e-05, "loss": 0.7218, "step": 64 }, { "epoch": 1.024, "eval_accuracy": 0.592, "eval_loss": 0.6791366934776306, "eval_runtime": 9.4284, "eval_samples_per_second": 26.516, "eval_steps_per_second": 3.394, "step": 64 }, { "epoch": 1.04, "grad_norm": 5.217968463897705, "learning_rate": 2.418032786885246e-05, "loss": 0.6869, "step": 65 }, { "epoch": 1.04, "eval_accuracy": 0.592, "eval_loss": 0.6775898337364197, "eval_runtime": 9.4135, "eval_samples_per_second": 26.558, "eval_steps_per_second": 3.399, "step": 65 }, { "epoch": 1.056, "grad_norm": 8.960049629211426, "learning_rate": 2.377049180327869e-05, "loss": 0.7135, "step": 66 }, { "epoch": 1.056, "eval_accuracy": 0.592, "eval_loss": 0.6763710975646973, "eval_runtime": 9.43, "eval_samples_per_second": 26.511, "eval_steps_per_second": 3.393, "step": 66 }, { "epoch": 1.072, "grad_norm": 14.524127960205078, "learning_rate": 2.336065573770492e-05, "loss": 0.7343, "step": 67 }, { "epoch": 1.072, "eval_accuracy": 0.596, "eval_loss": 0.673941433429718, "eval_runtime": 9.4381, "eval_samples_per_second": 26.488, "eval_steps_per_second": 3.39, "step": 67 }, { "epoch": 1.088, "grad_norm": 14.215781211853027, "learning_rate": 2.295081967213115e-05, "loss": 0.7439, "step": 68 }, { "epoch": 1.088, "eval_accuracy": 0.596, "eval_loss": 0.6748945116996765, "eval_runtime": 9.4059, "eval_samples_per_second": 26.579, "eval_steps_per_second": 3.402, "step": 68 }, { "epoch": 1.104, "grad_norm": 5.426934719085693, "learning_rate": 2.254098360655738e-05, "loss": 0.5504, "step": 69 }, { "epoch": 1.104, "eval_accuracy": 0.6, "eval_loss": 0.6768242120742798, "eval_runtime": 9.4117, "eval_samples_per_second": 26.563, "eval_steps_per_second": 3.4, "step": 69 }, { "epoch": 1.12, "grad_norm": 14.354090690612793, "learning_rate": 2.2131147540983607e-05, "loss": 0.696, "step": 70 }, { "epoch": 1.12, "eval_accuracy": 0.596, "eval_loss": 0.6765508055686951, "eval_runtime": 9.4291, "eval_samples_per_second": 26.514, "eval_steps_per_second": 3.394, "step": 70 }, { "epoch": 1.1360000000000001, "grad_norm": 11.328275680541992, "learning_rate": 2.1721311475409837e-05, "loss": 0.6042, "step": 71 }, { "epoch": 1.1360000000000001, "eval_accuracy": 0.596, "eval_loss": 0.6768398284912109, "eval_runtime": 9.4156, "eval_samples_per_second": 26.552, "eval_steps_per_second": 3.399, "step": 71 }, { "epoch": 1.152, "grad_norm": 9.158403396606445, "learning_rate": 2.1311475409836064e-05, "loss": 0.4853, "step": 72 }, { "epoch": 1.152, "eval_accuracy": 0.604, "eval_loss": 0.6750390529632568, "eval_runtime": 9.4378, "eval_samples_per_second": 26.489, "eval_steps_per_second": 3.391, "step": 72 }, { "epoch": 1.168, "grad_norm": 7.848287105560303, "learning_rate": 2.0901639344262298e-05, "loss": 0.6744, "step": 73 }, { "epoch": 1.168, "eval_accuracy": 0.6, "eval_loss": 0.6753163933753967, "eval_runtime": 9.4125, "eval_samples_per_second": 26.56, "eval_steps_per_second": 3.4, "step": 73 }, { "epoch": 1.184, "grad_norm": 11.083074569702148, "learning_rate": 2.0491803278688525e-05, "loss": 0.7398, "step": 74 }, { "epoch": 1.184, "eval_accuracy": 0.596, "eval_loss": 0.676925778388977, "eval_runtime": 9.421, "eval_samples_per_second": 26.536, "eval_steps_per_second": 3.397, "step": 74 }, { "epoch": 1.2, "grad_norm": 8.224617958068848, "learning_rate": 2.0081967213114755e-05, "loss": 0.6029, "step": 75 }, { "epoch": 1.2, "eval_accuracy": 0.596, "eval_loss": 0.677783191204071, "eval_runtime": 9.4291, "eval_samples_per_second": 26.514, "eval_steps_per_second": 3.394, "step": 75 }, { "epoch": 1.216, "grad_norm": 17.132051467895508, "learning_rate": 1.9672131147540985e-05, "loss": 0.6935, "step": 76 }, { "epoch": 1.216, "eval_accuracy": 0.596, "eval_loss": 0.6787539124488831, "eval_runtime": 9.4075, "eval_samples_per_second": 26.575, "eval_steps_per_second": 3.402, "step": 76 }, { "epoch": 1.232, "grad_norm": 8.447811126708984, "learning_rate": 1.9262295081967212e-05, "loss": 0.7292, "step": 77 }, { "epoch": 1.232, "eval_accuracy": 0.6, "eval_loss": 0.6795663833618164, "eval_runtime": 9.4049, "eval_samples_per_second": 26.582, "eval_steps_per_second": 3.402, "step": 77 }, { "epoch": 1.248, "grad_norm": 4.971631050109863, "learning_rate": 1.8852459016393442e-05, "loss": 0.6192, "step": 78 }, { "epoch": 1.248, "eval_accuracy": 0.6, "eval_loss": 0.6786601543426514, "eval_runtime": 9.4102, "eval_samples_per_second": 26.567, "eval_steps_per_second": 3.401, "step": 78 }, { "epoch": 1.264, "grad_norm": 8.30854320526123, "learning_rate": 1.8442622950819673e-05, "loss": 0.6979, "step": 79 }, { "epoch": 1.264, "eval_accuracy": 0.6, "eval_loss": 0.6776171922683716, "eval_runtime": 9.4206, "eval_samples_per_second": 26.537, "eval_steps_per_second": 3.397, "step": 79 }, { "epoch": 1.28, "grad_norm": 9.044068336486816, "learning_rate": 1.8032786885245903e-05, "loss": 0.7554, "step": 80 }, { "epoch": 1.28, "eval_accuracy": 0.596, "eval_loss": 0.6768652200698853, "eval_runtime": 9.4398, "eval_samples_per_second": 26.484, "eval_steps_per_second": 3.39, "step": 80 }, { "epoch": 1.296, "grad_norm": 22.36913299560547, "learning_rate": 1.7622950819672133e-05, "loss": 0.7857, "step": 81 }, { "epoch": 1.296, "eval_accuracy": 0.584, "eval_loss": 0.6760781407356262, "eval_runtime": 9.4344, "eval_samples_per_second": 26.499, "eval_steps_per_second": 3.392, "step": 81 }, { "epoch": 1.312, "grad_norm": 9.494186401367188, "learning_rate": 1.721311475409836e-05, "loss": 0.7903, "step": 82 }, { "epoch": 1.312, "eval_accuracy": 0.576, "eval_loss": 0.6796757578849792, "eval_runtime": 9.3991, "eval_samples_per_second": 26.598, "eval_steps_per_second": 3.405, "step": 82 }, { "epoch": 1.328, "grad_norm": 6.161738395690918, "learning_rate": 1.680327868852459e-05, "loss": 0.714, "step": 83 }, { "epoch": 1.328, "eval_accuracy": 0.576, "eval_loss": 0.6806288957595825, "eval_runtime": 9.424, "eval_samples_per_second": 26.528, "eval_steps_per_second": 3.396, "step": 83 }, { "epoch": 1.3439999999999999, "grad_norm": 10.077332496643066, "learning_rate": 1.6393442622950818e-05, "loss": 0.7107, "step": 84 }, { "epoch": 1.3439999999999999, "eval_accuracy": 0.584, "eval_loss": 0.6848242282867432, "eval_runtime": 9.4189, "eval_samples_per_second": 26.542, "eval_steps_per_second": 3.397, "step": 84 }, { "epoch": 1.3599999999999999, "grad_norm": 14.34889030456543, "learning_rate": 1.598360655737705e-05, "loss": 0.6276, "step": 85 }, { "epoch": 1.3599999999999999, "eval_accuracy": 0.588, "eval_loss": 0.6862617135047913, "eval_runtime": 9.4148, "eval_samples_per_second": 26.554, "eval_steps_per_second": 3.399, "step": 85 }, { "epoch": 1.376, "grad_norm": 9.223981857299805, "learning_rate": 1.557377049180328e-05, "loss": 0.7295, "step": 86 }, { "epoch": 1.376, "eval_accuracy": 0.588, "eval_loss": 0.6857773661613464, "eval_runtime": 9.4043, "eval_samples_per_second": 26.584, "eval_steps_per_second": 3.403, "step": 86 }, { "epoch": 1.392, "grad_norm": 13.143969535827637, "learning_rate": 1.5163934426229509e-05, "loss": 0.6597, "step": 87 }, { "epoch": 1.392, "eval_accuracy": 0.588, "eval_loss": 0.6872578263282776, "eval_runtime": 9.4212, "eval_samples_per_second": 26.536, "eval_steps_per_second": 3.397, "step": 87 }, { "epoch": 1.408, "grad_norm": 22.58281898498535, "learning_rate": 1.4754098360655739e-05, "loss": 0.6335, "step": 88 }, { "epoch": 1.408, "eval_accuracy": 0.58, "eval_loss": 0.6847929954528809, "eval_runtime": 9.4232, "eval_samples_per_second": 26.53, "eval_steps_per_second": 3.396, "step": 88 }, { "epoch": 1.424, "grad_norm": 12.670473098754883, "learning_rate": 1.4344262295081968e-05, "loss": 0.7245, "step": 89 }, { "epoch": 1.424, "eval_accuracy": 0.572, "eval_loss": 0.6834453344345093, "eval_runtime": 9.4138, "eval_samples_per_second": 26.557, "eval_steps_per_second": 3.399, "step": 89 }, { "epoch": 1.44, "grad_norm": 20.81968879699707, "learning_rate": 1.3934426229508196e-05, "loss": 0.5546, "step": 90 }, { "epoch": 1.44, "eval_accuracy": 0.568, "eval_loss": 0.6808554530143738, "eval_runtime": 9.4208, "eval_samples_per_second": 26.537, "eval_steps_per_second": 3.397, "step": 90 }, { "epoch": 1.456, "grad_norm": 8.033720016479492, "learning_rate": 1.3524590163934428e-05, "loss": 0.6482, "step": 91 }, { "epoch": 1.456, "eval_accuracy": 0.568, "eval_loss": 0.6760781407356262, "eval_runtime": 9.408, "eval_samples_per_second": 26.573, "eval_steps_per_second": 3.401, "step": 91 }, { "epoch": 1.472, "grad_norm": 9.656173706054688, "learning_rate": 1.3114754098360657e-05, "loss": 0.6814, "step": 92 }, { "epoch": 1.472, "eval_accuracy": 0.572, "eval_loss": 0.6791015863418579, "eval_runtime": 9.4039, "eval_samples_per_second": 26.585, "eval_steps_per_second": 3.403, "step": 92 }, { "epoch": 1.488, "grad_norm": 4.5396599769592285, "learning_rate": 1.2704918032786885e-05, "loss": 0.5693, "step": 93 }, { "epoch": 1.488, "eval_accuracy": 0.584, "eval_loss": 0.6775078177452087, "eval_runtime": 9.4321, "eval_samples_per_second": 26.505, "eval_steps_per_second": 3.393, "step": 93 }, { "epoch": 1.504, "grad_norm": 11.05844783782959, "learning_rate": 1.2295081967213116e-05, "loss": 0.5369, "step": 94 }, { "epoch": 1.504, "eval_accuracy": 0.58, "eval_loss": 0.6771523356437683, "eval_runtime": 9.4156, "eval_samples_per_second": 26.552, "eval_steps_per_second": 3.399, "step": 94 }, { "epoch": 1.52, "grad_norm": 19.972246170043945, "learning_rate": 1.1885245901639344e-05, "loss": 0.7144, "step": 95 }, { "epoch": 1.52, "eval_accuracy": 0.576, "eval_loss": 0.6779101490974426, "eval_runtime": 9.4028, "eval_samples_per_second": 26.588, "eval_steps_per_second": 3.403, "step": 95 }, { "epoch": 1.536, "grad_norm": 11.014993667602539, "learning_rate": 1.1475409836065575e-05, "loss": 0.6405, "step": 96 }, { "epoch": 1.536, "eval_accuracy": 0.564, "eval_loss": 0.6772187352180481, "eval_runtime": 9.4126, "eval_samples_per_second": 26.56, "eval_steps_per_second": 3.4, "step": 96 }, { "epoch": 1.552, "grad_norm": 8.04190444946289, "learning_rate": 1.1065573770491803e-05, "loss": 0.7893, "step": 97 }, { "epoch": 1.552, "eval_accuracy": 0.584, "eval_loss": 0.6751992106437683, "eval_runtime": 9.4142, "eval_samples_per_second": 26.556, "eval_steps_per_second": 3.399, "step": 97 }, { "epoch": 1.568, "grad_norm": 8.616044044494629, "learning_rate": 1.0655737704918032e-05, "loss": 0.6448, "step": 98 }, { "epoch": 1.568, "eval_accuracy": 0.568, "eval_loss": 0.6759804487228394, "eval_runtime": 9.4235, "eval_samples_per_second": 26.529, "eval_steps_per_second": 3.396, "step": 98 }, { "epoch": 1.584, "grad_norm": 12.122180938720703, "learning_rate": 1.0245901639344262e-05, "loss": 0.5828, "step": 99 }, { "epoch": 1.584, "eval_accuracy": 0.576, "eval_loss": 0.6741952896118164, "eval_runtime": 9.4162, "eval_samples_per_second": 26.55, "eval_steps_per_second": 3.398, "step": 99 }, { "epoch": 1.6, "grad_norm": 15.246779441833496, "learning_rate": 9.836065573770493e-06, "loss": 0.6762, "step": 100 }, { "epoch": 1.6, "eval_accuracy": 0.572, "eval_loss": 0.6730703115463257, "eval_runtime": 9.406, "eval_samples_per_second": 26.579, "eval_steps_per_second": 3.402, "step": 100 }, { "epoch": 1.616, "grad_norm": 16.69089126586914, "learning_rate": 9.426229508196721e-06, "loss": 0.6432, "step": 101 }, { "epoch": 1.616, "eval_accuracy": 0.584, "eval_loss": 0.6738671660423279, "eval_runtime": 9.4165, "eval_samples_per_second": 26.549, "eval_steps_per_second": 3.398, "step": 101 }, { "epoch": 1.6320000000000001, "grad_norm": 8.9694242477417, "learning_rate": 9.016393442622952e-06, "loss": 0.5826, "step": 102 }, { "epoch": 1.6320000000000001, "eval_accuracy": 0.58, "eval_loss": 0.6729843616485596, "eval_runtime": 9.4844, "eval_samples_per_second": 26.359, "eval_steps_per_second": 3.374, "step": 102 }, { "epoch": 1.6480000000000001, "grad_norm": 9.330092430114746, "learning_rate": 8.60655737704918e-06, "loss": 0.6224, "step": 103 }, { "epoch": 1.6480000000000001, "eval_accuracy": 0.584, "eval_loss": 0.673214852809906, "eval_runtime": 9.4192, "eval_samples_per_second": 26.541, "eval_steps_per_second": 3.397, "step": 103 }, { "epoch": 1.6640000000000001, "grad_norm": 7.138861179351807, "learning_rate": 8.196721311475409e-06, "loss": 0.6262, "step": 104 }, { "epoch": 1.6640000000000001, "eval_accuracy": 0.592, "eval_loss": 0.6745429635047913, "eval_runtime": 9.4226, "eval_samples_per_second": 26.532, "eval_steps_per_second": 3.396, "step": 104 }, { "epoch": 1.6800000000000002, "grad_norm": 7.4160356521606445, "learning_rate": 7.78688524590164e-06, "loss": 0.6451, "step": 105 }, { "epoch": 1.6800000000000002, "eval_accuracy": 0.592, "eval_loss": 0.6730429530143738, "eval_runtime": 9.4489, "eval_samples_per_second": 26.458, "eval_steps_per_second": 3.387, "step": 105 }, { "epoch": 1.696, "grad_norm": 5.479573726654053, "learning_rate": 7.3770491803278695e-06, "loss": 0.5948, "step": 106 }, { "epoch": 1.696, "eval_accuracy": 0.6, "eval_loss": 0.6731171607971191, "eval_runtime": 9.4414, "eval_samples_per_second": 26.479, "eval_steps_per_second": 3.389, "step": 106 }, { "epoch": 1.712, "grad_norm": 9.357452392578125, "learning_rate": 6.967213114754098e-06, "loss": 0.7451, "step": 107 }, { "epoch": 1.712, "eval_accuracy": 0.58, "eval_loss": 0.6747695207595825, "eval_runtime": 9.4087, "eval_samples_per_second": 26.571, "eval_steps_per_second": 3.401, "step": 107 }, { "epoch": 1.728, "grad_norm": 10.986834526062012, "learning_rate": 6.557377049180328e-06, "loss": 0.5922, "step": 108 }, { "epoch": 1.728, "eval_accuracy": 0.588, "eval_loss": 0.6725429892539978, "eval_runtime": 9.4208, "eval_samples_per_second": 26.537, "eval_steps_per_second": 3.397, "step": 108 }, { "epoch": 1.744, "grad_norm": 6.625186920166016, "learning_rate": 6.147540983606558e-06, "loss": 0.6454, "step": 109 }, { "epoch": 1.744, "eval_accuracy": 0.592, "eval_loss": 0.6714960932731628, "eval_runtime": 9.4316, "eval_samples_per_second": 26.507, "eval_steps_per_second": 3.393, "step": 109 }, { "epoch": 1.76, "grad_norm": 9.619455337524414, "learning_rate": 5.737704918032787e-06, "loss": 0.601, "step": 110 }, { "epoch": 1.76, "eval_accuracy": 0.596, "eval_loss": 0.671625018119812, "eval_runtime": 9.4295, "eval_samples_per_second": 26.512, "eval_steps_per_second": 3.394, "step": 110 }, { "epoch": 1.776, "grad_norm": 10.5454683303833, "learning_rate": 5.327868852459016e-06, "loss": 0.7236, "step": 111 }, { "epoch": 1.776, "eval_accuracy": 0.592, "eval_loss": 0.6704453229904175, "eval_runtime": 9.4138, "eval_samples_per_second": 26.557, "eval_steps_per_second": 3.399, "step": 111 }, { "epoch": 1.792, "grad_norm": 9.553342819213867, "learning_rate": 4.918032786885246e-06, "loss": 0.7825, "step": 112 }, { "epoch": 1.792, "eval_accuracy": 0.596, "eval_loss": 0.673535168170929, "eval_runtime": 9.4206, "eval_samples_per_second": 26.538, "eval_steps_per_second": 3.397, "step": 112 }, { "epoch": 1.808, "grad_norm": 7.810243129730225, "learning_rate": 4.508196721311476e-06, "loss": 0.6302, "step": 113 }, { "epoch": 1.808, "eval_accuracy": 0.584, "eval_loss": 0.670703113079071, "eval_runtime": 9.5051, "eval_samples_per_second": 26.302, "eval_steps_per_second": 3.367, "step": 113 }, { "epoch": 1.8239999999999998, "grad_norm": 15.086982727050781, "learning_rate": 4.098360655737704e-06, "loss": 0.6824, "step": 114 }, { "epoch": 1.8239999999999998, "eval_accuracy": 0.584, "eval_loss": 0.6711757779121399, "eval_runtime": 9.432, "eval_samples_per_second": 26.505, "eval_steps_per_second": 3.393, "step": 114 }, { "epoch": 1.8399999999999999, "grad_norm": 13.564058303833008, "learning_rate": 3.6885245901639347e-06, "loss": 0.6208, "step": 115 }, { "epoch": 1.8399999999999999, "eval_accuracy": 0.588, "eval_loss": 0.6693046689033508, "eval_runtime": 9.4215, "eval_samples_per_second": 26.535, "eval_steps_per_second": 3.396, "step": 115 }, { "epoch": 1.8559999999999999, "grad_norm": 7.943946361541748, "learning_rate": 3.278688524590164e-06, "loss": 0.6987, "step": 116 }, { "epoch": 1.8559999999999999, "eval_accuracy": 0.588, "eval_loss": 0.671625018119812, "eval_runtime": 9.4001, "eval_samples_per_second": 26.595, "eval_steps_per_second": 3.404, "step": 116 }, { "epoch": 1.8719999999999999, "grad_norm": 6.293920993804932, "learning_rate": 2.8688524590163937e-06, "loss": 0.5587, "step": 117 }, { "epoch": 1.8719999999999999, "eval_accuracy": 0.588, "eval_loss": 0.670785129070282, "eval_runtime": 9.3933, "eval_samples_per_second": 26.615, "eval_steps_per_second": 3.407, "step": 117 }, { "epoch": 1.888, "grad_norm": 5.374147415161133, "learning_rate": 2.459016393442623e-06, "loss": 0.6304, "step": 118 }, { "epoch": 1.888, "eval_accuracy": 0.592, "eval_loss": 0.6705155968666077, "eval_runtime": 9.4015, "eval_samples_per_second": 26.592, "eval_steps_per_second": 3.404, "step": 118 }, { "epoch": 1.904, "grad_norm": 11.269082069396973, "learning_rate": 2.049180327868852e-06, "loss": 0.4528, "step": 119 }, { "epoch": 1.904, "eval_accuracy": 0.584, "eval_loss": 0.6711132526397705, "eval_runtime": 9.4407, "eval_samples_per_second": 26.481, "eval_steps_per_second": 3.39, "step": 119 }, { "epoch": 1.92, "grad_norm": 20.449726104736328, "learning_rate": 1.639344262295082e-06, "loss": 0.7061, "step": 120 }, { "epoch": 1.92, "eval_accuracy": 0.58, "eval_loss": 0.6705625057220459, "eval_runtime": 9.4641, "eval_samples_per_second": 26.416, "eval_steps_per_second": 3.381, "step": 120 }, { "epoch": 1.936, "grad_norm": 13.892779350280762, "learning_rate": 1.2295081967213116e-06, "loss": 0.5595, "step": 121 }, { "epoch": 1.936, "eval_accuracy": 0.588, "eval_loss": 0.670035183429718, "eval_runtime": 9.4443, "eval_samples_per_second": 26.471, "eval_steps_per_second": 3.388, "step": 121 }, { "epoch": 1.952, "grad_norm": 4.646062850952148, "learning_rate": 8.19672131147541e-07, "loss": 0.5968, "step": 122 }, { "epoch": 1.952, "eval_accuracy": 0.588, "eval_loss": 0.6705195307731628, "eval_runtime": 9.4452, "eval_samples_per_second": 26.468, "eval_steps_per_second": 3.388, "step": 122 }, { "epoch": 1.968, "grad_norm": 5.045331001281738, "learning_rate": 4.098360655737705e-07, "loss": 0.577, "step": 123 }, { "epoch": 1.968, "eval_accuracy": 0.584, "eval_loss": 0.6710820198059082, "eval_runtime": 9.4702, "eval_samples_per_second": 26.399, "eval_steps_per_second": 3.379, "step": 123 }, { "epoch": 1.984, "grad_norm": 12.286917686462402, "learning_rate": 0.0, "loss": 0.5765, "step": 124 }, { "epoch": 1.984, "eval_accuracy": 0.58, "eval_loss": 0.6720273494720459, "eval_runtime": 9.4365, "eval_samples_per_second": 26.493, "eval_steps_per_second": 3.391, "step": 124 }, { "epoch": 1.984, "step": 124, "total_flos": 1.3708912645636096e+16, "train_loss": 0.6877071011450983, "train_runtime": 1489.9136, "train_samples_per_second": 1.342, "train_steps_per_second": 0.083 } ], "logging_steps": 1, "max_steps": 124, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 1.3708912645636096e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }