{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 1, "global_step": 80, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.125, "grad_norm": 12.1875, "learning_rate": 2.5e-05, "loss": 0.7832, "step": 1 }, { "epoch": 0.125, "eval_accuracy": 0.504, "eval_loss": 0.7432675957679749, "eval_runtime": 3.7871, "eval_samples_per_second": 66.013, "eval_steps_per_second": 2.112, "step": 1 }, { "epoch": 0.25, "grad_norm": 7.90625, "learning_rate": 5e-05, "loss": 0.7611, "step": 2 }, { "epoch": 0.25, "eval_accuracy": 0.504, "eval_loss": 0.7380077838897705, "eval_runtime": 3.9236, "eval_samples_per_second": 63.717, "eval_steps_per_second": 2.039, "step": 2 }, { "epoch": 0.375, "grad_norm": 6.84375, "learning_rate": 4.935897435897436e-05, "loss": 0.7547, "step": 3 }, { "epoch": 0.375, "eval_accuracy": 0.508, "eval_loss": 0.729250967502594, "eval_runtime": 3.7777, "eval_samples_per_second": 66.178, "eval_steps_per_second": 2.118, "step": 3 }, { "epoch": 0.5, "grad_norm": 8.0625, "learning_rate": 4.871794871794872e-05, "loss": 0.775, "step": 4 }, { "epoch": 0.5, "eval_accuracy": 0.5, "eval_loss": 0.7301250100135803, "eval_runtime": 3.9789, "eval_samples_per_second": 62.832, "eval_steps_per_second": 2.011, "step": 4 }, { "epoch": 0.625, "grad_norm": 6.03125, "learning_rate": 4.8076923076923084e-05, "loss": 0.7001, "step": 5 }, { "epoch": 0.625, "eval_accuracy": 0.492, "eval_loss": 0.7357714772224426, "eval_runtime": 3.8343, "eval_samples_per_second": 65.202, "eval_steps_per_second": 2.086, "step": 5 }, { "epoch": 0.75, "grad_norm": 2.265625, "learning_rate": 4.7435897435897435e-05, "loss": 0.7124, "step": 6 }, { "epoch": 0.75, "eval_accuracy": 0.492, "eval_loss": 0.7345156073570251, "eval_runtime": 3.8283, "eval_samples_per_second": 65.303, "eval_steps_per_second": 2.09, "step": 6 }, { "epoch": 0.875, "grad_norm": 2.0625, "learning_rate": 4.67948717948718e-05, "loss": 0.7225, "step": 7 }, { "epoch": 0.875, "eval_accuracy": 0.492, "eval_loss": 0.7287343740463257, "eval_runtime": 3.8296, "eval_samples_per_second": 65.281, "eval_steps_per_second": 2.089, "step": 7 }, { "epoch": 1.0, "grad_norm": 2.53125, "learning_rate": 4.615384615384616e-05, "loss": 0.7265, "step": 8 }, { "epoch": 1.0, "eval_accuracy": 0.512, "eval_loss": 0.7232968807220459, "eval_runtime": 3.8309, "eval_samples_per_second": 65.259, "eval_steps_per_second": 2.088, "step": 8 }, { "epoch": 1.125, "grad_norm": 2.046875, "learning_rate": 4.5512820512820516e-05, "loss": 0.7436, "step": 9 }, { "epoch": 1.125, "eval_accuracy": 0.492, "eval_loss": 0.720089852809906, "eval_runtime": 3.8302, "eval_samples_per_second": 65.271, "eval_steps_per_second": 2.089, "step": 9 }, { "epoch": 1.25, "grad_norm": 2.15625, "learning_rate": 4.4871794871794874e-05, "loss": 0.7222, "step": 10 }, { "epoch": 1.25, "eval_accuracy": 0.492, "eval_loss": 0.7188398241996765, "eval_runtime": 3.8334, "eval_samples_per_second": 65.217, "eval_steps_per_second": 2.087, "step": 10 }, { "epoch": 1.375, "grad_norm": 2.234375, "learning_rate": 4.423076923076923e-05, "loss": 0.7124, "step": 11 }, { "epoch": 1.375, "eval_accuracy": 0.484, "eval_loss": 0.7185254096984863, "eval_runtime": 3.8304, "eval_samples_per_second": 65.268, "eval_steps_per_second": 2.089, "step": 11 }, { "epoch": 1.5, "grad_norm": 4.84375, "learning_rate": 4.358974358974359e-05, "loss": 0.7443, "step": 12 }, { "epoch": 1.5, "eval_accuracy": 0.556, "eval_loss": 0.7176758050918579, "eval_runtime": 3.8288, "eval_samples_per_second": 65.295, "eval_steps_per_second": 2.089, "step": 12 }, { "epoch": 1.625, "grad_norm": 3.171875, "learning_rate": 4.294871794871795e-05, "loss": 0.7088, "step": 13 }, { "epoch": 1.625, "eval_accuracy": 0.508, "eval_loss": 0.7167324423789978, "eval_runtime": 3.8259, "eval_samples_per_second": 65.344, "eval_steps_per_second": 2.091, "step": 13 }, { "epoch": 1.75, "grad_norm": 1.8125, "learning_rate": 4.230769230769231e-05, "loss": 0.6932, "step": 14 }, { "epoch": 1.75, "eval_accuracy": 0.512, "eval_loss": 0.716666042804718, "eval_runtime": 3.8301, "eval_samples_per_second": 65.273, "eval_steps_per_second": 2.089, "step": 14 }, { "epoch": 1.875, "grad_norm": 1.7890625, "learning_rate": 4.166666666666667e-05, "loss": 0.7183, "step": 15 }, { "epoch": 1.875, "eval_accuracy": 0.508, "eval_loss": 0.7168632745742798, "eval_runtime": 3.8301, "eval_samples_per_second": 65.272, "eval_steps_per_second": 2.089, "step": 15 }, { "epoch": 2.0, "grad_norm": 2.125, "learning_rate": 4.1025641025641023e-05, "loss": 0.7103, "step": 16 }, { "epoch": 2.0, "eval_accuracy": 0.484, "eval_loss": 0.715988278388977, "eval_runtime": 3.8292, "eval_samples_per_second": 65.288, "eval_steps_per_second": 2.089, "step": 16 }, { "epoch": 2.125, "grad_norm": 4.0625, "learning_rate": 4.038461538461539e-05, "loss": 0.6919, "step": 17 }, { "epoch": 2.125, "eval_accuracy": 0.484, "eval_loss": 0.7158437371253967, "eval_runtime": 3.8275, "eval_samples_per_second": 65.317, "eval_steps_per_second": 2.09, "step": 17 }, { "epoch": 2.25, "grad_norm": 2.1875, "learning_rate": 3.974358974358974e-05, "loss": 0.709, "step": 18 }, { "epoch": 2.25, "eval_accuracy": 0.484, "eval_loss": 0.7181054949760437, "eval_runtime": 3.828, "eval_samples_per_second": 65.309, "eval_steps_per_second": 2.09, "step": 18 }, { "epoch": 2.375, "grad_norm": 1.375, "learning_rate": 3.9102564102564105e-05, "loss": 0.7114, "step": 19 }, { "epoch": 2.375, "eval_accuracy": 0.484, "eval_loss": 0.7208398580551147, "eval_runtime": 3.8314, "eval_samples_per_second": 65.25, "eval_steps_per_second": 2.088, "step": 19 }, { "epoch": 2.5, "grad_norm": 3.515625, "learning_rate": 3.846153846153846e-05, "loss": 0.6523, "step": 20 }, { "epoch": 2.5, "eval_accuracy": 0.48, "eval_loss": 0.7245039343833923, "eval_runtime": 3.8292, "eval_samples_per_second": 65.288, "eval_steps_per_second": 2.089, "step": 20 }, { "epoch": 2.625, "grad_norm": 2.765625, "learning_rate": 3.782051282051282e-05, "loss": 0.7068, "step": 21 }, { "epoch": 2.625, "eval_accuracy": 0.5, "eval_loss": 0.7264218926429749, "eval_runtime": 3.8298, "eval_samples_per_second": 65.278, "eval_steps_per_second": 2.089, "step": 21 }, { "epoch": 2.75, "grad_norm": 3.546875, "learning_rate": 3.717948717948718e-05, "loss": 0.6983, "step": 22 }, { "epoch": 2.75, "eval_accuracy": 0.5, "eval_loss": 0.7264081835746765, "eval_runtime": 3.8282, "eval_samples_per_second": 65.305, "eval_steps_per_second": 2.09, "step": 22 }, { "epoch": 2.875, "grad_norm": 3.75, "learning_rate": 3.653846153846154e-05, "loss": 0.6958, "step": 23 }, { "epoch": 2.875, "eval_accuracy": 0.5, "eval_loss": 0.7263144254684448, "eval_runtime": 3.824, "eval_samples_per_second": 65.376, "eval_steps_per_second": 2.092, "step": 23 }, { "epoch": 3.0, "grad_norm": 4.28125, "learning_rate": 3.58974358974359e-05, "loss": 0.696, "step": 24 }, { "epoch": 3.0, "eval_accuracy": 0.5, "eval_loss": 0.7253711223602295, "eval_runtime": 3.7784, "eval_samples_per_second": 66.166, "eval_steps_per_second": 2.117, "step": 24 }, { "epoch": 3.125, "grad_norm": 2.546875, "learning_rate": 3.525641025641026e-05, "loss": 0.7316, "step": 25 }, { "epoch": 3.125, "eval_accuracy": 0.5, "eval_loss": 0.7229355573654175, "eval_runtime": 3.8248, "eval_samples_per_second": 65.363, "eval_steps_per_second": 2.092, "step": 25 }, { "epoch": 3.25, "grad_norm": 0.9453125, "learning_rate": 3.461538461538462e-05, "loss": 0.6574, "step": 26 }, { "epoch": 3.25, "eval_accuracy": 0.5, "eval_loss": 0.7210351824760437, "eval_runtime": 3.7805, "eval_samples_per_second": 66.129, "eval_steps_per_second": 2.116, "step": 26 }, { "epoch": 3.375, "grad_norm": 1.9140625, "learning_rate": 3.397435897435898e-05, "loss": 0.6521, "step": 27 }, { "epoch": 3.375, "eval_accuracy": 0.504, "eval_loss": 0.7196875214576721, "eval_runtime": 3.7838, "eval_samples_per_second": 66.072, "eval_steps_per_second": 2.114, "step": 27 }, { "epoch": 3.5, "grad_norm": 2.734375, "learning_rate": 3.3333333333333335e-05, "loss": 0.6884, "step": 28 }, { "epoch": 3.5, "eval_accuracy": 0.504, "eval_loss": 0.7171445488929749, "eval_runtime": 3.7809, "eval_samples_per_second": 66.122, "eval_steps_per_second": 2.116, "step": 28 }, { "epoch": 3.625, "grad_norm": 4.90625, "learning_rate": 3.269230769230769e-05, "loss": 0.6949, "step": 29 }, { "epoch": 3.625, "eval_accuracy": 0.496, "eval_loss": 0.7141953110694885, "eval_runtime": 3.7732, "eval_samples_per_second": 66.258, "eval_steps_per_second": 2.12, "step": 29 }, { "epoch": 3.75, "grad_norm": 2.984375, "learning_rate": 3.205128205128206e-05, "loss": 0.6877, "step": 30 }, { "epoch": 3.75, "eval_accuracy": 0.5, "eval_loss": 0.7117382884025574, "eval_runtime": 3.7862, "eval_samples_per_second": 66.029, "eval_steps_per_second": 2.113, "step": 30 }, { "epoch": 3.875, "grad_norm": 4.03125, "learning_rate": 3.141025641025641e-05, "loss": 0.695, "step": 31 }, { "epoch": 3.875, "eval_accuracy": 0.508, "eval_loss": 0.710156261920929, "eval_runtime": 3.831, "eval_samples_per_second": 65.256, "eval_steps_per_second": 2.088, "step": 31 }, { "epoch": 4.0, "grad_norm": 1.6015625, "learning_rate": 3.0769230769230774e-05, "loss": 0.6641, "step": 32 }, { "epoch": 4.0, "eval_accuracy": 0.516, "eval_loss": 0.7094960808753967, "eval_runtime": 3.8235, "eval_samples_per_second": 65.385, "eval_steps_per_second": 2.092, "step": 32 }, { "epoch": 4.125, "grad_norm": 5.65625, "learning_rate": 3.012820512820513e-05, "loss": 0.6936, "step": 33 }, { "epoch": 4.125, "eval_accuracy": 0.508, "eval_loss": 0.7094511985778809, "eval_runtime": 3.8342, "eval_samples_per_second": 65.203, "eval_steps_per_second": 2.087, "step": 33 }, { "epoch": 4.25, "grad_norm": 1.3515625, "learning_rate": 2.948717948717949e-05, "loss": 0.6511, "step": 34 }, { "epoch": 4.25, "eval_accuracy": 0.512, "eval_loss": 0.709640622138977, "eval_runtime": 3.8302, "eval_samples_per_second": 65.271, "eval_steps_per_second": 2.089, "step": 34 }, { "epoch": 4.375, "grad_norm": 2.21875, "learning_rate": 2.8846153846153845e-05, "loss": 0.6737, "step": 35 }, { "epoch": 4.375, "eval_accuracy": 0.516, "eval_loss": 0.7100146412849426, "eval_runtime": 3.8242, "eval_samples_per_second": 65.373, "eval_steps_per_second": 2.092, "step": 35 }, { "epoch": 4.5, "grad_norm": 7.625, "learning_rate": 2.8205128205128207e-05, "loss": 0.686, "step": 36 }, { "epoch": 4.5, "eval_accuracy": 0.512, "eval_loss": 0.7103955149650574, "eval_runtime": 3.829, "eval_samples_per_second": 65.292, "eval_steps_per_second": 2.089, "step": 36 }, { "epoch": 4.625, "grad_norm": 1.9609375, "learning_rate": 2.756410256410257e-05, "loss": 0.6819, "step": 37 }, { "epoch": 4.625, "eval_accuracy": 0.516, "eval_loss": 0.7098662257194519, "eval_runtime": 3.8308, "eval_samples_per_second": 65.261, "eval_steps_per_second": 2.088, "step": 37 }, { "epoch": 4.75, "grad_norm": 4.0625, "learning_rate": 2.6923076923076923e-05, "loss": 0.6557, "step": 38 }, { "epoch": 4.75, "eval_accuracy": 0.512, "eval_loss": 0.709054708480835, "eval_runtime": 3.8339, "eval_samples_per_second": 65.208, "eval_steps_per_second": 2.087, "step": 38 }, { "epoch": 4.875, "grad_norm": 4.90625, "learning_rate": 2.6282051282051285e-05, "loss": 0.6788, "step": 39 }, { "epoch": 4.875, "eval_accuracy": 0.512, "eval_loss": 0.708984375, "eval_runtime": 3.7793, "eval_samples_per_second": 66.149, "eval_steps_per_second": 2.117, "step": 39 }, { "epoch": 5.0, "grad_norm": 2.703125, "learning_rate": 2.564102564102564e-05, "loss": 0.6821, "step": 40 }, { "epoch": 5.0, "eval_accuracy": 0.516, "eval_loss": 0.7079521417617798, "eval_runtime": 3.8284, "eval_samples_per_second": 65.302, "eval_steps_per_second": 2.09, "step": 40 }, { "epoch": 5.125, "grad_norm": 1.90625, "learning_rate": 2.5e-05, "loss": 0.6955, "step": 41 }, { "epoch": 5.125, "eval_accuracy": 0.512, "eval_loss": 0.7075204849243164, "eval_runtime": 3.7781, "eval_samples_per_second": 66.171, "eval_steps_per_second": 2.117, "step": 41 }, { "epoch": 5.25, "grad_norm": 2.890625, "learning_rate": 2.435897435897436e-05, "loss": 0.6678, "step": 42 }, { "epoch": 5.25, "eval_accuracy": 0.512, "eval_loss": 0.708126962184906, "eval_runtime": 3.8286, "eval_samples_per_second": 65.298, "eval_steps_per_second": 2.09, "step": 42 }, { "epoch": 5.375, "grad_norm": 3.265625, "learning_rate": 2.3717948717948718e-05, "loss": 0.6633, "step": 43 }, { "epoch": 5.375, "eval_accuracy": 0.508, "eval_loss": 0.7086679935455322, "eval_runtime": 3.832, "eval_samples_per_second": 65.24, "eval_steps_per_second": 2.088, "step": 43 }, { "epoch": 5.5, "grad_norm": 4.96875, "learning_rate": 2.307692307692308e-05, "loss": 0.673, "step": 44 }, { "epoch": 5.5, "eval_accuracy": 0.508, "eval_loss": 0.709298849105835, "eval_runtime": 3.7804, "eval_samples_per_second": 66.131, "eval_steps_per_second": 2.116, "step": 44 }, { "epoch": 5.625, "grad_norm": 4.40625, "learning_rate": 2.2435897435897437e-05, "loss": 0.6819, "step": 45 }, { "epoch": 5.625, "eval_accuracy": 0.504, "eval_loss": 0.7100507616996765, "eval_runtime": 3.8246, "eval_samples_per_second": 65.366, "eval_steps_per_second": 2.092, "step": 45 }, { "epoch": 5.75, "grad_norm": 3.796875, "learning_rate": 2.1794871794871795e-05, "loss": 0.6804, "step": 46 }, { "epoch": 5.75, "eval_accuracy": 0.504, "eval_loss": 0.7107539176940918, "eval_runtime": 3.8234, "eval_samples_per_second": 65.386, "eval_steps_per_second": 2.092, "step": 46 }, { "epoch": 5.875, "grad_norm": 2.453125, "learning_rate": 2.1153846153846154e-05, "loss": 0.6526, "step": 47 }, { "epoch": 5.875, "eval_accuracy": 0.504, "eval_loss": 0.7117363214492798, "eval_runtime": 3.825, "eval_samples_per_second": 65.359, "eval_steps_per_second": 2.091, "step": 47 }, { "epoch": 6.0, "grad_norm": 3.859375, "learning_rate": 2.0512820512820512e-05, "loss": 0.6603, "step": 48 }, { "epoch": 6.0, "eval_accuracy": 0.504, "eval_loss": 0.7116543054580688, "eval_runtime": 3.8234, "eval_samples_per_second": 65.387, "eval_steps_per_second": 2.092, "step": 48 }, { "epoch": 6.125, "grad_norm": 2.640625, "learning_rate": 1.987179487179487e-05, "loss": 0.6694, "step": 49 }, { "epoch": 6.125, "eval_accuracy": 0.5, "eval_loss": 0.7120312452316284, "eval_runtime": 3.7768, "eval_samples_per_second": 66.194, "eval_steps_per_second": 2.118, "step": 49 }, { "epoch": 6.25, "grad_norm": 1.2890625, "learning_rate": 1.923076923076923e-05, "loss": 0.6666, "step": 50 }, { "epoch": 6.25, "eval_accuracy": 0.504, "eval_loss": 0.7120312452316284, "eval_runtime": 3.7768, "eval_samples_per_second": 66.193, "eval_steps_per_second": 2.118, "step": 50 }, { "epoch": 6.375, "grad_norm": 1.3828125, "learning_rate": 1.858974358974359e-05, "loss": 0.6517, "step": 51 }, { "epoch": 6.375, "eval_accuracy": 0.5, "eval_loss": 0.7119590044021606, "eval_runtime": 3.7825, "eval_samples_per_second": 66.094, "eval_steps_per_second": 2.115, "step": 51 }, { "epoch": 6.5, "grad_norm": 8.875, "learning_rate": 1.794871794871795e-05, "loss": 0.6673, "step": 52 }, { "epoch": 6.5, "eval_accuracy": 0.504, "eval_loss": 0.7129882574081421, "eval_runtime": 3.7803, "eval_samples_per_second": 66.133, "eval_steps_per_second": 2.116, "step": 52 }, { "epoch": 6.625, "grad_norm": 1.3046875, "learning_rate": 1.730769230769231e-05, "loss": 0.646, "step": 53 }, { "epoch": 6.625, "eval_accuracy": 0.508, "eval_loss": 0.712039053440094, "eval_runtime": 3.7834, "eval_samples_per_second": 66.077, "eval_steps_per_second": 2.114, "step": 53 }, { "epoch": 6.75, "grad_norm": 3.640625, "learning_rate": 1.6666666666666667e-05, "loss": 0.6822, "step": 54 }, { "epoch": 6.75, "eval_accuracy": 0.512, "eval_loss": 0.7116581797599792, "eval_runtime": 3.7796, "eval_samples_per_second": 66.145, "eval_steps_per_second": 2.117, "step": 54 }, { "epoch": 6.875, "grad_norm": 2.21875, "learning_rate": 1.602564102564103e-05, "loss": 0.6642, "step": 55 }, { "epoch": 6.875, "eval_accuracy": 0.512, "eval_loss": 0.7108300924301147, "eval_runtime": 3.833, "eval_samples_per_second": 65.223, "eval_steps_per_second": 2.087, "step": 55 }, { "epoch": 7.0, "grad_norm": 4.75, "learning_rate": 1.5384615384615387e-05, "loss": 0.6719, "step": 56 }, { "epoch": 7.0, "eval_accuracy": 0.508, "eval_loss": 0.7102011442184448, "eval_runtime": 3.7842, "eval_samples_per_second": 66.065, "eval_steps_per_second": 2.114, "step": 56 }, { "epoch": 7.125, "grad_norm": 5.625, "learning_rate": 1.4743589743589745e-05, "loss": 0.6645, "step": 57 }, { "epoch": 7.125, "eval_accuracy": 0.512, "eval_loss": 0.7102500200271606, "eval_runtime": 3.8277, "eval_samples_per_second": 65.313, "eval_steps_per_second": 2.09, "step": 57 }, { "epoch": 7.25, "grad_norm": 2.15625, "learning_rate": 1.4102564102564104e-05, "loss": 0.6448, "step": 58 }, { "epoch": 7.25, "eval_accuracy": 0.508, "eval_loss": 0.7099394798278809, "eval_runtime": 3.8298, "eval_samples_per_second": 65.278, "eval_steps_per_second": 2.089, "step": 58 }, { "epoch": 7.375, "grad_norm": 4.46875, "learning_rate": 1.3461538461538462e-05, "loss": 0.6623, "step": 59 }, { "epoch": 7.375, "eval_accuracy": 0.512, "eval_loss": 0.7101836204528809, "eval_runtime": 3.8325, "eval_samples_per_second": 65.231, "eval_steps_per_second": 2.087, "step": 59 }, { "epoch": 7.5, "grad_norm": 4.3125, "learning_rate": 1.282051282051282e-05, "loss": 0.6618, "step": 60 }, { "epoch": 7.5, "eval_accuracy": 0.512, "eval_loss": 0.7102265357971191, "eval_runtime": 3.7819, "eval_samples_per_second": 66.104, "eval_steps_per_second": 2.115, "step": 60 }, { "epoch": 7.625, "grad_norm": 1.09375, "learning_rate": 1.217948717948718e-05, "loss": 0.6535, "step": 61 }, { "epoch": 7.625, "eval_accuracy": 0.512, "eval_loss": 0.710568368434906, "eval_runtime": 3.787, "eval_samples_per_second": 66.016, "eval_steps_per_second": 2.113, "step": 61 }, { "epoch": 7.75, "grad_norm": 1.90625, "learning_rate": 1.153846153846154e-05, "loss": 0.6585, "step": 62 }, { "epoch": 7.75, "eval_accuracy": 0.512, "eval_loss": 0.7093847393989563, "eval_runtime": 3.7788, "eval_samples_per_second": 66.159, "eval_steps_per_second": 2.117, "step": 62 }, { "epoch": 7.875, "grad_norm": 6.03125, "learning_rate": 1.0897435897435898e-05, "loss": 0.7038, "step": 63 }, { "epoch": 7.875, "eval_accuracy": 0.512, "eval_loss": 0.7079941630363464, "eval_runtime": 3.7843, "eval_samples_per_second": 66.063, "eval_steps_per_second": 2.114, "step": 63 }, { "epoch": 8.0, "grad_norm": 1.3125, "learning_rate": 1.0256410256410256e-05, "loss": 0.6549, "step": 64 }, { "epoch": 8.0, "eval_accuracy": 0.508, "eval_loss": 0.7070527076721191, "eval_runtime": 3.8257, "eval_samples_per_second": 65.348, "eval_steps_per_second": 2.091, "step": 64 }, { "epoch": 8.125, "grad_norm": 4.8125, "learning_rate": 9.615384615384616e-06, "loss": 0.6655, "step": 65 }, { "epoch": 8.125, "eval_accuracy": 0.512, "eval_loss": 0.7069512009620667, "eval_runtime": 3.8309, "eval_samples_per_second": 65.26, "eval_steps_per_second": 2.088, "step": 65 }, { "epoch": 8.25, "grad_norm": 5.3125, "learning_rate": 8.974358974358976e-06, "loss": 0.6607, "step": 66 }, { "epoch": 8.25, "eval_accuracy": 0.512, "eval_loss": 0.7063554525375366, "eval_runtime": 3.8238, "eval_samples_per_second": 65.38, "eval_steps_per_second": 2.092, "step": 66 }, { "epoch": 8.375, "grad_norm": 2.640625, "learning_rate": 8.333333333333334e-06, "loss": 0.6749, "step": 67 }, { "epoch": 8.375, "eval_accuracy": 0.512, "eval_loss": 0.7062519788742065, "eval_runtime": 3.8329, "eval_samples_per_second": 65.224, "eval_steps_per_second": 2.087, "step": 67 }, { "epoch": 8.5, "grad_norm": 2.25, "learning_rate": 7.692307692307694e-06, "loss": 0.6647, "step": 68 }, { "epoch": 8.5, "eval_accuracy": 0.512, "eval_loss": 0.7055859565734863, "eval_runtime": 3.828, "eval_samples_per_second": 65.308, "eval_steps_per_second": 2.09, "step": 68 }, { "epoch": 8.625, "grad_norm": 1.3828125, "learning_rate": 7.051282051282052e-06, "loss": 0.6979, "step": 69 }, { "epoch": 8.625, "eval_accuracy": 0.512, "eval_loss": 0.7064355611801147, "eval_runtime": 3.8307, "eval_samples_per_second": 65.262, "eval_steps_per_second": 2.088, "step": 69 }, { "epoch": 8.75, "grad_norm": 5.53125, "learning_rate": 6.41025641025641e-06, "loss": 0.6492, "step": 70 }, { "epoch": 8.75, "eval_accuracy": 0.512, "eval_loss": 0.7062558531761169, "eval_runtime": 3.826, "eval_samples_per_second": 65.342, "eval_steps_per_second": 2.091, "step": 70 }, { "epoch": 8.875, "grad_norm": 2.109375, "learning_rate": 5.76923076923077e-06, "loss": 0.6561, "step": 71 }, { "epoch": 8.875, "eval_accuracy": 0.512, "eval_loss": 0.7061426043510437, "eval_runtime": 3.8252, "eval_samples_per_second": 65.356, "eval_steps_per_second": 2.091, "step": 71 }, { "epoch": 9.0, "grad_norm": 4.3125, "learning_rate": 5.128205128205128e-06, "loss": 0.6545, "step": 72 }, { "epoch": 9.0, "eval_accuracy": 0.512, "eval_loss": 0.7061015367507935, "eval_runtime": 3.7808, "eval_samples_per_second": 66.124, "eval_steps_per_second": 2.116, "step": 72 }, { "epoch": 9.125, "grad_norm": 3.453125, "learning_rate": 4.487179487179488e-06, "loss": 0.678, "step": 73 }, { "epoch": 9.125, "eval_accuracy": 0.508, "eval_loss": 0.7067734599113464, "eval_runtime": 3.826, "eval_samples_per_second": 65.342, "eval_steps_per_second": 2.091, "step": 73 }, { "epoch": 9.25, "grad_norm": 4.5, "learning_rate": 3.846153846153847e-06, "loss": 0.6868, "step": 74 }, { "epoch": 9.25, "eval_accuracy": 0.512, "eval_loss": 0.7066249847412109, "eval_runtime": 3.8335, "eval_samples_per_second": 65.214, "eval_steps_per_second": 2.087, "step": 74 }, { "epoch": 9.375, "grad_norm": 2.140625, "learning_rate": 3.205128205128205e-06, "loss": 0.6619, "step": 75 }, { "epoch": 9.375, "eval_accuracy": 0.508, "eval_loss": 0.7071660161018372, "eval_runtime": 3.774, "eval_samples_per_second": 66.242, "eval_steps_per_second": 2.12, "step": 75 }, { "epoch": 9.5, "grad_norm": 1.296875, "learning_rate": 2.564102564102564e-06, "loss": 0.6624, "step": 76 }, { "epoch": 9.5, "eval_accuracy": 0.508, "eval_loss": 0.707388699054718, "eval_runtime": 3.8273, "eval_samples_per_second": 65.32, "eval_steps_per_second": 2.09, "step": 76 }, { "epoch": 9.625, "grad_norm": 3.390625, "learning_rate": 1.9230769230769234e-06, "loss": 0.6669, "step": 77 }, { "epoch": 9.625, "eval_accuracy": 0.512, "eval_loss": 0.7070527076721191, "eval_runtime": 3.834, "eval_samples_per_second": 65.206, "eval_steps_per_second": 2.087, "step": 77 }, { "epoch": 9.75, "grad_norm": 6.875, "learning_rate": 1.282051282051282e-06, "loss": 0.6496, "step": 78 }, { "epoch": 9.75, "eval_accuracy": 0.508, "eval_loss": 0.7079198956489563, "eval_runtime": 3.8324, "eval_samples_per_second": 65.233, "eval_steps_per_second": 2.087, "step": 78 }, { "epoch": 9.875, "grad_norm": 1.7734375, "learning_rate": 6.41025641025641e-07, "loss": 0.6783, "step": 79 }, { "epoch": 9.875, "eval_accuracy": 0.508, "eval_loss": 0.7070019245147705, "eval_runtime": 3.8328, "eval_samples_per_second": 65.226, "eval_steps_per_second": 2.087, "step": 79 }, { "epoch": 10.0, "grad_norm": 5.5, "learning_rate": 0.0, "loss": 0.6588, "step": 80 }, { "epoch": 10.0, "eval_accuracy": 0.508, "eval_loss": 0.7080761790275574, "eval_runtime": 3.7884, "eval_samples_per_second": 65.99, "eval_steps_per_second": 2.112, "step": 80 }, { "epoch": 10.0, "step": 80, "total_flos": 5.798258899156992e+16, "train_loss": 0.6848556816577911, "train_runtime": 722.7927, "train_samples_per_second": 13.835, "train_steps_per_second": 0.111 } ], "logging_steps": 1, "max_steps": 80, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.798258899156992e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }