{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 1, "global_step": 70, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14285714285714285, "grad_norm": 3.998234510421753, "learning_rate": 2.5e-05, "loss": 0.7117, "step": 1 }, { "epoch": 0.14285714285714285, "eval_loss": 0.6798907518386841, "eval_matthews_correlation": 0.02588070098335507, "eval_runtime": 2.5395, "eval_samples_per_second": 87.024, "eval_steps_per_second": 1.575, "step": 1 }, { "epoch": 0.2857142857142857, "grad_norm": 7.397937297821045, "learning_rate": 5e-05, "loss": 0.7295, "step": 2 }, { "epoch": 0.2857142857142857, "eval_loss": 0.6786512732505798, "eval_matthews_correlation": 0.06415201276791879, "eval_runtime": 2.5854, "eval_samples_per_second": 85.481, "eval_steps_per_second": 1.547, "step": 2 }, { "epoch": 0.42857142857142855, "grad_norm": 2.171586751937866, "learning_rate": 4.9264705882352944e-05, "loss": 0.6526, "step": 3 }, { "epoch": 0.42857142857142855, "eval_loss": 0.6761562824249268, "eval_matthews_correlation": 0.06415201276791879, "eval_runtime": 2.5858, "eval_samples_per_second": 85.465, "eval_steps_per_second": 1.547, "step": 3 }, { "epoch": 0.5714285714285714, "grad_norm": 1.377044677734375, "learning_rate": 4.8529411764705885e-05, "loss": 0.7017, "step": 4 }, { "epoch": 0.5714285714285714, "eval_loss": 0.6824426651000977, "eval_matthews_correlation": 0.03840151481124831, "eval_runtime": 2.5843, "eval_samples_per_second": 85.516, "eval_steps_per_second": 1.548, "step": 4 }, { "epoch": 0.7142857142857143, "grad_norm": 1.5057487487792969, "learning_rate": 4.7794117647058826e-05, "loss": 0.7353, "step": 5 }, { "epoch": 0.7142857142857143, "eval_loss": 0.6836904287338257, "eval_matthews_correlation": 0.03840151481124831, "eval_runtime": 2.5832, "eval_samples_per_second": 85.554, "eval_steps_per_second": 1.548, "step": 5 }, { "epoch": 0.8571428571428571, "grad_norm": 2.314178228378296, "learning_rate": 4.705882352941177e-05, "loss": 0.7249, "step": 6 }, { "epoch": 0.8571428571428571, "eval_loss": 0.6898430585861206, "eval_matthews_correlation": 0.07515566518214702, "eval_runtime": 2.748, "eval_samples_per_second": 80.422, "eval_steps_per_second": 1.456, "step": 6 }, { "epoch": 1.0, "grad_norm": 2.5473382472991943, "learning_rate": 4.632352941176471e-05, "loss": 0.7272, "step": 7 }, { "epoch": 1.0, "eval_loss": 0.6884390115737915, "eval_matthews_correlation": 0.06254519425349994, "eval_runtime": 2.5815, "eval_samples_per_second": 85.608, "eval_steps_per_second": 1.549, "step": 7 }, { "epoch": 1.1428571428571428, "grad_norm": 1.2883397340774536, "learning_rate": 4.558823529411765e-05, "loss": 0.6804, "step": 8 }, { "epoch": 1.1428571428571428, "eval_loss": 0.6875110268592834, "eval_matthews_correlation": 0.08901797683850127, "eval_runtime": 2.5789, "eval_samples_per_second": 85.695, "eval_steps_per_second": 1.551, "step": 8 }, { "epoch": 1.2857142857142856, "grad_norm": 9.519600868225098, "learning_rate": 4.485294117647059e-05, "loss": 0.7318, "step": 9 }, { "epoch": 1.2857142857142856, "eval_loss": 0.6818560361862183, "eval_matthews_correlation": 0.1265779624120371, "eval_runtime": 2.5834, "eval_samples_per_second": 85.546, "eval_steps_per_second": 1.548, "step": 9 }, { "epoch": 1.4285714285714286, "grad_norm": 12.696111679077148, "learning_rate": 4.411764705882353e-05, "loss": 0.714, "step": 10 }, { "epoch": 1.4285714285714286, "eval_loss": 0.6748389601707458, "eval_matthews_correlation": -0.10535441464971222, "eval_runtime": 2.5799, "eval_samples_per_second": 85.662, "eval_steps_per_second": 1.55, "step": 10 }, { "epoch": 1.5714285714285714, "grad_norm": 3.595411539077759, "learning_rate": 4.3382352941176474e-05, "loss": 0.7136, "step": 11 }, { "epoch": 1.5714285714285714, "eval_loss": 0.6693562865257263, "eval_matthews_correlation": -0.07344997524675997, "eval_runtime": 2.582, "eval_samples_per_second": 85.593, "eval_steps_per_second": 1.549, "step": 11 }, { "epoch": 1.7142857142857144, "grad_norm": 3.0389199256896973, "learning_rate": 4.2647058823529415e-05, "loss": 0.7148, "step": 12 }, { "epoch": 1.7142857142857144, "eval_loss": 0.6688193678855896, "eval_matthews_correlation": -0.05644705286285884, "eval_runtime": 2.5824, "eval_samples_per_second": 85.581, "eval_steps_per_second": 1.549, "step": 12 }, { "epoch": 1.8571428571428572, "grad_norm": 3.855015754699707, "learning_rate": 4.1911764705882356e-05, "loss": 0.7435, "step": 13 }, { "epoch": 1.8571428571428572, "eval_loss": 0.6662454009056091, "eval_matthews_correlation": -0.05644705286285884, "eval_runtime": 2.5839, "eval_samples_per_second": 85.531, "eval_steps_per_second": 1.548, "step": 13 }, { "epoch": 2.0, "grad_norm": 2.5934810638427734, "learning_rate": 4.11764705882353e-05, "loss": 0.7235, "step": 14 }, { "epoch": 2.0, "eval_loss": 0.6664111018180847, "eval_matthews_correlation": -0.08839144444490248, "eval_runtime": 2.5848, "eval_samples_per_second": 85.5, "eval_steps_per_second": 1.548, "step": 14 }, { "epoch": 2.142857142857143, "grad_norm": 4.137197971343994, "learning_rate": 4.044117647058824e-05, "loss": 0.735, "step": 15 }, { "epoch": 2.142857142857143, "eval_loss": 0.6680322885513306, "eval_matthews_correlation": -0.016145842093015665, "eval_runtime": 2.5832, "eval_samples_per_second": 85.554, "eval_steps_per_second": 1.548, "step": 15 }, { "epoch": 2.2857142857142856, "grad_norm": 1.7002750635147095, "learning_rate": 3.970588235294117e-05, "loss": 0.7067, "step": 16 }, { "epoch": 2.2857142857142856, "eval_loss": 0.6730608940124512, "eval_matthews_correlation": -0.01955413854015369, "eval_runtime": 2.5331, "eval_samples_per_second": 87.246, "eval_steps_per_second": 1.579, "step": 16 }, { "epoch": 2.4285714285714284, "grad_norm": 5.263991355895996, "learning_rate": 3.897058823529412e-05, "loss": 0.677, "step": 17 }, { "epoch": 2.4285714285714284, "eval_loss": 0.6752498745918274, "eval_matthews_correlation": -0.003660370980642071, "eval_runtime": 2.5815, "eval_samples_per_second": 85.609, "eval_steps_per_second": 1.549, "step": 17 }, { "epoch": 2.571428571428571, "grad_norm": 3.7233338356018066, "learning_rate": 3.8235294117647055e-05, "loss": 0.7148, "step": 18 }, { "epoch": 2.571428571428571, "eval_loss": 0.6821073889732361, "eval_matthews_correlation": 0.03233650264505481, "eval_runtime": 2.5867, "eval_samples_per_second": 85.438, "eval_steps_per_second": 1.546, "step": 18 }, { "epoch": 2.7142857142857144, "grad_norm": 2.5118980407714844, "learning_rate": 3.7500000000000003e-05, "loss": 0.6816, "step": 19 }, { "epoch": 2.7142857142857144, "eval_loss": 0.6831817030906677, "eval_matthews_correlation": 0.03233650264505481, "eval_runtime": 2.5301, "eval_samples_per_second": 87.348, "eval_steps_per_second": 1.581, "step": 19 }, { "epoch": 2.857142857142857, "grad_norm": 2.3825275897979736, "learning_rate": 3.6764705882352945e-05, "loss": 0.7313, "step": 20 }, { "epoch": 2.857142857142857, "eval_loss": 0.6824917793273926, "eval_matthews_correlation": 0.03233650264505481, "eval_runtime": 2.583, "eval_samples_per_second": 85.558, "eval_steps_per_second": 1.549, "step": 20 }, { "epoch": 3.0, "grad_norm": 3.6262497901916504, "learning_rate": 3.6029411764705886e-05, "loss": 0.7177, "step": 21 }, { "epoch": 3.0, "eval_loss": 0.6839202046394348, "eval_matthews_correlation": 0.02099447073916433, "eval_runtime": 2.5829, "eval_samples_per_second": 85.562, "eval_steps_per_second": 1.549, "step": 21 }, { "epoch": 3.142857142857143, "grad_norm": 5.570082187652588, "learning_rate": 3.529411764705883e-05, "loss": 0.7039, "step": 22 }, { "epoch": 3.142857142857143, "eval_loss": 0.6828458905220032, "eval_matthews_correlation": 0.03233650264505481, "eval_runtime": 2.5864, "eval_samples_per_second": 85.448, "eval_steps_per_second": 1.547, "step": 22 }, { "epoch": 3.2857142857142856, "grad_norm": 2.2818679809570312, "learning_rate": 3.455882352941177e-05, "loss": 0.6952, "step": 23 }, { "epoch": 3.2857142857142856, "eval_loss": 0.6800376772880554, "eval_matthews_correlation": 0.03233650264505481, "eval_runtime": 2.5831, "eval_samples_per_second": 85.556, "eval_steps_per_second": 1.549, "step": 23 }, { "epoch": 3.4285714285714284, "grad_norm": 4.441514015197754, "learning_rate": 3.382352941176471e-05, "loss": 0.6898, "step": 24 }, { "epoch": 3.4285714285714284, "eval_loss": 0.6810070872306824, "eval_matthews_correlation": 0.03233650264505481, "eval_runtime": 2.5837, "eval_samples_per_second": 85.538, "eval_steps_per_second": 1.548, "step": 24 }, { "epoch": 3.571428571428571, "grad_norm": 2.3354990482330322, "learning_rate": 3.308823529411765e-05, "loss": 0.6589, "step": 25 }, { "epoch": 3.571428571428571, "eval_loss": 0.6801570057868958, "eval_matthews_correlation": 0.014648552723664804, "eval_runtime": 2.5834, "eval_samples_per_second": 85.547, "eval_steps_per_second": 1.548, "step": 25 }, { "epoch": 3.7142857142857144, "grad_norm": 1.5001767873764038, "learning_rate": 3.235294117647059e-05, "loss": 0.6952, "step": 26 }, { "epoch": 3.7142857142857144, "eval_loss": 0.6867631673812866, "eval_matthews_correlation": 0.03233650264505481, "eval_runtime": 2.5804, "eval_samples_per_second": 85.647, "eval_steps_per_second": 1.55, "step": 26 }, { "epoch": 3.857142857142857, "grad_norm": 8.003545761108398, "learning_rate": 3.161764705882353e-05, "loss": 0.6509, "step": 27 }, { "epoch": 3.857142857142857, "eval_loss": 0.6861798763275146, "eval_matthews_correlation": 0.02099447073916433, "eval_runtime": 2.5355, "eval_samples_per_second": 87.161, "eval_steps_per_second": 1.578, "step": 27 }, { "epoch": 4.0, "grad_norm": 2.25142502784729, "learning_rate": 3.0882352941176475e-05, "loss": 0.6892, "step": 28 }, { "epoch": 4.0, "eval_loss": 0.6863400340080261, "eval_matthews_correlation": 0.02099447073916433, "eval_runtime": 2.5286, "eval_samples_per_second": 87.399, "eval_steps_per_second": 1.582, "step": 28 }, { "epoch": 4.142857142857143, "grad_norm": 8.721115112304688, "learning_rate": 3.0147058823529413e-05, "loss": 0.6586, "step": 29 }, { "epoch": 4.142857142857143, "eval_loss": 0.6862174272537231, "eval_matthews_correlation": 0.02099447073916433, "eval_runtime": 2.5809, "eval_samples_per_second": 85.63, "eval_steps_per_second": 1.55, "step": 29 }, { "epoch": 4.285714285714286, "grad_norm": 4.0326619148254395, "learning_rate": 2.9411764705882354e-05, "loss": 0.7479, "step": 30 }, { "epoch": 4.285714285714286, "eval_loss": 0.6849657893180847, "eval_matthews_correlation": 0.02099447073916433, "eval_runtime": 2.5316, "eval_samples_per_second": 87.298, "eval_steps_per_second": 1.58, "step": 30 }, { "epoch": 4.428571428571429, "grad_norm": 3.4629993438720703, "learning_rate": 2.8676470588235295e-05, "loss": 0.7311, "step": 31 }, { "epoch": 4.428571428571429, "eval_loss": 0.6831032633781433, "eval_matthews_correlation": 0.03233650264505481, "eval_runtime": 2.5314, "eval_samples_per_second": 87.305, "eval_steps_per_second": 1.58, "step": 31 }, { "epoch": 4.571428571428571, "grad_norm": 1.6142446994781494, "learning_rate": 2.7941176470588236e-05, "loss": 0.6869, "step": 32 }, { "epoch": 4.571428571428571, "eval_loss": 0.6772472262382507, "eval_matthews_correlation": -0.003660370980642071, "eval_runtime": 2.5782, "eval_samples_per_second": 85.72, "eval_steps_per_second": 1.551, "step": 32 }, { "epoch": 4.714285714285714, "grad_norm": 2.363823890686035, "learning_rate": 2.7205882352941174e-05, "loss": 0.6625, "step": 33 }, { "epoch": 4.714285714285714, "eval_loss": 0.674756646156311, "eval_matthews_correlation": 0.007873691885759546, "eval_runtime": 2.5793, "eval_samples_per_second": 85.683, "eval_steps_per_second": 1.551, "step": 33 }, { "epoch": 4.857142857142857, "grad_norm": 1.2323307991027832, "learning_rate": 2.647058823529412e-05, "loss": 0.6749, "step": 34 }, { "epoch": 4.857142857142857, "eval_loss": 0.6726891994476318, "eval_matthews_correlation": 0.019864667834482774, "eval_runtime": 2.5302, "eval_samples_per_second": 87.346, "eval_steps_per_second": 1.581, "step": 34 }, { "epoch": 5.0, "grad_norm": 5.784065246582031, "learning_rate": 2.5735294117647057e-05, "loss": 0.6787, "step": 35 }, { "epoch": 5.0, "eval_loss": 0.6702554821968079, "eval_matthews_correlation": 0.007873691885759546, "eval_runtime": 2.5301, "eval_samples_per_second": 87.347, "eval_steps_per_second": 1.581, "step": 35 }, { "epoch": 5.142857142857143, "grad_norm": 2.1666295528411865, "learning_rate": 2.5e-05, "loss": 0.6622, "step": 36 }, { "epoch": 5.142857142857143, "eval_loss": 0.6678881049156189, "eval_matthews_correlation": 0.019864667834482774, "eval_runtime": 2.5786, "eval_samples_per_second": 85.706, "eval_steps_per_second": 1.551, "step": 36 }, { "epoch": 5.285714285714286, "grad_norm": 1.818786382675171, "learning_rate": 2.4264705882352942e-05, "loss": 0.7103, "step": 37 }, { "epoch": 5.285714285714286, "eval_loss": 0.6679052114486694, "eval_matthews_correlation": 0.019864667834482774, "eval_runtime": 2.5803, "eval_samples_per_second": 85.649, "eval_steps_per_second": 1.55, "step": 37 }, { "epoch": 5.428571428571429, "grad_norm": 4.267655849456787, "learning_rate": 2.3529411764705884e-05, "loss": 0.7146, "step": 38 }, { "epoch": 5.428571428571429, "eval_loss": 0.6662653088569641, "eval_matthews_correlation": 0.007873691885759546, "eval_runtime": 2.5787, "eval_samples_per_second": 85.702, "eval_steps_per_second": 1.551, "step": 38 }, { "epoch": 5.571428571428571, "grad_norm": 1.2934308052062988, "learning_rate": 2.2794117647058825e-05, "loss": 0.7013, "step": 39 }, { "epoch": 5.571428571428571, "eval_loss": 0.667534589767456, "eval_matthews_correlation": 0.03236368357125948, "eval_runtime": 2.577, "eval_samples_per_second": 85.759, "eval_steps_per_second": 1.552, "step": 39 }, { "epoch": 5.714285714285714, "grad_norm": 5.849837779998779, "learning_rate": 2.2058823529411766e-05, "loss": 0.6958, "step": 40 }, { "epoch": 5.714285714285714, "eval_loss": 0.6637371778488159, "eval_matthews_correlation": 0.019864667834482774, "eval_runtime": 2.5784, "eval_samples_per_second": 85.711, "eval_steps_per_second": 1.551, "step": 40 }, { "epoch": 5.857142857142857, "grad_norm": 1.7092323303222656, "learning_rate": 2.1323529411764707e-05, "loss": 0.6783, "step": 41 }, { "epoch": 5.857142857142857, "eval_loss": 0.6648374795913696, "eval_matthews_correlation": 0.03236368357125948, "eval_runtime": 2.58, "eval_samples_per_second": 85.659, "eval_steps_per_second": 1.55, "step": 41 }, { "epoch": 6.0, "grad_norm": 5.318603038787842, "learning_rate": 2.058823529411765e-05, "loss": 0.6721, "step": 42 }, { "epoch": 6.0, "eval_loss": 0.6657968759536743, "eval_matthews_correlation": 0.05100006919207634, "eval_runtime": 2.5284, "eval_samples_per_second": 87.405, "eval_steps_per_second": 1.582, "step": 42 }, { "epoch": 6.142857142857143, "grad_norm": 2.208892345428467, "learning_rate": 1.9852941176470586e-05, "loss": 0.7382, "step": 43 }, { "epoch": 6.142857142857143, "eval_loss": 0.6654295921325684, "eval_matthews_correlation": 0.03840151481124831, "eval_runtime": 2.5782, "eval_samples_per_second": 85.719, "eval_steps_per_second": 1.551, "step": 43 }, { "epoch": 6.285714285714286, "grad_norm": 1.805138349533081, "learning_rate": 1.9117647058823528e-05, "loss": 0.7029, "step": 44 }, { "epoch": 6.285714285714286, "eval_loss": 0.6661415696144104, "eval_matthews_correlation": 0.03840151481124831, "eval_runtime": 2.5785, "eval_samples_per_second": 85.708, "eval_steps_per_second": 1.551, "step": 44 }, { "epoch": 6.428571428571429, "grad_norm": 1.8971737623214722, "learning_rate": 1.8382352941176472e-05, "loss": 0.712, "step": 45 }, { "epoch": 6.428571428571429, "eval_loss": 0.6690033078193665, "eval_matthews_correlation": 0.03840151481124831, "eval_runtime": 2.5783, "eval_samples_per_second": 85.714, "eval_steps_per_second": 1.551, "step": 45 }, { "epoch": 6.571428571428571, "grad_norm": 7.983123779296875, "learning_rate": 1.7647058823529414e-05, "loss": 0.607, "step": 46 }, { "epoch": 6.571428571428571, "eval_loss": 0.6701312065124512, "eval_matthews_correlation": 0.03840151481124831, "eval_runtime": 2.5814, "eval_samples_per_second": 85.613, "eval_steps_per_second": 1.55, "step": 46 }, { "epoch": 6.714285714285714, "grad_norm": 1.1388919353485107, "learning_rate": 1.6911764705882355e-05, "loss": 0.662, "step": 47 }, { "epoch": 6.714285714285714, "eval_loss": 0.6674517393112183, "eval_matthews_correlation": 0.019864667834482774, "eval_runtime": 2.5834, "eval_samples_per_second": 85.547, "eval_steps_per_second": 1.548, "step": 47 }, { "epoch": 6.857142857142857, "grad_norm": 1.0342700481414795, "learning_rate": 1.6176470588235296e-05, "loss": 0.6474, "step": 48 }, { "epoch": 6.857142857142857, "eval_loss": 0.667188823223114, "eval_matthews_correlation": 0.03840151481124831, "eval_runtime": 2.58, "eval_samples_per_second": 85.659, "eval_steps_per_second": 1.55, "step": 48 }, { "epoch": 7.0, "grad_norm": 2.212453603744507, "learning_rate": 1.5441176470588237e-05, "loss": 0.6797, "step": 49 }, { "epoch": 7.0, "eval_loss": 0.665696918964386, "eval_matthews_correlation": 0.019864667834482774, "eval_runtime": 2.5784, "eval_samples_per_second": 85.712, "eval_steps_per_second": 1.551, "step": 49 }, { "epoch": 7.142857142857143, "grad_norm": 1.651175618171692, "learning_rate": 1.4705882352941177e-05, "loss": 0.682, "step": 50 }, { "epoch": 7.142857142857143, "eval_loss": 0.6662675142288208, "eval_matthews_correlation": 0.007873691885759546, "eval_runtime": 2.5819, "eval_samples_per_second": 85.597, "eval_steps_per_second": 1.549, "step": 50 }, { "epoch": 7.285714285714286, "grad_norm": 1.319028377532959, "learning_rate": 1.3970588235294118e-05, "loss": 0.7054, "step": 51 }, { "epoch": 7.285714285714286, "eval_loss": 0.665385365486145, "eval_matthews_correlation": 0.05626125766542442, "eval_runtime": 2.5824, "eval_samples_per_second": 85.58, "eval_steps_per_second": 1.549, "step": 51 }, { "epoch": 7.428571428571429, "grad_norm": 1.2058743238449097, "learning_rate": 1.323529411764706e-05, "loss": 0.6679, "step": 52 }, { "epoch": 7.428571428571429, "eval_loss": 0.6671490669250488, "eval_matthews_correlation": -0.003660370980642071, "eval_runtime": 2.5799, "eval_samples_per_second": 85.663, "eval_steps_per_second": 1.55, "step": 52 }, { "epoch": 7.571428571428571, "grad_norm": 2.41142201423645, "learning_rate": 1.25e-05, "loss": 0.6798, "step": 53 }, { "epoch": 7.571428571428571, "eval_loss": 0.6681118011474609, "eval_matthews_correlation": 0.04945936664931965, "eval_runtime": 2.5821, "eval_samples_per_second": 85.589, "eval_steps_per_second": 1.549, "step": 53 }, { "epoch": 7.714285714285714, "grad_norm": 1.5525087118148804, "learning_rate": 1.1764705882352942e-05, "loss": 0.7081, "step": 54 }, { "epoch": 7.714285714285714, "eval_loss": 0.6653820872306824, "eval_matthews_correlation": 0.02099447073916433, "eval_runtime": 2.5792, "eval_samples_per_second": 85.687, "eval_steps_per_second": 1.551, "step": 54 }, { "epoch": 7.857142857142857, "grad_norm": 5.988497257232666, "learning_rate": 1.1029411764705883e-05, "loss": 0.6459, "step": 55 }, { "epoch": 7.857142857142857, "eval_loss": 0.665358304977417, "eval_matthews_correlation": 0.07350910094057247, "eval_runtime": 2.5843, "eval_samples_per_second": 85.516, "eval_steps_per_second": 1.548, "step": 55 }, { "epoch": 8.0, "grad_norm": 7.899259090423584, "learning_rate": 1.0294117647058824e-05, "loss": 0.6267, "step": 56 }, { "epoch": 8.0, "eval_loss": 0.6640177369117737, "eval_matthews_correlation": 0.07792426255947497, "eval_runtime": 2.585, "eval_samples_per_second": 85.493, "eval_steps_per_second": 1.547, "step": 56 }, { "epoch": 8.142857142857142, "grad_norm": 7.863129615783691, "learning_rate": 9.558823529411764e-06, "loss": 0.6544, "step": 57 }, { "epoch": 8.142857142857142, "eval_loss": 0.6616404056549072, "eval_matthews_correlation": 0.06892933773673708, "eval_runtime": 2.5328, "eval_samples_per_second": 87.255, "eval_steps_per_second": 1.579, "step": 57 }, { "epoch": 8.285714285714286, "grad_norm": 4.004273891448975, "learning_rate": 8.823529411764707e-06, "loss": 0.608, "step": 58 }, { "epoch": 8.285714285714286, "eval_loss": 0.6617823839187622, "eval_matthews_correlation": 0.06415201276791879, "eval_runtime": 2.5837, "eval_samples_per_second": 85.537, "eval_steps_per_second": 1.548, "step": 58 }, { "epoch": 8.428571428571429, "grad_norm": 3.1885335445404053, "learning_rate": 8.088235294117648e-06, "loss": 0.698, "step": 59 }, { "epoch": 8.428571428571429, "eval_loss": 0.6621480584144592, "eval_matthews_correlation": 0.08213547054966991, "eval_runtime": 2.5804, "eval_samples_per_second": 85.645, "eval_steps_per_second": 1.55, "step": 59 }, { "epoch": 8.571428571428571, "grad_norm": 1.903214454650879, "learning_rate": 7.3529411764705884e-06, "loss": 0.6541, "step": 60 }, { "epoch": 8.571428571428571, "eval_loss": 0.6629019975662231, "eval_matthews_correlation": 0.06892933773673708, "eval_runtime": 2.5325, "eval_samples_per_second": 87.266, "eval_steps_per_second": 1.579, "step": 60 }, { "epoch": 8.714285714285714, "grad_norm": 2.6434895992279053, "learning_rate": 6.61764705882353e-06, "loss": 0.6375, "step": 61 }, { "epoch": 8.714285714285714, "eval_loss": 0.6642066836357117, "eval_matthews_correlation": 0.03840151481124831, "eval_runtime": 2.5848, "eval_samples_per_second": 85.5, "eval_steps_per_second": 1.548, "step": 61 }, { "epoch": 8.857142857142858, "grad_norm": 1.5652414560317993, "learning_rate": 5.882352941176471e-06, "loss": 0.6667, "step": 62 }, { "epoch": 8.857142857142858, "eval_loss": 0.6663409471511841, "eval_matthews_correlation": 0.04407882683211864, "eval_runtime": 2.5811, "eval_samples_per_second": 85.622, "eval_steps_per_second": 1.55, "step": 62 }, { "epoch": 9.0, "grad_norm": 7.361441135406494, "learning_rate": 5.147058823529412e-06, "loss": 0.6357, "step": 63 }, { "epoch": 9.0, "eval_loss": 0.6640652418136597, "eval_matthews_correlation": 0.04945936664931965, "eval_runtime": 2.5855, "eval_samples_per_second": 85.475, "eval_steps_per_second": 1.547, "step": 63 }, { "epoch": 9.142857142857142, "grad_norm": 2.824963331222534, "learning_rate": 4.411764705882353e-06, "loss": 0.6433, "step": 64 }, { "epoch": 9.142857142857142, "eval_loss": 0.6646701097488403, "eval_matthews_correlation": 0.04945936664931965, "eval_runtime": 2.5911, "eval_samples_per_second": 85.293, "eval_steps_per_second": 1.544, "step": 64 }, { "epoch": 9.285714285714286, "grad_norm": 7.149904251098633, "learning_rate": 3.6764705882352942e-06, "loss": 0.6288, "step": 65 }, { "epoch": 9.285714285714286, "eval_loss": 0.6667016744613647, "eval_matthews_correlation": 0.04945936664931965, "eval_runtime": 2.5847, "eval_samples_per_second": 85.502, "eval_steps_per_second": 1.548, "step": 65 }, { "epoch": 9.428571428571429, "grad_norm": 1.3270394802093506, "learning_rate": 2.9411764705882355e-06, "loss": 0.678, "step": 66 }, { "epoch": 9.428571428571429, "eval_loss": 0.6667911410331726, "eval_matthews_correlation": 0.04945936664931965, "eval_runtime": 2.5833, "eval_samples_per_second": 85.551, "eval_steps_per_second": 1.548, "step": 66 }, { "epoch": 9.571428571428571, "grad_norm": 7.60737419128418, "learning_rate": 2.2058823529411767e-06, "loss": 0.6888, "step": 67 }, { "epoch": 9.571428571428571, "eval_loss": 0.6648308038711548, "eval_matthews_correlation": 0.06606632151098657, "eval_runtime": 2.5895, "eval_samples_per_second": 85.343, "eval_steps_per_second": 1.545, "step": 67 }, { "epoch": 9.714285714285714, "grad_norm": 6.667550563812256, "learning_rate": 1.4705882352941177e-06, "loss": 0.6281, "step": 68 }, { "epoch": 9.714285714285714, "eval_loss": 0.6676240563392639, "eval_matthews_correlation": 0.04945936664931965, "eval_runtime": 2.5817, "eval_samples_per_second": 85.603, "eval_steps_per_second": 1.549, "step": 68 }, { "epoch": 9.857142857142858, "grad_norm": 2.7164316177368164, "learning_rate": 7.352941176470589e-07, "loss": 0.6405, "step": 69 }, { "epoch": 9.857142857142858, "eval_loss": 0.6660658717155457, "eval_matthews_correlation": 0.06606632151098657, "eval_runtime": 2.5799, "eval_samples_per_second": 85.663, "eval_steps_per_second": 1.55, "step": 69 }, { "epoch": 10.0, "grad_norm": 1.6632885932922363, "learning_rate": 0.0, "loss": 0.6593, "step": 70 }, { "epoch": 10.0, "eval_loss": 0.6662001013755798, "eval_matthews_correlation": 0.06606632151098657, "eval_runtime": 2.5806, "eval_samples_per_second": 85.639, "eval_steps_per_second": 1.55, "step": 70 }, { "epoch": 10.0, "step": 70, "total_flos": 3.203834021989581e+16, "train_loss": 0.6859068019049508, "train_runtime": 582.463, "train_samples_per_second": 15.16, "train_steps_per_second": 0.12 } ], "logging_steps": 1, "max_steps": 70, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 3.203834021989581e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }