{ "best_metric": 0.5284, "best_model_checkpoint": "resnet-50-finetuned-eurosat/checkpoint-3869", "epoch": 11.974413646055437, "eval_steps": 500, "global_step": 4212, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 1.8705593347549438, "learning_rate": 2.3696682464454978e-07, "loss": 6.5183, "step": 10 }, { "epoch": 0.06, "grad_norm": 4516.62451171875, "learning_rate": 4.7393364928909956e-07, "loss": 10.2708, "step": 20 }, { "epoch": 0.09, "grad_norm": 15346.5517578125, "learning_rate": 7.109004739336493e-07, "loss": 18.4806, "step": 30 }, { "epoch": 0.11, "grad_norm": 28745.083984375, "learning_rate": 9.478672985781991e-07, "loss": 22.8441, "step": 40 }, { "epoch": 0.14, "grad_norm": 31542.3671875, "learning_rate": 1.184834123222749e-06, "loss": 29.9695, "step": 50 }, { "epoch": 0.17, "grad_norm": 1.8697186708450317, "learning_rate": 1.4218009478672987e-06, "loss": 5.864, "step": 60 }, { "epoch": 0.2, "grad_norm": 37601.5234375, "learning_rate": 1.6587677725118483e-06, "loss": 15.9819, "step": 70 }, { "epoch": 0.23, "grad_norm": 19746.09765625, "learning_rate": 1.8957345971563982e-06, "loss": 15.2462, "step": 80 }, { "epoch": 0.26, "grad_norm": 63597.546875, "learning_rate": 2.1327014218009483e-06, "loss": 30.304, "step": 90 }, { "epoch": 0.28, "grad_norm": 2.0636484622955322, "learning_rate": 2.369668246445498e-06, "loss": 19.0087, "step": 100 }, { "epoch": 0.31, "grad_norm": 98734.3203125, "learning_rate": 2.606635071090048e-06, "loss": 44.2014, "step": 110 }, { "epoch": 0.34, "grad_norm": 30639.279296875, "learning_rate": 2.8436018957345973e-06, "loss": 24.2432, "step": 120 }, { "epoch": 0.37, "grad_norm": 43809.24609375, "learning_rate": 3.0805687203791474e-06, "loss": 56.0469, "step": 130 }, { "epoch": 0.4, "grad_norm": 24138.275390625, "learning_rate": 3.3175355450236967e-06, "loss": 28.4808, "step": 140 }, { "epoch": 0.43, "grad_norm": 21418.955078125, "learning_rate": 3.5545023696682468e-06, "loss": 21.946, "step": 150 }, { "epoch": 0.45, "grad_norm": 17024.44140625, "learning_rate": 3.7914691943127964e-06, "loss": 21.7602, "step": 160 }, { "epoch": 0.48, "grad_norm": 35208.97265625, "learning_rate": 4.0284360189573465e-06, "loss": 22.4126, "step": 170 }, { "epoch": 0.51, "grad_norm": 39777.86328125, "learning_rate": 4.265402843601897e-06, "loss": 24.0016, "step": 180 }, { "epoch": 0.54, "grad_norm": 20732.916015625, "learning_rate": 4.502369668246446e-06, "loss": 31.3397, "step": 190 }, { "epoch": 0.57, "grad_norm": 33167.265625, "learning_rate": 4.739336492890996e-06, "loss": 16.8009, "step": 200 }, { "epoch": 0.6, "grad_norm": 2.895967721939087, "learning_rate": 4.976303317535545e-06, "loss": 13.1856, "step": 210 }, { "epoch": 0.63, "grad_norm": 30878.47265625, "learning_rate": 5.213270142180096e-06, "loss": 15.2357, "step": 220 }, { "epoch": 0.65, "grad_norm": 2.3504858016967773, "learning_rate": 5.4502369668246446e-06, "loss": 19.679, "step": 230 }, { "epoch": 0.68, "grad_norm": 39848.65625, "learning_rate": 5.687203791469195e-06, "loss": 20.6292, "step": 240 }, { "epoch": 0.71, "grad_norm": 14600.3056640625, "learning_rate": 5.924170616113745e-06, "loss": 14.6482, "step": 250 }, { "epoch": 0.74, "grad_norm": 68617.703125, "learning_rate": 6.161137440758295e-06, "loss": 23.6278, "step": 260 }, { "epoch": 0.77, "grad_norm": 50267.3671875, "learning_rate": 6.398104265402843e-06, "loss": 18.2338, "step": 270 }, { "epoch": 0.8, "grad_norm": 49318.7265625, "learning_rate": 6.635071090047393e-06, "loss": 14.3167, "step": 280 }, { "epoch": 0.82, "grad_norm": 19000.4765625, "learning_rate": 6.8720379146919435e-06, "loss": 23.2961, "step": 290 }, { "epoch": 0.85, "grad_norm": 2.1456470489501953, "learning_rate": 7.1090047393364935e-06, "loss": 27.7204, "step": 300 }, { "epoch": 0.88, "grad_norm": 38115.8046875, "learning_rate": 7.345971563981044e-06, "loss": 16.021, "step": 310 }, { "epoch": 0.91, "grad_norm": 37622.7265625, "learning_rate": 7.582938388625593e-06, "loss": 12.3883, "step": 320 }, { "epoch": 0.94, "grad_norm": 1.9154870510101318, "learning_rate": 7.819905213270143e-06, "loss": 26.5582, "step": 330 }, { "epoch": 0.97, "grad_norm": 4383.86767578125, "learning_rate": 8.056872037914693e-06, "loss": 22.5727, "step": 340 }, { "epoch": 1.0, "grad_norm": 121000.0078125, "learning_rate": 8.293838862559243e-06, "loss": 19.4471, "step": 350 }, { "epoch": 1.0, "eval_accuracy": 0.4514, "eval_loss": 25.238075256347656, "eval_runtime": 48.9771, "eval_samples_per_second": 102.089, "eval_steps_per_second": 3.206, "step": 351 }, { "epoch": 1.02, "grad_norm": 64430.546875, "learning_rate": 8.530805687203793e-06, "loss": 14.327, "step": 360 }, { "epoch": 1.05, "grad_norm": 30426.388671875, "learning_rate": 8.767772511848342e-06, "loss": 8.7723, "step": 370 }, { "epoch": 1.08, "grad_norm": 11503.71875, "learning_rate": 9.004739336492892e-06, "loss": 16.3649, "step": 380 }, { "epoch": 1.11, "grad_norm": 2.256610155105591, "learning_rate": 9.241706161137442e-06, "loss": 8.7562, "step": 390 }, { "epoch": 1.14, "grad_norm": 30465.013671875, "learning_rate": 9.478672985781992e-06, "loss": 8.3802, "step": 400 }, { "epoch": 1.17, "grad_norm": 54011.71484375, "learning_rate": 9.715639810426542e-06, "loss": 23.3823, "step": 410 }, { "epoch": 1.19, "grad_norm": 19838.44921875, "learning_rate": 9.95260663507109e-06, "loss": 12.8482, "step": 420 }, { "epoch": 1.22, "grad_norm": 40503.9453125, "learning_rate": 9.978891820580475e-06, "loss": 23.521, "step": 430 }, { "epoch": 1.25, "grad_norm": 27022.01171875, "learning_rate": 9.95250659630607e-06, "loss": 13.9863, "step": 440 }, { "epoch": 1.28, "grad_norm": 45974.8828125, "learning_rate": 9.926121372031664e-06, "loss": 31.3028, "step": 450 }, { "epoch": 1.31, "grad_norm": 32368.458984375, "learning_rate": 9.899736147757257e-06, "loss": 15.9645, "step": 460 }, { "epoch": 1.34, "grad_norm": 60147.5078125, "learning_rate": 9.87335092348285e-06, "loss": 16.7716, "step": 470 }, { "epoch": 1.36, "grad_norm": 2.2444117069244385, "learning_rate": 9.846965699208444e-06, "loss": 17.0073, "step": 480 }, { "epoch": 1.39, "grad_norm": 62614.7578125, "learning_rate": 9.820580474934037e-06, "loss": 22.8753, "step": 490 }, { "epoch": 1.42, "grad_norm": 1.647746205329895, "learning_rate": 9.79419525065963e-06, "loss": 25.4411, "step": 500 }, { "epoch": 1.45, "grad_norm": 26367.078125, "learning_rate": 9.767810026385224e-06, "loss": 17.3336, "step": 510 }, { "epoch": 1.48, "grad_norm": 31225.44921875, "learning_rate": 9.741424802110818e-06, "loss": 23.2953, "step": 520 }, { "epoch": 1.51, "grad_norm": 40510.99609375, "learning_rate": 9.715039577836413e-06, "loss": 9.5293, "step": 530 }, { "epoch": 1.54, "grad_norm": 41800.3671875, "learning_rate": 9.688654353562006e-06, "loss": 31.3173, "step": 540 }, { "epoch": 1.56, "grad_norm": 3.1635913848876953, "learning_rate": 9.6622691292876e-06, "loss": 14.864, "step": 550 }, { "epoch": 1.59, "grad_norm": 65725.7734375, "learning_rate": 9.635883905013193e-06, "loss": 19.294, "step": 560 }, { "epoch": 1.62, "grad_norm": 2.843966484069824, "learning_rate": 9.609498680738787e-06, "loss": 16.0631, "step": 570 }, { "epoch": 1.65, "grad_norm": 36806.65625, "learning_rate": 9.58311345646438e-06, "loss": 20.6014, "step": 580 }, { "epoch": 1.68, "grad_norm": 26330.318359375, "learning_rate": 9.556728232189975e-06, "loss": 14.8391, "step": 590 }, { "epoch": 1.71, "grad_norm": 42348.09765625, "learning_rate": 9.530343007915567e-06, "loss": 14.5358, "step": 600 }, { "epoch": 1.73, "grad_norm": 46159.625, "learning_rate": 9.50395778364116e-06, "loss": 14.4153, "step": 610 }, { "epoch": 1.76, "grad_norm": 13231.01953125, "learning_rate": 9.477572559366756e-06, "loss": 7.9408, "step": 620 }, { "epoch": 1.79, "grad_norm": 9599.7880859375, "learning_rate": 9.45118733509235e-06, "loss": 10.06, "step": 630 }, { "epoch": 1.82, "grad_norm": 35902.42578125, "learning_rate": 9.424802110817943e-06, "loss": 4.7903, "step": 640 }, { "epoch": 1.85, "grad_norm": 4.08818244934082, "learning_rate": 9.398416886543536e-06, "loss": 5.3186, "step": 650 }, { "epoch": 1.88, "grad_norm": 49990.7421875, "learning_rate": 9.37203166226913e-06, "loss": 20.1198, "step": 660 }, { "epoch": 1.9, "grad_norm": 59561.36328125, "learning_rate": 9.345646437994725e-06, "loss": 18.1733, "step": 670 }, { "epoch": 1.93, "grad_norm": 27601.34375, "learning_rate": 9.319261213720318e-06, "loss": 26.7961, "step": 680 }, { "epoch": 1.96, "grad_norm": 45803.765625, "learning_rate": 9.292875989445912e-06, "loss": 21.5834, "step": 690 }, { "epoch": 1.99, "grad_norm": 3.440899133682251, "learning_rate": 9.266490765171505e-06, "loss": 14.378, "step": 700 }, { "epoch": 2.0, "eval_accuracy": 0.4594, "eval_loss": 24.59226417541504, "eval_runtime": 45.1325, "eval_samples_per_second": 110.785, "eval_steps_per_second": 3.479, "step": 703 }, { "epoch": 2.02, "grad_norm": 45057.88671875, "learning_rate": 9.240105540897099e-06, "loss": 19.4438, "step": 710 }, { "epoch": 2.05, "grad_norm": 38772.54296875, "learning_rate": 9.213720316622692e-06, "loss": 33.2932, "step": 720 }, { "epoch": 2.08, "grad_norm": 2.6954495906829834, "learning_rate": 9.187335092348285e-06, "loss": 21.963, "step": 730 }, { "epoch": 2.1, "grad_norm": 21965.80859375, "learning_rate": 9.160949868073879e-06, "loss": 11.3273, "step": 740 }, { "epoch": 2.13, "grad_norm": 25026.427734375, "learning_rate": 9.134564643799472e-06, "loss": 25.0493, "step": 750 }, { "epoch": 2.16, "grad_norm": 26746.955078125, "learning_rate": 9.108179419525068e-06, "loss": 17.1204, "step": 760 }, { "epoch": 2.19, "grad_norm": 22665.927734375, "learning_rate": 9.081794195250661e-06, "loss": 11.7094, "step": 770 }, { "epoch": 2.22, "grad_norm": 19639.458984375, "learning_rate": 9.055408970976254e-06, "loss": 9.6691, "step": 780 }, { "epoch": 2.25, "grad_norm": 3.374030590057373, "learning_rate": 9.029023746701848e-06, "loss": 23.1671, "step": 790 }, { "epoch": 2.27, "grad_norm": 2.6674258708953857, "learning_rate": 9.002638522427441e-06, "loss": 25.5004, "step": 800 }, { "epoch": 2.3, "grad_norm": 64243.41796875, "learning_rate": 8.976253298153035e-06, "loss": 11.5501, "step": 810 }, { "epoch": 2.33, "grad_norm": 52738.89453125, "learning_rate": 8.94986807387863e-06, "loss": 18.2777, "step": 820 }, { "epoch": 2.36, "grad_norm": 3.1866891384124756, "learning_rate": 8.923482849604222e-06, "loss": 26.3919, "step": 830 }, { "epoch": 2.39, "grad_norm": 27522.453125, "learning_rate": 8.897097625329815e-06, "loss": 13.0593, "step": 840 }, { "epoch": 2.42, "grad_norm": 29027.912109375, "learning_rate": 8.87071240105541e-06, "loss": 12.8464, "step": 850 }, { "epoch": 2.44, "grad_norm": 70082.3984375, "learning_rate": 8.844327176781004e-06, "loss": 11.5423, "step": 860 }, { "epoch": 2.47, "grad_norm": 28935.212890625, "learning_rate": 8.817941952506597e-06, "loss": 13.5174, "step": 870 }, { "epoch": 2.5, "grad_norm": 24620.404296875, "learning_rate": 8.79155672823219e-06, "loss": 15.3363, "step": 880 }, { "epoch": 2.53, "grad_norm": 13174.8828125, "learning_rate": 8.765171503957784e-06, "loss": 16.6491, "step": 890 }, { "epoch": 2.56, "grad_norm": 4.222790241241455, "learning_rate": 8.738786279683378e-06, "loss": 16.673, "step": 900 }, { "epoch": 2.59, "grad_norm": 57918.12109375, "learning_rate": 8.712401055408973e-06, "loss": 30.1329, "step": 910 }, { "epoch": 2.62, "grad_norm": 32034.955078125, "learning_rate": 8.686015831134566e-06, "loss": 16.7075, "step": 920 }, { "epoch": 2.64, "grad_norm": 51441.37890625, "learning_rate": 8.659630606860158e-06, "loss": 12.6608, "step": 930 }, { "epoch": 2.67, "grad_norm": 34879.83203125, "learning_rate": 8.633245382585753e-06, "loss": 22.7977, "step": 940 }, { "epoch": 2.7, "grad_norm": 50555.06640625, "learning_rate": 8.606860158311347e-06, "loss": 10.3796, "step": 950 }, { "epoch": 2.73, "grad_norm": 4.60589599609375, "learning_rate": 8.58047493403694e-06, "loss": 17.7149, "step": 960 }, { "epoch": 2.76, "grad_norm": 29151.978515625, "learning_rate": 8.554089709762534e-06, "loss": 12.9757, "step": 970 }, { "epoch": 2.79, "grad_norm": 14650.404296875, "learning_rate": 8.527704485488127e-06, "loss": 18.8433, "step": 980 }, { "epoch": 2.81, "grad_norm": 2.882780075073242, "learning_rate": 8.50131926121372e-06, "loss": 21.3776, "step": 990 }, { "epoch": 2.84, "grad_norm": 12802.4921875, "learning_rate": 8.474934036939316e-06, "loss": 29.6767, "step": 1000 }, { "epoch": 2.87, "grad_norm": 79775.125, "learning_rate": 8.448548812664909e-06, "loss": 26.4151, "step": 1010 }, { "epoch": 2.9, "grad_norm": 29102.322265625, "learning_rate": 8.422163588390503e-06, "loss": 14.3842, "step": 1020 }, { "epoch": 2.93, "grad_norm": 10350.70703125, "learning_rate": 8.395778364116096e-06, "loss": 20.5911, "step": 1030 }, { "epoch": 2.96, "grad_norm": 50394.54296875, "learning_rate": 8.36939313984169e-06, "loss": 25.6745, "step": 1040 }, { "epoch": 2.99, "grad_norm": 43292.83203125, "learning_rate": 8.343007915567283e-06, "loss": 20.7257, "step": 1050 }, { "epoch": 3.0, "eval_accuracy": 0.4706, "eval_loss": 24.335988998413086, "eval_runtime": 45.1942, "eval_samples_per_second": 110.634, "eval_steps_per_second": 3.474, "step": 1055 }, { "epoch": 3.01, "grad_norm": 23486.609375, "learning_rate": 8.316622691292876e-06, "loss": 18.8314, "step": 1060 }, { "epoch": 3.04, "grad_norm": 56185.265625, "learning_rate": 8.29023746701847e-06, "loss": 25.6348, "step": 1070 }, { "epoch": 3.07, "grad_norm": 7961.529296875, "learning_rate": 8.263852242744063e-06, "loss": 9.3684, "step": 1080 }, { "epoch": 3.1, "grad_norm": 2.4720194339752197, "learning_rate": 8.237467018469659e-06, "loss": 15.2763, "step": 1090 }, { "epoch": 3.13, "grad_norm": 25756.095703125, "learning_rate": 8.211081794195252e-06, "loss": 20.1983, "step": 1100 }, { "epoch": 3.16, "grad_norm": 3.966672420501709, "learning_rate": 8.184696569920845e-06, "loss": 7.367, "step": 1110 }, { "epoch": 3.18, "grad_norm": 53999.3515625, "learning_rate": 8.158311345646439e-06, "loss": 25.2676, "step": 1120 }, { "epoch": 3.21, "grad_norm": 67693.546875, "learning_rate": 8.131926121372032e-06, "loss": 13.1406, "step": 1130 }, { "epoch": 3.24, "grad_norm": 39821.26953125, "learning_rate": 8.105540897097626e-06, "loss": 24.3738, "step": 1140 }, { "epoch": 3.27, "grad_norm": 37856.0859375, "learning_rate": 8.079155672823221e-06, "loss": 10.8707, "step": 1150 }, { "epoch": 3.3, "grad_norm": 23978.22265625, "learning_rate": 8.052770448548813e-06, "loss": 15.1604, "step": 1160 }, { "epoch": 3.33, "grad_norm": 22707.345703125, "learning_rate": 8.026385224274406e-06, "loss": 9.807, "step": 1170 }, { "epoch": 3.35, "grad_norm": 20809.11328125, "learning_rate": 8.000000000000001e-06, "loss": 13.3375, "step": 1180 }, { "epoch": 3.38, "grad_norm": 27979.55859375, "learning_rate": 7.973614775725595e-06, "loss": 25.7842, "step": 1190 }, { "epoch": 3.41, "grad_norm": 41703.5234375, "learning_rate": 7.947229551451188e-06, "loss": 20.69, "step": 1200 }, { "epoch": 3.44, "grad_norm": 16662.30078125, "learning_rate": 7.920844327176782e-06, "loss": 21.4575, "step": 1210 }, { "epoch": 3.47, "grad_norm": 61241.4765625, "learning_rate": 7.894459102902375e-06, "loss": 14.6511, "step": 1220 }, { "epoch": 3.5, "grad_norm": 4.406251430511475, "learning_rate": 7.868073878627969e-06, "loss": 15.7202, "step": 1230 }, { "epoch": 3.53, "grad_norm": 19873.681640625, "learning_rate": 7.841688654353564e-06, "loss": 11.2091, "step": 1240 }, { "epoch": 3.55, "grad_norm": 139053.625, "learning_rate": 7.815303430079156e-06, "loss": 15.7887, "step": 1250 }, { "epoch": 3.58, "grad_norm": 36517.13671875, "learning_rate": 7.788918205804749e-06, "loss": 13.9419, "step": 1260 }, { "epoch": 3.61, "grad_norm": 24077.20703125, "learning_rate": 7.762532981530344e-06, "loss": 10.6665, "step": 1270 }, { "epoch": 3.64, "grad_norm": 22051.26171875, "learning_rate": 7.736147757255938e-06, "loss": 19.0224, "step": 1280 }, { "epoch": 3.67, "grad_norm": 22119.998046875, "learning_rate": 7.709762532981531e-06, "loss": 12.9177, "step": 1290 }, { "epoch": 3.7, "grad_norm": 2.928393840789795, "learning_rate": 7.683377308707125e-06, "loss": 35.1944, "step": 1300 }, { "epoch": 3.72, "grad_norm": 56515.19140625, "learning_rate": 7.656992084432718e-06, "loss": 9.9593, "step": 1310 }, { "epoch": 3.75, "grad_norm": 22500.84375, "learning_rate": 7.630606860158311e-06, "loss": 11.3214, "step": 1320 }, { "epoch": 3.78, "grad_norm": 51895.48046875, "learning_rate": 7.604221635883906e-06, "loss": 14.0507, "step": 1330 }, { "epoch": 3.81, "grad_norm": 36253.95703125, "learning_rate": 7.577836411609499e-06, "loss": 8.2554, "step": 1340 }, { "epoch": 3.84, "grad_norm": 3.3514811992645264, "learning_rate": 7.551451187335093e-06, "loss": 15.6688, "step": 1350 }, { "epoch": 3.87, "grad_norm": 24293.787109375, "learning_rate": 7.525065963060687e-06, "loss": 19.9979, "step": 1360 }, { "epoch": 3.89, "grad_norm": 57937.61328125, "learning_rate": 7.4986807387862805e-06, "loss": 13.2916, "step": 1370 }, { "epoch": 3.92, "grad_norm": 35850.4140625, "learning_rate": 7.472295514511874e-06, "loss": 17.6864, "step": 1380 }, { "epoch": 3.95, "grad_norm": 40921.08984375, "learning_rate": 7.445910290237468e-06, "loss": 22.2915, "step": 1390 }, { "epoch": 3.98, "grad_norm": 26940.921875, "learning_rate": 7.419525065963062e-06, "loss": 23.0579, "step": 1400 }, { "epoch": 4.0, "eval_accuracy": 0.479, "eval_loss": 17.927661895751953, "eval_runtime": 45.9245, "eval_samples_per_second": 108.874, "eval_steps_per_second": 3.419, "step": 1407 }, { "epoch": 4.01, "grad_norm": 2.540001392364502, "learning_rate": 7.393139841688654e-06, "loss": 26.1677, "step": 1410 }, { "epoch": 4.04, "grad_norm": 25591.275390625, "learning_rate": 7.366754617414249e-06, "loss": 11.7632, "step": 1420 }, { "epoch": 4.07, "grad_norm": 16780.751953125, "learning_rate": 7.340369393139842e-06, "loss": 15.4046, "step": 1430 }, { "epoch": 4.09, "grad_norm": 50002.53515625, "learning_rate": 7.3139841688654355e-06, "loss": 13.8249, "step": 1440 }, { "epoch": 4.12, "grad_norm": 3.906174898147583, "learning_rate": 7.28759894459103e-06, "loss": 14.6412, "step": 1450 }, { "epoch": 4.15, "grad_norm": 7839.53759765625, "learning_rate": 7.261213720316623e-06, "loss": 11.677, "step": 1460 }, { "epoch": 4.18, "grad_norm": 43679.359375, "learning_rate": 7.234828496042217e-06, "loss": 28.4914, "step": 1470 }, { "epoch": 4.21, "grad_norm": 4.5355224609375, "learning_rate": 7.208443271767811e-06, "loss": 18.0921, "step": 1480 }, { "epoch": 4.24, "grad_norm": 3.0454983711242676, "learning_rate": 7.1820580474934045e-06, "loss": 12.2903, "step": 1490 }, { "epoch": 4.26, "grad_norm": 52284.54296875, "learning_rate": 7.155672823218998e-06, "loss": 13.3073, "step": 1500 }, { "epoch": 4.29, "grad_norm": 46066.03515625, "learning_rate": 7.129287598944592e-06, "loss": 26.3335, "step": 1510 }, { "epoch": 4.32, "grad_norm": 5.04480504989624, "learning_rate": 7.102902374670185e-06, "loss": 10.9462, "step": 1520 }, { "epoch": 4.35, "grad_norm": 3.8758034706115723, "learning_rate": 7.076517150395778e-06, "loss": 26.3002, "step": 1530 }, { "epoch": 4.38, "grad_norm": 14430.576171875, "learning_rate": 7.050131926121373e-06, "loss": 6.2562, "step": 1540 }, { "epoch": 4.41, "grad_norm": 19713.322265625, "learning_rate": 7.023746701846966e-06, "loss": 19.0434, "step": 1550 }, { "epoch": 4.43, "grad_norm": 32164.08203125, "learning_rate": 6.99736147757256e-06, "loss": 9.6638, "step": 1560 }, { "epoch": 4.46, "grad_norm": 6464.00341796875, "learning_rate": 6.970976253298154e-06, "loss": 21.096, "step": 1570 }, { "epoch": 4.49, "grad_norm": 3.788201332092285, "learning_rate": 6.944591029023747e-06, "loss": 16.4134, "step": 1580 }, { "epoch": 4.52, "grad_norm": 35681.3515625, "learning_rate": 6.918205804749341e-06, "loss": 34.0506, "step": 1590 }, { "epoch": 4.55, "grad_norm": 19604.365234375, "learning_rate": 6.891820580474935e-06, "loss": 13.2112, "step": 1600 }, { "epoch": 4.58, "grad_norm": 35878.91796875, "learning_rate": 6.8654353562005286e-06, "loss": 13.8717, "step": 1610 }, { "epoch": 4.61, "grad_norm": 4.150086879730225, "learning_rate": 6.839050131926121e-06, "loss": 9.4475, "step": 1620 }, { "epoch": 4.63, "grad_norm": 3.6677896976470947, "learning_rate": 6.812664907651716e-06, "loss": 13.0993, "step": 1630 }, { "epoch": 4.66, "grad_norm": 11255.166015625, "learning_rate": 6.786279683377309e-06, "loss": 11.9428, "step": 1640 }, { "epoch": 4.69, "grad_norm": 9635.9912109375, "learning_rate": 6.759894459102902e-06, "loss": 19.0856, "step": 1650 }, { "epoch": 4.72, "grad_norm": 4.402582168579102, "learning_rate": 6.733509234828497e-06, "loss": 7.2455, "step": 1660 }, { "epoch": 4.75, "grad_norm": 18808.263671875, "learning_rate": 6.70712401055409e-06, "loss": 18.2399, "step": 1670 }, { "epoch": 4.78, "grad_norm": 47149.54296875, "learning_rate": 6.680738786279684e-06, "loss": 14.1329, "step": 1680 }, { "epoch": 4.8, "grad_norm": 53427.94140625, "learning_rate": 6.654353562005278e-06, "loss": 14.964, "step": 1690 }, { "epoch": 4.83, "grad_norm": 3.8708295822143555, "learning_rate": 6.627968337730871e-06, "loss": 14.1862, "step": 1700 }, { "epoch": 4.86, "grad_norm": 21101.42578125, "learning_rate": 6.601583113456465e-06, "loss": 15.3858, "step": 1710 }, { "epoch": 4.89, "grad_norm": 18543.30859375, "learning_rate": 6.575197889182059e-06, "loss": 20.203, "step": 1720 }, { "epoch": 4.92, "grad_norm": 51301.85546875, "learning_rate": 6.548812664907653e-06, "loss": 16.0301, "step": 1730 }, { "epoch": 4.95, "grad_norm": 2.9157021045684814, "learning_rate": 6.522427440633245e-06, "loss": 12.4107, "step": 1740 }, { "epoch": 4.98, "grad_norm": 40848.80859375, "learning_rate": 6.4960422163588396e-06, "loss": 16.7616, "step": 1750 }, { "epoch": 5.0, "eval_accuracy": 0.4808, "eval_loss": 24.001291275024414, "eval_runtime": 45.0786, "eval_samples_per_second": 110.917, "eval_steps_per_second": 3.483, "step": 1758 }, { "epoch": 5.0, "grad_norm": 32241.318359375, "learning_rate": 6.469656992084433e-06, "loss": 16.9512, "step": 1760 }, { "epoch": 5.03, "grad_norm": 38065.63671875, "learning_rate": 6.4432717678100265e-06, "loss": 34.917, "step": 1770 }, { "epoch": 5.06, "grad_norm": 76571.4296875, "learning_rate": 6.416886543535621e-06, "loss": 10.8024, "step": 1780 }, { "epoch": 5.09, "grad_norm": 16919.375, "learning_rate": 6.390501319261214e-06, "loss": 11.519, "step": 1790 }, { "epoch": 5.12, "grad_norm": 14353.009765625, "learning_rate": 6.364116094986808e-06, "loss": 23.6153, "step": 1800 }, { "epoch": 5.15, "grad_norm": 91696.828125, "learning_rate": 6.337730870712402e-06, "loss": 28.1094, "step": 1810 }, { "epoch": 5.17, "grad_norm": 3.3631858825683594, "learning_rate": 6.3113456464379955e-06, "loss": 9.9939, "step": 1820 }, { "epoch": 5.2, "grad_norm": 3.1543691158294678, "learning_rate": 6.284960422163588e-06, "loss": 19.7281, "step": 1830 }, { "epoch": 5.23, "grad_norm": 14223.375, "learning_rate": 6.258575197889183e-06, "loss": 20.8196, "step": 1840 }, { "epoch": 5.26, "grad_norm": 20589.833984375, "learning_rate": 6.232189973614776e-06, "loss": 10.6144, "step": 1850 }, { "epoch": 5.29, "grad_norm": 43285.2109375, "learning_rate": 6.205804749340369e-06, "loss": 26.6982, "step": 1860 }, { "epoch": 5.32, "grad_norm": 41181.64453125, "learning_rate": 6.179419525065964e-06, "loss": 13.477, "step": 1870 }, { "epoch": 5.34, "grad_norm": 4.323703289031982, "learning_rate": 6.153034300791557e-06, "loss": 17.2591, "step": 1880 }, { "epoch": 5.37, "grad_norm": 4.063042640686035, "learning_rate": 6.1266490765171505e-06, "loss": 19.0261, "step": 1890 }, { "epoch": 5.4, "grad_norm": 6825.7470703125, "learning_rate": 6.100263852242745e-06, "loss": 20.4592, "step": 1900 }, { "epoch": 5.43, "grad_norm": 12758.51953125, "learning_rate": 6.073878627968338e-06, "loss": 10.1658, "step": 1910 }, { "epoch": 5.46, "grad_norm": 32590.52734375, "learning_rate": 6.047493403693932e-06, "loss": 31.1392, "step": 1920 }, { "epoch": 5.49, "grad_norm": 10071.1376953125, "learning_rate": 6.021108179419526e-06, "loss": 22.3136, "step": 1930 }, { "epoch": 5.52, "grad_norm": 12942.5498046875, "learning_rate": 5.9947229551451195e-06, "loss": 13.154, "step": 1940 }, { "epoch": 5.54, "grad_norm": 20000.166015625, "learning_rate": 5.968337730870712e-06, "loss": 7.8325, "step": 1950 }, { "epoch": 5.57, "grad_norm": 26191.10546875, "learning_rate": 5.9419525065963064e-06, "loss": 10.9377, "step": 1960 }, { "epoch": 5.6, "grad_norm": 26854.51171875, "learning_rate": 5.9155672823219e-06, "loss": 26.5239, "step": 1970 }, { "epoch": 5.63, "grad_norm": 3.6520581245422363, "learning_rate": 5.889182058047493e-06, "loss": 13.576, "step": 1980 }, { "epoch": 5.66, "grad_norm": 46974.328125, "learning_rate": 5.862796833773088e-06, "loss": 19.6556, "step": 1990 }, { "epoch": 5.69, "grad_norm": 72692.6796875, "learning_rate": 5.836411609498681e-06, "loss": 18.7958, "step": 2000 }, { "epoch": 5.71, "grad_norm": 27918.220703125, "learning_rate": 5.810026385224275e-06, "loss": 9.4042, "step": 2010 }, { "epoch": 5.74, "grad_norm": 8622.783203125, "learning_rate": 5.783641160949869e-06, "loss": 14.2554, "step": 2020 }, { "epoch": 5.77, "grad_norm": 12704.0400390625, "learning_rate": 5.757255936675462e-06, "loss": 12.6341, "step": 2030 }, { "epoch": 5.8, "grad_norm": 3.309465169906616, "learning_rate": 5.730870712401056e-06, "loss": 16.3332, "step": 2040 }, { "epoch": 5.83, "grad_norm": 3.172189712524414, "learning_rate": 5.70448548812665e-06, "loss": 9.6934, "step": 2050 }, { "epoch": 5.86, "grad_norm": 62337.05859375, "learning_rate": 5.678100263852243e-06, "loss": 17.4837, "step": 2060 }, { "epoch": 5.88, "grad_norm": 71888.5859375, "learning_rate": 5.651715039577836e-06, "loss": 19.7688, "step": 2070 }, { "epoch": 5.91, "grad_norm": 9631.845703125, "learning_rate": 5.6253298153034305e-06, "loss": 21.72, "step": 2080 }, { "epoch": 5.94, "grad_norm": 8002.171875, "learning_rate": 5.598944591029024e-06, "loss": 16.0429, "step": 2090 }, { "epoch": 5.97, "grad_norm": 12119.623046875, "learning_rate": 5.572559366754617e-06, "loss": 11.8946, "step": 2100 }, { "epoch": 6.0, "grad_norm": 19956.529296875, "learning_rate": 5.546174142480212e-06, "loss": 13.2407, "step": 2110 }, { "epoch": 6.0, "eval_accuracy": 0.4888, "eval_loss": 16.81440544128418, "eval_runtime": 45.2206, "eval_samples_per_second": 110.569, "eval_steps_per_second": 3.472, "step": 2110 }, { "epoch": 6.03, "grad_norm": 31582.849609375, "learning_rate": 5.519788918205805e-06, "loss": 11.0814, "step": 2120 }, { "epoch": 6.06, "grad_norm": 3.686307907104492, "learning_rate": 5.493403693931399e-06, "loss": 7.7683, "step": 2130 }, { "epoch": 6.08, "grad_norm": 31997.40234375, "learning_rate": 5.467018469656993e-06, "loss": 7.9998, "step": 2140 }, { "epoch": 6.11, "grad_norm": 56679.56640625, "learning_rate": 5.440633245382586e-06, "loss": 30.4386, "step": 2150 }, { "epoch": 6.14, "grad_norm": 43711.375, "learning_rate": 5.414248021108179e-06, "loss": 14.9644, "step": 2160 }, { "epoch": 6.17, "grad_norm": 5.386730670928955, "learning_rate": 5.387862796833774e-06, "loss": 5.2579, "step": 2170 }, { "epoch": 6.2, "grad_norm": 35897.203125, "learning_rate": 5.361477572559367e-06, "loss": 7.865, "step": 2180 }, { "epoch": 6.23, "grad_norm": 8337.6298828125, "learning_rate": 5.33509234828496e-06, "loss": 9.0662, "step": 2190 }, { "epoch": 6.25, "grad_norm": 13576.43359375, "learning_rate": 5.3087071240105546e-06, "loss": 17.4631, "step": 2200 }, { "epoch": 6.28, "grad_norm": 24255.123046875, "learning_rate": 5.282321899736148e-06, "loss": 11.8401, "step": 2210 }, { "epoch": 6.31, "grad_norm": 13836.0986328125, "learning_rate": 5.2559366754617415e-06, "loss": 23.3019, "step": 2220 }, { "epoch": 6.34, "grad_norm": 38204.4921875, "learning_rate": 5.229551451187336e-06, "loss": 12.2579, "step": 2230 }, { "epoch": 6.37, "grad_norm": 68799.796875, "learning_rate": 5.203166226912929e-06, "loss": 13.6891, "step": 2240 }, { "epoch": 6.4, "grad_norm": 6920.53662109375, "learning_rate": 5.176781002638523e-06, "loss": 8.0952, "step": 2250 }, { "epoch": 6.43, "grad_norm": 5.1690874099731445, "learning_rate": 5.150395778364117e-06, "loss": 17.0515, "step": 2260 }, { "epoch": 6.45, "grad_norm": 388.1389465332031, "learning_rate": 5.12401055408971e-06, "loss": 5.2641, "step": 2270 }, { "epoch": 6.48, "grad_norm": 43087.90625, "learning_rate": 5.097625329815303e-06, "loss": 20.8513, "step": 2280 }, { "epoch": 6.51, "grad_norm": 14357.72265625, "learning_rate": 5.071240105540897e-06, "loss": 7.3095, "step": 2290 }, { "epoch": 6.54, "grad_norm": 8868.662109375, "learning_rate": 5.044854881266491e-06, "loss": 22.2034, "step": 2300 }, { "epoch": 6.57, "grad_norm": 45453.5625, "learning_rate": 5.018469656992084e-06, "loss": 35.0015, "step": 2310 }, { "epoch": 6.6, "grad_norm": 3.170788288116455, "learning_rate": 4.992084432717679e-06, "loss": 31.3804, "step": 2320 }, { "epoch": 6.62, "grad_norm": 18835.568359375, "learning_rate": 4.965699208443272e-06, "loss": 28.6304, "step": 2330 }, { "epoch": 6.65, "grad_norm": 9002.6005859375, "learning_rate": 4.9393139841688655e-06, "loss": 21.0833, "step": 2340 }, { "epoch": 6.68, "grad_norm": 5.6249237060546875, "learning_rate": 4.91292875989446e-06, "loss": 10.8386, "step": 2350 }, { "epoch": 6.71, "grad_norm": 1778.7213134765625, "learning_rate": 4.886543535620053e-06, "loss": 13.7267, "step": 2360 }, { "epoch": 6.74, "grad_norm": 22283.333984375, "learning_rate": 4.860158311345647e-06, "loss": 11.4599, "step": 2370 }, { "epoch": 6.77, "grad_norm": 27862.04296875, "learning_rate": 4.833773087071241e-06, "loss": 12.8282, "step": 2380 }, { "epoch": 6.79, "grad_norm": 4.5571160316467285, "learning_rate": 4.807387862796834e-06, "loss": 15.8908, "step": 2390 }, { "epoch": 6.82, "grad_norm": 10238.275390625, "learning_rate": 4.781002638522428e-06, "loss": 9.3482, "step": 2400 }, { "epoch": 6.85, "grad_norm": 17660.458984375, "learning_rate": 4.7546174142480214e-06, "loss": 16.7676, "step": 2410 }, { "epoch": 6.88, "grad_norm": 34884.1640625, "learning_rate": 4.728232189973615e-06, "loss": 18.4743, "step": 2420 }, { "epoch": 6.91, "grad_norm": 23877.84375, "learning_rate": 4.701846965699209e-06, "loss": 19.6808, "step": 2430 }, { "epoch": 6.94, "grad_norm": 27960.5546875, "learning_rate": 4.675461741424803e-06, "loss": 6.8063, "step": 2440 }, { "epoch": 6.97, "grad_norm": 12199.77734375, "learning_rate": 4.649076517150396e-06, "loss": 20.9414, "step": 2450 }, { "epoch": 6.99, "grad_norm": 11235.28515625, "learning_rate": 4.62269129287599e-06, "loss": 12.5439, "step": 2460 }, { "epoch": 7.0, "eval_accuracy": 0.496, "eval_loss": 10.816112518310547, "eval_runtime": 50.5252, "eval_samples_per_second": 98.961, "eval_steps_per_second": 3.107, "step": 2462 }, { "epoch": 7.02, "grad_norm": 14523.2421875, "learning_rate": 4.596306068601584e-06, "loss": 6.7719, "step": 2470 }, { "epoch": 7.05, "grad_norm": 14875.3134765625, "learning_rate": 4.569920844327177e-06, "loss": 7.6423, "step": 2480 }, { "epoch": 7.08, "grad_norm": 4.714503765106201, "learning_rate": 4.543535620052771e-06, "loss": 14.27, "step": 2490 }, { "epoch": 7.11, "grad_norm": 27521.392578125, "learning_rate": 4.517150395778364e-06, "loss": 16.9158, "step": 2500 }, { "epoch": 7.14, "grad_norm": 5.09544038772583, "learning_rate": 4.490765171503958e-06, "loss": 6.6798, "step": 2510 }, { "epoch": 7.16, "grad_norm": 10889.9306640625, "learning_rate": 4.464379947229552e-06, "loss": 15.3132, "step": 2520 }, { "epoch": 7.19, "grad_norm": 35052.25390625, "learning_rate": 4.4379947229551455e-06, "loss": 16.6837, "step": 2530 }, { "epoch": 7.22, "grad_norm": 12993.140625, "learning_rate": 4.411609498680739e-06, "loss": 8.1167, "step": 2540 }, { "epoch": 7.25, "grad_norm": 3.494886636734009, "learning_rate": 4.3852242744063324e-06, "loss": 15.8404, "step": 2550 }, { "epoch": 7.28, "grad_norm": 45734.86328125, "learning_rate": 4.358839050131927e-06, "loss": 16.2751, "step": 2560 }, { "epoch": 7.31, "grad_norm": 3.4322822093963623, "learning_rate": 4.33245382585752e-06, "loss": 14.5818, "step": 2570 }, { "epoch": 7.33, "grad_norm": 65198.58984375, "learning_rate": 4.306068601583114e-06, "loss": 14.4943, "step": 2580 }, { "epoch": 7.36, "grad_norm": 3.6506009101867676, "learning_rate": 4.279683377308708e-06, "loss": 29.6269, "step": 2590 }, { "epoch": 7.39, "grad_norm": 3.7057137489318848, "learning_rate": 4.2532981530343006e-06, "loss": 19.3266, "step": 2600 }, { "epoch": 7.42, "grad_norm": 5.194668292999268, "learning_rate": 4.226912928759895e-06, "loss": 7.3371, "step": 2610 }, { "epoch": 7.45, "grad_norm": 22885.326171875, "learning_rate": 4.200527704485488e-06, "loss": 11.3142, "step": 2620 }, { "epoch": 7.48, "grad_norm": 3.5124611854553223, "learning_rate": 4.174142480211082e-06, "loss": 9.6943, "step": 2630 }, { "epoch": 7.51, "grad_norm": 37121.1875, "learning_rate": 4.147757255936676e-06, "loss": 18.6008, "step": 2640 }, { "epoch": 7.53, "grad_norm": 16882.5546875, "learning_rate": 4.1213720316622696e-06, "loss": 5.7419, "step": 2650 }, { "epoch": 7.56, "grad_norm": 47388.609375, "learning_rate": 4.094986807387863e-06, "loss": 26.2545, "step": 2660 }, { "epoch": 7.59, "grad_norm": 27090.69140625, "learning_rate": 4.0686015831134565e-06, "loss": 15.8451, "step": 2670 }, { "epoch": 7.62, "grad_norm": 58406.08984375, "learning_rate": 4.042216358839051e-06, "loss": 12.3944, "step": 2680 }, { "epoch": 7.65, "grad_norm": 21176.11328125, "learning_rate": 4.015831134564644e-06, "loss": 18.6717, "step": 2690 }, { "epoch": 7.68, "grad_norm": 20264.244140625, "learning_rate": 3.989445910290238e-06, "loss": 27.9921, "step": 2700 }, { "epoch": 7.7, "grad_norm": 9458.7236328125, "learning_rate": 3.963060686015832e-06, "loss": 6.9878, "step": 2710 }, { "epoch": 7.73, "grad_norm": 65288.0234375, "learning_rate": 3.936675461741425e-06, "loss": 6.4625, "step": 2720 }, { "epoch": 7.76, "grad_norm": 27124.0390625, "learning_rate": 3.910290237467019e-06, "loss": 13.301, "step": 2730 }, { "epoch": 7.79, "grad_norm": 30827.23828125, "learning_rate": 3.883905013192612e-06, "loss": 9.9931, "step": 2740 }, { "epoch": 7.82, "grad_norm": 31747.015625, "learning_rate": 3.857519788918206e-06, "loss": 11.3358, "step": 2750 }, { "epoch": 7.85, "grad_norm": 40866.578125, "learning_rate": 3.831134564643799e-06, "loss": 22.6144, "step": 2760 }, { "epoch": 7.87, "grad_norm": 11538.0966796875, "learning_rate": 3.8047493403693936e-06, "loss": 16.5795, "step": 2770 }, { "epoch": 7.9, "grad_norm": 40162.5546875, "learning_rate": 3.778364116094987e-06, "loss": 24.256, "step": 2780 }, { "epoch": 7.93, "grad_norm": 42002.49609375, "learning_rate": 3.7519788918205805e-06, "loss": 13.8153, "step": 2790 }, { "epoch": 7.96, "grad_norm": 3.6676483154296875, "learning_rate": 3.7255936675461744e-06, "loss": 13.5529, "step": 2800 }, { "epoch": 7.99, "grad_norm": 5.180718898773193, "learning_rate": 3.699208443271768e-06, "loss": 10.301, "step": 2810 }, { "epoch": 8.0, "eval_accuracy": 0.5066, "eval_loss": 14.157267570495605, "eval_runtime": 46.1687, "eval_samples_per_second": 108.298, "eval_steps_per_second": 3.401, "step": 2814 }, { "epoch": 8.02, "grad_norm": 43407.69921875, "learning_rate": 3.6728232189973618e-06, "loss": 7.1806, "step": 2820 }, { "epoch": 8.05, "grad_norm": 34019.515625, "learning_rate": 3.6464379947229557e-06, "loss": 12.0117, "step": 2830 }, { "epoch": 8.07, "grad_norm": 3.6186134815216064, "learning_rate": 3.6200527704485487e-06, "loss": 23.7395, "step": 2840 }, { "epoch": 8.1, "grad_norm": 3.713313579559326, "learning_rate": 3.5936675461741426e-06, "loss": 15.2477, "step": 2850 }, { "epoch": 8.13, "grad_norm": 3.1282858848571777, "learning_rate": 3.5672823218997365e-06, "loss": 12.6871, "step": 2860 }, { "epoch": 8.16, "grad_norm": 19212.1640625, "learning_rate": 3.54089709762533e-06, "loss": 14.7677, "step": 2870 }, { "epoch": 8.19, "grad_norm": 5.48822546005249, "learning_rate": 3.514511873350924e-06, "loss": 13.8763, "step": 2880 }, { "epoch": 8.22, "grad_norm": 4.162522792816162, "learning_rate": 3.4881266490765177e-06, "loss": 7.4292, "step": 2890 }, { "epoch": 8.24, "grad_norm": 3.5952889919281006, "learning_rate": 3.4617414248021107e-06, "loss": 12.4824, "step": 2900 }, { "epoch": 8.27, "grad_norm": 34536.01953125, "learning_rate": 3.4353562005277046e-06, "loss": 10.0536, "step": 2910 }, { "epoch": 8.3, "grad_norm": 4.813022136688232, "learning_rate": 3.4089709762532985e-06, "loss": 5.8745, "step": 2920 }, { "epoch": 8.33, "grad_norm": 7918.7265625, "learning_rate": 3.382585751978892e-06, "loss": 11.4748, "step": 2930 }, { "epoch": 8.36, "grad_norm": 30790.84375, "learning_rate": 3.356200527704486e-06, "loss": 10.8125, "step": 2940 }, { "epoch": 8.39, "grad_norm": 45355.0703125, "learning_rate": 3.3298153034300797e-06, "loss": 22.0736, "step": 2950 }, { "epoch": 8.42, "grad_norm": 30917.537109375, "learning_rate": 3.3034300791556727e-06, "loss": 13.6627, "step": 2960 }, { "epoch": 8.44, "grad_norm": 43979.453125, "learning_rate": 3.2770448548812666e-06, "loss": 11.2351, "step": 2970 }, { "epoch": 8.47, "grad_norm": 24439.71875, "learning_rate": 3.2506596306068605e-06, "loss": 10.9467, "step": 2980 }, { "epoch": 8.5, "grad_norm": 14647.2001953125, "learning_rate": 3.2242744063324544e-06, "loss": 13.553, "step": 2990 }, { "epoch": 8.53, "grad_norm": 12101.857421875, "learning_rate": 3.197889182058048e-06, "loss": 8.8052, "step": 3000 }, { "epoch": 8.56, "grad_norm": 4.919346332550049, "learning_rate": 3.1715039577836413e-06, "loss": 8.7727, "step": 3010 }, { "epoch": 8.59, "grad_norm": 63979.8671875, "learning_rate": 3.145118733509235e-06, "loss": 17.3421, "step": 3020 }, { "epoch": 8.61, "grad_norm": 26554.1953125, "learning_rate": 3.1187335092348287e-06, "loss": 23.6886, "step": 3030 }, { "epoch": 8.64, "grad_norm": 6.986400127410889, "learning_rate": 3.0923482849604225e-06, "loss": 10.6609, "step": 3040 }, { "epoch": 8.67, "grad_norm": 76165.25, "learning_rate": 3.0659630606860164e-06, "loss": 18.4048, "step": 3050 }, { "epoch": 8.7, "grad_norm": 56495.66796875, "learning_rate": 3.0395778364116095e-06, "loss": 15.648, "step": 3060 }, { "epoch": 8.73, "grad_norm": 47611.59375, "learning_rate": 3.0131926121372033e-06, "loss": 22.4894, "step": 3070 }, { "epoch": 8.76, "grad_norm": 41161.15234375, "learning_rate": 2.9868073878627972e-06, "loss": 16.8503, "step": 3080 }, { "epoch": 8.78, "grad_norm": 25355.17578125, "learning_rate": 2.9604221635883907e-06, "loss": 8.8893, "step": 3090 }, { "epoch": 8.81, "grad_norm": 19881.41796875, "learning_rate": 2.9340369393139846e-06, "loss": 8.531, "step": 3100 }, { "epoch": 8.84, "grad_norm": 3.6396067142486572, "learning_rate": 2.9076517150395785e-06, "loss": 18.1681, "step": 3110 }, { "epoch": 8.87, "grad_norm": 12494.5126953125, "learning_rate": 2.8812664907651715e-06, "loss": 9.6084, "step": 3120 }, { "epoch": 8.9, "grad_norm": 16214.6875, "learning_rate": 2.8548812664907654e-06, "loss": 13.8055, "step": 3130 }, { "epoch": 8.93, "grad_norm": 51600.5703125, "learning_rate": 2.8284960422163593e-06, "loss": 14.8168, "step": 3140 }, { "epoch": 8.96, "grad_norm": 35349.5859375, "learning_rate": 2.8021108179419527e-06, "loss": 13.4314, "step": 3150 }, { "epoch": 8.98, "grad_norm": 10837.5458984375, "learning_rate": 2.7757255936675466e-06, "loss": 18.2068, "step": 3160 }, { "epoch": 9.0, "eval_accuracy": 0.5054, "eval_loss": 15.783122062683105, "eval_runtime": 47.8138, "eval_samples_per_second": 104.572, "eval_steps_per_second": 3.284, "step": 3165 }, { "epoch": 9.01, "grad_norm": 5.2953009605407715, "learning_rate": 2.7493403693931405e-06, "loss": 7.8406, "step": 3170 }, { "epoch": 9.04, "grad_norm": 5.277254581451416, "learning_rate": 2.7229551451187335e-06, "loss": 7.9094, "step": 3180 }, { "epoch": 9.07, "grad_norm": 8876.9833984375, "learning_rate": 2.6965699208443274e-06, "loss": 8.504, "step": 3190 }, { "epoch": 9.1, "grad_norm": 5.480013370513916, "learning_rate": 2.6701846965699213e-06, "loss": 20.42, "step": 3200 }, { "epoch": 9.13, "grad_norm": 17570.037109375, "learning_rate": 2.6437994722955147e-06, "loss": 22.3004, "step": 3210 }, { "epoch": 9.15, "grad_norm": 41648.78125, "learning_rate": 2.6174142480211086e-06, "loss": 15.4409, "step": 3220 }, { "epoch": 9.18, "grad_norm": 10240.6328125, "learning_rate": 2.591029023746702e-06, "loss": 5.6755, "step": 3230 }, { "epoch": 9.21, "grad_norm": 10833.125, "learning_rate": 2.5646437994722956e-06, "loss": 26.6077, "step": 3240 }, { "epoch": 9.24, "grad_norm": 13891.32421875, "learning_rate": 2.5382585751978894e-06, "loss": 7.8003, "step": 3250 }, { "epoch": 9.27, "grad_norm": 39645.2265625, "learning_rate": 2.5118733509234833e-06, "loss": 30.8708, "step": 3260 }, { "epoch": 9.3, "grad_norm": 30990.80859375, "learning_rate": 2.4854881266490768e-06, "loss": 12.1299, "step": 3270 }, { "epoch": 9.32, "grad_norm": 21565.091796875, "learning_rate": 2.4591029023746702e-06, "loss": 16.0867, "step": 3280 }, { "epoch": 9.35, "grad_norm": 41620.01953125, "learning_rate": 2.432717678100264e-06, "loss": 17.6403, "step": 3290 }, { "epoch": 9.38, "grad_norm": 30027.357421875, "learning_rate": 2.406332453825858e-06, "loss": 27.302, "step": 3300 }, { "epoch": 9.41, "grad_norm": 19158.068359375, "learning_rate": 2.3799472295514515e-06, "loss": 16.9498, "step": 3310 }, { "epoch": 9.44, "grad_norm": 4.843263149261475, "learning_rate": 2.353562005277045e-06, "loss": 16.1065, "step": 3320 }, { "epoch": 9.47, "grad_norm": 10113.2021484375, "learning_rate": 2.327176781002639e-06, "loss": 30.102, "step": 3330 }, { "epoch": 9.5, "grad_norm": 4.67209005355835, "learning_rate": 2.3007915567282323e-06, "loss": 12.136, "step": 3340 }, { "epoch": 9.52, "grad_norm": 28773.3828125, "learning_rate": 2.274406332453826e-06, "loss": 9.9983, "step": 3350 }, { "epoch": 9.55, "grad_norm": 29744.68359375, "learning_rate": 2.2480211081794196e-06, "loss": 7.8049, "step": 3360 }, { "epoch": 9.58, "grad_norm": 55136.67578125, "learning_rate": 2.2216358839050135e-06, "loss": 11.5446, "step": 3370 }, { "epoch": 9.61, "grad_norm": 4.848857402801514, "learning_rate": 2.195250659630607e-06, "loss": 11.939, "step": 3380 }, { "epoch": 9.64, "grad_norm": 34092.7578125, "learning_rate": 2.168865435356201e-06, "loss": 23.4238, "step": 3390 }, { "epoch": 9.67, "grad_norm": 10920.62109375, "learning_rate": 2.1424802110817943e-06, "loss": 11.9045, "step": 3400 }, { "epoch": 9.69, "grad_norm": 77582.25, "learning_rate": 2.1160949868073878e-06, "loss": 13.2217, "step": 3410 }, { "epoch": 9.72, "grad_norm": 8.04971981048584, "learning_rate": 2.0897097625329816e-06, "loss": 26.735, "step": 3420 }, { "epoch": 9.75, "grad_norm": 8471.4970703125, "learning_rate": 2.0633245382585755e-06, "loss": 9.9453, "step": 3430 }, { "epoch": 9.78, "grad_norm": 4.887660503387451, "learning_rate": 2.036939313984169e-06, "loss": 9.0241, "step": 3440 }, { "epoch": 9.81, "grad_norm": 20093.53125, "learning_rate": 2.010554089709763e-06, "loss": 7.2606, "step": 3450 }, { "epoch": 9.84, "grad_norm": 22952.759765625, "learning_rate": 1.9841688654353563e-06, "loss": 20.1506, "step": 3460 }, { "epoch": 9.86, "grad_norm": 22027.55078125, "learning_rate": 1.9577836411609498e-06, "loss": 11.949, "step": 3470 }, { "epoch": 9.89, "grad_norm": 29166.6953125, "learning_rate": 1.9313984168865437e-06, "loss": 14.21, "step": 3480 }, { "epoch": 9.92, "grad_norm": 6533.68896484375, "learning_rate": 1.9050131926121373e-06, "loss": 16.2336, "step": 3490 }, { "epoch": 9.95, "grad_norm": 21790.36328125, "learning_rate": 1.878627968337731e-06, "loss": 12.3446, "step": 3500 }, { "epoch": 9.98, "grad_norm": 4.611354827880859, "learning_rate": 1.8522427440633247e-06, "loss": 5.7088, "step": 3510 }, { "epoch": 10.0, "eval_accuracy": 0.512, "eval_loss": 14.330853462219238, "eval_runtime": 49.9643, "eval_samples_per_second": 100.072, "eval_steps_per_second": 3.142, "step": 3517 }, { "epoch": 10.01, "grad_norm": 14744.13671875, "learning_rate": 1.8258575197889184e-06, "loss": 12.3285, "step": 3520 }, { "epoch": 10.04, "grad_norm": 34220.26953125, "learning_rate": 1.7994722955145118e-06, "loss": 10.1598, "step": 3530 }, { "epoch": 10.06, "grad_norm": 23474.5078125, "learning_rate": 1.7730870712401057e-06, "loss": 13.9591, "step": 3540 }, { "epoch": 10.09, "grad_norm": 9619.984375, "learning_rate": 1.7467018469656994e-06, "loss": 6.0775, "step": 3550 }, { "epoch": 10.12, "grad_norm": 16567.521484375, "learning_rate": 1.7203166226912928e-06, "loss": 12.7605, "step": 3560 }, { "epoch": 10.15, "grad_norm": 25935.73046875, "learning_rate": 1.6939313984168867e-06, "loss": 18.8492, "step": 3570 }, { "epoch": 10.18, "grad_norm": 13734.125, "learning_rate": 1.6675461741424804e-06, "loss": 33.0552, "step": 3580 }, { "epoch": 10.21, "grad_norm": 16731.1328125, "learning_rate": 1.6411609498680738e-06, "loss": 17.3606, "step": 3590 }, { "epoch": 10.23, "grad_norm": 57974.08203125, "learning_rate": 1.6147757255936677e-06, "loss": 13.9039, "step": 3600 }, { "epoch": 10.26, "grad_norm": 3936.139404296875, "learning_rate": 1.5883905013192614e-06, "loss": 6.4631, "step": 3610 }, { "epoch": 10.29, "grad_norm": 4.282674789428711, "learning_rate": 1.5620052770448549e-06, "loss": 14.2391, "step": 3620 }, { "epoch": 10.32, "grad_norm": 9716.8974609375, "learning_rate": 1.5356200527704487e-06, "loss": 12.9602, "step": 3630 }, { "epoch": 10.35, "grad_norm": 105246.859375, "learning_rate": 1.5092348284960422e-06, "loss": 5.6431, "step": 3640 }, { "epoch": 10.38, "grad_norm": 28066.587890625, "learning_rate": 1.4828496042216359e-06, "loss": 27.5527, "step": 3650 }, { "epoch": 10.41, "grad_norm": 18242.50390625, "learning_rate": 1.4564643799472298e-06, "loss": 8.1481, "step": 3660 }, { "epoch": 10.43, "grad_norm": 22079.322265625, "learning_rate": 1.4300791556728232e-06, "loss": 21.5088, "step": 3670 }, { "epoch": 10.46, "grad_norm": 12174.9892578125, "learning_rate": 1.4036939313984169e-06, "loss": 14.1888, "step": 3680 }, { "epoch": 10.49, "grad_norm": 5.95954704284668, "learning_rate": 1.3773087071240108e-06, "loss": 15.7346, "step": 3690 }, { "epoch": 10.52, "grad_norm": 19023.048828125, "learning_rate": 1.3509234828496042e-06, "loss": 16.955, "step": 3700 }, { "epoch": 10.55, "grad_norm": 13843.515625, "learning_rate": 1.3245382585751981e-06, "loss": 16.371, "step": 3710 }, { "epoch": 10.58, "grad_norm": 21204.41015625, "learning_rate": 1.2981530343007918e-06, "loss": 7.4397, "step": 3720 }, { "epoch": 10.6, "grad_norm": 3.4024298191070557, "learning_rate": 1.2717678100263852e-06, "loss": 6.7214, "step": 3730 }, { "epoch": 10.63, "grad_norm": 42470.30859375, "learning_rate": 1.245382585751979e-06, "loss": 12.0161, "step": 3740 }, { "epoch": 10.66, "grad_norm": 14318.4609375, "learning_rate": 1.2189973614775728e-06, "loss": 12.1081, "step": 3750 }, { "epoch": 10.69, "grad_norm": 10011.2294921875, "learning_rate": 1.1926121372031663e-06, "loss": 6.5746, "step": 3760 }, { "epoch": 10.72, "grad_norm": 20765.578125, "learning_rate": 1.16622691292876e-06, "loss": 12.4505, "step": 3770 }, { "epoch": 10.75, "grad_norm": 3.3577842712402344, "learning_rate": 1.1398416886543536e-06, "loss": 16.3994, "step": 3780 }, { "epoch": 10.77, "grad_norm": 34584.66796875, "learning_rate": 1.1134564643799473e-06, "loss": 8.4269, "step": 3790 }, { "epoch": 10.8, "grad_norm": 6.110357284545898, "learning_rate": 1.087071240105541e-06, "loss": 13.8447, "step": 3800 }, { "epoch": 10.83, "grad_norm": 6753.76708984375, "learning_rate": 1.0606860158311346e-06, "loss": 13.934, "step": 3810 }, { "epoch": 10.86, "grad_norm": 4.714809417724609, "learning_rate": 1.0343007915567283e-06, "loss": 10.1007, "step": 3820 }, { "epoch": 10.89, "grad_norm": 22406.89453125, "learning_rate": 1.007915567282322e-06, "loss": 9.933, "step": 3830 }, { "epoch": 10.92, "grad_norm": 23237.966796875, "learning_rate": 9.815303430079156e-07, "loss": 13.3623, "step": 3840 }, { "epoch": 10.95, "grad_norm": 8239.9091796875, "learning_rate": 9.551451187335093e-07, "loss": 4.7175, "step": 3850 }, { "epoch": 10.97, "grad_norm": 24220.6953125, "learning_rate": 9.28759894459103e-07, "loss": 18.9725, "step": 3860 }, { "epoch": 11.0, "eval_accuracy": 0.5284, "eval_loss": 21.657846450805664, "eval_runtime": 49.4255, "eval_samples_per_second": 101.162, "eval_steps_per_second": 3.176, "step": 3869 }, { "epoch": 11.0, "grad_norm": 90386.609375, "learning_rate": 9.023746701846966e-07, "loss": 17.9928, "step": 3870 }, { "epoch": 11.03, "grad_norm": 9651.0390625, "learning_rate": 8.759894459102903e-07, "loss": 23.2039, "step": 3880 }, { "epoch": 11.06, "grad_norm": 5.884151458740234, "learning_rate": 8.49604221635884e-07, "loss": 2.7096, "step": 3890 }, { "epoch": 11.09, "grad_norm": 27521.5625, "learning_rate": 8.232189973614777e-07, "loss": 13.1323, "step": 3900 }, { "epoch": 11.12, "grad_norm": 10829.33203125, "learning_rate": 7.968337730870713e-07, "loss": 13.2876, "step": 3910 }, { "epoch": 11.14, "grad_norm": 40882.875, "learning_rate": 7.704485488126649e-07, "loss": 18.2464, "step": 3920 }, { "epoch": 11.17, "grad_norm": 15569.154296875, "learning_rate": 7.440633245382587e-07, "loss": 8.1155, "step": 3930 }, { "epoch": 11.2, "grad_norm": 4905.3935546875, "learning_rate": 7.176781002638523e-07, "loss": 5.9305, "step": 3940 }, { "epoch": 11.23, "grad_norm": 18239.060546875, "learning_rate": 6.912928759894459e-07, "loss": 10.8597, "step": 3950 }, { "epoch": 11.26, "grad_norm": 11243.37109375, "learning_rate": 6.649076517150396e-07, "loss": 7.2837, "step": 3960 }, { "epoch": 11.29, "grad_norm": 39662.703125, "learning_rate": 6.385224274406334e-07, "loss": 9.5539, "step": 3970 }, { "epoch": 11.31, "grad_norm": 3.6288795471191406, "learning_rate": 6.121372031662269e-07, "loss": 2.5246, "step": 3980 }, { "epoch": 11.34, "grad_norm": 149521.96875, "learning_rate": 5.857519788918206e-07, "loss": 20.4766, "step": 3990 }, { "epoch": 11.37, "grad_norm": 53100.82421875, "learning_rate": 5.593667546174144e-07, "loss": 12.6192, "step": 4000 }, { "epoch": 11.4, "grad_norm": 19199.001953125, "learning_rate": 5.329815303430079e-07, "loss": 18.3155, "step": 4010 }, { "epoch": 11.43, "grad_norm": 20008.158203125, "learning_rate": 5.065963060686016e-07, "loss": 17.0249, "step": 4020 }, { "epoch": 11.46, "grad_norm": 26673.271484375, "learning_rate": 4.802110817941953e-07, "loss": 8.6804, "step": 4030 }, { "epoch": 11.49, "grad_norm": 4560.06787109375, "learning_rate": 4.5382585751978896e-07, "loss": 10.9282, "step": 4040 }, { "epoch": 11.51, "grad_norm": 26306.35546875, "learning_rate": 4.274406332453826e-07, "loss": 4.3387, "step": 4050 }, { "epoch": 11.54, "grad_norm": 46509.32421875, "learning_rate": 4.010554089709763e-07, "loss": 6.683, "step": 4060 }, { "epoch": 11.57, "grad_norm": 13722.4189453125, "learning_rate": 3.7467018469656997e-07, "loss": 10.8325, "step": 4070 }, { "epoch": 11.6, "grad_norm": 5.297541618347168, "learning_rate": 3.482849604221636e-07, "loss": 7.9766, "step": 4080 }, { "epoch": 11.63, "grad_norm": 20999.05859375, "learning_rate": 3.218997361477573e-07, "loss": 8.6599, "step": 4090 }, { "epoch": 11.66, "grad_norm": 33970.34375, "learning_rate": 2.9551451187335093e-07, "loss": 18.0011, "step": 4100 }, { "epoch": 11.68, "grad_norm": 54935.26953125, "learning_rate": 2.691292875989446e-07, "loss": 9.0449, "step": 4110 }, { "epoch": 11.71, "grad_norm": 30675.240234375, "learning_rate": 2.427440633245383e-07, "loss": 12.3345, "step": 4120 }, { "epoch": 11.74, "grad_norm": 9968.326171875, "learning_rate": 2.1635883905013192e-07, "loss": 11.519, "step": 4130 }, { "epoch": 11.77, "grad_norm": 3.7177724838256836, "learning_rate": 1.899736147757256e-07, "loss": 5.1236, "step": 4140 }, { "epoch": 11.8, "grad_norm": 25476.486328125, "learning_rate": 1.635883905013193e-07, "loss": 9.1192, "step": 4150 }, { "epoch": 11.83, "grad_norm": 4.210846424102783, "learning_rate": 1.3720316622691293e-07, "loss": 4.8955, "step": 4160 }, { "epoch": 11.86, "grad_norm": 45358.33203125, "learning_rate": 1.108179419525066e-07, "loss": 14.9257, "step": 4170 }, { "epoch": 11.88, "grad_norm": 32215.34765625, "learning_rate": 8.443271767810026e-08, "loss": 14.8244, "step": 4180 }, { "epoch": 11.91, "grad_norm": 7220.693359375, "learning_rate": 5.8047493403693936e-08, "loss": 33.6707, "step": 4190 }, { "epoch": 11.94, "grad_norm": 4.6804280281066895, "learning_rate": 3.16622691292876e-08, "loss": 6.343, "step": 4200 }, { "epoch": 11.97, "grad_norm": 33938.75, "learning_rate": 5.2770448548812665e-09, "loss": 16.9049, "step": 4210 }, { "epoch": 11.97, "eval_accuracy": 0.5204, "eval_loss": 12.967041969299316, "eval_runtime": 48.68, "eval_samples_per_second": 102.712, "eval_steps_per_second": 3.225, "step": 4212 }, { "epoch": 11.97, "step": 4212, "total_flos": 1.1451142170353959e+19, "train_loss": 15.888186119673712, "train_runtime": 6961.331, "train_samples_per_second": 77.571, "train_steps_per_second": 0.605 } ], "logging_steps": 10, "max_steps": 4212, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 500, "total_flos": 1.1451142170353959e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }