{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9926873857404024, "eval_steps": 500, "global_step": 1092, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 4.545454545454545e-08, "loss": 6.3231, "step": 10 }, { "epoch": 0.07, "learning_rate": 9.09090909090909e-08, "loss": 5.5694, "step": 20 }, { "epoch": 0.11, "learning_rate": 1.3636363636363635e-07, "loss": 3.253, "step": 30 }, { "epoch": 0.15, "learning_rate": 1.818181818181818e-07, "loss": 1.3696, "step": 40 }, { "epoch": 0.18, "learning_rate": 2.2727272727272726e-07, "loss": 0.5924, "step": 50 }, { "epoch": 0.22, "learning_rate": 2.727272727272727e-07, "loss": 0.2712, "step": 60 }, { "epoch": 0.26, "learning_rate": 3.1818181818181815e-07, "loss": 0.1977, "step": 70 }, { "epoch": 0.29, "learning_rate": 3.636363636363636e-07, "loss": 0.1782, "step": 80 }, { "epoch": 0.33, "learning_rate": 4.090909090909091e-07, "loss": 0.1586, "step": 90 }, { "epoch": 0.37, "learning_rate": 4.545454545454545e-07, "loss": 0.1282, "step": 100 }, { "epoch": 0.4, "learning_rate": 5e-07, "loss": 0.083, "step": 110 }, { "epoch": 0.44, "learning_rate": 4.998720766742264e-07, "loss": 0.0686, "step": 120 }, { "epoch": 0.48, "learning_rate": 4.994884376119235e-07, "loss": 0.0636, "step": 130 }, { "epoch": 0.51, "learning_rate": 4.988494754241696e-07, "loss": 0.0618, "step": 140 }, { "epoch": 0.55, "learning_rate": 4.979558440163095e-07, "loss": 0.0609, "step": 150 }, { "epoch": 0.59, "learning_rate": 4.968084579187565e-07, "loss": 0.0572, "step": 160 }, { "epoch": 0.62, "learning_rate": 4.954084913510755e-07, "loss": 0.0576, "step": 170 }, { "epoch": 0.66, "learning_rate": 4.937573770203005e-07, "loss": 0.0616, "step": 180 }, { "epoch": 0.69, "learning_rate": 4.918568046547231e-07, "loss": 0.058, "step": 190 }, { "epoch": 0.73, "learning_rate": 4.897087192746464e-07, "loss": 0.0581, "step": 200 }, { "epoch": 0.77, "learning_rate": 4.873153192018773e-07, "loss": 0.0561, "step": 210 }, { "epoch": 0.8, "learning_rate": 4.846790538099934e-07, "loss": 0.0527, "step": 220 }, { "epoch": 0.84, "learning_rate": 4.818026210176872e-07, "loss": 0.0527, "step": 230 }, { "epoch": 0.88, "learning_rate": 4.786889645277519e-07, "loss": 0.0546, "step": 240 }, { "epoch": 0.91, "learning_rate": 4.7534127081453554e-07, "loss": 0.0513, "step": 250 }, { "epoch": 0.95, "learning_rate": 4.71762965862946e-07, "loss": 0.0489, "step": 260 }, { "epoch": 0.99, "learning_rate": 4.679577116623435e-07, "loss": 0.0511, "step": 270 }, { "epoch": 1.0, "eval_loss": 0.051615118980407715, "eval_runtime": 48.6349, "eval_samples_per_second": 7.135, "eval_steps_per_second": 0.596, "step": 273 }, { "epoch": 1.02, "learning_rate": 4.639294024589101e-07, "loss": 0.0476, "step": 280 }, { "epoch": 1.06, "learning_rate": 4.596821607703302e-07, "loss": 0.0486, "step": 290 }, { "epoch": 1.1, "learning_rate": 4.5522033316686106e-07, "loss": 0.0473, "step": 300 }, { "epoch": 1.13, "learning_rate": 4.5054848582311134e-07, "loss": 0.0501, "step": 310 }, { "epoch": 1.17, "learning_rate": 4.456713998450786e-07, "loss": 0.0435, "step": 320 }, { "epoch": 1.21, "learning_rate": 4.405940663772302e-07, "loss": 0.0393, "step": 330 }, { "epoch": 1.24, "learning_rate": 4.353216814946321e-07, "loss": 0.0489, "step": 340 }, { "epoch": 1.28, "learning_rate": 4.2985964088535603e-07, "loss": 0.039, "step": 350 }, { "epoch": 1.32, "learning_rate": 4.2421353432860386e-07, "loss": 0.045, "step": 360 }, { "epoch": 1.35, "learning_rate": 4.1838913997420285e-07, "loss": 0.0389, "step": 370 }, { "epoch": 1.39, "learning_rate": 4.1239241842932446e-07, "loss": 0.0407, "step": 380 }, { "epoch": 1.43, "learning_rate": 4.0622950665847893e-07, "loss": 0.0442, "step": 390 }, { "epoch": 1.46, "learning_rate": 3.9990671170302747e-07, "loss": 0.04, "step": 400 }, { "epoch": 1.5, "learning_rate": 3.934305042266413e-07, "loss": 0.0398, "step": 410 }, { "epoch": 1.54, "learning_rate": 3.868075118933106e-07, "loss": 0.0478, "step": 420 }, { "epoch": 1.57, "learning_rate": 3.8004451258468224e-07, "loss": 0.0426, "step": 430 }, { "epoch": 1.61, "learning_rate": 3.7314842746366625e-07, "loss": 0.0398, "step": 440 }, { "epoch": 1.65, "learning_rate": 3.6612631389141073e-07, "loss": 0.0407, "step": 450 }, { "epoch": 1.68, "learning_rate": 3.5898535820489257e-07, "loss": 0.0418, "step": 460 }, { "epoch": 1.72, "learning_rate": 3.5173286836251683e-07, "loss": 0.0411, "step": 470 }, { "epoch": 1.76, "learning_rate": 3.4437626646524965e-07, "loss": 0.0363, "step": 480 }, { "epoch": 1.79, "learning_rate": 3.369230811609397e-07, "loss": 0.0397, "step": 490 }, { "epoch": 1.83, "learning_rate": 3.2938093993960107e-07, "loss": 0.0439, "step": 500 }, { "epoch": 1.86, "learning_rate": 3.217575613275414e-07, "loss": 0.0437, "step": 510 }, { "epoch": 1.9, "learning_rate": 3.1406074698832646e-07, "loss": 0.0445, "step": 520 }, { "epoch": 1.94, "learning_rate": 3.0629837373866113e-07, "loss": 0.0401, "step": 530 }, { "epoch": 1.97, "learning_rate": 2.984783854873614e-07, "loss": 0.0412, "step": 540 }, { "epoch": 2.0, "eval_loss": 0.04081326350569725, "eval_runtime": 48.5536, "eval_samples_per_second": 7.147, "eval_steps_per_second": 0.597, "step": 547 }, { "epoch": 2.01, "learning_rate": 2.9060878510566426e-07, "loss": 0.0354, "step": 550 }, { "epoch": 2.05, "learning_rate": 2.8269762623719636e-07, "loss": 0.0325, "step": 560 }, { "epoch": 2.08, "learning_rate": 2.747530050559831e-07, "loss": 0.0327, "step": 570 }, { "epoch": 2.12, "learning_rate": 2.6678305198093267e-07, "loss": 0.0373, "step": 580 }, { "epoch": 2.16, "learning_rate": 2.587959233552739e-07, "loss": 0.0377, "step": 590 }, { "epoch": 2.19, "learning_rate": 2.507997930994643e-07, "loss": 0.0328, "step": 600 }, { "epoch": 2.23, "learning_rate": 2.42802844346109e-07, "loss": 0.0357, "step": 610 }, { "epoch": 2.27, "learning_rate": 2.3481326106545248e-07, "loss": 0.0334, "step": 620 }, { "epoch": 2.3, "learning_rate": 2.2683921969001343e-07, "loss": 0.0318, "step": 630 }, { "epoch": 2.34, "learning_rate": 2.1888888074693235e-07, "loss": 0.0324, "step": 640 }, { "epoch": 2.38, "learning_rate": 2.109703805065985e-07, "loss": 0.0321, "step": 650 }, { "epoch": 2.41, "learning_rate": 2.0309182265609888e-07, "loss": 0.0328, "step": 660 }, { "epoch": 2.45, "learning_rate": 1.952612700060137e-07, "loss": 0.0332, "step": 670 }, { "epoch": 2.49, "learning_rate": 1.874867362390442e-07, "loss": 0.0329, "step": 680 }, { "epoch": 2.52, "learning_rate": 1.7977617770891678e-07, "loss": 0.0328, "step": 690 }, { "epoch": 2.56, "learning_rate": 1.7213748529795756e-07, "loss": 0.0272, "step": 700 }, { "epoch": 2.6, "learning_rate": 1.645784763416686e-07, "loss": 0.0373, "step": 710 }, { "epoch": 2.63, "learning_rate": 1.5710688662857186e-07, "loss": 0.031, "step": 720 }, { "epoch": 2.67, "learning_rate": 1.4973036248350665e-07, "loss": 0.0342, "step": 730 }, { "epoch": 2.71, "learning_rate": 1.4245645294248333e-07, "loss": 0.037, "step": 740 }, { "epoch": 2.74, "learning_rate": 1.352926020271006e-07, "loss": 0.0334, "step": 750 }, { "epoch": 2.78, "learning_rate": 1.2824614112643428e-07, "loss": 0.034, "step": 760 }, { "epoch": 2.82, "learning_rate": 1.2132428149419105e-07, "loss": 0.036, "step": 770 }, { "epoch": 2.85, "learning_rate": 1.1453410686880843e-07, "loss": 0.0383, "step": 780 }, { "epoch": 2.89, "learning_rate": 1.0788256622405165e-07, "loss": 0.0266, "step": 790 }, { "epoch": 2.93, "learning_rate": 1.0137646665752717e-07, "loss": 0.0318, "step": 800 }, { "epoch": 2.96, "learning_rate": 9.502246642438996e-08, "loss": 0.0336, "step": 810 }, { "epoch": 3.0, "learning_rate": 8.882706812337432e-08, "loss": 0.032, "step": 820 }, { "epoch": 3.0, "eval_loss": 0.03941405192017555, "eval_runtime": 48.6039, "eval_samples_per_second": 7.139, "eval_steps_per_second": 0.597, "step": 820 }, { "epoch": 3.03, "learning_rate": 8.279661204212143e-08, "loss": 0.0254, "step": 830 }, { "epoch": 3.07, "learning_rate": 7.693726966861405e-08, "loss": 0.0328, "step": 840 }, { "epoch": 3.11, "learning_rate": 7.125503737535843e-08, "loss": 0.0231, "step": 850 }, { "epoch": 3.14, "learning_rate": 6.575573028277659e-08, "loss": 0.0276, "step": 860 }, { "epoch": 3.18, "learning_rate": 6.044497630809053e-08, "loss": 0.0239, "step": 870 }, { "epoch": 3.22, "learning_rate": 5.5328210405786696e-08, "loss": 0.0251, "step": 880 }, { "epoch": 3.25, "learning_rate": 5.041066900555646e-08, "loss": 0.025, "step": 890 }, { "epoch": 3.29, "learning_rate": 4.569738465340414e-08, "loss": 0.0281, "step": 900 }, { "epoch": 3.33, "learning_rate": 4.1193180861406965e-08, "loss": 0.0246, "step": 910 }, { "epoch": 3.36, "learning_rate": 3.690266717139728e-08, "loss": 0.0252, "step": 920 }, { "epoch": 3.4, "learning_rate": 3.2830234437619035e-08, "loss": 0.0293, "step": 930 }, { "epoch": 3.44, "learning_rate": 2.8980050333186595e-08, "loss": 0.027, "step": 940 }, { "epoch": 3.47, "learning_rate": 2.5356055084943763e-08, "loss": 0.025, "step": 950 }, { "epoch": 3.51, "learning_rate": 2.1961957441088557e-08, "loss": 0.0255, "step": 960 }, { "epoch": 3.55, "learning_rate": 1.8801230875689688e-08, "loss": 0.0273, "step": 970 }, { "epoch": 3.58, "learning_rate": 1.5877110033980152e-08, "loss": 0.0291, "step": 980 }, { "epoch": 3.62, "learning_rate": 1.3192587422064166e-08, "loss": 0.0273, "step": 990 }, { "epoch": 3.66, "learning_rate": 1.0750410344426781e-08, "loss": 0.0303, "step": 1000 }, { "epoch": 3.69, "learning_rate": 8.553078092379123e-09, "loss": 0.0302, "step": 1010 }, { "epoch": 3.73, "learning_rate": 6.6028393863173935e-09, "loss": 0.024, "step": 1020 }, { "epoch": 3.77, "learning_rate": 4.901690074412185e-09, "loss": 0.0245, "step": 1030 }, { "epoch": 3.8, "learning_rate": 3.451371090084476e-09, "loss": 0.0211, "step": 1040 }, { "epoch": 3.84, "learning_rate": 2.2533666703574983e-09, "loss": 0.0224, "step": 1050 }, { "epoch": 3.88, "learning_rate": 1.308902836908543e-09, "loss": 0.0201, "step": 1060 }, { "epoch": 3.91, "learning_rate": 6.189461413748098e-10, "loss": 0.0242, "step": 1070 }, { "epoch": 3.95, "learning_rate": 1.842026761973703e-10, "loss": 0.0301, "step": 1080 }, { "epoch": 3.99, "learning_rate": 5.117352015610033e-12, "loss": 0.0258, "step": 1090 }, { "epoch": 3.99, "eval_loss": 0.04152332618832588, "eval_runtime": 48.6503, "eval_samples_per_second": 7.133, "eval_steps_per_second": 0.596, "step": 1092 }, { "epoch": 3.99, "step": 1092, "total_flos": 193702916259840.0, "train_loss": 0.20003320886702328, "train_runtime": 14516.5669, "train_samples_per_second": 1.806, "train_steps_per_second": 0.075 } ], "logging_steps": 10, "max_steps": 1092, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "total_flos": 193702916259840.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }