{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.18562874251497, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11976047904191617, "grad_norm": 2.8749659061431885, "learning_rate": 0.0002, "loss": 1.4673, "step": 10 }, { "epoch": 0.23952095808383234, "grad_norm": 3.4398715496063232, "learning_rate": 0.0002, "loss": 1.0019, "step": 20 }, { "epoch": 0.3592814371257485, "grad_norm": 1.9059951305389404, "learning_rate": 0.0002, "loss": 1.0575, "step": 30 }, { "epoch": 0.47904191616766467, "grad_norm": 4.149394989013672, "learning_rate": 0.0002, "loss": 0.8164, "step": 40 }, { "epoch": 0.5988023952095808, "grad_norm": 1.4866076707839966, "learning_rate": 0.0002, "loss": 0.8684, "step": 50 }, { "epoch": 0.718562874251497, "grad_norm": 3.1927452087402344, "learning_rate": 0.0002, "loss": 0.8016, "step": 60 }, { "epoch": 0.8383233532934131, "grad_norm": 1.1162314414978027, "learning_rate": 0.0002, "loss": 0.6809, "step": 70 }, { "epoch": 0.9580838323353293, "grad_norm": 2.829102039337158, "learning_rate": 0.0002, "loss": 0.6962, "step": 80 }, { "epoch": 1.0778443113772456, "grad_norm": 1.2642532587051392, "learning_rate": 0.0002, "loss": 0.5769, "step": 90 }, { "epoch": 1.1976047904191618, "grad_norm": 1.3799452781677246, "learning_rate": 0.0002, "loss": 0.4128, "step": 100 }, { "epoch": 1.3173652694610778, "grad_norm": 2.3143367767333984, "learning_rate": 0.0002, "loss": 0.6441, "step": 110 }, { "epoch": 1.437125748502994, "grad_norm": 1.085919976234436, "learning_rate": 0.0002, "loss": 0.393, "step": 120 }, { "epoch": 1.55688622754491, "grad_norm": 1.2423957586288452, "learning_rate": 0.0002, "loss": 0.5582, "step": 130 }, { "epoch": 1.6766467065868262, "grad_norm": 1.2964059114456177, "learning_rate": 0.0002, "loss": 0.4276, "step": 140 }, { "epoch": 1.7964071856287425, "grad_norm": 1.8397400379180908, "learning_rate": 0.0002, "loss": 0.6162, "step": 150 }, { "epoch": 1.9161676646706587, "grad_norm": 1.0209627151489258, "learning_rate": 0.0002, "loss": 0.4565, "step": 160 }, { "epoch": 2.035928143712575, "grad_norm": 0.8725757598876953, "learning_rate": 0.0002, "loss": 0.4807, "step": 170 }, { "epoch": 2.155688622754491, "grad_norm": 1.1269447803497314, "learning_rate": 0.0002, "loss": 0.3895, "step": 180 }, { "epoch": 2.2754491017964074, "grad_norm": 1.528011679649353, "learning_rate": 0.0002, "loss": 0.3553, "step": 190 }, { "epoch": 2.3952095808383236, "grad_norm": 0.8296527862548828, "learning_rate": 0.0002, "loss": 0.3516, "step": 200 }, { "epoch": 2.5149700598802394, "grad_norm": 1.301917552947998, "learning_rate": 0.0002, "loss": 0.3918, "step": 210 }, { "epoch": 2.6347305389221556, "grad_norm": 0.8420801758766174, "learning_rate": 0.0002, "loss": 0.3497, "step": 220 }, { "epoch": 2.754491017964072, "grad_norm": 1.1430580615997314, "learning_rate": 0.0002, "loss": 0.4311, "step": 230 }, { "epoch": 2.874251497005988, "grad_norm": 0.9065356850624084, "learning_rate": 0.0002, "loss": 0.3551, "step": 240 }, { "epoch": 2.9940119760479043, "grad_norm": 1.1302285194396973, "learning_rate": 0.0002, "loss": 0.3513, "step": 250 }, { "epoch": 3.1137724550898205, "grad_norm": 0.9960314631462097, "learning_rate": 0.0002, "loss": 0.3124, "step": 260 }, { "epoch": 3.2335329341317367, "grad_norm": 1.680296778678894, "learning_rate": 0.0002, "loss": 0.3065, "step": 270 }, { "epoch": 3.3532934131736525, "grad_norm": 1.1697853803634644, "learning_rate": 0.0002, "loss": 0.3009, "step": 280 }, { "epoch": 3.4730538922155687, "grad_norm": 1.9219907522201538, "learning_rate": 0.0002, "loss": 0.2802, "step": 290 }, { "epoch": 3.592814371257485, "grad_norm": 1.384773850440979, "learning_rate": 0.0002, "loss": 0.3419, "step": 300 }, { "epoch": 3.712574850299401, "grad_norm": 1.3956997394561768, "learning_rate": 0.0002, "loss": 0.3172, "step": 310 }, { "epoch": 3.8323353293413174, "grad_norm": 1.058669924736023, "learning_rate": 0.0002, "loss": 0.3723, "step": 320 }, { "epoch": 3.9520958083832336, "grad_norm": 1.5626955032348633, "learning_rate": 0.0002, "loss": 0.325, "step": 330 }, { "epoch": 4.07185628742515, "grad_norm": 1.2782564163208008, "learning_rate": 0.0002, "loss": 0.2912, "step": 340 }, { "epoch": 4.191616766467066, "grad_norm": 1.0916423797607422, "learning_rate": 0.0002, "loss": 0.233, "step": 350 }, { "epoch": 4.311377245508982, "grad_norm": 0.8613762855529785, "learning_rate": 0.0002, "loss": 0.3058, "step": 360 }, { "epoch": 4.431137724550898, "grad_norm": 0.6293674111366272, "learning_rate": 0.0002, "loss": 0.2334, "step": 370 }, { "epoch": 4.550898203592815, "grad_norm": 1.6042566299438477, "learning_rate": 0.0002, "loss": 0.3287, "step": 380 }, { "epoch": 4.6706586826347305, "grad_norm": 0.8140411376953125, "learning_rate": 0.0002, "loss": 0.2372, "step": 390 }, { "epoch": 4.790419161676647, "grad_norm": 1.5365833044052124, "learning_rate": 0.0002, "loss": 0.3266, "step": 400 }, { "epoch": 4.910179640718563, "grad_norm": 0.9418448805809021, "learning_rate": 0.0002, "loss": 0.2513, "step": 410 }, { "epoch": 5.029940119760479, "grad_norm": 0.6695829033851624, "learning_rate": 0.0002, "loss": 0.2688, "step": 420 }, { "epoch": 5.149700598802395, "grad_norm": 0.628887414932251, "learning_rate": 0.0002, "loss": 0.2149, "step": 430 }, { "epoch": 5.269461077844311, "grad_norm": 0.964766263961792, "learning_rate": 0.0002, "loss": 0.2606, "step": 440 }, { "epoch": 5.389221556886228, "grad_norm": 0.5990360975265503, "learning_rate": 0.0002, "loss": 0.2364, "step": 450 }, { "epoch": 5.508982035928144, "grad_norm": 0.8189520835876465, "learning_rate": 0.0002, "loss": 0.2857, "step": 460 }, { "epoch": 5.62874251497006, "grad_norm": 0.5583224296569824, "learning_rate": 0.0002, "loss": 0.2414, "step": 470 }, { "epoch": 5.748502994011976, "grad_norm": 0.7695009708404541, "learning_rate": 0.0002, "loss": 0.2434, "step": 480 }, { "epoch": 5.868263473053892, "grad_norm": 0.3456665575504303, "learning_rate": 0.0002, "loss": 0.2597, "step": 490 }, { "epoch": 5.9880239520958085, "grad_norm": 0.7596808671951294, "learning_rate": 0.0002, "loss": 0.2983, "step": 500 }, { "epoch": 6.107784431137724, "grad_norm": 0.9513673782348633, "learning_rate": 0.0002, "loss": 0.2139, "step": 510 }, { "epoch": 6.227544910179641, "grad_norm": 1.0958881378173828, "learning_rate": 0.0002, "loss": 0.2211, "step": 520 }, { "epoch": 6.347305389221557, "grad_norm": 0.6882690787315369, "learning_rate": 0.0002, "loss": 0.2347, "step": 530 }, { "epoch": 6.467065868263473, "grad_norm": 1.0562934875488281, "learning_rate": 0.0002, "loss": 0.2276, "step": 540 }, { "epoch": 6.586826347305389, "grad_norm": 1.1535356044769287, "learning_rate": 0.0002, "loss": 0.2469, "step": 550 }, { "epoch": 6.706586826347305, "grad_norm": 0.9436424970626831, "learning_rate": 0.0002, "loss": 0.2713, "step": 560 }, { "epoch": 6.826347305389222, "grad_norm": 1.0283164978027344, "learning_rate": 0.0002, "loss": 0.2449, "step": 570 }, { "epoch": 6.946107784431137, "grad_norm": 1.3945902585983276, "learning_rate": 0.0002, "loss": 0.2193, "step": 580 }, { "epoch": 7.065868263473054, "grad_norm": 0.5662649869918823, "learning_rate": 0.0002, "loss": 0.2415, "step": 590 }, { "epoch": 7.18562874251497, "grad_norm": 0.4687662720680237, "learning_rate": 0.0002, "loss": 0.1792, "step": 600 } ], "logging_steps": 10, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 100, "total_flos": 1753733775298560.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }