{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.42182968626417083, "eval_steps": 50, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010545742156604272, "grad_norm": 0.8949686288833618, "learning_rate": 2.0000000000000003e-06, "loss": 2.0803, "step": 10 }, { "epoch": 0.021091484313208543, "grad_norm": 0.8410115838050842, "learning_rate": 4.000000000000001e-06, "loss": 2.0584, "step": 20 }, { "epoch": 0.03163722646981281, "grad_norm": 0.8865960836410522, "learning_rate": 6e-06, "loss": 2.0496, "step": 30 }, { "epoch": 0.042182968626417086, "grad_norm": 0.7669851779937744, "learning_rate": 8.000000000000001e-06, "loss": 2.0115, "step": 40 }, { "epoch": 0.052728710783021354, "grad_norm": 0.7927244901657104, "learning_rate": 1e-05, "loss": 1.999, "step": 50 }, { "epoch": 0.052728710783021354, "eval_loss": 1.9660409688949585, "eval_runtime": 1845.8031, "eval_samples_per_second": 1.827, "eval_steps_per_second": 0.457, "step": 50 }, { "epoch": 0.06327445293962562, "grad_norm": 0.6458373069763184, "learning_rate": 1.2e-05, "loss": 1.9319, "step": 60 }, { "epoch": 0.07382019509622989, "grad_norm": 0.566554069519043, "learning_rate": 1.4000000000000001e-05, "loss": 1.9084, "step": 70 }, { "epoch": 0.08436593725283417, "grad_norm": 0.4990901052951813, "learning_rate": 1.6000000000000003e-05, "loss": 1.8971, "step": 80 }, { "epoch": 0.09491167940943844, "grad_norm": 0.4458348751068115, "learning_rate": 1.8e-05, "loss": 1.8216, "step": 90 }, { "epoch": 0.10545742156604271, "grad_norm": 0.4464481472969055, "learning_rate": 2e-05, "loss": 1.8433, "step": 100 }, { "epoch": 0.10545742156604271, "eval_loss": 1.8134711980819702, "eval_runtime": 1846.165, "eval_samples_per_second": 1.826, "eval_steps_per_second": 0.457, "step": 100 }, { "epoch": 0.11600316372264698, "grad_norm": 0.4543227255344391, "learning_rate": 2.2000000000000003e-05, "loss": 1.8193, "step": 110 }, { "epoch": 0.12654890587925124, "grad_norm": 0.4522910416126251, "learning_rate": 2.4e-05, "loss": 1.7737, "step": 120 }, { "epoch": 0.13709464803585553, "grad_norm": 0.4522746801376343, "learning_rate": 2.6000000000000002e-05, "loss": 1.7885, "step": 130 }, { "epoch": 0.14764039019245978, "grad_norm": 0.44632086157798767, "learning_rate": 2.8000000000000003e-05, "loss": 1.7669, "step": 140 }, { "epoch": 0.15818613234906406, "grad_norm": 0.4553607404232025, "learning_rate": 3e-05, "loss": 1.7605, "step": 150 }, { "epoch": 0.15818613234906406, "eval_loss": 1.7562175989151, "eval_runtime": 1836.9664, "eval_samples_per_second": 1.836, "eval_steps_per_second": 0.459, "step": 150 }, { "epoch": 0.16873187450566834, "grad_norm": 0.4852316975593567, "learning_rate": 3.2000000000000005e-05, "loss": 1.7367, "step": 160 }, { "epoch": 0.1792776166622726, "grad_norm": 0.49629154801368713, "learning_rate": 3.4000000000000007e-05, "loss": 1.7447, "step": 170 }, { "epoch": 0.18982335881887688, "grad_norm": 0.5253108739852905, "learning_rate": 3.6e-05, "loss": 1.7378, "step": 180 }, { "epoch": 0.20036910097548116, "grad_norm": 0.6104539036750793, "learning_rate": 3.8e-05, "loss": 1.737, "step": 190 }, { "epoch": 0.21091484313208542, "grad_norm": 0.564102828502655, "learning_rate": 4e-05, "loss": 1.7325, "step": 200 }, { "epoch": 0.21091484313208542, "eval_loss": 1.727364420890808, "eval_runtime": 1836.4408, "eval_samples_per_second": 1.836, "eval_steps_per_second": 0.459, "step": 200 }, { "epoch": 0.2214605852886897, "grad_norm": 0.6158842444419861, "learning_rate": 4.2e-05, "loss": 1.7168, "step": 210 }, { "epoch": 0.23200632744529395, "grad_norm": 0.699800431728363, "learning_rate": 4.4000000000000006e-05, "loss": 1.738, "step": 220 }, { "epoch": 0.24255206960189823, "grad_norm": 0.6008381843566895, "learning_rate": 4.600000000000001e-05, "loss": 1.7256, "step": 230 }, { "epoch": 0.2530978117585025, "grad_norm": 0.633844792842865, "learning_rate": 4.8e-05, "loss": 1.7033, "step": 240 }, { "epoch": 0.2636435539151068, "grad_norm": 0.6631755232810974, "learning_rate": 5e-05, "loss": 1.6999, "step": 250 }, { "epoch": 0.2636435539151068, "eval_loss": 1.7032877206802368, "eval_runtime": 1836.0737, "eval_samples_per_second": 1.837, "eval_steps_per_second": 0.459, "step": 250 }, { "epoch": 0.27418929607171105, "grad_norm": 0.6652688980102539, "learning_rate": 5.2000000000000004e-05, "loss": 1.6866, "step": 260 }, { "epoch": 0.2847350382283153, "grad_norm": 0.6938503980636597, "learning_rate": 5.4000000000000005e-05, "loss": 1.7029, "step": 270 }, { "epoch": 0.29528078038491956, "grad_norm": 0.686392605304718, "learning_rate": 5.6000000000000006e-05, "loss": 1.6989, "step": 280 }, { "epoch": 0.30582652254152387, "grad_norm": 0.7344717979431152, "learning_rate": 5.8e-05, "loss": 1.6685, "step": 290 }, { "epoch": 0.3163722646981281, "grad_norm": 0.6960188150405884, "learning_rate": 6e-05, "loss": 1.6705, "step": 300 }, { "epoch": 0.3163722646981281, "eval_loss": 1.6844133138656616, "eval_runtime": 1836.0829, "eval_samples_per_second": 1.837, "eval_steps_per_second": 0.459, "step": 300 }, { "epoch": 0.3269180068547324, "grad_norm": 0.6903261542320251, "learning_rate": 6.2e-05, "loss": 1.6672, "step": 310 }, { "epoch": 0.3374637490113367, "grad_norm": 0.7028161883354187, "learning_rate": 6.400000000000001e-05, "loss": 1.6934, "step": 320 }, { "epoch": 0.34800949116794094, "grad_norm": 0.7435672879219055, "learning_rate": 6.6e-05, "loss": 1.6712, "step": 330 }, { "epoch": 0.3585552333245452, "grad_norm": 0.7246424555778503, "learning_rate": 6.800000000000001e-05, "loss": 1.6608, "step": 340 }, { "epoch": 0.3691009754811495, "grad_norm": 0.6919660568237305, "learning_rate": 7e-05, "loss": 1.6568, "step": 350 }, { "epoch": 0.3691009754811495, "eval_loss": 1.6683924198150635, "eval_runtime": 1838.3493, "eval_samples_per_second": 1.834, "eval_steps_per_second": 0.459, "step": 350 }, { "epoch": 0.37964671763775376, "grad_norm": 0.7590942978858948, "learning_rate": 7.2e-05, "loss": 1.6742, "step": 360 }, { "epoch": 0.390192459794358, "grad_norm": 0.6980053186416626, "learning_rate": 7.4e-05, "loss": 1.6744, "step": 370 }, { "epoch": 0.4007382019509623, "grad_norm": 0.7112457752227783, "learning_rate": 7.6e-05, "loss": 1.648, "step": 380 }, { "epoch": 0.4112839441075666, "grad_norm": 0.7157771587371826, "learning_rate": 7.800000000000001e-05, "loss": 1.6634, "step": 390 }, { "epoch": 0.42182968626417083, "grad_norm": 0.7440850138664246, "learning_rate": 8e-05, "loss": 1.6567, "step": 400 }, { "epoch": 0.42182968626417083, "eval_loss": 1.6561506986618042, "eval_runtime": 1822.2187, "eval_samples_per_second": 1.85, "eval_steps_per_second": 0.463, "step": 400 } ], "logging_steps": 10, "max_steps": 2844, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.274968196448256e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }