{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.056603773584905, "eval_steps": 10, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3018867924528302, "grad_norm": 0.12168226391077042, "learning_rate": 6.666666666666667e-06, "loss": 2.5839, "step": 10 }, { "epoch": 0.3018867924528302, "eval_loss": 2.4508910179138184, "eval_runtime": 356.5361, "eval_samples_per_second": 0.367, "eval_steps_per_second": 0.367, "step": 10 }, { "epoch": 0.6037735849056604, "grad_norm": 0.13410530984401703, "learning_rate": 1.3333333333333333e-05, "loss": 2.5791, "step": 20 }, { "epoch": 0.6037735849056604, "eval_loss": 2.438781261444092, "eval_runtime": 355.7723, "eval_samples_per_second": 0.368, "eval_steps_per_second": 0.368, "step": 20 }, { "epoch": 0.9056603773584906, "grad_norm": 0.16412803530693054, "learning_rate": 2e-05, "loss": 2.561, "step": 30 }, { "epoch": 0.9056603773584906, "eval_loss": 2.409200429916382, "eval_runtime": 356.6858, "eval_samples_per_second": 0.367, "eval_steps_per_second": 0.367, "step": 30 }, { "epoch": 1.2075471698113207, "grad_norm": 0.1962829828262329, "learning_rate": 1.9932383577419432e-05, "loss": 2.515, "step": 40 }, { "epoch": 1.2075471698113207, "eval_loss": 2.369290828704834, "eval_runtime": 356.4653, "eval_samples_per_second": 0.367, "eval_steps_per_second": 0.367, "step": 40 }, { "epoch": 1.509433962264151, "grad_norm": 0.2138909250497818, "learning_rate": 1.973044870579824e-05, "loss": 2.4646, "step": 50 }, { "epoch": 1.509433962264151, "eval_loss": 2.330085039138794, "eval_runtime": 355.0797, "eval_samples_per_second": 0.369, "eval_steps_per_second": 0.369, "step": 50 }, { "epoch": 1.8113207547169812, "grad_norm": 0.24827434122562408, "learning_rate": 1.9396926207859085e-05, "loss": 2.4343, "step": 60 }, { "epoch": 1.8113207547169812, "eval_loss": 2.2919955253601074, "eval_runtime": 355.0754, "eval_samples_per_second": 0.369, "eval_steps_per_second": 0.369, "step": 60 }, { "epoch": 2.1132075471698113, "grad_norm": 0.27827900648117065, "learning_rate": 1.8936326403234125e-05, "loss": 2.4112, "step": 70 }, { "epoch": 2.1132075471698113, "eval_loss": 2.257754325866699, "eval_runtime": 355.8986, "eval_samples_per_second": 0.368, "eval_steps_per_second": 0.368, "step": 70 }, { "epoch": 2.4150943396226414, "grad_norm": 0.26660796999931335, "learning_rate": 1.8354878114129368e-05, "loss": 2.3651, "step": 80 }, { "epoch": 2.4150943396226414, "eval_loss": 2.2276153564453125, "eval_runtime": 355.81, "eval_samples_per_second": 0.368, "eval_steps_per_second": 0.368, "step": 80 }, { "epoch": 2.7169811320754715, "grad_norm": 0.2869923412799835, "learning_rate": 1.766044443118978e-05, "loss": 2.3413, "step": 90 }, { "epoch": 2.7169811320754715, "eval_loss": 2.1996243000030518, "eval_runtime": 356.1237, "eval_samples_per_second": 0.368, "eval_steps_per_second": 0.368, "step": 90 }, { "epoch": 3.018867924528302, "grad_norm": 0.29539480805397034, "learning_rate": 1.686241637868734e-05, "loss": 2.309, "step": 100 }, { "epoch": 3.018867924528302, "eval_loss": 2.175126552581787, "eval_runtime": 355.2283, "eval_samples_per_second": 0.369, "eval_steps_per_second": 0.369, "step": 100 }, { "epoch": 3.3207547169811322, "grad_norm": 0.2869047224521637, "learning_rate": 1.5971585917027864e-05, "loss": 2.2908, "step": 110 }, { "epoch": 3.3207547169811322, "eval_loss": 2.1576268672943115, "eval_runtime": 354.9528, "eval_samples_per_second": 0.369, "eval_steps_per_second": 0.369, "step": 110 }, { "epoch": 3.6226415094339623, "grad_norm": 0.29853248596191406, "learning_rate": 1.5000000000000002e-05, "loss": 2.2538, "step": 120 }, { "epoch": 3.6226415094339623, "eval_loss": 2.1427152156829834, "eval_runtime": 355.8889, "eval_samples_per_second": 0.368, "eval_steps_per_second": 0.368, "step": 120 }, { "epoch": 3.9245283018867925, "grad_norm": 0.3196108043193817, "learning_rate": 1.396079766039157e-05, "loss": 2.262, "step": 130 }, { "epoch": 3.9245283018867925, "eval_loss": 2.1301257610321045, "eval_runtime": 356.3065, "eval_samples_per_second": 0.368, "eval_steps_per_second": 0.368, "step": 130 }, { "epoch": 4.226415094339623, "grad_norm": 0.32266777753829956, "learning_rate": 1.2868032327110904e-05, "loss": 2.2209, "step": 140 }, { "epoch": 4.226415094339623, "eval_loss": 2.119654893875122, "eval_runtime": 354.8329, "eval_samples_per_second": 0.369, "eval_steps_per_second": 0.369, "step": 140 }, { "epoch": 4.528301886792453, "grad_norm": 0.32279086112976074, "learning_rate": 1.1736481776669307e-05, "loss": 2.2248, "step": 150 }, { "epoch": 4.528301886792453, "eval_loss": 2.11035418510437, "eval_runtime": 355.4708, "eval_samples_per_second": 0.369, "eval_steps_per_second": 0.369, "step": 150 }, { "epoch": 4.830188679245283, "grad_norm": 0.3195267617702484, "learning_rate": 1.0581448289104759e-05, "loss": 2.2249, "step": 160 }, { "epoch": 4.830188679245283, "eval_loss": 2.102231025695801, "eval_runtime": 355.8543, "eval_samples_per_second": 0.368, "eval_steps_per_second": 0.368, "step": 160 }, { "epoch": 5.132075471698113, "grad_norm": 0.3310501277446747, "learning_rate": 9.418551710895243e-06, "loss": 2.2063, "step": 170 }, { "epoch": 5.132075471698113, "eval_loss": 2.094794750213623, "eval_runtime": 357.0843, "eval_samples_per_second": 0.367, "eval_steps_per_second": 0.367, "step": 170 }, { "epoch": 5.433962264150943, "grad_norm": 0.33955734968185425, "learning_rate": 8.263518223330698e-06, "loss": 2.1997, "step": 180 }, { "epoch": 5.433962264150943, "eval_loss": 2.0882046222686768, "eval_runtime": 356.6814, "eval_samples_per_second": 0.367, "eval_steps_per_second": 0.367, "step": 180 }, { "epoch": 5.735849056603773, "grad_norm": 0.3333246111869812, "learning_rate": 7.131967672889101e-06, "loss": 2.1985, "step": 190 }, { "epoch": 5.735849056603773, "eval_loss": 2.0826594829559326, "eval_runtime": 355.813, "eval_samples_per_second": 0.368, "eval_steps_per_second": 0.368, "step": 190 }, { "epoch": 6.037735849056604, "grad_norm": 0.34075412154197693, "learning_rate": 6.039202339608432e-06, "loss": 2.1992, "step": 200 }, { "epoch": 6.037735849056604, "eval_loss": 2.078132390975952, "eval_runtime": 357.7349, "eval_samples_per_second": 0.366, "eval_steps_per_second": 0.366, "step": 200 }, { "epoch": 6.339622641509434, "grad_norm": 0.3513352870941162, "learning_rate": 5.000000000000003e-06, "loss": 2.1851, "step": 210 }, { "epoch": 6.339622641509434, "eval_loss": 2.0746688842773438, "eval_runtime": 355.6074, "eval_samples_per_second": 0.368, "eval_steps_per_second": 0.368, "step": 210 }, { "epoch": 6.6415094339622645, "grad_norm": 0.359353631734848, "learning_rate": 4.028414082972141e-06, "loss": 2.178, "step": 220 }, { "epoch": 6.6415094339622645, "eval_loss": 2.0718541145324707, "eval_runtime": 354.8685, "eval_samples_per_second": 0.369, "eval_steps_per_second": 0.369, "step": 220 }, { "epoch": 6.943396226415095, "grad_norm": 0.3545942008495331, "learning_rate": 3.1375836213126653e-06, "loss": 2.1698, "step": 230 }, { "epoch": 6.943396226415095, "eval_loss": 2.069652557373047, "eval_runtime": 356.0491, "eval_samples_per_second": 0.368, "eval_steps_per_second": 0.368, "step": 230 }, { "epoch": 7.245283018867925, "grad_norm": 0.35219329595565796, "learning_rate": 2.339555568810221e-06, "loss": 2.1768, "step": 240 }, { "epoch": 7.245283018867925, "eval_loss": 2.068044424057007, "eval_runtime": 355.5515, "eval_samples_per_second": 0.368, "eval_steps_per_second": 0.368, "step": 240 }, { "epoch": 7.547169811320755, "grad_norm": 0.3606317341327667, "learning_rate": 1.6451218858706374e-06, "loss": 2.17, "step": 250 }, { "epoch": 7.547169811320755, "eval_loss": 2.0668373107910156, "eval_runtime": 355.9606, "eval_samples_per_second": 0.368, "eval_steps_per_second": 0.368, "step": 250 }, { "epoch": 7.849056603773585, "grad_norm": 0.345058798789978, "learning_rate": 1.0636735967658785e-06, "loss": 2.1755, "step": 260 }, { "epoch": 7.849056603773585, "eval_loss": 2.066086769104004, "eval_runtime": 354.824, "eval_samples_per_second": 0.369, "eval_steps_per_second": 0.369, "step": 260 }, { "epoch": 8.150943396226415, "grad_norm": 0.3428496718406677, "learning_rate": 6.030737921409169e-07, "loss": 2.1665, "step": 270 }, { "epoch": 8.150943396226415, "eval_loss": 2.065619707107544, "eval_runtime": 355.814, "eval_samples_per_second": 0.368, "eval_steps_per_second": 0.368, "step": 270 }, { "epoch": 8.452830188679245, "grad_norm": 0.34520721435546875, "learning_rate": 2.6955129420176193e-07, "loss": 2.1752, "step": 280 }, { "epoch": 8.452830188679245, "eval_loss": 2.0653860569000244, "eval_runtime": 357.4979, "eval_samples_per_second": 0.366, "eval_steps_per_second": 0.366, "step": 280 }, { "epoch": 8.754716981132075, "grad_norm": 0.3415066599845886, "learning_rate": 6.761642258056977e-08, "loss": 2.1724, "step": 290 }, { "epoch": 8.754716981132075, "eval_loss": 2.065295934677124, "eval_runtime": 357.1321, "eval_samples_per_second": 0.367, "eval_steps_per_second": 0.367, "step": 290 }, { "epoch": 9.056603773584905, "grad_norm": 0.35265278816223145, "learning_rate": 0.0, "loss": 2.168, "step": 300 }, { "epoch": 9.056603773584905, "eval_loss": 2.0652811527252197, "eval_runtime": 357.0379, "eval_samples_per_second": 0.367, "eval_steps_per_second": 0.367, "step": 300 } ], "logging_steps": 10, "max_steps": 300, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.97219261251584e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }