{ "best_metric": null, "best_model_checkpoint": null, "epoch": 13.636363636363637, "eval_steps": 500, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18, "grad_norm": 220.0, "learning_rate": 2.5e-05, "loss": 24.9691, "step": 1 }, { "epoch": 0.91, "grad_norm": 35.25, "learning_rate": 0.000125, "loss": 21.9058, "step": 5 }, { "epoch": 0.91, "eval_loss": 7.6562018394470215, "eval_runtime": 0.5686, "eval_samples_per_second": 3.517, "eval_steps_per_second": 1.759, "step": 5 }, { "epoch": 1.82, "grad_norm": 9.0, "learning_rate": 0.00019956059820218982, "loss": 13.5645, "step": 10 }, { "epoch": 2.0, "eval_loss": 6.635939121246338, "eval_runtime": 0.5597, "eval_samples_per_second": 3.573, "eval_steps_per_second": 1.787, "step": 11 }, { "epoch": 2.73, "grad_norm": 4.5625, "learning_rate": 0.00019466156752904343, "loss": 10.2613, "step": 15 }, { "epoch": 2.91, "eval_loss": 6.0754241943359375, "eval_runtime": 0.5741, "eval_samples_per_second": 3.484, "eval_steps_per_second": 1.742, "step": 16 }, { "epoch": 3.64, "grad_norm": 14.6875, "learning_rate": 0.00018458320592590975, "loss": 9.903, "step": 20 }, { "epoch": 4.0, "eval_loss": 3.1116435527801514, "eval_runtime": 0.5647, "eval_samples_per_second": 3.542, "eval_steps_per_second": 1.771, "step": 22 }, { "epoch": 4.55, "grad_norm": 32.75, "learning_rate": 0.00016987694277788417, "loss": 4.594, "step": 25 }, { "epoch": 4.91, "eval_loss": 1.6371122598648071, "eval_runtime": 0.5752, "eval_samples_per_second": 3.477, "eval_steps_per_second": 1.739, "step": 27 }, { "epoch": 5.45, "grad_norm": 3.125, "learning_rate": 0.0001513474193514842, "loss": 1.6122, "step": 30 }, { "epoch": 6.0, "eval_loss": 1.4159561395645142, "eval_runtime": 0.5662, "eval_samples_per_second": 3.532, "eval_steps_per_second": 1.766, "step": 33 }, { "epoch": 6.36, "grad_norm": 1.03125, "learning_rate": 0.0001300084635000341, "loss": 1.3971, "step": 35 }, { "epoch": 6.91, "eval_loss": 1.3411411046981812, "eval_runtime": 0.5782, "eval_samples_per_second": 3.459, "eval_steps_per_second": 1.73, "step": 38 }, { "epoch": 7.27, "grad_norm": 1.65625, "learning_rate": 0.0001070276188945293, "loss": 1.2757, "step": 40 }, { "epoch": 8.0, "eval_loss": 1.307417631149292, "eval_runtime": 0.5668, "eval_samples_per_second": 3.529, "eval_steps_per_second": 1.764, "step": 44 }, { "epoch": 8.18, "grad_norm": 1.796875, "learning_rate": 8.366226381814697e-05, "loss": 1.1233, "step": 45 }, { "epoch": 8.91, "eval_loss": 1.275590419769287, "eval_runtime": 0.5782, "eval_samples_per_second": 3.459, "eval_steps_per_second": 1.729, "step": 49 }, { "epoch": 9.09, "grad_norm": 0.99609375, "learning_rate": 6.119081473277501e-05, "loss": 1.0599, "step": 50 }, { "epoch": 10.0, "grad_norm": 0.90234375, "learning_rate": 4.084277875864776e-05, "loss": 0.9741, "step": 55 }, { "epoch": 10.0, "eval_loss": 1.273629903793335, "eval_runtime": 0.5636, "eval_samples_per_second": 3.549, "eval_steps_per_second": 1.774, "step": 55 }, { "epoch": 10.91, "grad_norm": 0.62890625, "learning_rate": 2.3731482188961818e-05, "loss": 0.9266, "step": 60 }, { "epoch": 10.91, "eval_loss": 1.279077410697937, "eval_runtime": 0.5768, "eval_samples_per_second": 3.467, "eval_steps_per_second": 1.734, "step": 60 }, { "epoch": 11.82, "grad_norm": 0.5234375, "learning_rate": 1.0793155744261351e-05, "loss": 0.8584, "step": 65 }, { "epoch": 12.0, "eval_loss": 1.2753326892852783, "eval_runtime": 0.5671, "eval_samples_per_second": 3.527, "eval_steps_per_second": 1.763, "step": 66 }, { "epoch": 12.73, "grad_norm": 0.515625, "learning_rate": 2.735709467518699e-06, "loss": 0.8714, "step": 70 }, { "epoch": 12.91, "eval_loss": 1.2842026948928833, "eval_runtime": 0.7036, "eval_samples_per_second": 2.842, "eval_steps_per_second": 1.421, "step": 71 }, { "epoch": 13.64, "grad_norm": 0.56640625, "learning_rate": 0.0, "loss": 0.8421, "step": 75 }, { "epoch": 13.64, "eval_loss": 1.2808171510696411, "eval_runtime": 0.5646, "eval_samples_per_second": 3.543, "eval_steps_per_second": 1.771, "step": 75 }, { "epoch": 13.64, "step": 75, "total_flos": 1.1503415671442637e+17, "train_loss": 4.785474745432536, "train_runtime": 283.6494, "train_samples_per_second": 4.654, "train_steps_per_second": 0.264 } ], "logging_steps": 5, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 100, "total_flos": 1.1503415671442637e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }