{ "best_metric": null, "best_model_checkpoint": null, "epoch": 13.636363636363637, "eval_steps": 500, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18, "grad_norm": 220.0, "learning_rate": 2.5e-05, "loss": 24.9691, "step": 1 }, { "epoch": 0.91, "grad_norm": 35.25, "learning_rate": 0.000125, "loss": 21.906, "step": 5 }, { "epoch": 0.91, "eval_loss": 7.653346538543701, "eval_runtime": 0.5608, "eval_samples_per_second": 3.567, "eval_steps_per_second": 1.783, "step": 5 }, { "epoch": 1.82, "grad_norm": 9.3125, "learning_rate": 0.00019956059820218982, "loss": 13.5603, "step": 10 }, { "epoch": 2.0, "eval_loss": 6.644189357757568, "eval_runtime": 0.5604, "eval_samples_per_second": 3.569, "eval_steps_per_second": 1.784, "step": 11 }, { "epoch": 2.73, "grad_norm": 4.625, "learning_rate": 0.00019466156752904343, "loss": 10.2605, "step": 15 }, { "epoch": 2.91, "eval_loss": 6.081549644470215, "eval_runtime": 0.5806, "eval_samples_per_second": 3.445, "eval_steps_per_second": 1.722, "step": 16 }, { "epoch": 3.64, "grad_norm": 15.125, "learning_rate": 0.00018458320592590975, "loss": 9.9129, "step": 20 }, { "epoch": 4.0, "eval_loss": 3.114753484725952, "eval_runtime": 0.5656, "eval_samples_per_second": 3.536, "eval_steps_per_second": 1.768, "step": 22 }, { "epoch": 4.55, "grad_norm": 2.96875, "learning_rate": 0.00016987694277788417, "loss": 4.5895, "step": 25 }, { "epoch": 4.91, "eval_loss": 1.6582958698272705, "eval_runtime": 0.5842, "eval_samples_per_second": 3.424, "eval_steps_per_second": 1.712, "step": 27 }, { "epoch": 5.45, "grad_norm": 3.171875, "learning_rate": 0.0001513474193514842, "loss": 1.6316, "step": 30 }, { "epoch": 6.0, "eval_loss": 1.415539264678955, "eval_runtime": 0.5662, "eval_samples_per_second": 3.532, "eval_steps_per_second": 1.766, "step": 33 }, { "epoch": 6.36, "grad_norm": 7.1875, "learning_rate": 0.0001300084635000341, "loss": 1.4115, "step": 35 }, { "epoch": 6.91, "eval_loss": 1.3542958498001099, "eval_runtime": 0.58, "eval_samples_per_second": 3.448, "eval_steps_per_second": 1.724, "step": 38 }, { "epoch": 7.27, "grad_norm": 1.296875, "learning_rate": 0.0001070276188945293, "loss": 1.2971, "step": 40 }, { "epoch": 8.0, "eval_loss": 1.313336968421936, "eval_runtime": 0.5751, "eval_samples_per_second": 3.478, "eval_steps_per_second": 1.739, "step": 44 }, { "epoch": 8.18, "grad_norm": 1.6171875, "learning_rate": 8.366226381814697e-05, "loss": 1.1321, "step": 45 }, { "epoch": 8.91, "eval_loss": 1.2903474569320679, "eval_runtime": 0.5848, "eval_samples_per_second": 3.42, "eval_steps_per_second": 1.71, "step": 49 }, { "epoch": 9.09, "grad_norm": 0.9375, "learning_rate": 6.119081473277501e-05, "loss": 1.062, "step": 50 }, { "epoch": 10.0, "grad_norm": 0.90234375, "learning_rate": 4.084277875864776e-05, "loss": 0.9739, "step": 55 }, { "epoch": 10.0, "eval_loss": 1.2820332050323486, "eval_runtime": 0.5646, "eval_samples_per_second": 3.542, "eval_steps_per_second": 1.771, "step": 55 }, { "epoch": 10.91, "grad_norm": 1.2109375, "learning_rate": 2.3731482188961818e-05, "loss": 0.917, "step": 60 }, { "epoch": 10.91, "eval_loss": 1.2888375520706177, "eval_runtime": 0.5836, "eval_samples_per_second": 3.427, "eval_steps_per_second": 1.714, "step": 60 }, { "epoch": 11.82, "grad_norm": 0.609375, "learning_rate": 1.0793155744261351e-05, "loss": 0.8541, "step": 65 }, { "epoch": 12.0, "eval_loss": 1.27806556224823, "eval_runtime": 0.5683, "eval_samples_per_second": 3.519, "eval_steps_per_second": 1.76, "step": 66 }, { "epoch": 12.73, "grad_norm": 0.6796875, "learning_rate": 2.735709467518699e-06, "loss": 0.8659, "step": 70 }, { "epoch": 12.91, "eval_loss": 1.2891546487808228, "eval_runtime": 0.7414, "eval_samples_per_second": 2.698, "eval_steps_per_second": 1.349, "step": 71 }, { "epoch": 13.64, "grad_norm": 0.515625, "learning_rate": 0.0, "loss": 0.8354, "step": 75 }, { "epoch": 13.64, "eval_loss": 1.2830009460449219, "eval_runtime": 0.5659, "eval_samples_per_second": 3.534, "eval_steps_per_second": 1.767, "step": 75 }, { "epoch": 13.64, "step": 75, "total_flos": 1.1503415671442637e+17, "train_loss": 4.788167775472005, "train_runtime": 283.7979, "train_samples_per_second": 4.651, "train_steps_per_second": 0.264 } ], "logging_steps": 5, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 100, "total_flos": 1.1503415671442637e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }