{ "best_metric": null, "best_model_checkpoint": null, "epoch": 50.63291139240506, "eval_steps": 25, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.2658227848101267, "grad_norm": 0.0, "learning_rate": 0.00019696969696969698, "loss": 3.653, "step": 25 }, { "epoch": 1.2658227848101267, "eval_loss": 3.791062116622925, "eval_runtime": 0.3728, "eval_samples_per_second": 53.654, "eval_steps_per_second": 8.048, "step": 25 }, { "epoch": 2.5316455696202533, "grad_norm": 0.0, "learning_rate": 0.00019191919191919191, "loss": 3.661, "step": 50 }, { "epoch": 2.5316455696202533, "eval_loss": 3.791062116622925, "eval_runtime": 0.366, "eval_samples_per_second": 54.652, "eval_steps_per_second": 8.198, "step": 50 }, { "epoch": 3.7974683544303796, "grad_norm": 0.0, "learning_rate": 0.00018686868686868687, "loss": 3.6566, "step": 75 }, { "epoch": 3.7974683544303796, "eval_loss": 3.791062116622925, "eval_runtime": 0.36, "eval_samples_per_second": 55.559, "eval_steps_per_second": 8.334, "step": 75 }, { "epoch": 5.063291139240507, "grad_norm": 0.0, "learning_rate": 0.00018181818181818183, "loss": 3.6562, "step": 100 }, { "epoch": 5.063291139240507, "eval_loss": 3.791062116622925, "eval_runtime": 0.3575, "eval_samples_per_second": 55.943, "eval_steps_per_second": 8.391, "step": 100 }, { "epoch": 6.329113924050633, "grad_norm": 0.0, "learning_rate": 0.0001767676767676768, "loss": 3.6384, "step": 125 }, { "epoch": 6.329113924050633, "eval_loss": 3.791062116622925, "eval_runtime": 0.3524, "eval_samples_per_second": 56.761, "eval_steps_per_second": 8.514, "step": 125 }, { "epoch": 7.594936708860759, "grad_norm": 0.0, "learning_rate": 0.00017171717171717173, "loss": 3.6585, "step": 150 }, { "epoch": 7.594936708860759, "eval_loss": 3.791062116622925, "eval_runtime": 0.3546, "eval_samples_per_second": 56.397, "eval_steps_per_second": 8.459, "step": 150 }, { "epoch": 8.860759493670885, "grad_norm": 0.0, "learning_rate": 0.0001666666666666667, "loss": 3.6454, "step": 175 }, { "epoch": 8.860759493670885, "eval_loss": 3.791062116622925, "eval_runtime": 0.3607, "eval_samples_per_second": 55.454, "eval_steps_per_second": 8.318, "step": 175 }, { "epoch": 10.126582278481013, "grad_norm": 0.0, "learning_rate": 0.00016161616161616162, "loss": 3.667, "step": 200 }, { "epoch": 10.126582278481013, "eval_loss": 3.791062116622925, "eval_runtime": 0.3664, "eval_samples_per_second": 54.588, "eval_steps_per_second": 8.188, "step": 200 }, { "epoch": 11.39240506329114, "grad_norm": 0.0, "learning_rate": 0.00015656565656565658, "loss": 3.6564, "step": 225 }, { "epoch": 11.39240506329114, "eval_loss": 3.791062116622925, "eval_runtime": 0.366, "eval_samples_per_second": 54.65, "eval_steps_per_second": 8.198, "step": 225 }, { "epoch": 12.658227848101266, "grad_norm": 0.0, "learning_rate": 0.00015151515151515152, "loss": 3.6437, "step": 250 }, { "epoch": 12.658227848101266, "eval_loss": 3.791062116622925, "eval_runtime": 0.3608, "eval_samples_per_second": 55.435, "eval_steps_per_second": 8.315, "step": 250 }, { "epoch": 13.924050632911392, "grad_norm": 0.0, "learning_rate": 0.00014646464646464648, "loss": 3.6569, "step": 275 }, { "epoch": 13.924050632911392, "eval_loss": 3.791062116622925, "eval_runtime": 0.3681, "eval_samples_per_second": 54.334, "eval_steps_per_second": 8.15, "step": 275 }, { "epoch": 15.189873417721518, "grad_norm": 0.0, "learning_rate": 0.0001414141414141414, "loss": 3.6336, "step": 300 }, { "epoch": 15.189873417721518, "eval_loss": 3.791062116622925, "eval_runtime": 0.3786, "eval_samples_per_second": 52.829, "eval_steps_per_second": 7.924, "step": 300 }, { "epoch": 16.455696202531644, "grad_norm": 0.0, "learning_rate": 0.00013636363636363637, "loss": 3.6463, "step": 325 }, { "epoch": 16.455696202531644, "eval_loss": 3.791062116622925, "eval_runtime": 0.3653, "eval_samples_per_second": 54.756, "eval_steps_per_second": 8.213, "step": 325 }, { "epoch": 17.72151898734177, "grad_norm": 0.0, "learning_rate": 0.00013131313131313133, "loss": 3.6615, "step": 350 }, { "epoch": 17.72151898734177, "eval_loss": 3.791062116622925, "eval_runtime": 0.3677, "eval_samples_per_second": 54.397, "eval_steps_per_second": 8.16, "step": 350 }, { "epoch": 18.9873417721519, "grad_norm": 0.0, "learning_rate": 0.00012626262626262626, "loss": 3.6409, "step": 375 }, { "epoch": 18.9873417721519, "eval_loss": 3.791062116622925, "eval_runtime": 0.365, "eval_samples_per_second": 54.799, "eval_steps_per_second": 8.22, "step": 375 }, { "epoch": 20.253164556962027, "grad_norm": 0.0, "learning_rate": 0.00012121212121212122, "loss": 3.647, "step": 400 }, { "epoch": 20.253164556962027, "eval_loss": 3.791062116622925, "eval_runtime": 0.3733, "eval_samples_per_second": 53.579, "eval_steps_per_second": 8.037, "step": 400 }, { "epoch": 21.518987341772153, "grad_norm": 0.0, "learning_rate": 0.00011616161616161616, "loss": 3.6702, "step": 425 }, { "epoch": 21.518987341772153, "eval_loss": 3.791062116622925, "eval_runtime": 0.3653, "eval_samples_per_second": 54.748, "eval_steps_per_second": 8.212, "step": 425 }, { "epoch": 22.78481012658228, "grad_norm": 0.0, "learning_rate": 0.00011111111111111112, "loss": 3.6336, "step": 450 }, { "epoch": 22.78481012658228, "eval_loss": 3.791062116622925, "eval_runtime": 0.3648, "eval_samples_per_second": 54.832, "eval_steps_per_second": 8.225, "step": 450 }, { "epoch": 24.050632911392405, "grad_norm": 0.0, "learning_rate": 0.00010606060606060606, "loss": 3.6546, "step": 475 }, { "epoch": 24.050632911392405, "eval_loss": 3.791062116622925, "eval_runtime": 0.3614, "eval_samples_per_second": 55.34, "eval_steps_per_second": 8.301, "step": 475 }, { "epoch": 25.31645569620253, "grad_norm": 0.0, "learning_rate": 0.00010101010101010102, "loss": 3.6487, "step": 500 }, { "epoch": 25.31645569620253, "eval_loss": 3.791062116622925, "eval_runtime": 0.3549, "eval_samples_per_second": 56.359, "eval_steps_per_second": 8.454, "step": 500 }, { "epoch": 26.582278481012658, "grad_norm": 0.0, "learning_rate": 9.595959595959596e-05, "loss": 3.6221, "step": 525 }, { "epoch": 26.582278481012658, "eval_loss": 3.791062116622925, "eval_runtime": 0.3694, "eval_samples_per_second": 54.137, "eval_steps_per_second": 8.121, "step": 525 }, { "epoch": 27.848101265822784, "grad_norm": 0.0, "learning_rate": 9.090909090909092e-05, "loss": 3.644, "step": 550 }, { "epoch": 27.848101265822784, "eval_loss": 3.791062116622925, "eval_runtime": 0.3655, "eval_samples_per_second": 54.72, "eval_steps_per_second": 8.208, "step": 550 }, { "epoch": 29.11392405063291, "grad_norm": 0.0, "learning_rate": 8.585858585858586e-05, "loss": 3.6389, "step": 575 }, { "epoch": 29.11392405063291, "eval_loss": 3.791062116622925, "eval_runtime": 0.3643, "eval_samples_per_second": 54.896, "eval_steps_per_second": 8.234, "step": 575 }, { "epoch": 30.379746835443036, "grad_norm": 0.0, "learning_rate": 8.080808080808081e-05, "loss": 3.656, "step": 600 }, { "epoch": 30.379746835443036, "eval_loss": 3.791062116622925, "eval_runtime": 0.3693, "eval_samples_per_second": 54.156, "eval_steps_per_second": 8.123, "step": 600 }, { "epoch": 31.645569620253166, "grad_norm": 0.0, "learning_rate": 7.575757575757576e-05, "loss": 3.6414, "step": 625 }, { "epoch": 31.645569620253166, "eval_loss": 3.791062116622925, "eval_runtime": 0.3634, "eval_samples_per_second": 55.039, "eval_steps_per_second": 8.256, "step": 625 }, { "epoch": 32.91139240506329, "grad_norm": 0.0, "learning_rate": 7.07070707070707e-05, "loss": 3.6422, "step": 650 }, { "epoch": 32.91139240506329, "eval_loss": 3.791062116622925, "eval_runtime": 0.3731, "eval_samples_per_second": 53.611, "eval_steps_per_second": 8.042, "step": 650 }, { "epoch": 34.177215189873415, "grad_norm": 0.0, "learning_rate": 6.565656565656566e-05, "loss": 3.6396, "step": 675 }, { "epoch": 34.177215189873415, "eval_loss": 3.791062116622925, "eval_runtime": 0.3707, "eval_samples_per_second": 53.955, "eval_steps_per_second": 8.093, "step": 675 }, { "epoch": 35.44303797468354, "grad_norm": 0.0, "learning_rate": 6.060606060606061e-05, "loss": 3.6658, "step": 700 }, { "epoch": 35.44303797468354, "eval_loss": 3.791062116622925, "eval_runtime": 0.3884, "eval_samples_per_second": 51.489, "eval_steps_per_second": 7.723, "step": 700 }, { "epoch": 36.70886075949367, "grad_norm": 0.0, "learning_rate": 5.555555555555556e-05, "loss": 3.6473, "step": 725 }, { "epoch": 36.70886075949367, "eval_loss": 3.791062116622925, "eval_runtime": 0.3649, "eval_samples_per_second": 54.809, "eval_steps_per_second": 8.221, "step": 725 }, { "epoch": 37.9746835443038, "grad_norm": 0.0, "learning_rate": 5.050505050505051e-05, "loss": 3.6263, "step": 750 }, { "epoch": 37.9746835443038, "eval_loss": 3.791062116622925, "eval_runtime": 0.3636, "eval_samples_per_second": 55.009, "eval_steps_per_second": 8.251, "step": 750 }, { "epoch": 39.24050632911393, "grad_norm": 0.0, "learning_rate": 4.545454545454546e-05, "loss": 3.6617, "step": 775 }, { "epoch": 39.24050632911393, "eval_loss": 3.791062116622925, "eval_runtime": 0.3597, "eval_samples_per_second": 55.595, "eval_steps_per_second": 8.339, "step": 775 }, { "epoch": 40.50632911392405, "grad_norm": 0.0, "learning_rate": 4.0404040404040405e-05, "loss": 3.6533, "step": 800 }, { "epoch": 40.50632911392405, "eval_loss": 3.791062116622925, "eval_runtime": 0.3586, "eval_samples_per_second": 55.77, "eval_steps_per_second": 8.366, "step": 800 }, { "epoch": 41.77215189873418, "grad_norm": 0.0, "learning_rate": 3.535353535353535e-05, "loss": 3.6451, "step": 825 }, { "epoch": 41.77215189873418, "eval_loss": 3.791062116622925, "eval_runtime": 0.365, "eval_samples_per_second": 54.797, "eval_steps_per_second": 8.22, "step": 825 }, { "epoch": 43.037974683544306, "grad_norm": 0.0, "learning_rate": 3.0303030303030306e-05, "loss": 3.6378, "step": 850 }, { "epoch": 43.037974683544306, "eval_loss": 3.791062116622925, "eval_runtime": 0.3685, "eval_samples_per_second": 54.277, "eval_steps_per_second": 8.142, "step": 850 }, { "epoch": 44.30379746835443, "grad_norm": 0.0, "learning_rate": 2.5252525252525256e-05, "loss": 3.6434, "step": 875 }, { "epoch": 44.30379746835443, "eval_loss": 3.791062116622925, "eval_runtime": 0.3608, "eval_samples_per_second": 55.439, "eval_steps_per_second": 8.316, "step": 875 }, { "epoch": 45.56962025316456, "grad_norm": 0.0, "learning_rate": 2.0202020202020203e-05, "loss": 3.6663, "step": 900 }, { "epoch": 45.56962025316456, "eval_loss": 3.791062116622925, "eval_runtime": 0.3668, "eval_samples_per_second": 54.52, "eval_steps_per_second": 8.178, "step": 900 }, { "epoch": 46.835443037974684, "grad_norm": 0.0, "learning_rate": 1.5151515151515153e-05, "loss": 3.6625, "step": 925 }, { "epoch": 46.835443037974684, "eval_loss": 3.791062116622925, "eval_runtime": 0.3641, "eval_samples_per_second": 54.926, "eval_steps_per_second": 8.239, "step": 925 }, { "epoch": 48.10126582278481, "grad_norm": 0.0, "learning_rate": 1.0101010101010101e-05, "loss": 3.6488, "step": 950 }, { "epoch": 48.10126582278481, "eval_loss": 3.791062116622925, "eval_runtime": 0.3619, "eval_samples_per_second": 55.257, "eval_steps_per_second": 8.289, "step": 950 }, { "epoch": 49.36708860759494, "grad_norm": 0.0, "learning_rate": 5.050505050505051e-06, "loss": 3.6441, "step": 975 }, { "epoch": 49.36708860759494, "eval_loss": 3.791062116622925, "eval_runtime": 0.3685, "eval_samples_per_second": 54.269, "eval_steps_per_second": 8.14, "step": 975 }, { "epoch": 50.63291139240506, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 3.6187, "step": 1000 }, { "epoch": 50.63291139240506, "eval_loss": 3.791062116622925, "eval_runtime": 0.3719, "eval_samples_per_second": 53.775, "eval_steps_per_second": 8.066, "step": 1000 } ], "logging_steps": 25, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 53, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2138672638863360.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }