{ "best_metric": null, "best_model_checkpoint": null, "epoch": 50.0, "eval_steps": 500, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.33, "grad_norm": 74.0, "learning_rate": 1.3333333333333333e-05, "loss": 17.168, "step": 1 }, { "epoch": 1.0, "eval_loss": 15.087593078613281, "eval_runtime": 1.3097, "eval_samples_per_second": 1.527, "eval_steps_per_second": 0.764, "step": 3 }, { "epoch": 1.67, "grad_norm": 67.5, "learning_rate": 6.666666666666667e-05, "loss": 14.9207, "step": 5 }, { "epoch": 2.0, "eval_loss": 8.764444351196289, "eval_runtime": 1.3178, "eval_samples_per_second": 1.518, "eval_steps_per_second": 0.759, "step": 6 }, { "epoch": 3.0, "eval_loss": 4.842519760131836, "eval_runtime": 1.321, "eval_samples_per_second": 1.514, "eval_steps_per_second": 0.757, "step": 9 }, { "epoch": 3.33, "grad_norm": 15.875, "learning_rate": 0.00013333333333333334, "loss": 7.3214, "step": 10 }, { "epoch": 4.0, "eval_loss": 3.023890495300293, "eval_runtime": 1.3205, "eval_samples_per_second": 1.515, "eval_steps_per_second": 0.757, "step": 12 }, { "epoch": 5.0, "grad_norm": 5.53125, "learning_rate": 0.0002, "loss": 2.9627, "step": 15 }, { "epoch": 5.0, "eval_loss": 2.256518602371216, "eval_runtime": 1.3203, "eval_samples_per_second": 1.515, "eval_steps_per_second": 0.757, "step": 15 }, { "epoch": 6.0, "eval_loss": 1.8792051076889038, "eval_runtime": 1.3249, "eval_samples_per_second": 1.51, "eval_steps_per_second": 0.755, "step": 18 }, { "epoch": 6.67, "grad_norm": 1.640625, "learning_rate": 0.00019932383577419432, "loss": 1.7971, "step": 20 }, { "epoch": 7.0, "eval_loss": 1.7647607326507568, "eval_runtime": 1.324, "eval_samples_per_second": 1.511, "eval_steps_per_second": 0.755, "step": 21 }, { "epoch": 8.0, "eval_loss": 1.7012015581130981, "eval_runtime": 1.3248, "eval_samples_per_second": 1.51, "eval_steps_per_second": 0.755, "step": 24 }, { "epoch": 8.33, "grad_norm": 2.234375, "learning_rate": 0.00019730448705798239, "loss": 1.4939, "step": 25 }, { "epoch": 9.0, "eval_loss": 1.547886610031128, "eval_runtime": 1.3259, "eval_samples_per_second": 1.508, "eval_steps_per_second": 0.754, "step": 27 }, { "epoch": 10.0, "grad_norm": 1.78125, "learning_rate": 0.00019396926207859084, "loss": 1.2756, "step": 30 }, { "epoch": 10.0, "eval_loss": 1.5050724744796753, "eval_runtime": 1.3222, "eval_samples_per_second": 1.513, "eval_steps_per_second": 0.756, "step": 30 }, { "epoch": 11.0, "eval_loss": 1.3975391387939453, "eval_runtime": 1.3256, "eval_samples_per_second": 1.509, "eval_steps_per_second": 0.754, "step": 33 }, { "epoch": 11.67, "grad_norm": 1.1796875, "learning_rate": 0.00018936326403234125, "loss": 1.0884, "step": 35 }, { "epoch": 12.0, "eval_loss": 1.444014549255371, "eval_runtime": 1.325, "eval_samples_per_second": 1.509, "eval_steps_per_second": 0.755, "step": 36 }, { "epoch": 13.0, "eval_loss": 1.413475513458252, "eval_runtime": 1.3258, "eval_samples_per_second": 1.509, "eval_steps_per_second": 0.754, "step": 39 }, { "epoch": 13.33, "grad_norm": 1.3046875, "learning_rate": 0.00018354878114129367, "loss": 0.9429, "step": 40 }, { "epoch": 14.0, "eval_loss": 1.4587093591690063, "eval_runtime": 1.3247, "eval_samples_per_second": 1.51, "eval_steps_per_second": 0.755, "step": 42 }, { "epoch": 15.0, "grad_norm": 2.21875, "learning_rate": 0.0001766044443118978, "loss": 0.7653, "step": 45 }, { "epoch": 15.0, "eval_loss": 1.487448811531067, "eval_runtime": 1.3231, "eval_samples_per_second": 1.512, "eval_steps_per_second": 0.756, "step": 45 }, { "epoch": 16.0, "eval_loss": 1.5958000421524048, "eval_runtime": 1.3242, "eval_samples_per_second": 1.51, "eval_steps_per_second": 0.755, "step": 48 }, { "epoch": 16.67, "grad_norm": 0.9921875, "learning_rate": 0.0001686241637868734, "loss": 0.6424, "step": 50 }, { "epoch": 17.0, "eval_loss": 1.592842698097229, "eval_runtime": 1.3242, "eval_samples_per_second": 1.51, "eval_steps_per_second": 0.755, "step": 51 }, { "epoch": 18.0, "eval_loss": 1.683807373046875, "eval_runtime": 1.3252, "eval_samples_per_second": 1.509, "eval_steps_per_second": 0.755, "step": 54 }, { "epoch": 18.33, "grad_norm": 4.78125, "learning_rate": 0.00015971585917027862, "loss": 0.5346, "step": 55 }, { "epoch": 19.0, "eval_loss": 1.8263951539993286, "eval_runtime": 1.3248, "eval_samples_per_second": 1.51, "eval_steps_per_second": 0.755, "step": 57 }, { "epoch": 20.0, "grad_norm": 0.99609375, "learning_rate": 0.00015000000000000001, "loss": 0.4249, "step": 60 }, { "epoch": 20.0, "eval_loss": 1.9654637575149536, "eval_runtime": 1.3233, "eval_samples_per_second": 1.511, "eval_steps_per_second": 0.756, "step": 60 }, { "epoch": 21.0, "eval_loss": 2.137000560760498, "eval_runtime": 1.3247, "eval_samples_per_second": 1.51, "eval_steps_per_second": 0.755, "step": 63 }, { "epoch": 21.67, "grad_norm": 1.484375, "learning_rate": 0.0001396079766039157, "loss": 0.3347, "step": 65 }, { "epoch": 22.0, "eval_loss": 2.698075294494629, "eval_runtime": 1.3255, "eval_samples_per_second": 1.509, "eval_steps_per_second": 0.754, "step": 66 }, { "epoch": 23.0, "eval_loss": 2.713052272796631, "eval_runtime": 1.3259, "eval_samples_per_second": 1.508, "eval_steps_per_second": 0.754, "step": 69 }, { "epoch": 23.33, "grad_norm": 1.71875, "learning_rate": 0.00012868032327110904, "loss": 0.2655, "step": 70 }, { "epoch": 24.0, "eval_loss": 2.7668490409851074, "eval_runtime": 1.3252, "eval_samples_per_second": 1.509, "eval_steps_per_second": 0.755, "step": 72 }, { "epoch": 25.0, "grad_norm": 0.734375, "learning_rate": 0.00011736481776669306, "loss": 0.2026, "step": 75 }, { "epoch": 25.0, "eval_loss": 2.8614566326141357, "eval_runtime": 1.3226, "eval_samples_per_second": 1.512, "eval_steps_per_second": 0.756, "step": 75 }, { "epoch": 26.0, "eval_loss": 3.1595633029937744, "eval_runtime": 1.3267, "eval_samples_per_second": 1.507, "eval_steps_per_second": 0.754, "step": 78 }, { "epoch": 26.67, "grad_norm": 0.73828125, "learning_rate": 0.00010581448289104758, "loss": 0.1588, "step": 80 }, { "epoch": 27.0, "eval_loss": 3.3285796642303467, "eval_runtime": 1.3243, "eval_samples_per_second": 1.51, "eval_steps_per_second": 0.755, "step": 81 }, { "epoch": 28.0, "eval_loss": 3.546278953552246, "eval_runtime": 1.3248, "eval_samples_per_second": 1.51, "eval_steps_per_second": 0.755, "step": 84 }, { "epoch": 28.33, "grad_norm": 1.015625, "learning_rate": 9.418551710895243e-05, "loss": 0.1319, "step": 85 }, { "epoch": 29.0, "eval_loss": 3.3686463832855225, "eval_runtime": 1.3245, "eval_samples_per_second": 1.51, "eval_steps_per_second": 0.755, "step": 87 }, { "epoch": 30.0, "grad_norm": 0.56640625, "learning_rate": 8.263518223330697e-05, "loss": 0.1111, "step": 90 }, { "epoch": 30.0, "eval_loss": 3.685863733291626, "eval_runtime": 1.3215, "eval_samples_per_second": 1.513, "eval_steps_per_second": 0.757, "step": 90 }, { "epoch": 31.0, "eval_loss": 3.780993700027466, "eval_runtime": 1.3257, "eval_samples_per_second": 1.509, "eval_steps_per_second": 0.754, "step": 93 }, { "epoch": 31.67, "grad_norm": 0.4375, "learning_rate": 7.131967672889101e-05, "loss": 0.0939, "step": 95 }, { "epoch": 32.0, "eval_loss": 3.7559256553649902, "eval_runtime": 1.3244, "eval_samples_per_second": 1.51, "eval_steps_per_second": 0.755, "step": 96 }, { "epoch": 33.0, "eval_loss": 3.916355848312378, "eval_runtime": 1.3252, "eval_samples_per_second": 1.509, "eval_steps_per_second": 0.755, "step": 99 }, { "epoch": 33.33, "grad_norm": 0.390625, "learning_rate": 6.039202339608432e-05, "loss": 0.082, "step": 100 }, { "epoch": 34.0, "eval_loss": 3.9693491458892822, "eval_runtime": 1.3217, "eval_samples_per_second": 1.513, "eval_steps_per_second": 0.757, "step": 102 }, { "epoch": 35.0, "grad_norm": 0.283203125, "learning_rate": 5.000000000000002e-05, "loss": 0.0709, "step": 105 }, { "epoch": 35.0, "eval_loss": 4.04301118850708, "eval_runtime": 1.3186, "eval_samples_per_second": 1.517, "eval_steps_per_second": 0.758, "step": 105 }, { "epoch": 36.0, "eval_loss": 4.101677417755127, "eval_runtime": 1.3238, "eval_samples_per_second": 1.511, "eval_steps_per_second": 0.755, "step": 108 }, { "epoch": 36.67, "grad_norm": 0.27734375, "learning_rate": 4.028414082972141e-05, "loss": 0.0638, "step": 110 }, { "epoch": 37.0, "eval_loss": 4.144949913024902, "eval_runtime": 1.3248, "eval_samples_per_second": 1.51, "eval_steps_per_second": 0.755, "step": 111 }, { "epoch": 38.0, "eval_loss": 4.1638994216918945, "eval_runtime": 1.3218, "eval_samples_per_second": 1.513, "eval_steps_per_second": 0.757, "step": 114 }, { "epoch": 38.33, "grad_norm": 0.3125, "learning_rate": 3.137583621312665e-05, "loss": 0.0597, "step": 115 }, { "epoch": 39.0, "eval_loss": 4.187974452972412, "eval_runtime": 1.3246, "eval_samples_per_second": 1.51, "eval_steps_per_second": 0.755, "step": 117 }, { "epoch": 40.0, "grad_norm": 0.318359375, "learning_rate": 2.339555568810221e-05, "loss": 0.0556, "step": 120 }, { "epoch": 40.0, "eval_loss": 4.212304592132568, "eval_runtime": 1.3229, "eval_samples_per_second": 1.512, "eval_steps_per_second": 0.756, "step": 120 }, { "epoch": 41.0, "eval_loss": 4.219560623168945, "eval_runtime": 1.3269, "eval_samples_per_second": 1.507, "eval_steps_per_second": 0.754, "step": 123 }, { "epoch": 41.67, "grad_norm": 0.201171875, "learning_rate": 1.6451218858706374e-05, "loss": 0.0535, "step": 125 }, { "epoch": 42.0, "eval_loss": 4.226192951202393, "eval_runtime": 1.3243, "eval_samples_per_second": 1.51, "eval_steps_per_second": 0.755, "step": 126 }, { "epoch": 43.0, "eval_loss": 4.230067729949951, "eval_runtime": 1.3252, "eval_samples_per_second": 1.509, "eval_steps_per_second": 0.755, "step": 129 }, { "epoch": 43.33, "grad_norm": 0.2490234375, "learning_rate": 1.0636735967658784e-05, "loss": 0.0521, "step": 130 }, { "epoch": 44.0, "eval_loss": 4.231449604034424, "eval_runtime": 1.3262, "eval_samples_per_second": 1.508, "eval_steps_per_second": 0.754, "step": 132 }, { "epoch": 45.0, "grad_norm": 0.10986328125, "learning_rate": 6.030737921409169e-06, "loss": 0.0521, "step": 135 }, { "epoch": 45.0, "eval_loss": 4.2365241050720215, "eval_runtime": 1.3238, "eval_samples_per_second": 1.511, "eval_steps_per_second": 0.755, "step": 135 }, { "epoch": 46.0, "eval_loss": 4.234958171844482, "eval_runtime": 1.3255, "eval_samples_per_second": 1.509, "eval_steps_per_second": 0.754, "step": 138 }, { "epoch": 46.67, "grad_norm": 0.12451171875, "learning_rate": 2.6955129420176196e-06, "loss": 0.0525, "step": 140 }, { "epoch": 47.0, "eval_loss": 4.236445426940918, "eval_runtime": 1.3261, "eval_samples_per_second": 1.508, "eval_steps_per_second": 0.754, "step": 141 }, { "epoch": 48.0, "eval_loss": 4.231955051422119, "eval_runtime": 1.3237, "eval_samples_per_second": 1.511, "eval_steps_per_second": 0.755, "step": 144 }, { "epoch": 48.33, "grad_norm": 0.1357421875, "learning_rate": 6.761642258056978e-07, "loss": 0.0509, "step": 145 }, { "epoch": 49.0, "eval_loss": 4.236112594604492, "eval_runtime": 1.3264, "eval_samples_per_second": 1.508, "eval_steps_per_second": 0.754, "step": 147 }, { "epoch": 50.0, "grad_norm": 0.09326171875, "learning_rate": 0.0, "loss": 0.0505, "step": 150 }, { "epoch": 50.0, "eval_loss": 4.238922595977783, "eval_runtime": 1.3223, "eval_samples_per_second": 1.513, "eval_steps_per_second": 0.756, "step": 150 }, { "epoch": 50.0, "step": 150, "total_flos": 4.601366279314473e+17, "train_loss": 1.2187181319793066, "train_runtime": 1336.0597, "train_samples_per_second": 3.256, "train_steps_per_second": 0.112 } ], "logging_steps": 5, "max_steps": 150, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 100, "total_flos": 4.601366279314473e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }