{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 1830, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.273224043715847, "grad_norm": 1.0141575336456299, "learning_rate": 0.00019995065603657316, "loss": 1.8945, "step": 50 }, { "epoch": 0.546448087431694, "grad_norm": 0.9003917574882507, "learning_rate": 0.00019980267284282717, "loss": 1.3737, "step": 100 }, { "epoch": 0.819672131147541, "grad_norm": 0.855492889881134, "learning_rate": 0.00019955619646030802, "loss": 1.3506, "step": 150 }, { "epoch": 1.092896174863388, "grad_norm": 1.1789681911468506, "learning_rate": 0.0001992114701314478, "loss": 1.2946, "step": 200 }, { "epoch": 1.366120218579235, "grad_norm": 0.9413456320762634, "learning_rate": 0.00019876883405951377, "loss": 1.2914, "step": 250 }, { "epoch": 1.639344262295082, "grad_norm": 0.8401021957397461, "learning_rate": 0.0001982287250728689, "loss": 1.2756, "step": 300 }, { "epoch": 1.9125683060109289, "grad_norm": 0.9392536878585815, "learning_rate": 0.00019759167619387476, "loss": 1.2785, "step": 350 }, { "epoch": 2.185792349726776, "grad_norm": 0.918136477470398, "learning_rate": 0.0001968583161128631, "loss": 1.2199, "step": 400 }, { "epoch": 2.459016393442623, "grad_norm": 0.9809663891792297, "learning_rate": 0.0001960293685676943, "loss": 1.201, "step": 450 }, { "epoch": 2.73224043715847, "grad_norm": 1.0254710912704468, "learning_rate": 0.00019510565162951537, "loss": 1.1842, "step": 500 }, { "epoch": 3.0054644808743167, "grad_norm": 1.1089431047439575, "learning_rate": 0.00019408807689542257, "loss": 1.1819, "step": 550 }, { "epoch": 3.278688524590164, "grad_norm": 1.2321062088012695, "learning_rate": 0.00019297764858882514, "loss": 1.1113, "step": 600 }, { "epoch": 3.551912568306011, "grad_norm": 1.0911256074905396, "learning_rate": 0.00019177546256839812, "loss": 1.1212, "step": 650 }, { "epoch": 3.8251366120218577, "grad_norm": 1.1500061750411987, "learning_rate": 0.00019048270524660196, "loss": 1.1247, "step": 700 }, { "epoch": 4.098360655737705, "grad_norm": 1.259513258934021, "learning_rate": 0.0001891006524188368, "loss": 1.0826, "step": 750 }, { "epoch": 4.371584699453552, "grad_norm": 1.377414345741272, "learning_rate": 0.00018763066800438636, "loss": 1.0593, "step": 800 }, { "epoch": 4.644808743169399, "grad_norm": 1.2397098541259766, "learning_rate": 0.0001860742027003944, "loss": 1.0414, "step": 850 }, { "epoch": 4.918032786885246, "grad_norm": 1.2820392847061157, "learning_rate": 0.00018443279255020152, "loss": 1.0601, "step": 900 }, { "epoch": 5.191256830601093, "grad_norm": 1.6708155870437622, "learning_rate": 0.00018270805742745617, "loss": 0.973, "step": 950 }, { "epoch": 5.46448087431694, "grad_norm": 1.546794056892395, "learning_rate": 0.00018090169943749476, "loss": 0.9904, "step": 1000 }, { "epoch": 5.737704918032787, "grad_norm": 1.437908411026001, "learning_rate": 0.00017901550123756906, "loss": 0.9863, "step": 1050 }, { "epoch": 6.0109289617486334, "grad_norm": 1.4555143117904663, "learning_rate": 0.00017705132427757895, "loss": 0.9768, "step": 1100 }, { "epoch": 6.284153005464481, "grad_norm": 1.494957447052002, "learning_rate": 0.00017501110696304596, "loss": 0.8969, "step": 1150 }, { "epoch": 6.557377049180328, "grad_norm": 1.4257054328918457, "learning_rate": 0.00017289686274214118, "loss": 0.9207, "step": 1200 }, { "epoch": 6.830601092896175, "grad_norm": 1.6431266069412231, "learning_rate": 0.00017071067811865476, "loss": 0.9116, "step": 1250 }, { "epoch": 7.103825136612022, "grad_norm": 1.4786570072174072, "learning_rate": 0.00016845471059286887, "loss": 0.8975, "step": 1300 }, { "epoch": 7.377049180327869, "grad_norm": 1.5059996843338013, "learning_rate": 0.00016613118653236518, "loss": 0.8519, "step": 1350 }, { "epoch": 7.6502732240437155, "grad_norm": 1.5110268592834473, "learning_rate": 0.000163742398974869, "loss": 0.8471, "step": 1400 }, { "epoch": 7.923497267759563, "grad_norm": 1.6930420398712158, "learning_rate": 0.00016129070536529766, "loss": 0.8544, "step": 1450 }, { "epoch": 8.19672131147541, "grad_norm": 1.8286707401275635, "learning_rate": 0.00015877852522924732, "loss": 0.8102, "step": 1500 }, { "epoch": 8.469945355191257, "grad_norm": 1.4673559665679932, "learning_rate": 0.00015620833778521307, "loss": 0.7986, "step": 1550 }, { "epoch": 8.743169398907105, "grad_norm": 1.6546106338500977, "learning_rate": 0.00015358267949789966, "loss": 0.7985, "step": 1600 }, { "epoch": 9.01639344262295, "grad_norm": 1.7138121128082275, "learning_rate": 0.00015090414157503714, "loss": 0.8194, "step": 1650 }, { "epoch": 9.289617486338798, "grad_norm": 1.5631183385849, "learning_rate": 0.00014817536741017152, "loss": 0.7317, "step": 1700 }, { "epoch": 9.562841530054644, "grad_norm": 1.936880111694336, "learning_rate": 0.00014539904997395468, "loss": 0.7479, "step": 1750 }, { "epoch": 9.836065573770492, "grad_norm": 1.6515196561813354, "learning_rate": 0.00014257792915650728, "loss": 0.7435, "step": 1800 } ], "logging_steps": 50, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 28, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.968783346244608e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }