{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.86046511627907, "eval_steps": 500, "global_step": 530, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18604651162790697, "grad_norm": 2.80859375, "learning_rate": 0.00019982437317643217, "loss": 0.7797, "step": 10 }, { "epoch": 0.37209302325581395, "grad_norm": 3.423828125, "learning_rate": 0.00019929810960135172, "loss": 0.7869, "step": 20 }, { "epoch": 0.5581395348837209, "grad_norm": 3.751953125, "learning_rate": 0.00019842305779475968, "loss": 0.7876, "step": 30 }, { "epoch": 0.7441860465116279, "grad_norm": 5.54296875, "learning_rate": 0.0001972022914080411, "loss": 0.8025, "step": 40 }, { "epoch": 0.9302325581395349, "grad_norm": 5.96875, "learning_rate": 0.00019564009842765225, "loss": 0.7775, "step": 50 }, { "epoch": 1.1162790697674418, "grad_norm": 7.3984375, "learning_rate": 0.0001937419661134121, "loss": 0.7599, "step": 60 }, { "epoch": 1.302325581395349, "grad_norm": 6.05078125, "learning_rate": 0.00019151456172430183, "loss": 0.7773, "step": 70 }, { "epoch": 1.4883720930232558, "grad_norm": 5.4375, "learning_rate": 0.00018896570909947475, "loss": 0.7747, "step": 80 }, { "epoch": 1.6744186046511627, "grad_norm": 5.05078125, "learning_rate": 0.00018610436117673555, "loss": 0.7561, "step": 90 }, { "epoch": 1.8604651162790697, "grad_norm": 5.4765625, "learning_rate": 0.0001829405685450202, "loss": 0.7699, "step": 100 }, { "epoch": 2.046511627906977, "grad_norm": 4.58203125, "learning_rate": 0.00017948544414133534, "loss": 0.7668, "step": 110 }, { "epoch": 2.2325581395348837, "grad_norm": 5.43359375, "learning_rate": 0.00017575112421616202, "loss": 0.763, "step": 120 }, { "epoch": 2.4186046511627906, "grad_norm": 4.69140625, "learning_rate": 0.00017175072570443312, "loss": 0.7538, "step": 130 }, { "epoch": 2.604651162790698, "grad_norm": 4.97265625, "learning_rate": 0.00016749830015182107, "loss": 0.773, "step": 140 }, { "epoch": 2.7906976744186047, "grad_norm": 7.2109375, "learning_rate": 0.00016300878435817113, "loss": 0.7781, "step": 150 }, { "epoch": 2.9767441860465116, "grad_norm": 5.51171875, "learning_rate": 0.0001582979479114472, "loss": 0.7505, "step": 160 }, { "epoch": 3.1627906976744184, "grad_norm": 5.9375, "learning_rate": 0.0001533823377964791, "loss": 0.7353, "step": 170 }, { "epoch": 3.3488372093023258, "grad_norm": 5.92578125, "learning_rate": 0.00014827922027307451, "loss": 0.7321, "step": 180 }, { "epoch": 3.5348837209302326, "grad_norm": 10.3828125, "learning_rate": 0.00014300652022765207, "loss": 0.7498, "step": 190 }, { "epoch": 3.7209302325581395, "grad_norm": 10.515625, "learning_rate": 0.00013758275821142382, "loss": 0.7595, "step": 200 }, { "epoch": 3.9069767441860463, "grad_norm": 7.015625, "learning_rate": 0.00013202698538628376, "loss": 0.7685, "step": 210 }, { "epoch": 4.093023255813954, "grad_norm": 5.71484375, "learning_rate": 0.00012635871660690676, "loss": 0.7613, "step": 220 }, { "epoch": 4.27906976744186, "grad_norm": 5.6171875, "learning_rate": 0.00012059786187410984, "loss": 0.7483, "step": 230 }, { "epoch": 4.465116279069767, "grad_norm": 6.875, "learning_rate": 0.00011476465640024814, "loss": 0.7515, "step": 240 }, { "epoch": 4.651162790697675, "grad_norm": 5.3125, "learning_rate": 0.00010887958953229349, "loss": 0.7514, "step": 250 }, { "epoch": 4.837209302325581, "grad_norm": 6.1875, "learning_rate": 0.00010296333278225599, "loss": 0.7544, "step": 260 }, { "epoch": 5.023255813953488, "grad_norm": 7.9609375, "learning_rate": 9.703666721774402e-05, "loss": 0.7221, "step": 270 }, { "epoch": 5.209302325581396, "grad_norm": 6.609375, "learning_rate": 9.112041046770653e-05, "loss": 0.7334, "step": 280 }, { "epoch": 5.395348837209302, "grad_norm": 4.640625, "learning_rate": 8.523534359975189e-05, "loss": 0.7175, "step": 290 }, { "epoch": 5.5813953488372094, "grad_norm": 4.47265625, "learning_rate": 7.940213812589018e-05, "loss": 0.7294, "step": 300 }, { "epoch": 5.767441860465116, "grad_norm": 6.55859375, "learning_rate": 7.364128339309326e-05, "loss": 0.75, "step": 310 }, { "epoch": 5.953488372093023, "grad_norm": 7.55859375, "learning_rate": 6.797301461371625e-05, "loss": 0.7479, "step": 320 }, { "epoch": 6.1395348837209305, "grad_norm": 7.60546875, "learning_rate": 6.24172417885762e-05, "loss": 0.7285, "step": 330 }, { "epoch": 6.325581395348837, "grad_norm": 7.0078125, "learning_rate": 5.699347977234799e-05, "loss": 0.7402, "step": 340 }, { "epoch": 6.511627906976744, "grad_norm": 5.78515625, "learning_rate": 5.172077972692553e-05, "loss": 0.7532, "step": 350 }, { "epoch": 6.6976744186046515, "grad_norm": 8.921875, "learning_rate": 4.661766220352097e-05, "loss": 0.714, "step": 360 }, { "epoch": 6.883720930232558, "grad_norm": 8.34375, "learning_rate": 4.170205208855281e-05, "loss": 0.736, "step": 370 }, { "epoch": 7.069767441860465, "grad_norm": 5.9921875, "learning_rate": 3.69912156418289e-05, "loss": 0.7404, "step": 380 }, { "epoch": 7.2558139534883725, "grad_norm": 8.7890625, "learning_rate": 3.250169984817897e-05, "loss": 0.7355, "step": 390 }, { "epoch": 7.441860465116279, "grad_norm": 6.76953125, "learning_rate": 2.8249274295566864e-05, "loss": 0.73, "step": 400 }, { "epoch": 7.627906976744186, "grad_norm": 6.28125, "learning_rate": 2.4248875783837987e-05, "loss": 0.7192, "step": 410 }, { "epoch": 7.813953488372093, "grad_norm": 5.9140625, "learning_rate": 2.0514555858664663e-05, "loss": 0.7425, "step": 420 }, { "epoch": 8.0, "grad_norm": 6.7265625, "learning_rate": 1.7059431454979824e-05, "loss": 0.7263, "step": 430 }, { "epoch": 8.186046511627907, "grad_norm": 6.07421875, "learning_rate": 1.3895638823264446e-05, "loss": 0.7217, "step": 440 }, { "epoch": 8.372093023255815, "grad_norm": 6.421875, "learning_rate": 1.103429090052528e-05, "loss": 0.7507, "step": 450 }, { "epoch": 8.55813953488372, "grad_norm": 9.28125, "learning_rate": 8.485438275698154e-06, "loss": 0.7174, "step": 460 }, { "epoch": 8.744186046511627, "grad_norm": 7.484375, "learning_rate": 6.258033886587911e-06, "loss": 0.7317, "step": 470 }, { "epoch": 8.930232558139535, "grad_norm": 5.91015625, "learning_rate": 4.359901572347758e-06, "loss": 0.7517, "step": 480 }, { "epoch": 9.116279069767442, "grad_norm": 5.78125, "learning_rate": 2.7977085919589254e-06, "loss": 0.7121, "step": 490 }, { "epoch": 9.30232558139535, "grad_norm": 6.01171875, "learning_rate": 1.576942205240317e-06, "loss": 0.7469, "step": 500 }, { "epoch": 9.488372093023255, "grad_norm": 5.47265625, "learning_rate": 7.018903986483083e-07, "loss": 0.7216, "step": 510 }, { "epoch": 9.674418604651162, "grad_norm": 7.90625, "learning_rate": 1.7562682356786487e-07, "loss": 0.7351, "step": 520 }, { "epoch": 9.86046511627907, "grad_norm": 6.05078125, "learning_rate": 0.0, "loss": 0.7443, "step": 530 }, { "epoch": 9.86046511627907, "step": 530, "total_flos": 3.46013457973248e+16, "train_loss": 0.7484581434501791, "train_runtime": 447.8277, "train_samples_per_second": 4.801, "train_steps_per_second": 1.183 } ], "logging_steps": 10, "max_steps": 530, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 3.46013457973248e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }