{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.96, "eval_steps": 500, "global_step": 2400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 14.472631454467773, "learning_rate": 2.0000000000000003e-06, "loss": 5.6365, "step": 5 }, { "epoch": 0.0, "grad_norm": NaN, "learning_rate": 6e-06, "loss": 7.9556, "step": 10 }, { "epoch": 0.01, "grad_norm": 9.6599702835083, "learning_rate": 1.1000000000000001e-05, "loss": 6.5246, "step": 15 }, { "epoch": 0.01, "grad_norm": 40.07120895385742, "learning_rate": 1.6000000000000003e-05, "loss": 7.7818, "step": 20 }, { "epoch": 0.01, "grad_norm": 5.662575721740723, "learning_rate": 2.1e-05, "loss": 5.5899, "step": 25 }, { "epoch": 0.01, "grad_norm": NaN, "learning_rate": 2.5e-05, "loss": 7.1589, "step": 30 }, { "epoch": 0.01, "grad_norm": 7.6021246910095215, "learning_rate": 3e-05, "loss": 4.9643, "step": 35 }, { "epoch": 0.02, "grad_norm": 7.397390365600586, "learning_rate": 3.5e-05, "loss": 5.944, "step": 40 }, { "epoch": 0.02, "grad_norm": 10.122846603393555, "learning_rate": 4e-05, "loss": 4.2179, "step": 45 }, { "epoch": 0.02, "grad_norm": 7.520490646362305, "learning_rate": 4.5e-05, "loss": 3.9576, "step": 50 }, { "epoch": 0.02, "grad_norm": 12.957700729370117, "learning_rate": 5e-05, "loss": 4.6752, "step": 55 }, { "epoch": 0.02, "grad_norm": 11.252644538879395, "learning_rate": 4.999948617395915e-05, "loss": 4.586, "step": 60 }, { "epoch": 0.03, "grad_norm": 14.29404354095459, "learning_rate": 4.9997944716957985e-05, "loss": 4.2009, "step": 65 }, { "epoch": 0.03, "grad_norm": 12.03260612487793, "learning_rate": 4.9995375692359755e-05, "loss": 4.7383, "step": 70 }, { "epoch": 0.03, "grad_norm": 7.339493751525879, "learning_rate": 4.9991779205767e-05, "loss": 4.0294, "step": 75 }, { "epoch": 0.03, "grad_norm": 5.51794958114624, "learning_rate": 4.99871554050172e-05, "loss": 3.8225, "step": 80 }, { "epoch": 0.03, "grad_norm": 10.195987701416016, "learning_rate": 4.9981504480176696e-05, "loss": 4.1299, "step": 85 }, { "epoch": 0.04, "grad_norm": 14.785408973693848, "learning_rate": 4.997482666353287e-05, "loss": 4.1754, "step": 90 }, { "epoch": 0.04, "grad_norm": 9.586732864379883, "learning_rate": 4.996712222958461e-05, "loss": 3.2669, "step": 95 }, { "epoch": 0.04, "grad_norm": 25.427532196044922, "learning_rate": 4.9958391495031026e-05, "loss": 4.1074, "step": 100 }, { "epoch": 0.04, "grad_norm": 7.9541730880737305, "learning_rate": 4.994863481875841e-05, "loss": 4.2565, "step": 105 }, { "epoch": 0.04, "grad_norm": 8.371864318847656, "learning_rate": 4.993785260182552e-05, "loss": 3.3137, "step": 110 }, { "epoch": 0.05, "grad_norm": 9.015960693359375, "learning_rate": 4.992604528744705e-05, "loss": 3.4324, "step": 115 }, { "epoch": 0.05, "grad_norm": 6.063784599304199, "learning_rate": 4.991321336097546e-05, "loss": 3.3621, "step": 120 }, { "epoch": 0.05, "grad_norm": 7.260740280151367, "learning_rate": 4.989935734988098e-05, "loss": 4.7443, "step": 125 }, { "epoch": 0.05, "grad_norm": 9.744063377380371, "learning_rate": 4.9884477823729956e-05, "loss": 4.085, "step": 130 }, { "epoch": 0.05, "grad_norm": 9.156418800354004, "learning_rate": 4.986857539416144e-05, "loss": 3.5387, "step": 135 }, { "epoch": 0.06, "grad_norm": 7.296212673187256, "learning_rate": 4.9851650714862006e-05, "loss": 3.348, "step": 140 }, { "epoch": 0.06, "grad_norm": 8.042045593261719, "learning_rate": 4.983370448153896e-05, "loss": 3.5621, "step": 145 }, { "epoch": 0.06, "grad_norm": 12.138426780700684, "learning_rate": 4.981473743189163e-05, "loss": 3.4145, "step": 150 }, { "epoch": 0.06, "grad_norm": 12.3366060256958, "learning_rate": 4.979475034558115e-05, "loss": 3.9396, "step": 155 }, { "epoch": 0.06, "grad_norm": 6.696476936340332, "learning_rate": 4.977374404419837e-05, "loss": 3.5372, "step": 160 }, { "epoch": 0.07, "grad_norm": 6.023372173309326, "learning_rate": 4.975171939123005e-05, "loss": 3.2419, "step": 165 }, { "epoch": 0.07, "grad_norm": 7.861164093017578, "learning_rate": 4.9728677292023405e-05, "loss": 3.7565, "step": 170 }, { "epoch": 0.07, "grad_norm": 6.757652282714844, "learning_rate": 4.970461869374889e-05, "loss": 3.2182, "step": 175 }, { "epoch": 0.07, "grad_norm": 9.434078216552734, "learning_rate": 4.967954458536126e-05, "loss": 3.229, "step": 180 }, { "epoch": 0.07, "grad_norm": 27.395694732666016, "learning_rate": 4.965345599755887e-05, "loss": 3.903, "step": 185 }, { "epoch": 0.08, "grad_norm": 6.09558629989624, "learning_rate": 4.962635400274142e-05, "loss": 3.1898, "step": 190 }, { "epoch": 0.08, "grad_norm": 4.992712497711182, "learning_rate": 4.959823971496574e-05, "loss": 3.2185, "step": 195 }, { "epoch": 0.08, "grad_norm": 7.1479411125183105, "learning_rate": 4.95691142899001e-05, "loss": 3.2792, "step": 200 }, { "epoch": 0.08, "grad_norm": 8.431621551513672, "learning_rate": 4.9538978924776634e-05, "loss": 3.1418, "step": 205 }, { "epoch": 0.08, "grad_norm": 23.00255584716797, "learning_rate": 4.9507834858342186e-05, "loss": 3.9976, "step": 210 }, { "epoch": 0.09, "grad_norm": 10.47463321685791, "learning_rate": 4.9475683370807326e-05, "loss": 3.4157, "step": 215 }, { "epoch": 0.09, "grad_norm": 18.375465393066406, "learning_rate": 4.9442525783793794e-05, "loss": 3.5929, "step": 220 }, { "epoch": 0.09, "grad_norm": 8.78403091430664, "learning_rate": 4.940836346028011e-05, "loss": 3.1698, "step": 225 }, { "epoch": 0.09, "grad_norm": 7.213769912719727, "learning_rate": 4.937319780454559e-05, "loss": 3.9643, "step": 230 }, { "epoch": 0.09, "grad_norm": 7.919612884521484, "learning_rate": 4.933703026211262e-05, "loss": 3.6494, "step": 235 }, { "epoch": 0.1, "grad_norm": 4.598107814788818, "learning_rate": 4.9299862319687204e-05, "loss": 3.5917, "step": 240 }, { "epoch": 0.1, "grad_norm": 37.90977096557617, "learning_rate": 4.926169550509787e-05, "loss": 3.7991, "step": 245 }, { "epoch": 0.1, "grad_norm": 12.698101997375488, "learning_rate": 4.9222531387232885e-05, "loss": 3.5116, "step": 250 }, { "epoch": 0.1, "grad_norm": 9.609795570373535, "learning_rate": 4.9182371575975736e-05, "loss": 3.4126, "step": 255 }, { "epoch": 0.1, "grad_norm": 8.108003616333008, "learning_rate": 4.914121772213898e-05, "loss": 3.2962, "step": 260 }, { "epoch": 0.11, "grad_norm": 11.94726848602295, "learning_rate": 4.909907151739633e-05, "loss": 3.6953, "step": 265 }, { "epoch": 0.11, "grad_norm": 8.730710983276367, "learning_rate": 4.905593469421323e-05, "loss": 3.4841, "step": 270 }, { "epoch": 0.11, "grad_norm": 6.423217296600342, "learning_rate": 4.9011809025775486e-05, "loss": 3.1157, "step": 275 }, { "epoch": 0.11, "grad_norm": 10.736145973205566, "learning_rate": 4.8966696325916515e-05, "loss": 3.244, "step": 280 }, { "epoch": 0.11, "grad_norm": 21.597434997558594, "learning_rate": 4.892059844904272e-05, "loss": 3.172, "step": 285 }, { "epoch": 0.12, "grad_norm": 23.540430068969727, "learning_rate": 4.887351729005726e-05, "loss": 3.1889, "step": 290 }, { "epoch": 0.12, "grad_norm": 11.286609649658203, "learning_rate": 4.882545478428218e-05, "loss": 3.5211, "step": 295 }, { "epoch": 0.12, "grad_norm": 7.204074382781982, "learning_rate": 4.877641290737884e-05, "loss": 3.255, "step": 300 }, { "epoch": 0.12, "grad_norm": 25.35500717163086, "learning_rate": 4.8726393675266716e-05, "loss": 3.4832, "step": 305 }, { "epoch": 0.12, "grad_norm": 6.76464319229126, "learning_rate": 4.8675399144040537e-05, "loss": 2.9075, "step": 310 }, { "epoch": 0.13, "grad_norm": 10.19654655456543, "learning_rate": 4.862343140988573e-05, "loss": 3.0107, "step": 315 }, { "epoch": 0.13, "grad_norm": 7.545323848724365, "learning_rate": 4.8570492608992325e-05, "loss": 2.7479, "step": 320 }, { "epoch": 0.13, "grad_norm": 15.334161758422852, "learning_rate": 4.851658491746707e-05, "loss": 2.8654, "step": 325 }, { "epoch": 0.13, "grad_norm": 14.28708553314209, "learning_rate": 4.846171055124401e-05, "loss": 3.1851, "step": 330 }, { "epoch": 0.13, "grad_norm": 22.403162002563477, "learning_rate": 4.8405871765993433e-05, "loss": 2.8862, "step": 335 }, { "epoch": 0.14, "grad_norm": 5.840334415435791, "learning_rate": 4.834907085702908e-05, "loss": 2.968, "step": 340 }, { "epoch": 0.14, "grad_norm": 12.307807922363281, "learning_rate": 4.829131015921385e-05, "loss": 2.9458, "step": 345 }, { "epoch": 0.14, "grad_norm": 12.92584228515625, "learning_rate": 4.82325920468638e-05, "loss": 3.4169, "step": 350 }, { "epoch": 0.14, "grad_norm": 13.389299392700195, "learning_rate": 4.817291893365055e-05, "loss": 3.1601, "step": 355 }, { "epoch": 0.14, "grad_norm": 16.73059844970703, "learning_rate": 4.8112293272502043e-05, "loss": 3.2846, "step": 360 }, { "epoch": 0.15, "grad_norm": 8.608989715576172, "learning_rate": 4.805071755550177e-05, "loss": 3.0091, "step": 365 }, { "epoch": 0.15, "grad_norm": 5.731265544891357, "learning_rate": 4.7988194313786275e-05, "loss": 3.3688, "step": 370 }, { "epoch": 0.15, "grad_norm": 11.152880668640137, "learning_rate": 4.7924726117441135e-05, "loss": 3.3177, "step": 375 }, { "epoch": 0.15, "grad_norm": 8.759363174438477, "learning_rate": 4.7860315575395316e-05, "loss": 2.9184, "step": 380 }, { "epoch": 0.15, "grad_norm": 37.01634216308594, "learning_rate": 4.7794965335313926e-05, "loss": 3.1494, "step": 385 }, { "epoch": 0.16, "grad_norm": 5.318065643310547, "learning_rate": 4.772867808348938e-05, "loss": 3.7067, "step": 390 }, { "epoch": 0.16, "grad_norm": 14.883944511413574, "learning_rate": 4.766145654473095e-05, "loss": 2.9832, "step": 395 }, { "epoch": 0.16, "grad_norm": 35.67652130126953, "learning_rate": 4.759330348225284e-05, "loss": 3.6382, "step": 400 }, { "epoch": 0.16, "grad_norm": 6.075866222381592, "learning_rate": 4.752422169756048e-05, "loss": 3.6166, "step": 405 }, { "epoch": 0.16, "grad_norm": 10.807673454284668, "learning_rate": 4.745421403033548e-05, "loss": 2.896, "step": 410 }, { "epoch": 0.17, "grad_norm": 17.647350311279297, "learning_rate": 4.738328335831883e-05, "loss": 2.9218, "step": 415 }, { "epoch": 0.17, "grad_norm": 35.758544921875, "learning_rate": 4.731143259719265e-05, "loss": 3.0896, "step": 420 }, { "epoch": 0.17, "grad_norm": 19.15152931213379, "learning_rate": 4.72386647004603e-05, "loss": 3.0875, "step": 425 }, { "epoch": 0.17, "grad_norm": 10.721521377563477, "learning_rate": 4.716498265932501e-05, "loss": 2.833, "step": 430 }, { "epoch": 0.17, "grad_norm": 36.97370147705078, "learning_rate": 4.709038950256688e-05, "loss": 3.3553, "step": 435 }, { "epoch": 0.18, "grad_norm": 18.96316146850586, "learning_rate": 4.701488829641845e-05, "loss": 2.8189, "step": 440 }, { "epoch": 0.18, "grad_norm": 8.662888526916504, "learning_rate": 4.693848214443858e-05, "loss": 3.4443, "step": 445 }, { "epoch": 0.18, "grad_norm": 18.23220443725586, "learning_rate": 4.686117418738489e-05, "loss": 3.5717, "step": 450 }, { "epoch": 0.18, "grad_norm": 39.92061996459961, "learning_rate": 4.678296760308474e-05, "loss": 3.2364, "step": 455 }, { "epoch": 0.18, "grad_norm": 9.42786693572998, "learning_rate": 4.6703865606304465e-05, "loss": 2.9595, "step": 460 }, { "epoch": 0.19, "grad_norm": 7.904202938079834, "learning_rate": 4.662387144861734e-05, "loss": 3.1582, "step": 465 }, { "epoch": 0.19, "grad_norm": 26.168556213378906, "learning_rate": 4.6542988418269876e-05, "loss": 3.3465, "step": 470 }, { "epoch": 0.19, "grad_norm": 10.387030601501465, "learning_rate": 4.6461219840046654e-05, "loss": 3.0016, "step": 475 }, { "epoch": 0.19, "grad_norm": 9.973118782043457, "learning_rate": 4.637856907513366e-05, "loss": 3.2299, "step": 480 }, { "epoch": 0.19, "grad_norm": 31.876556396484375, "learning_rate": 4.629503952098011e-05, "loss": 3.3682, "step": 485 }, { "epoch": 0.2, "grad_norm": 23.903682708740234, "learning_rate": 4.6210634611158816e-05, "loss": 2.7361, "step": 490 }, { "epoch": 0.2, "grad_norm": 7.048882007598877, "learning_rate": 4.612535781522504e-05, "loss": 2.8006, "step": 495 }, { "epoch": 0.2, "grad_norm": 16.777807235717773, "learning_rate": 4.6039212638573833e-05, "loss": 2.7768, "step": 500 }, { "epoch": 0.2, "grad_norm": 35.86275100708008, "learning_rate": 4.595220262229601e-05, "loss": 3.2371, "step": 505 }, { "epoch": 0.2, "grad_norm": 18.5112361907959, "learning_rate": 4.586433134303257e-05, "loss": 3.9903, "step": 510 }, { "epoch": 0.21, "grad_norm": 15.84546947479248, "learning_rate": 4.5775602412827604e-05, "loss": 3.2624, "step": 515 }, { "epoch": 0.21, "grad_norm": 9.097062110900879, "learning_rate": 4.5686019478979915e-05, "loss": 3.036, "step": 520 }, { "epoch": 0.21, "grad_norm": 12.135428428649902, "learning_rate": 4.559558622389304e-05, "loss": 3.0802, "step": 525 }, { "epoch": 0.21, "grad_norm": 7.016368865966797, "learning_rate": 4.55043063649239e-05, "loss": 2.718, "step": 530 }, { "epoch": 0.21, "grad_norm": 7.902190685272217, "learning_rate": 4.5412183654229965e-05, "loss": 2.9923, "step": 535 }, { "epoch": 0.22, "grad_norm": 9.023144721984863, "learning_rate": 4.531922187861507e-05, "loss": 3.2273, "step": 540 }, { "epoch": 0.22, "grad_norm": 12.821276664733887, "learning_rate": 4.522542485937369e-05, "loss": 3.0269, "step": 545 }, { "epoch": 0.22, "grad_norm": 11.519277572631836, "learning_rate": 4.51307964521339e-05, "loss": 2.9414, "step": 550 }, { "epoch": 0.22, "grad_norm": 6.272705554962158, "learning_rate": 4.503534054669892e-05, "loss": 3.1369, "step": 555 }, { "epoch": 0.22, "grad_norm": 15.541712760925293, "learning_rate": 4.493906106688712e-05, "loss": 3.3594, "step": 560 }, { "epoch": 0.23, "grad_norm": 29.176177978515625, "learning_rate": 4.484196197037082e-05, "loss": 3.7387, "step": 565 }, { "epoch": 0.23, "grad_norm": 5.822291374206543, "learning_rate": 4.474404724851356e-05, "loss": 3.0015, "step": 570 }, { "epoch": 0.23, "grad_norm": 13.846043586730957, "learning_rate": 4.4645320926206064e-05, "loss": 2.8736, "step": 575 }, { "epoch": 0.23, "grad_norm": 6.161740303039551, "learning_rate": 4.454578706170075e-05, "loss": 3.1031, "step": 580 }, { "epoch": 0.23, "grad_norm": 12.11698055267334, "learning_rate": 4.444544974644493e-05, "loss": 3.1832, "step": 585 }, { "epoch": 0.24, "grad_norm": 5.144786834716797, "learning_rate": 4.434431310491267e-05, "loss": 2.7414, "step": 590 }, { "epoch": 0.24, "grad_norm": 14.706486701965332, "learning_rate": 4.4242381294435154e-05, "loss": 2.7289, "step": 595 }, { "epoch": 0.24, "grad_norm": 11.251163482666016, "learning_rate": 4.413965850502987e-05, "loss": 3.4787, "step": 600 }, { "epoch": 0.24, "grad_norm": 11.005367279052734, "learning_rate": 4.4036148959228365e-05, "loss": 3.1987, "step": 605 }, { "epoch": 0.24, "grad_norm": 13.336194038391113, "learning_rate": 4.393185691190264e-05, "loss": 2.9382, "step": 610 }, { "epoch": 0.25, "grad_norm": 9.259475708007812, "learning_rate": 4.382678665009028e-05, "loss": 3.1349, "step": 615 }, { "epoch": 0.25, "grad_norm": 20.902462005615234, "learning_rate": 4.372094249281821e-05, "loss": 3.1365, "step": 620 }, { "epoch": 0.25, "grad_norm": 13.907161712646484, "learning_rate": 4.3614328790925177e-05, "loss": 3.3361, "step": 625 }, { "epoch": 0.25, "grad_norm": 6.845438480377197, "learning_rate": 4.350694992688289e-05, "loss": 3.1903, "step": 630 }, { "epoch": 0.25, "grad_norm": 14.879186630249023, "learning_rate": 4.3398810314615876e-05, "loss": 3.5133, "step": 635 }, { "epoch": 0.26, "grad_norm": 10.454853057861328, "learning_rate": 4.3289914399320034e-05, "loss": 3.0928, "step": 640 }, { "epoch": 0.26, "grad_norm": 10.692648887634277, "learning_rate": 4.318026665727993e-05, "loss": 3.1065, "step": 645 }, { "epoch": 0.26, "grad_norm": 5.677508354187012, "learning_rate": 4.306987159568479e-05, "loss": 2.6484, "step": 650 }, { "epoch": 0.26, "grad_norm": 7.058112621307373, "learning_rate": 4.2958733752443195e-05, "loss": 2.6531, "step": 655 }, { "epoch": 0.26, "grad_norm": 7.029059886932373, "learning_rate": 4.284685769599658e-05, "loss": 3.3165, "step": 660 }, { "epoch": 0.27, "grad_norm": 9.306894302368164, "learning_rate": 4.273424802513145e-05, "loss": 3.2707, "step": 665 }, { "epoch": 0.27, "grad_norm": 6.9906392097473145, "learning_rate": 4.262090936879029e-05, "loss": 3.0822, "step": 670 }, { "epoch": 0.27, "grad_norm": 10.101268768310547, "learning_rate": 4.250684638588138e-05, "loss": 3.3994, "step": 675 }, { "epoch": 0.27, "grad_norm": 13.360941886901855, "learning_rate": 4.239206376508717e-05, "loss": 3.2358, "step": 680 }, { "epoch": 0.27, "grad_norm": 25.73331642150879, "learning_rate": 4.227656622467162e-05, "loss": 3.1359, "step": 685 }, { "epoch": 0.28, "grad_norm": 6.391608715057373, "learning_rate": 4.216035851228626e-05, "loss": 3.1343, "step": 690 }, { "epoch": 0.28, "grad_norm": 21.201011657714844, "learning_rate": 4.204344540477499e-05, "loss": 3.0807, "step": 695 }, { "epoch": 0.28, "grad_norm": 12.530668258666992, "learning_rate": 4.192583170797774e-05, "loss": 2.9729, "step": 700 }, { "epoch": 0.28, "grad_norm": 17.209762573242188, "learning_rate": 4.180752225653292e-05, "loss": 2.9911, "step": 705 }, { "epoch": 0.28, "grad_norm": 16.309785842895508, "learning_rate": 4.16885219136787e-05, "loss": 3.0701, "step": 710 }, { "epoch": 0.29, "grad_norm": 7.244511127471924, "learning_rate": 4.1568835571053075e-05, "loss": 2.8169, "step": 715 }, { "epoch": 0.29, "grad_norm": 5.627272605895996, "learning_rate": 4.144846814849282e-05, "loss": 3.0518, "step": 720 }, { "epoch": 0.29, "grad_norm": 9.72758674621582, "learning_rate": 4.132742459383122e-05, "loss": 3.1135, "step": 725 }, { "epoch": 0.29, "grad_norm": 14.121967315673828, "learning_rate": 4.120570988269472e-05, "loss": 2.9481, "step": 730 }, { "epoch": 0.29, "grad_norm": 71.68965148925781, "learning_rate": 4.108332901829836e-05, "loss": 3.7509, "step": 735 }, { "epoch": 0.3, "grad_norm": 22.303184509277344, "learning_rate": 4.096028703124014e-05, "loss": 2.9864, "step": 740 }, { "epoch": 0.3, "grad_norm": 11.789464950561523, "learning_rate": 4.083658897929426e-05, "loss": 2.7573, "step": 745 }, { "epoch": 0.3, "grad_norm": 21.576683044433594, "learning_rate": 4.071223994720309e-05, "loss": 3.4365, "step": 750 }, { "epoch": 0.3, "grad_norm": 7.004244804382324, "learning_rate": 4.058724504646834e-05, "loss": 3.1543, "step": 755 }, { "epoch": 0.3, "grad_norm": 8.743468284606934, "learning_rate": 4.046160941514079e-05, "loss": 2.638, "step": 760 }, { "epoch": 0.31, "grad_norm": 9.981857299804688, "learning_rate": 4.033533821760917e-05, "loss": 2.4701, "step": 765 }, { "epoch": 0.31, "grad_norm": 11.454618453979492, "learning_rate": 4.0208436644387834e-05, "loss": 3.4152, "step": 770 }, { "epoch": 0.31, "grad_norm": 22.861881256103516, "learning_rate": 4.008090991190341e-05, "loss": 3.0526, "step": 775 }, { "epoch": 0.31, "grad_norm": 34.58321762084961, "learning_rate": 3.9952763262280405e-05, "loss": 3.1283, "step": 780 }, { "epoch": 0.31, "grad_norm": 12.163683891296387, "learning_rate": 3.982400196312564e-05, "loss": 2.8686, "step": 785 }, { "epoch": 0.32, "grad_norm": 34.78618240356445, "learning_rate": 3.969463130731183e-05, "loss": 2.782, "step": 790 }, { "epoch": 0.32, "grad_norm": 11.429190635681152, "learning_rate": 3.95646566127599e-05, "loss": 3.1518, "step": 795 }, { "epoch": 0.32, "grad_norm": 7.942728519439697, "learning_rate": 3.943408322222049e-05, "loss": 3.1183, "step": 800 }, { "epoch": 0.32, "grad_norm": 6.556036949157715, "learning_rate": 3.9302916503054246e-05, "loss": 2.9241, "step": 805 }, { "epoch": 0.32, "grad_norm": 7.161313533782959, "learning_rate": 3.917116184701125e-05, "loss": 2.8394, "step": 810 }, { "epoch": 0.33, "grad_norm": 17.556371688842773, "learning_rate": 3.903882467000937e-05, "loss": 2.667, "step": 815 }, { "epoch": 0.33, "grad_norm": 6.0064167976379395, "learning_rate": 3.8905910411911625e-05, "loss": 2.8605, "step": 820 }, { "epoch": 0.33, "grad_norm": 13.329277038574219, "learning_rate": 3.8772424536302564e-05, "loss": 2.8765, "step": 825 }, { "epoch": 0.33, "grad_norm": 7.7756876945495605, "learning_rate": 3.8638372530263715e-05, "loss": 2.6558, "step": 830 }, { "epoch": 0.33, "grad_norm": 6.505970478057861, "learning_rate": 3.850375990414801e-05, "loss": 2.6178, "step": 835 }, { "epoch": 0.34, "grad_norm": 29.54509925842285, "learning_rate": 3.836859219135324e-05, "loss": 2.9655, "step": 840 }, { "epoch": 0.34, "grad_norm": 12.92197322845459, "learning_rate": 3.823287494809469e-05, "loss": 2.7825, "step": 845 }, { "epoch": 0.34, "grad_norm": 10.073705673217773, "learning_rate": 3.8096613753176634e-05, "loss": 3.3055, "step": 850 }, { "epoch": 0.34, "grad_norm": 23.254878997802734, "learning_rate": 3.7959814207763135e-05, "loss": 2.906, "step": 855 }, { "epoch": 0.34, "grad_norm": 16.555452346801758, "learning_rate": 3.782248193514766e-05, "loss": 3.2928, "step": 860 }, { "epoch": 0.35, "grad_norm": 8.429189682006836, "learning_rate": 3.7684622580522055e-05, "loss": 2.6259, "step": 865 }, { "epoch": 0.35, "grad_norm": 9.762368202209473, "learning_rate": 3.7546241810744445e-05, "loss": 2.7576, "step": 870 }, { "epoch": 0.35, "grad_norm": 11.004511833190918, "learning_rate": 3.740734531410626e-05, "loss": 2.6714, "step": 875 }, { "epoch": 0.35, "grad_norm": 56.73052978515625, "learning_rate": 3.726793880009845e-05, "loss": 3.542, "step": 880 }, { "epoch": 0.35, "grad_norm": 10.330306053161621, "learning_rate": 3.7128027999176803e-05, "loss": 3.0011, "step": 885 }, { "epoch": 0.36, "grad_norm": 12.130693435668945, "learning_rate": 3.698761866252635e-05, "loss": 3.0935, "step": 890 }, { "epoch": 0.36, "grad_norm": 12.528786659240723, "learning_rate": 3.6846716561824965e-05, "loss": 3.3687, "step": 895 }, { "epoch": 0.36, "grad_norm": 14.122909545898438, "learning_rate": 3.670532748900615e-05, "loss": 2.5591, "step": 900 }, { "epoch": 0.36, "grad_norm": 8.046327590942383, "learning_rate": 3.656345725602089e-05, "loss": 2.3845, "step": 905 }, { "epoch": 0.36, "grad_norm": 22.46470832824707, "learning_rate": 3.642111169459879e-05, "loss": 2.8321, "step": 910 }, { "epoch": 0.37, "grad_norm": 10.554203987121582, "learning_rate": 3.6278296656008366e-05, "loss": 2.4067, "step": 915 }, { "epoch": 0.37, "grad_norm": 8.757329940795898, "learning_rate": 3.6135018010816477e-05, "loss": 2.7003, "step": 920 }, { "epoch": 0.37, "grad_norm": 8.956037521362305, "learning_rate": 3.599128164864706e-05, "loss": 2.7975, "step": 925 }, { "epoch": 0.37, "grad_norm": 15.920394897460938, "learning_rate": 3.5847093477938956e-05, "loss": 2.7154, "step": 930 }, { "epoch": 0.37, "grad_norm": 15.14090633392334, "learning_rate": 3.570245942570315e-05, "loss": 3.3547, "step": 935 }, { "epoch": 0.38, "grad_norm": 17.72629165649414, "learning_rate": 3.5557385437279e-05, "loss": 2.7852, "step": 940 }, { "epoch": 0.38, "grad_norm": 5.857475757598877, "learning_rate": 3.5411877476089975e-05, "loss": 2.73, "step": 945 }, { "epoch": 0.38, "grad_norm": 22.337295532226562, "learning_rate": 3.526594152339845e-05, "loss": 3.1076, "step": 950 }, { "epoch": 0.38, "grad_norm": 10.523070335388184, "learning_rate": 3.5119583578059846e-05, "loss": 3.1857, "step": 955 }, { "epoch": 0.38, "grad_norm": 11.635588645935059, "learning_rate": 3.497280965627605e-05, "loss": 2.8584, "step": 960 }, { "epoch": 0.39, "grad_norm": 27.29017448425293, "learning_rate": 3.4825625791348096e-05, "loss": 3.3567, "step": 965 }, { "epoch": 0.39, "grad_norm": 25.24538230895996, "learning_rate": 3.467803803342821e-05, "loss": 3.1713, "step": 970 }, { "epoch": 0.39, "grad_norm": 11.2062406539917, "learning_rate": 3.4530052449271044e-05, "loss": 2.8941, "step": 975 }, { "epoch": 0.39, "grad_norm": 20.25462532043457, "learning_rate": 3.438167512198436e-05, "loss": 3.1291, "step": 980 }, { "epoch": 0.39, "grad_norm": 13.0928373336792, "learning_rate": 3.4232912150778914e-05, "loss": 2.4672, "step": 985 }, { "epoch": 0.4, "grad_norm": 21.326560974121094, "learning_rate": 3.408376965071779e-05, "loss": 3.0489, "step": 990 }, { "epoch": 0.4, "grad_norm": 54.281578063964844, "learning_rate": 3.393425375246503e-05, "loss": 3.0251, "step": 995 }, { "epoch": 0.4, "grad_norm": 18.1798038482666, "learning_rate": 3.378437060203357e-05, "loss": 3.0152, "step": 1000 }, { "epoch": 0.4, "grad_norm": 8.757742881774902, "learning_rate": 3.363412636053269e-05, "loss": 2.5223, "step": 1005 }, { "epoch": 0.4, "grad_norm": 7.601003170013428, "learning_rate": 3.348352720391469e-05, "loss": 2.8158, "step": 1010 }, { "epoch": 0.41, "grad_norm": 16.87445068359375, "learning_rate": 3.3332579322721046e-05, "loss": 3.3828, "step": 1015 }, { "epoch": 0.41, "grad_norm": 11.726461410522461, "learning_rate": 3.318128892182792e-05, "loss": 2.5411, "step": 1020 }, { "epoch": 0.41, "grad_norm": 11.669478416442871, "learning_rate": 3.3029662220191144e-05, "loss": 2.6686, "step": 1025 }, { "epoch": 0.41, "grad_norm": 58.972007751464844, "learning_rate": 3.2877705450590526e-05, "loss": 3.3026, "step": 1030 }, { "epoch": 0.41, "grad_norm": 7.434502124786377, "learning_rate": 3.272542485937369e-05, "loss": 2.6652, "step": 1035 }, { "epoch": 0.42, "grad_norm": 22.020599365234375, "learning_rate": 3.2572826706199305e-05, "loss": 3.1553, "step": 1040 }, { "epoch": 0.42, "grad_norm": 17.60317611694336, "learning_rate": 3.2419917263779766e-05, "loss": 3.2016, "step": 1045 }, { "epoch": 0.42, "grad_norm": 15.09745979309082, "learning_rate": 3.2266702817623346e-05, "loss": 2.6242, "step": 1050 }, { "epoch": 0.42, "grad_norm": 11.325228691101074, "learning_rate": 3.211318966577581e-05, "loss": 2.9649, "step": 1055 }, { "epoch": 0.42, "grad_norm": 16.898090362548828, "learning_rate": 3.195938411856159e-05, "loss": 2.4623, "step": 1060 }, { "epoch": 0.43, "grad_norm": 8.761795043945312, "learning_rate": 3.180529249832428e-05, "loss": 2.4152, "step": 1065 }, { "epoch": 0.43, "grad_norm": 11.047560691833496, "learning_rate": 3.165092113916688e-05, "loss": 3.0119, "step": 1070 }, { "epoch": 0.43, "grad_norm": 11.61601734161377, "learning_rate": 3.149627638669132e-05, "loss": 2.4781, "step": 1075 }, { "epoch": 0.43, "grad_norm": 21.56399154663086, "learning_rate": 3.1341364597737686e-05, "loss": 3.4005, "step": 1080 }, { "epoch": 0.43, "grad_norm": 15.666382789611816, "learning_rate": 3.118619214012286e-05, "loss": 2.9385, "step": 1085 }, { "epoch": 0.44, "grad_norm": 11.866064071655273, "learning_rate": 3.1030765392378816e-05, "loss": 3.0096, "step": 1090 }, { "epoch": 0.44, "grad_norm": 11.25349235534668, "learning_rate": 3.0875090743490384e-05, "loss": 2.0702, "step": 1095 }, { "epoch": 0.44, "grad_norm": 10.070486068725586, "learning_rate": 3.071917459263264e-05, "loss": 3.0652, "step": 1100 }, { "epoch": 0.44, "grad_norm": 7.047021389007568, "learning_rate": 3.056302334890786e-05, "loss": 2.8617, "step": 1105 }, { "epoch": 0.44, "grad_norm": 11.59208869934082, "learning_rate": 3.040664343108209e-05, "loss": 2.8697, "step": 1110 }, { "epoch": 0.45, "grad_norm": 9.361946105957031, "learning_rate": 3.0250041267321232e-05, "loss": 2.7646, "step": 1115 }, { "epoch": 0.45, "grad_norm": 11.60400390625, "learning_rate": 3.0093223294926892e-05, "loss": 2.8238, "step": 1120 }, { "epoch": 0.45, "grad_norm": 19.50318145751953, "learning_rate": 2.993619596007168e-05, "loss": 2.8857, "step": 1125 }, { "epoch": 0.45, "grad_norm": 24.245025634765625, "learning_rate": 2.9778965717534313e-05, "loss": 2.949, "step": 1130 }, { "epoch": 0.45, "grad_norm": 14.323025703430176, "learning_rate": 2.962153903043422e-05, "loss": 2.6237, "step": 1135 }, { "epoch": 0.46, "grad_norm": 13.869138717651367, "learning_rate": 2.9463922369965917e-05, "loss": 2.5021, "step": 1140 }, { "epoch": 0.46, "grad_norm": 9.667675018310547, "learning_rate": 2.9306122215132976e-05, "loss": 2.9346, "step": 1145 }, { "epoch": 0.46, "grad_norm": 22.312841415405273, "learning_rate": 2.91481450524817e-05, "loss": 2.7172, "step": 1150 }, { "epoch": 0.46, "grad_norm": 10.218385696411133, "learning_rate": 2.8989997375834482e-05, "loss": 2.4864, "step": 1155 }, { "epoch": 0.46, "grad_norm": 7.388011455535889, "learning_rate": 2.8831685686022897e-05, "loss": 2.7519, "step": 1160 }, { "epoch": 0.47, "grad_norm": 8.172524452209473, "learning_rate": 2.8673216490620452e-05, "loss": 2.701, "step": 1165 }, { "epoch": 0.47, "grad_norm": 11.60141372680664, "learning_rate": 2.8514596303675073e-05, "loss": 3.1608, "step": 1170 }, { "epoch": 0.47, "grad_norm": 21.537139892578125, "learning_rate": 2.8355831645441388e-05, "loss": 2.9307, "step": 1175 }, { "epoch": 0.47, "grad_norm": 12.06334400177002, "learning_rate": 2.8196929042112652e-05, "loss": 2.9414, "step": 1180 }, { "epoch": 0.47, "grad_norm": 8.39703369140625, "learning_rate": 2.8037895025552512e-05, "loss": 2.938, "step": 1185 }, { "epoch": 0.48, "grad_norm": 8.78774642944336, "learning_rate": 2.787873613302649e-05, "loss": 2.5751, "step": 1190 }, { "epoch": 0.48, "grad_norm": 24.190916061401367, "learning_rate": 2.7719458906933277e-05, "loss": 2.328, "step": 1195 }, { "epoch": 0.48, "grad_norm": 17.80084228515625, "learning_rate": 2.7560069894535784e-05, "loss": 2.7258, "step": 1200 }, { "epoch": 0.48, "grad_norm": 17.03345489501953, "learning_rate": 2.7400575647692046e-05, "loss": 2.8104, "step": 1205 }, { "epoch": 0.48, "grad_norm": 36.040897369384766, "learning_rate": 2.724098272258584e-05, "loss": 2.9138, "step": 1210 }, { "epoch": 0.49, "grad_norm": 9.431939125061035, "learning_rate": 2.7081297679457236e-05, "loss": 2.3052, "step": 1215 }, { "epoch": 0.49, "grad_norm": 10.720209121704102, "learning_rate": 2.692152708233292e-05, "loss": 2.3984, "step": 1220 }, { "epoch": 0.49, "grad_norm": 13.637066841125488, "learning_rate": 2.676167749875635e-05, "loss": 2.8592, "step": 1225 }, { "epoch": 0.49, "grad_norm": 8.968180656433105, "learning_rate": 2.6601755499517826e-05, "loss": 2.4064, "step": 1230 }, { "epoch": 0.49, "grad_norm": 9.793657302856445, "learning_rate": 2.6441767658384366e-05, "loss": 3.0291, "step": 1235 }, { "epoch": 0.5, "grad_norm": 8.77736759185791, "learning_rate": 2.628172055182948e-05, "loss": 2.3978, "step": 1240 }, { "epoch": 0.5, "grad_norm": 9.85341739654541, "learning_rate": 2.6121620758762877e-05, "loss": 2.5431, "step": 1245 }, { "epoch": 0.5, "grad_norm": 11.554092407226562, "learning_rate": 2.596147486025996e-05, "loss": 2.8736, "step": 1250 }, { "epoch": 0.5, "grad_norm": 8.345808029174805, "learning_rate": 2.5801289439291388e-05, "loss": 2.6837, "step": 1255 }, { "epoch": 0.5, "grad_norm": 9.449942588806152, "learning_rate": 2.564107108045239e-05, "loss": 3.1686, "step": 1260 }, { "epoch": 0.51, "grad_norm": 32.239585876464844, "learning_rate": 2.5480826369692178e-05, "loss": 3.0723, "step": 1265 }, { "epoch": 0.51, "grad_norm": 36.09556579589844, "learning_rate": 2.5320561894043184e-05, "loss": 2.6983, "step": 1270 }, { "epoch": 0.51, "grad_norm": 42.91587829589844, "learning_rate": 2.5160284241350278e-05, "loss": 2.6336, "step": 1275 }, { "epoch": 0.51, "grad_norm": 20.82798194885254, "learning_rate": 2.5e-05, "loss": 2.8635, "step": 1280 }, { "epoch": 0.51, "grad_norm": 7.868647575378418, "learning_rate": 2.4839715758649724e-05, "loss": 2.8232, "step": 1285 }, { "epoch": 0.52, "grad_norm": 12.662830352783203, "learning_rate": 2.467943810595682e-05, "loss": 2.8228, "step": 1290 }, { "epoch": 0.52, "grad_norm": 15.0082368850708, "learning_rate": 2.4519173630307825e-05, "loss": 2.6371, "step": 1295 }, { "epoch": 0.52, "grad_norm": 15.185354232788086, "learning_rate": 2.4358928919547616e-05, "loss": 2.6336, "step": 1300 }, { "epoch": 0.52, "grad_norm": 7.559114456176758, "learning_rate": 2.419871056070862e-05, "loss": 2.4209, "step": 1305 }, { "epoch": 0.52, "grad_norm": 13.428425788879395, "learning_rate": 2.403852513974004e-05, "loss": 2.6897, "step": 1310 }, { "epoch": 0.53, "grad_norm": 38.91737747192383, "learning_rate": 2.3878379241237136e-05, "loss": 2.569, "step": 1315 }, { "epoch": 0.53, "grad_norm": 15.211456298828125, "learning_rate": 2.3718279448170525e-05, "loss": 2.4856, "step": 1320 }, { "epoch": 0.53, "grad_norm": 11.391417503356934, "learning_rate": 2.3558232341615643e-05, "loss": 2.5527, "step": 1325 }, { "epoch": 0.53, "grad_norm": 21.21587371826172, "learning_rate": 2.339824450048218e-05, "loss": 3.5899, "step": 1330 }, { "epoch": 0.53, "grad_norm": 17.66887664794922, "learning_rate": 2.323832250124365e-05, "loss": 2.4669, "step": 1335 }, { "epoch": 0.54, "grad_norm": 10.46285629272461, "learning_rate": 2.3078472917667092e-05, "loss": 2.4984, "step": 1340 }, { "epoch": 0.54, "grad_norm": 14.3226318359375, "learning_rate": 2.291870232054277e-05, "loss": 2.8999, "step": 1345 }, { "epoch": 0.54, "grad_norm": 47.95698547363281, "learning_rate": 2.2759017277414166e-05, "loss": 2.7827, "step": 1350 }, { "epoch": 0.54, "grad_norm": 10.691919326782227, "learning_rate": 2.2599424352307957e-05, "loss": 2.9201, "step": 1355 }, { "epoch": 0.54, "grad_norm": 15.748466491699219, "learning_rate": 2.243993010546422e-05, "loss": 2.3433, "step": 1360 }, { "epoch": 0.55, "grad_norm": 18.585126876831055, "learning_rate": 2.2280541093066732e-05, "loss": 2.7896, "step": 1365 }, { "epoch": 0.55, "grad_norm": 9.304043769836426, "learning_rate": 2.212126386697352e-05, "loss": 2.781, "step": 1370 }, { "epoch": 0.55, "grad_norm": 14.388626098632812, "learning_rate": 2.196210497444749e-05, "loss": 2.9678, "step": 1375 }, { "epoch": 0.55, "grad_norm": 9.608166694641113, "learning_rate": 2.1803070957887347e-05, "loss": 3.0607, "step": 1380 }, { "epoch": 0.55, "grad_norm": 6.090882301330566, "learning_rate": 2.164416835455862e-05, "loss": 3.6207, "step": 1385 }, { "epoch": 0.56, "grad_norm": 13.572842597961426, "learning_rate": 2.1485403696324936e-05, "loss": 2.5546, "step": 1390 }, { "epoch": 0.56, "grad_norm": 7.186910629272461, "learning_rate": 2.1326783509379554e-05, "loss": 2.9678, "step": 1395 }, { "epoch": 0.56, "grad_norm": 50.014766693115234, "learning_rate": 2.11683143139771e-05, "loss": 3.1836, "step": 1400 }, { "epoch": 0.56, "grad_norm": 6.37592077255249, "learning_rate": 2.1010002624165527e-05, "loss": 2.659, "step": 1405 }, { "epoch": 0.56, "grad_norm": 22.413625717163086, "learning_rate": 2.0851854947518313e-05, "loss": 2.6929, "step": 1410 }, { "epoch": 0.57, "grad_norm": 11.021233558654785, "learning_rate": 2.069387778486703e-05, "loss": 2.7117, "step": 1415 }, { "epoch": 0.57, "grad_norm": 56.379844665527344, "learning_rate": 2.0536077630034086e-05, "loss": 3.4033, "step": 1420 }, { "epoch": 0.57, "grad_norm": 10.377793312072754, "learning_rate": 2.0378460969565782e-05, "loss": 2.4798, "step": 1425 }, { "epoch": 0.57, "grad_norm": 6.888087749481201, "learning_rate": 2.02210342824657e-05, "loss": 2.6521, "step": 1430 }, { "epoch": 0.57, "grad_norm": 6.764181613922119, "learning_rate": 2.0063804039928324e-05, "loss": 2.6824, "step": 1435 }, { "epoch": 0.58, "grad_norm": 55.408729553222656, "learning_rate": 1.9906776705073114e-05, "loss": 3.5755, "step": 1440 }, { "epoch": 0.58, "grad_norm": 8.327556610107422, "learning_rate": 1.9749958732678767e-05, "loss": 2.3398, "step": 1445 }, { "epoch": 0.58, "grad_norm": 9.360859870910645, "learning_rate": 1.9593356568917913e-05, "loss": 2.5018, "step": 1450 }, { "epoch": 0.58, "grad_norm": 32.50775146484375, "learning_rate": 1.9436976651092144e-05, "loss": 2.8295, "step": 1455 }, { "epoch": 0.58, "grad_norm": 23.467443466186523, "learning_rate": 1.928082540736737e-05, "loss": 2.6034, "step": 1460 }, { "epoch": 0.59, "grad_norm": 10.357468605041504, "learning_rate": 1.9124909256509622e-05, "loss": 3.0481, "step": 1465 }, { "epoch": 0.59, "grad_norm": 16.312400817871094, "learning_rate": 1.8969234607621186e-05, "loss": 2.9409, "step": 1470 }, { "epoch": 0.59, "grad_norm": 15.405855178833008, "learning_rate": 1.8813807859877147e-05, "loss": 2.6141, "step": 1475 }, { "epoch": 0.59, "grad_norm": 10.774744987487793, "learning_rate": 1.865863540226232e-05, "loss": 2.5919, "step": 1480 }, { "epoch": 0.59, "grad_norm": 7.001652717590332, "learning_rate": 1.8503723613308683e-05, "loss": 3.043, "step": 1485 }, { "epoch": 0.6, "grad_norm": 32.264766693115234, "learning_rate": 1.8349078860833123e-05, "loss": 2.711, "step": 1490 }, { "epoch": 0.6, "grad_norm": 15.870736122131348, "learning_rate": 1.8194707501675724e-05, "loss": 2.4717, "step": 1495 }, { "epoch": 0.6, "grad_norm": 15.47707748413086, "learning_rate": 1.8040615881438425e-05, "loss": 2.9706, "step": 1500 }, { "epoch": 0.6, "grad_norm": 12.465653419494629, "learning_rate": 1.7886810334224192e-05, "loss": 3.0024, "step": 1505 }, { "epoch": 0.6, "grad_norm": 20.99846839904785, "learning_rate": 1.7733297182376663e-05, "loss": 3.3546, "step": 1510 }, { "epoch": 0.61, "grad_norm": 37.41334533691406, "learning_rate": 1.7580082736220237e-05, "loss": 2.924, "step": 1515 }, { "epoch": 0.61, "grad_norm": 6.230709552764893, "learning_rate": 1.74271732938007e-05, "loss": 2.7253, "step": 1520 }, { "epoch": 0.61, "grad_norm": 15.803852081298828, "learning_rate": 1.7274575140626318e-05, "loss": 2.8601, "step": 1525 }, { "epoch": 0.61, "grad_norm": 6.981109142303467, "learning_rate": 1.7122294549409484e-05, "loss": 2.6754, "step": 1530 }, { "epoch": 0.61, "grad_norm": 21.247787475585938, "learning_rate": 1.6970337779808862e-05, "loss": 2.5148, "step": 1535 }, { "epoch": 0.62, "grad_norm": 13.401627540588379, "learning_rate": 1.6818711078172077e-05, "loss": 2.5783, "step": 1540 }, { "epoch": 0.62, "grad_norm": 8.62679386138916, "learning_rate": 1.666742067727896e-05, "loss": 2.5611, "step": 1545 }, { "epoch": 0.62, "grad_norm": 6.567328929901123, "learning_rate": 1.6516472796085315e-05, "loss": 2.3179, "step": 1550 }, { "epoch": 0.62, "grad_norm": 9.707379341125488, "learning_rate": 1.6365873639467315e-05, "loss": 2.3441, "step": 1555 }, { "epoch": 0.62, "grad_norm": 38.05426788330078, "learning_rate": 1.621562939796643e-05, "loss": 2.562, "step": 1560 }, { "epoch": 0.63, "grad_norm": 18.448406219482422, "learning_rate": 1.6065746247534984e-05, "loss": 2.6616, "step": 1565 }, { "epoch": 0.63, "grad_norm": 12.998920440673828, "learning_rate": 1.5916230349282215e-05, "loss": 2.9305, "step": 1570 }, { "epoch": 0.63, "grad_norm": 7.341577053070068, "learning_rate": 1.5767087849221096e-05, "loss": 2.6997, "step": 1575 }, { "epoch": 0.63, "grad_norm": 11.726909637451172, "learning_rate": 1.561832487801565e-05, "loss": 2.9281, "step": 1580 }, { "epoch": 0.63, "grad_norm": 10.877985000610352, "learning_rate": 1.5469947550728958e-05, "loss": 3.7181, "step": 1585 }, { "epoch": 0.64, "grad_norm": 11.845093727111816, "learning_rate": 1.53219619665718e-05, "loss": 3.1277, "step": 1590 }, { "epoch": 0.64, "grad_norm": 11.064024925231934, "learning_rate": 1.5174374208651912e-05, "loss": 2.8159, "step": 1595 }, { "epoch": 0.64, "grad_norm": 12.611387252807617, "learning_rate": 1.502719034372396e-05, "loss": 2.9906, "step": 1600 }, { "epoch": 0.64, "grad_norm": 8.44251537322998, "learning_rate": 1.4880416421940155e-05, "loss": 2.7055, "step": 1605 }, { "epoch": 0.64, "grad_norm": 11.020173072814941, "learning_rate": 1.4734058476601553e-05, "loss": 2.7852, "step": 1610 }, { "epoch": 0.65, "grad_norm": 18.893342971801758, "learning_rate": 1.458812252391003e-05, "loss": 2.7677, "step": 1615 }, { "epoch": 0.65, "grad_norm": 9.38496208190918, "learning_rate": 1.444261456272101e-05, "loss": 2.7732, "step": 1620 }, { "epoch": 0.65, "grad_norm": 6.147873401641846, "learning_rate": 1.4297540574296869e-05, "loss": 2.2796, "step": 1625 }, { "epoch": 0.65, "grad_norm": 18.142108917236328, "learning_rate": 1.4152906522061048e-05, "loss": 3.1047, "step": 1630 }, { "epoch": 0.65, "grad_norm": 16.73578643798828, "learning_rate": 1.400871835135295e-05, "loss": 2.9558, "step": 1635 }, { "epoch": 0.66, "grad_norm": 8.667417526245117, "learning_rate": 1.386498198918352e-05, "loss": 2.5587, "step": 1640 }, { "epoch": 0.66, "grad_norm": 29.769210815429688, "learning_rate": 1.3721703343991633e-05, "loss": 3.2146, "step": 1645 }, { "epoch": 0.66, "grad_norm": 27.924272537231445, "learning_rate": 1.3578888305401207e-05, "loss": 2.8355, "step": 1650 }, { "epoch": 0.66, "grad_norm": 8.13482666015625, "learning_rate": 1.3436542743979125e-05, "loss": 2.8218, "step": 1655 }, { "epoch": 0.66, "grad_norm": 16.816307067871094, "learning_rate": 1.329467251099386e-05, "loss": 2.5636, "step": 1660 }, { "epoch": 0.67, "grad_norm": 10.0423583984375, "learning_rate": 1.3153283438175034e-05, "loss": 2.3591, "step": 1665 }, { "epoch": 0.67, "grad_norm": 10.223281860351562, "learning_rate": 1.3012381337473656e-05, "loss": 3.0024, "step": 1670 }, { "epoch": 0.67, "grad_norm": 16.49390983581543, "learning_rate": 1.2871972000823196e-05, "loss": 2.9023, "step": 1675 }, { "epoch": 0.67, "grad_norm": 7.545689582824707, "learning_rate": 1.2732061199901562e-05, "loss": 2.8308, "step": 1680 }, { "epoch": 0.67, "grad_norm": 16.10993003845215, "learning_rate": 1.2592654685893757e-05, "loss": 2.6319, "step": 1685 }, { "epoch": 0.68, "grad_norm": 17.161388397216797, "learning_rate": 1.2453758189255568e-05, "loss": 2.6535, "step": 1690 }, { "epoch": 0.68, "grad_norm": 9.898162841796875, "learning_rate": 1.231537741947795e-05, "loss": 3.3956, "step": 1695 }, { "epoch": 0.68, "grad_norm": 8.259682655334473, "learning_rate": 1.217751806485235e-05, "loss": 2.5027, "step": 1700 }, { "epoch": 0.68, "grad_norm": 13.10878849029541, "learning_rate": 1.2040185792236874e-05, "loss": 2.4352, "step": 1705 }, { "epoch": 0.68, "grad_norm": 5.2070722579956055, "learning_rate": 1.1903386246823361e-05, "loss": 2.8799, "step": 1710 }, { "epoch": 0.69, "grad_norm": 12.991480827331543, "learning_rate": 1.1767125051905315e-05, "loss": 3.0858, "step": 1715 }, { "epoch": 0.69, "grad_norm": 10.183439254760742, "learning_rate": 1.1631407808646758e-05, "loss": 2.767, "step": 1720 }, { "epoch": 0.69, "grad_norm": 44.539459228515625, "learning_rate": 1.1496240095852001e-05, "loss": 2.9168, "step": 1725 }, { "epoch": 0.69, "grad_norm": 9.825093269348145, "learning_rate": 1.1361627469736285e-05, "loss": 2.6493, "step": 1730 }, { "epoch": 0.69, "grad_norm": 13.887325286865234, "learning_rate": 1.122757546369744e-05, "loss": 2.8159, "step": 1735 }, { "epoch": 0.7, "grad_norm": 6.512905120849609, "learning_rate": 1.1094089588088383e-05, "loss": 3.4574, "step": 1740 }, { "epoch": 0.7, "grad_norm": 8.677343368530273, "learning_rate": 1.096117532999063e-05, "loss": 2.7936, "step": 1745 }, { "epoch": 0.7, "grad_norm": 29.443038940429688, "learning_rate": 1.082883815298876e-05, "loss": 2.7603, "step": 1750 }, { "epoch": 0.7, "grad_norm": 6.62708044052124, "learning_rate": 1.0697083496945765e-05, "loss": 2.6984, "step": 1755 }, { "epoch": 0.7, "grad_norm": 11.575033187866211, "learning_rate": 1.0565916777779519e-05, "loss": 2.7648, "step": 1760 }, { "epoch": 0.71, "grad_norm": 19.933650970458984, "learning_rate": 1.0435343387240098e-05, "loss": 2.9776, "step": 1765 }, { "epoch": 0.71, "grad_norm": 12.603243827819824, "learning_rate": 1.0305368692688174e-05, "loss": 2.4419, "step": 1770 }, { "epoch": 0.71, "grad_norm": 8.084098815917969, "learning_rate": 1.0175998036874356e-05, "loss": 2.7489, "step": 1775 }, { "epoch": 0.71, "grad_norm": 20.572294235229492, "learning_rate": 1.0047236737719601e-05, "loss": 3.0075, "step": 1780 }, { "epoch": 0.71, "grad_norm": 8.42055606842041, "learning_rate": 9.919090088096589e-06, "loss": 2.7706, "step": 1785 }, { "epoch": 0.72, "grad_norm": 33.90031051635742, "learning_rate": 9.791563355612172e-06, "loss": 3.6592, "step": 1790 }, { "epoch": 0.72, "grad_norm": 8.972926139831543, "learning_rate": 9.664661782390841e-06, "loss": 2.4741, "step": 1795 }, { "epoch": 0.72, "grad_norm": 34.49540328979492, "learning_rate": 9.538390584859214e-06, "loss": 2.8379, "step": 1800 }, { "epoch": 0.72, "grad_norm": 17.665390014648438, "learning_rate": 9.412754953531663e-06, "loss": 3.0774, "step": 1805 }, { "epoch": 0.72, "grad_norm": 24.203062057495117, "learning_rate": 9.287760052796909e-06, "loss": 2.6639, "step": 1810 }, { "epoch": 0.73, "grad_norm": 15.584200859069824, "learning_rate": 9.163411020705762e-06, "loss": 2.75, "step": 1815 }, { "epoch": 0.73, "grad_norm": 9.29335880279541, "learning_rate": 9.039712968759864e-06, "loss": 2.9486, "step": 1820 }, { "epoch": 0.73, "grad_norm": 10.088394165039062, "learning_rate": 8.916670981701655e-06, "loss": 2.251, "step": 1825 }, { "epoch": 0.73, "grad_norm": 12.336640357971191, "learning_rate": 8.794290117305296e-06, "loss": 2.9527, "step": 1830 }, { "epoch": 0.73, "grad_norm": 8.988898277282715, "learning_rate": 8.672575406168782e-06, "loss": 2.6327, "step": 1835 }, { "epoch": 0.74, "grad_norm": 11.5975980758667, "learning_rate": 8.551531851507186e-06, "loss": 2.6512, "step": 1840 }, { "epoch": 0.74, "grad_norm": 23.88221549987793, "learning_rate": 8.431164428946927e-06, "loss": 2.6542, "step": 1845 }, { "epoch": 0.74, "grad_norm": 8.243393898010254, "learning_rate": 8.3114780863213e-06, "loss": 3.0325, "step": 1850 }, { "epoch": 0.74, "grad_norm": 13.184893608093262, "learning_rate": 8.192477743467078e-06, "loss": 2.7061, "step": 1855 }, { "epoch": 0.74, "grad_norm": 7.766141414642334, "learning_rate": 8.07416829202227e-06, "loss": 2.2925, "step": 1860 }, { "epoch": 0.75, "grad_norm": 30.14096450805664, "learning_rate": 7.956554595225016e-06, "loss": 3.4068, "step": 1865 }, { "epoch": 0.75, "grad_norm": 16.042314529418945, "learning_rate": 7.839641487713745e-06, "loss": 2.388, "step": 1870 }, { "epoch": 0.75, "grad_norm": 16.486698150634766, "learning_rate": 7.723433775328384e-06, "loss": 3.549, "step": 1875 }, { "epoch": 0.75, "grad_norm": 10.384564399719238, "learning_rate": 7.607936234912841e-06, "loss": 3.186, "step": 1880 }, { "epoch": 0.75, "grad_norm": 13.724809646606445, "learning_rate": 7.493153614118634e-06, "loss": 2.6417, "step": 1885 }, { "epoch": 0.76, "grad_norm": 8.79148006439209, "learning_rate": 7.379090631209712e-06, "loss": 2.6296, "step": 1890 }, { "epoch": 0.76, "grad_norm": 14.533072471618652, "learning_rate": 7.265751974868554e-06, "loss": 2.5174, "step": 1895 }, { "epoch": 0.76, "grad_norm": 11.014676094055176, "learning_rate": 7.153142304003418e-06, "loss": 2.5473, "step": 1900 }, { "epoch": 0.76, "grad_norm": 14.28827953338623, "learning_rate": 7.041266247556813e-06, "loss": 2.8404, "step": 1905 }, { "epoch": 0.76, "grad_norm": 13.483999252319336, "learning_rate": 6.930128404315214e-06, "loss": 2.6145, "step": 1910 }, { "epoch": 0.77, "grad_norm": 22.0334529876709, "learning_rate": 6.819733342720066e-06, "loss": 2.7059, "step": 1915 }, { "epoch": 0.77, "grad_norm": 42.69429397583008, "learning_rate": 6.7100856006799665e-06, "loss": 3.0099, "step": 1920 }, { "epoch": 0.77, "grad_norm": 9.883504867553711, "learning_rate": 6.601189685384126e-06, "loss": 2.9242, "step": 1925 }, { "epoch": 0.77, "grad_norm": 17.86440658569336, "learning_rate": 6.493050073117116e-06, "loss": 3.022, "step": 1930 }, { "epoch": 0.77, "grad_norm": 27.365087509155273, "learning_rate": 6.385671209074828e-06, "loss": 2.9742, "step": 1935 }, { "epoch": 0.78, "grad_norm": 13.641215324401855, "learning_rate": 6.279057507181796e-06, "loss": 2.4529, "step": 1940 }, { "epoch": 0.78, "grad_norm": 8.5147123336792, "learning_rate": 6.173213349909729e-06, "loss": 2.5926, "step": 1945 }, { "epoch": 0.78, "grad_norm": 13.009632110595703, "learning_rate": 6.068143088097372e-06, "loss": 2.4284, "step": 1950 }, { "epoch": 0.78, "grad_norm": 11.745450973510742, "learning_rate": 5.9638510407716394e-06, "loss": 2.7295, "step": 1955 }, { "epoch": 0.78, "grad_norm": 7.102616310119629, "learning_rate": 5.860341494970131e-06, "loss": 2.3844, "step": 1960 }, { "epoch": 0.79, "grad_norm": 19.79552459716797, "learning_rate": 5.757618705564849e-06, "loss": 2.6637, "step": 1965 }, { "epoch": 0.79, "grad_norm": 10.288678169250488, "learning_rate": 5.655686895087329e-06, "loss": 2.6376, "step": 1970 }, { "epoch": 0.79, "grad_norm": 5.515965938568115, "learning_rate": 5.554550253555066e-06, "loss": 2.7672, "step": 1975 }, { "epoch": 0.79, "grad_norm": 22.45775604248047, "learning_rate": 5.454212938299255e-06, "loss": 2.9637, "step": 1980 }, { "epoch": 0.79, "grad_norm": 8.804770469665527, "learning_rate": 5.354679073793942e-06, "loss": 2.4708, "step": 1985 }, { "epoch": 0.8, "grad_norm": 14.247252464294434, "learning_rate": 5.255952751486443e-06, "loss": 2.6551, "step": 1990 }, { "epoch": 0.8, "grad_norm": 7.674140930175781, "learning_rate": 5.158038029629195e-06, "loss": 2.7633, "step": 1995 }, { "epoch": 0.8, "grad_norm": 9.93137264251709, "learning_rate": 5.060938933112891e-06, "loss": 2.7034, "step": 2000 }, { "epoch": 0.8, "grad_norm": 11.792275428771973, "learning_rate": 4.9646594533010875e-06, "loss": 2.8564, "step": 2005 }, { "epoch": 0.8, "grad_norm": 13.885851860046387, "learning_rate": 4.869203547866097e-06, "loss": 2.6809, "step": 2010 }, { "epoch": 0.81, "grad_norm": 21.813217163085938, "learning_rate": 4.7745751406263165e-06, "loss": 3.0319, "step": 2015 }, { "epoch": 0.81, "grad_norm": 12.183605194091797, "learning_rate": 4.680778121384935e-06, "loss": 2.5685, "step": 2020 }, { "epoch": 0.81, "grad_norm": 26.5369930267334, "learning_rate": 4.587816345770032e-06, "loss": 2.9539, "step": 2025 }, { "epoch": 0.81, "grad_norm": 5.17782735824585, "learning_rate": 4.495693635076101e-06, "loss": 2.3644, "step": 2030 }, { "epoch": 0.81, "grad_norm": 21.22880744934082, "learning_rate": 4.404413776106958e-06, "loss": 3.0965, "step": 2035 }, { "epoch": 0.82, "grad_norm": 5.561256408691406, "learning_rate": 4.313980521020092e-06, "loss": 3.032, "step": 2040 }, { "epoch": 0.82, "grad_norm": 7.701003074645996, "learning_rate": 4.224397587172402e-06, "loss": 2.7757, "step": 2045 }, { "epoch": 0.82, "grad_norm": 17.345849990844727, "learning_rate": 4.135668656967434e-06, "loss": 2.3689, "step": 2050 }, { "epoch": 0.82, "grad_norm": 8.013896942138672, "learning_rate": 4.047797377703985e-06, "loss": 2.512, "step": 2055 }, { "epoch": 0.82, "grad_norm": 6.709656715393066, "learning_rate": 3.9607873614261715e-06, "loss": 3.0421, "step": 2060 }, { "epoch": 0.83, "grad_norm": 13.18295669555664, "learning_rate": 3.8746421847749765e-06, "loss": 2.4467, "step": 2065 }, { "epoch": 0.83, "grad_norm": 9.89919662475586, "learning_rate": 3.789365388841193e-06, "loss": 2.322, "step": 2070 }, { "epoch": 0.83, "grad_norm": 20.111215591430664, "learning_rate": 3.7049604790198976e-06, "loss": 3.2308, "step": 2075 }, { "epoch": 0.83, "grad_norm": 9.88414192199707, "learning_rate": 3.621430924866348e-06, "loss": 2.5939, "step": 2080 }, { "epoch": 0.83, "grad_norm": 8.166717529296875, "learning_rate": 3.5387801599533475e-06, "loss": 2.9439, "step": 2085 }, { "epoch": 0.84, "grad_norm": 31.1898193359375, "learning_rate": 3.4570115817301243e-06, "loss": 3.2292, "step": 2090 }, { "epoch": 0.84, "grad_norm": 26.66686248779297, "learning_rate": 3.3761285513826625e-06, "loss": 2.9332, "step": 2095 }, { "epoch": 0.84, "grad_norm": 4.934303283691406, "learning_rate": 3.296134393695538e-06, "loss": 2.7759, "step": 2100 }, { "epoch": 0.84, "grad_norm": 11.340413093566895, "learning_rate": 3.217032396915265e-06, "loss": 2.3886, "step": 2105 }, { "epoch": 0.84, "grad_norm": 10.123784065246582, "learning_rate": 3.1388258126151093e-06, "loss": 2.6871, "step": 2110 }, { "epoch": 0.85, "grad_norm": 9.508092880249023, "learning_rate": 3.06151785556143e-06, "loss": 3.0332, "step": 2115 }, { "epoch": 0.85, "grad_norm": 6.981659412384033, "learning_rate": 2.98511170358155e-06, "loss": 2.6556, "step": 2120 }, { "epoch": 0.85, "grad_norm": 7.004157066345215, "learning_rate": 2.9096104974331184e-06, "loss": 2.7863, "step": 2125 }, { "epoch": 0.85, "grad_norm": 15.677950859069824, "learning_rate": 2.8350173406749973e-06, "loss": 2.7697, "step": 2130 }, { "epoch": 0.85, "grad_norm": 7.333318710327148, "learning_rate": 2.7613352995397078e-06, "loss": 2.6726, "step": 2135 }, { "epoch": 0.86, "grad_norm": 5.832099437713623, "learning_rate": 2.688567402807357e-06, "loss": 2.8863, "step": 2140 }, { "epoch": 0.86, "grad_norm": 10.78770637512207, "learning_rate": 2.6167166416811746e-06, "loss": 2.8584, "step": 2145 }, { "epoch": 0.86, "grad_norm": 16.921499252319336, "learning_rate": 2.545785969664524e-06, "loss": 2.9597, "step": 2150 }, { "epoch": 0.86, "grad_norm": 8.275824546813965, "learning_rate": 2.475778302439524e-06, "loss": 2.1545, "step": 2155 }, { "epoch": 0.86, "grad_norm": 20.255998611450195, "learning_rate": 2.4066965177471645e-06, "loss": 2.5138, "step": 2160 }, { "epoch": 0.87, "grad_norm": 6.863336086273193, "learning_rate": 2.338543455269046e-06, "loss": 2.5736, "step": 2165 }, { "epoch": 0.87, "grad_norm": 10.660894393920898, "learning_rate": 2.271321916510627e-06, "loss": 2.5524, "step": 2170 }, { "epoch": 0.87, "grad_norm": 13.18586254119873, "learning_rate": 2.205034664686076e-06, "loss": 2.5716, "step": 2175 }, { "epoch": 0.87, "grad_norm": 11.016261100769043, "learning_rate": 2.1396844246046903e-06, "loss": 3.3971, "step": 2180 }, { "epoch": 0.87, "grad_norm": 23.928808212280273, "learning_rate": 2.075273882558873e-06, "loss": 2.7627, "step": 2185 }, { "epoch": 0.88, "grad_norm": 8.036683082580566, "learning_rate": 2.0118056862137357e-06, "loss": 2.7452, "step": 2190 }, { "epoch": 0.88, "grad_norm": 15.620336532592773, "learning_rate": 1.949282444498238e-06, "loss": 2.7892, "step": 2195 }, { "epoch": 0.88, "grad_norm": 5.829482078552246, "learning_rate": 1.8877067274979648e-06, "loss": 2.7427, "step": 2200 }, { "epoch": 0.88, "grad_norm": 24.735841751098633, "learning_rate": 1.827081066349459e-06, "loss": 2.3911, "step": 2205 }, { "epoch": 0.88, "grad_norm": 15.522488594055176, "learning_rate": 1.767407953136202e-06, "loss": 2.7896, "step": 2210 }, { "epoch": 0.89, "grad_norm": 10.789400100708008, "learning_rate": 1.7086898407861485e-06, "loss": 2.6058, "step": 2215 }, { "epoch": 0.89, "grad_norm": 12.961084365844727, "learning_rate": 1.6509291429709223e-06, "loss": 2.1545, "step": 2220 }, { "epoch": 0.89, "grad_norm": 42.19524002075195, "learning_rate": 1.59412823400657e-06, "loss": 2.7638, "step": 2225 }, { "epoch": 0.89, "grad_norm": 11.88717269897461, "learning_rate": 1.538289448755989e-06, "loss": 3.258, "step": 2230 }, { "epoch": 0.89, "grad_norm": 9.746241569519043, "learning_rate": 1.483415082532938e-06, "loss": 2.4654, "step": 2235 }, { "epoch": 0.9, "grad_norm": 8.284168243408203, "learning_rate": 1.4295073910076757e-06, "loss": 2.8934, "step": 2240 }, { "epoch": 0.9, "grad_norm": 6.290571212768555, "learning_rate": 1.3765685901142716e-06, "loss": 2.8615, "step": 2245 }, { "epoch": 0.9, "grad_norm": 9.758255004882812, "learning_rate": 1.3246008559594709e-06, "loss": 2.4915, "step": 2250 }, { "epoch": 0.9, "grad_norm": 64.08142852783203, "learning_rate": 1.273606324733284e-06, "loss": 2.6644, "step": 2255 }, { "epoch": 0.9, "grad_norm": 6.796569347381592, "learning_rate": 1.2235870926211619e-06, "loss": 3.0118, "step": 2260 }, { "epoch": 0.91, "grad_norm": 48.14067459106445, "learning_rate": 1.1745452157178206e-06, "loss": 2.5428, "step": 2265 }, { "epoch": 0.91, "grad_norm": 78.56604766845703, "learning_rate": 1.1264827099427417e-06, "loss": 3.2121, "step": 2270 }, { "epoch": 0.91, "grad_norm": 8.145052909851074, "learning_rate": 1.0794015509572818e-06, "loss": 2.7715, "step": 2275 }, { "epoch": 0.91, "grad_norm": 9.410415649414062, "learning_rate": 1.0333036740834856e-06, "loss": 2.9311, "step": 2280 }, { "epoch": 0.91, "grad_norm": 26.370338439941406, "learning_rate": 9.881909742245177e-07, "loss": 2.8867, "step": 2285 }, { "epoch": 0.92, "grad_norm": 19.1849365234375, "learning_rate": 9.440653057867815e-07, "loss": 2.559, "step": 2290 }, { "epoch": 0.92, "grad_norm": 25.568355560302734, "learning_rate": 9.009284826036691e-07, "loss": 2.5787, "step": 2295 }, { "epoch": 0.92, "grad_norm": 14.307110786437988, "learning_rate": 8.587822778610283e-07, "loss": 2.8058, "step": 2300 }, { "epoch": 0.92, "grad_norm": 25.297014236450195, "learning_rate": 8.176284240242638e-07, "loss": 2.8921, "step": 2305 }, { "epoch": 0.92, "grad_norm": 19.239059448242188, "learning_rate": 7.774686127671183e-07, "loss": 2.6122, "step": 2310 }, { "epoch": 0.93, "grad_norm": 11.51448917388916, "learning_rate": 7.383044949021339e-07, "loss": 2.2361, "step": 2315 }, { "epoch": 0.93, "grad_norm": 6.253133773803711, "learning_rate": 7.00137680312804e-07, "loss": 2.5475, "step": 2320 }, { "epoch": 0.93, "grad_norm": 17.918170928955078, "learning_rate": 6.62969737887384e-07, "loss": 2.8589, "step": 2325 }, { "epoch": 0.93, "grad_norm": 15.065885543823242, "learning_rate": 6.268021954544096e-07, "loss": 2.9153, "step": 2330 }, { "epoch": 0.93, "grad_norm": 13.842984199523926, "learning_rate": 5.916365397198975e-07, "loss": 2.894, "step": 2335 }, { "epoch": 0.94, "grad_norm": 10.621698379516602, "learning_rate": 5.574742162062163e-07, "loss": 3.1294, "step": 2340 }, { "epoch": 0.94, "grad_norm": 13.219593048095703, "learning_rate": 5.243166291926782e-07, "loss": 2.6543, "step": 2345 }, { "epoch": 0.94, "grad_norm": 11.599324226379395, "learning_rate": 4.921651416578188e-07, "loss": 2.3677, "step": 2350 }, { "epoch": 0.94, "grad_norm": 8.731278419494629, "learning_rate": 4.6102107522336403e-07, "loss": 2.7634, "step": 2355 }, { "epoch": 0.94, "grad_norm": 43.908077239990234, "learning_rate": 4.308857100999042e-07, "loss": 2.8106, "step": 2360 }, { "epoch": 0.95, "grad_norm": 9.233963966369629, "learning_rate": 4.0176028503425835e-07, "loss": 2.5252, "step": 2365 }, { "epoch": 0.95, "grad_norm": 13.496291160583496, "learning_rate": 3.7364599725858153e-07, "loss": 2.2908, "step": 2370 }, { "epoch": 0.95, "grad_norm": 11.278969764709473, "learning_rate": 3.465440024411265e-07, "loss": 2.5045, "step": 2375 }, { "epoch": 0.95, "grad_norm": 13.121251106262207, "learning_rate": 3.204554146387456e-07, "loss": 2.4661, "step": 2380 }, { "epoch": 0.95, "grad_norm": 17.475078582763672, "learning_rate": 2.9538130625110796e-07, "loss": 2.9907, "step": 2385 }, { "epoch": 0.96, "grad_norm": 6.753970146179199, "learning_rate": 2.7132270797659563e-07, "loss": 2.4326, "step": 2390 }, { "epoch": 0.96, "grad_norm": 18.052387237548828, "learning_rate": 2.482806087699546e-07, "loss": 3.0056, "step": 2395 }, { "epoch": 0.96, "grad_norm": 11.094266891479492, "learning_rate": 2.262559558016325e-07, "loss": 2.7058, "step": 2400 } ], "logging_steps": 5, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "total_flos": 1.3202552777146368e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }