{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0012865413480094, "eval_steps": 500, "global_step": 7000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 9.371515465709024, "learning_rate": 4.036571428571429e-07, "loss": 0.4514, "step": 10 }, { "epoch": 0.01, "grad_norm": 8.455797488672111, "learning_rate": 6.299428571428571e-07, "loss": 0.4763, "step": 20 }, { "epoch": 0.01, "grad_norm": 7.8003218070916445, "learning_rate": 8.562285714285715e-07, "loss": 0.3688, "step": 30 }, { "epoch": 0.01, "grad_norm": 9.541799841427569, "learning_rate": 1.082514285714286e-06, "loss": 0.469, "step": 40 }, { "epoch": 0.01, "grad_norm": 6.601466865524005, "learning_rate": 1.3088e-06, "loss": 0.4116, "step": 50 }, { "epoch": 0.02, "grad_norm": 10.592066448199162, "learning_rate": 1.5350857142857146e-06, "loss": 0.4571, "step": 60 }, { "epoch": 0.02, "grad_norm": 8.856963406340235, "learning_rate": 1.7613714285714286e-06, "loss": 0.3603, "step": 70 }, { "epoch": 0.02, "grad_norm": 10.172045634489491, "learning_rate": 1.987657142857143e-06, "loss": 0.4502, "step": 80 }, { "epoch": 0.03, "grad_norm": 12.044668349295934, "learning_rate": 2.2139428571428572e-06, "loss": 0.4041, "step": 90 }, { "epoch": 0.03, "grad_norm": 9.210970631831845, "learning_rate": 2.4402285714285715e-06, "loss": 0.3747, "step": 100 }, { "epoch": 0.03, "grad_norm": 8.834495851805377, "learning_rate": 2.666514285714286e-06, "loss": 0.361, "step": 110 }, { "epoch": 0.03, "grad_norm": 11.3238205722943, "learning_rate": 2.8928000000000005e-06, "loss": 0.3222, "step": 120 }, { "epoch": 0.04, "grad_norm": 11.121942678675966, "learning_rate": 3.119085714285715e-06, "loss": 0.3948, "step": 130 }, { "epoch": 0.04, "grad_norm": 14.085567560484616, "learning_rate": 3.345371428571429e-06, "loss": 0.3692, "step": 140 }, { "epoch": 0.04, "grad_norm": 9.420475642350816, "learning_rate": 3.5716571428571433e-06, "loss": 0.3641, "step": 150 }, { "epoch": 0.05, "grad_norm": 10.015469601953582, "learning_rate": 3.7979428571428576e-06, "loss": 0.3473, "step": 160 }, { "epoch": 0.05, "grad_norm": 7.300818086534276, "learning_rate": 4.0242285714285715e-06, "loss": 0.2811, "step": 170 }, { "epoch": 0.05, "grad_norm": 4.6093932070614985, "learning_rate": 4.250514285714286e-06, "loss": 0.3099, "step": 180 }, { "epoch": 0.05, "grad_norm": 8.54101584831637, "learning_rate": 4.476800000000001e-06, "loss": 0.3204, "step": 190 }, { "epoch": 0.06, "grad_norm": 6.339057577468356, "learning_rate": 4.703085714285715e-06, "loss": 0.368, "step": 200 }, { "epoch": 0.06, "grad_norm": 9.276047195584283, "learning_rate": 4.929371428571429e-06, "loss": 0.3717, "step": 210 }, { "epoch": 0.06, "grad_norm": 36.896689860423116, "learning_rate": 5.155657142857143e-06, "loss": 0.369, "step": 220 }, { "epoch": 0.07, "grad_norm": 6.251253231286501, "learning_rate": 5.381942857142858e-06, "loss": 0.3515, "step": 230 }, { "epoch": 0.07, "grad_norm": 12.484764885573412, "learning_rate": 5.608228571428572e-06, "loss": 0.2748, "step": 240 }, { "epoch": 0.07, "grad_norm": 11.645571647165193, "learning_rate": 5.834514285714287e-06, "loss": 0.3782, "step": 250 }, { "epoch": 0.07, "grad_norm": 10.692599680491462, "learning_rate": 6.0608e-06, "loss": 0.438, "step": 260 }, { "epoch": 0.08, "grad_norm": 7.129048853453849, "learning_rate": 6.287085714285715e-06, "loss": 0.2768, "step": 270 }, { "epoch": 0.08, "grad_norm": 8.422586556066586, "learning_rate": 6.513371428571429e-06, "loss": 0.2694, "step": 280 }, { "epoch": 0.08, "grad_norm": 5.8492409107659205, "learning_rate": 6.739657142857144e-06, "loss": 0.2738, "step": 290 }, { "epoch": 0.09, "grad_norm": 6.2098389235782205, "learning_rate": 6.965942857142858e-06, "loss": 0.3143, "step": 300 }, { "epoch": 0.09, "grad_norm": 10.701812619129571, "learning_rate": 7.192228571428572e-06, "loss": 0.2848, "step": 310 }, { "epoch": 0.09, "grad_norm": 10.387394866227702, "learning_rate": 7.418514285714285e-06, "loss": 0.2988, "step": 320 }, { "epoch": 0.09, "grad_norm": 11.98648518233628, "learning_rate": 7.644800000000002e-06, "loss": 0.3174, "step": 330 }, { "epoch": 0.1, "grad_norm": 7.293199679575633, "learning_rate": 7.871085714285716e-06, "loss": 0.3183, "step": 340 }, { "epoch": 0.1, "grad_norm": 10.484247653147987, "learning_rate": 8.09737142857143e-06, "loss": 0.3022, "step": 350 }, { "epoch": 0.1, "grad_norm": 16.443272030479804, "learning_rate": 8.323657142857145e-06, "loss": 0.3653, "step": 360 }, { "epoch": 0.11, "grad_norm": 6.078106799257775, "learning_rate": 8.549942857142857e-06, "loss": 0.2418, "step": 370 }, { "epoch": 0.11, "grad_norm": 10.702087797600011, "learning_rate": 8.776228571428573e-06, "loss": 0.2554, "step": 380 }, { "epoch": 0.11, "grad_norm": 8.627718731946361, "learning_rate": 9.002514285714287e-06, "loss": 0.2065, "step": 390 }, { "epoch": 0.11, "grad_norm": 10.03683515843309, "learning_rate": 9.2288e-06, "loss": 0.3217, "step": 400 }, { "epoch": 0.12, "grad_norm": 8.103345919303372, "learning_rate": 9.455085714285714e-06, "loss": 0.2682, "step": 410 }, { "epoch": 0.12, "grad_norm": 9.210551505377033, "learning_rate": 9.68137142857143e-06, "loss": 0.2875, "step": 420 }, { "epoch": 0.12, "grad_norm": 9.32931045846214, "learning_rate": 9.907657142857144e-06, "loss": 0.3036, "step": 430 }, { "epoch": 0.13, "grad_norm": 9.811646248189412, "learning_rate": 1.0133942857142856e-05, "loss": 0.2939, "step": 440 }, { "epoch": 0.13, "grad_norm": 8.14950809047388, "learning_rate": 1.0360228571428571e-05, "loss": 0.2708, "step": 450 }, { "epoch": 0.13, "grad_norm": 13.482169915842496, "learning_rate": 1.0586514285714287e-05, "loss": 0.2902, "step": 460 }, { "epoch": 0.13, "grad_norm": 10.89787289924173, "learning_rate": 1.0812800000000001e-05, "loss": 0.2696, "step": 470 }, { "epoch": 0.14, "grad_norm": 7.990196420024129, "learning_rate": 1.1039085714285713e-05, "loss": 0.3017, "step": 480 }, { "epoch": 0.14, "grad_norm": 12.729985703477526, "learning_rate": 1.126537142857143e-05, "loss": 0.2506, "step": 490 }, { "epoch": 0.14, "grad_norm": 7.581057140542081, "learning_rate": 1.1491657142857144e-05, "loss": 0.3436, "step": 500 }, { "epoch": 0.15, "grad_norm": 7.896637270888684, "learning_rate": 1.1717942857142858e-05, "loss": 0.2504, "step": 510 }, { "epoch": 0.15, "grad_norm": 9.62086497076248, "learning_rate": 1.1944228571428574e-05, "loss": 0.3033, "step": 520 }, { "epoch": 0.15, "grad_norm": 11.13637449869111, "learning_rate": 1.2170514285714286e-05, "loss": 0.326, "step": 530 }, { "epoch": 0.15, "grad_norm": 7.842448457129596, "learning_rate": 1.23968e-05, "loss": 0.2394, "step": 540 }, { "epoch": 0.16, "grad_norm": 11.8255362455627, "learning_rate": 1.2623085714285717e-05, "loss": 0.2891, "step": 550 }, { "epoch": 0.16, "grad_norm": 12.35279287040865, "learning_rate": 1.284937142857143e-05, "loss": 0.2562, "step": 560 }, { "epoch": 0.16, "grad_norm": 8.285377251295747, "learning_rate": 1.3075657142857143e-05, "loss": 0.2909, "step": 570 }, { "epoch": 0.17, "grad_norm": 5.040282298462395, "learning_rate": 1.3301942857142858e-05, "loss": 0.2498, "step": 580 }, { "epoch": 0.17, "grad_norm": 13.085847714244219, "learning_rate": 1.3528228571428572e-05, "loss": 0.2309, "step": 590 }, { "epoch": 0.17, "grad_norm": 14.666708397083449, "learning_rate": 1.3754514285714286e-05, "loss": 0.2492, "step": 600 }, { "epoch": 0.17, "grad_norm": 9.654868017763354, "learning_rate": 1.3980800000000002e-05, "loss": 0.2733, "step": 610 }, { "epoch": 0.18, "grad_norm": 6.642454366588229, "learning_rate": 1.4207085714285716e-05, "loss": 0.2913, "step": 620 }, { "epoch": 0.18, "grad_norm": 9.20069280586577, "learning_rate": 1.443337142857143e-05, "loss": 0.242, "step": 630 }, { "epoch": 0.18, "grad_norm": 5.949416559929591, "learning_rate": 1.4659657142857145e-05, "loss": 0.2308, "step": 640 }, { "epoch": 0.19, "grad_norm": 11.359091564981798, "learning_rate": 1.4885942857142857e-05, "loss": 0.2677, "step": 650 }, { "epoch": 0.19, "grad_norm": 6.880379808721017, "learning_rate": 1.5112228571428571e-05, "loss": 0.2351, "step": 660 }, { "epoch": 0.19, "grad_norm": 10.707160935004477, "learning_rate": 1.5338514285714287e-05, "loss": 0.2678, "step": 670 }, { "epoch": 0.19, "grad_norm": 6.169276982362473, "learning_rate": 1.55648e-05, "loss": 0.2598, "step": 680 }, { "epoch": 0.2, "grad_norm": 15.858254559405284, "learning_rate": 1.5791085714285718e-05, "loss": 0.2626, "step": 690 }, { "epoch": 0.2, "grad_norm": 5.454785463081219, "learning_rate": 1.6017371428571428e-05, "loss": 0.2354, "step": 700 }, { "epoch": 0.2, "grad_norm": 13.73774311659306, "learning_rate": 1.6243657142857142e-05, "loss": 0.2349, "step": 710 }, { "epoch": 0.21, "grad_norm": 11.186423478997424, "learning_rate": 1.646994285714286e-05, "loss": 0.2746, "step": 720 }, { "epoch": 0.21, "grad_norm": 5.473687607748699, "learning_rate": 1.6696228571428573e-05, "loss": 0.2074, "step": 730 }, { "epoch": 0.21, "grad_norm": 11.168516281064829, "learning_rate": 1.6922514285714287e-05, "loss": 0.3105, "step": 740 }, { "epoch": 0.21, "grad_norm": 5.404004286554699, "learning_rate": 1.71488e-05, "loss": 0.2731, "step": 750 }, { "epoch": 0.22, "grad_norm": 9.620468856584267, "learning_rate": 1.7375085714285715e-05, "loss": 0.3306, "step": 760 }, { "epoch": 0.22, "grad_norm": 9.403860200466715, "learning_rate": 1.760137142857143e-05, "loss": 0.2664, "step": 770 }, { "epoch": 0.22, "grad_norm": 8.928021293040489, "learning_rate": 1.7827657142857146e-05, "loss": 0.1846, "step": 780 }, { "epoch": 0.23, "grad_norm": 9.747870579514325, "learning_rate": 1.805394285714286e-05, "loss": 0.2284, "step": 790 }, { "epoch": 0.23, "grad_norm": 10.011503469568964, "learning_rate": 1.8280228571428574e-05, "loss": 0.2189, "step": 800 }, { "epoch": 0.23, "grad_norm": 6.118483521812366, "learning_rate": 1.8506514285714287e-05, "loss": 0.2727, "step": 810 }, { "epoch": 0.23, "grad_norm": 8.207314778241457, "learning_rate": 1.87328e-05, "loss": 0.2791, "step": 820 }, { "epoch": 0.24, "grad_norm": 5.858610952789285, "learning_rate": 1.8959085714285715e-05, "loss": 0.2396, "step": 830 }, { "epoch": 0.24, "grad_norm": 10.83113071840425, "learning_rate": 1.9185371428571432e-05, "loss": 0.2925, "step": 840 }, { "epoch": 0.24, "grad_norm": 5.528793392886516, "learning_rate": 1.9411657142857146e-05, "loss": 0.2517, "step": 850 }, { "epoch": 0.25, "grad_norm": 20.177843175736335, "learning_rate": 1.963794285714286e-05, "loss": 0.1734, "step": 860 }, { "epoch": 0.25, "grad_norm": 11.587005545427413, "learning_rate": 1.986422857142857e-05, "loss": 0.2266, "step": 870 }, { "epoch": 0.25, "grad_norm": 20.058565223324408, "learning_rate": 1.9999995528769805e-05, "loss": 0.2119, "step": 880 }, { "epoch": 0.25, "grad_norm": 3.9780862882757804, "learning_rate": 1.999995975895224e-05, "loss": 0.2415, "step": 890 }, { "epoch": 0.26, "grad_norm": 8.131181918071052, "learning_rate": 1.999988821944506e-05, "loss": 0.1773, "step": 900 }, { "epoch": 0.26, "grad_norm": 6.262960804567659, "learning_rate": 1.9999780910504195e-05, "loss": 0.2974, "step": 910 }, { "epoch": 0.26, "grad_norm": 5.901669941581385, "learning_rate": 1.9999637832513524e-05, "loss": 0.2659, "step": 920 }, { "epoch": 0.27, "grad_norm": 6.216938924409966, "learning_rate": 1.9999458985984886e-05, "loss": 0.2359, "step": 930 }, { "epoch": 0.27, "grad_norm": 10.346253797939001, "learning_rate": 1.9999244371558077e-05, "loss": 0.243, "step": 940 }, { "epoch": 0.27, "grad_norm": 17.593493281874835, "learning_rate": 1.999899399000084e-05, "loss": 0.2309, "step": 950 }, { "epoch": 0.27, "grad_norm": 7.8447252860270655, "learning_rate": 1.999870784220888e-05, "loss": 0.2354, "step": 960 }, { "epoch": 0.28, "grad_norm": 5.275135902715139, "learning_rate": 1.9998385929205847e-05, "loss": 0.2714, "step": 970 }, { "epoch": 0.28, "grad_norm": 7.885648763475231, "learning_rate": 1.9998028252143332e-05, "loss": 0.3331, "step": 980 }, { "epoch": 0.28, "grad_norm": 5.652221931457679, "learning_rate": 1.9997634812300866e-05, "loss": 0.2806, "step": 990 }, { "epoch": 0.29, "grad_norm": 7.149741865713607, "learning_rate": 1.999720561108592e-05, "loss": 0.2655, "step": 1000 }, { "epoch": 0.29, "grad_norm": 7.241471634825913, "learning_rate": 1.999674065003389e-05, "loss": 0.2973, "step": 1010 }, { "epoch": 0.29, "grad_norm": 8.52695633711115, "learning_rate": 1.9996239930808104e-05, "loss": 0.2394, "step": 1020 }, { "epoch": 0.29, "grad_norm": 10.1463093629637, "learning_rate": 1.9995703455199803e-05, "loss": 0.2233, "step": 1030 }, { "epoch": 0.3, "grad_norm": 13.648598489319562, "learning_rate": 1.9995131225128146e-05, "loss": 0.2284, "step": 1040 }, { "epoch": 0.3, "grad_norm": 6.789267867200956, "learning_rate": 1.999452324264019e-05, "loss": 0.2608, "step": 1050 }, { "epoch": 0.3, "grad_norm": 10.519271966378506, "learning_rate": 1.9993879509910905e-05, "loss": 0.2056, "step": 1060 }, { "epoch": 0.31, "grad_norm": 11.293702752542806, "learning_rate": 1.999320002924313e-05, "loss": 0.1504, "step": 1070 }, { "epoch": 0.31, "grad_norm": 6.540696317135377, "learning_rate": 1.9992484803067606e-05, "loss": 0.2475, "step": 1080 }, { "epoch": 0.31, "grad_norm": 5.647577539513675, "learning_rate": 1.9991733833942945e-05, "loss": 0.2754, "step": 1090 }, { "epoch": 0.31, "grad_norm": 5.066069486627178, "learning_rate": 1.999094712455561e-05, "loss": 0.2842, "step": 1100 }, { "epoch": 0.32, "grad_norm": 7.894192272611739, "learning_rate": 1.9990124677719934e-05, "loss": 0.212, "step": 1110 }, { "epoch": 0.32, "grad_norm": 5.663160960040441, "learning_rate": 1.9989266496378094e-05, "loss": 0.2599, "step": 1120 }, { "epoch": 0.32, "grad_norm": 15.186474675233994, "learning_rate": 1.998837258360009e-05, "loss": 0.261, "step": 1130 }, { "epoch": 0.33, "grad_norm": 9.65719728236852, "learning_rate": 1.998744294258376e-05, "loss": 0.2192, "step": 1140 }, { "epoch": 0.33, "grad_norm": 9.9677760685831, "learning_rate": 1.9986477576654738e-05, "loss": 0.3029, "step": 1150 }, { "epoch": 0.33, "grad_norm": 5.994411727129336, "learning_rate": 1.9985476489266476e-05, "loss": 0.2137, "step": 1160 }, { "epoch": 0.33, "grad_norm": 12.235199999123623, "learning_rate": 1.9984439684000204e-05, "loss": 0.2542, "step": 1170 }, { "epoch": 0.34, "grad_norm": 5.622683238719617, "learning_rate": 1.998336716456492e-05, "loss": 0.2316, "step": 1180 }, { "epoch": 0.34, "grad_norm": 10.572770221366234, "learning_rate": 1.9982258934797397e-05, "loss": 0.2017, "step": 1190 }, { "epoch": 0.34, "grad_norm": 8.085887492297054, "learning_rate": 1.9981114998662157e-05, "loss": 0.2183, "step": 1200 }, { "epoch": 0.35, "grad_norm": 12.27546753779604, "learning_rate": 1.9979935360251438e-05, "loss": 0.2255, "step": 1210 }, { "epoch": 0.35, "grad_norm": 6.897638674285995, "learning_rate": 1.997872002378522e-05, "loss": 0.2705, "step": 1220 }, { "epoch": 0.35, "grad_norm": 7.809337006194467, "learning_rate": 1.9977468993611168e-05, "loss": 0.2572, "step": 1230 }, { "epoch": 0.35, "grad_norm": 6.421037382645887, "learning_rate": 1.9976182274204644e-05, "loss": 0.2019, "step": 1240 }, { "epoch": 0.36, "grad_norm": 10.192332613489608, "learning_rate": 1.9974859870168692e-05, "loss": 0.235, "step": 1250 }, { "epoch": 0.36, "grad_norm": 11.44316452753769, "learning_rate": 1.9973501786233993e-05, "loss": 0.2348, "step": 1260 }, { "epoch": 0.36, "grad_norm": 7.907164027035195, "learning_rate": 1.9972108027258875e-05, "loss": 0.2042, "step": 1270 }, { "epoch": 0.37, "grad_norm": 7.094461413769536, "learning_rate": 1.9970678598229296e-05, "loss": 0.2637, "step": 1280 }, { "epoch": 0.37, "grad_norm": 8.773372279644047, "learning_rate": 1.9969213504258806e-05, "loss": 0.2801, "step": 1290 }, { "epoch": 0.37, "grad_norm": 3.6516375016951805, "learning_rate": 1.9967712750588554e-05, "loss": 0.2354, "step": 1300 }, { "epoch": 0.37, "grad_norm": 6.447398159233043, "learning_rate": 1.9966176342587232e-05, "loss": 0.2914, "step": 1310 }, { "epoch": 0.38, "grad_norm": 8.830674497044601, "learning_rate": 1.996460428575111e-05, "loss": 0.2626, "step": 1320 }, { "epoch": 0.38, "grad_norm": 7.188889079001396, "learning_rate": 1.996299658570397e-05, "loss": 0.2217, "step": 1330 }, { "epoch": 0.38, "grad_norm": 17.24214856135159, "learning_rate": 1.9961353248197096e-05, "loss": 0.2397, "step": 1340 }, { "epoch": 0.39, "grad_norm": 5.826992876071087, "learning_rate": 1.9959674279109272e-05, "loss": 0.1537, "step": 1350 }, { "epoch": 0.39, "grad_norm": 10.913933525879852, "learning_rate": 1.995795968444674e-05, "loss": 0.227, "step": 1360 }, { "epoch": 0.39, "grad_norm": 5.4268271397350984, "learning_rate": 1.9956209470343194e-05, "loss": 0.2152, "step": 1370 }, { "epoch": 0.39, "grad_norm": 5.76952662173985, "learning_rate": 1.995442364305975e-05, "loss": 0.2558, "step": 1380 }, { "epoch": 0.4, "grad_norm": 9.179136868458949, "learning_rate": 1.9952602208984907e-05, "loss": 0.2096, "step": 1390 }, { "epoch": 0.4, "grad_norm": 9.129998079364487, "learning_rate": 1.995074517463457e-05, "loss": 0.2117, "step": 1400 }, { "epoch": 0.4, "grad_norm": 8.71664742442743, "learning_rate": 1.9948852546651977e-05, "loss": 0.2332, "step": 1410 }, { "epoch": 0.41, "grad_norm": 8.885439468360495, "learning_rate": 1.994692433180771e-05, "loss": 0.2544, "step": 1420 }, { "epoch": 0.41, "grad_norm": 7.019215232261128, "learning_rate": 1.9944960536999648e-05, "loss": 0.2334, "step": 1430 }, { "epoch": 0.41, "grad_norm": 9.896500371570356, "learning_rate": 1.994296116925295e-05, "loss": 0.2156, "step": 1440 }, { "epoch": 0.41, "grad_norm": 8.711469557041552, "learning_rate": 1.9940926235720042e-05, "loss": 0.333, "step": 1450 }, { "epoch": 0.42, "grad_norm": 9.798453804578006, "learning_rate": 1.9938855743680576e-05, "loss": 0.2166, "step": 1460 }, { "epoch": 0.42, "grad_norm": 14.342977665072942, "learning_rate": 1.9936749700541403e-05, "loss": 0.2516, "step": 1470 }, { "epoch": 0.42, "grad_norm": 7.3235895563076046, "learning_rate": 1.9934608113836562e-05, "loss": 0.3124, "step": 1480 }, { "epoch": 0.43, "grad_norm": 6.759392314435464, "learning_rate": 1.993243099122724e-05, "loss": 0.2179, "step": 1490 }, { "epoch": 0.43, "grad_norm": 15.762200368261366, "learning_rate": 1.9930218340501743e-05, "loss": 0.1941, "step": 1500 }, { "epoch": 0.43, "grad_norm": 5.025042952168924, "learning_rate": 1.9927970169575482e-05, "loss": 0.2238, "step": 1510 }, { "epoch": 0.43, "grad_norm": 9.731684378727454, "learning_rate": 1.9925686486490927e-05, "loss": 0.2307, "step": 1520 }, { "epoch": 0.44, "grad_norm": 7.258274749390965, "learning_rate": 1.9923367299417595e-05, "loss": 0.2653, "step": 1530 }, { "epoch": 0.44, "grad_norm": 12.122217232617373, "learning_rate": 1.9921012616652004e-05, "loss": 0.2675, "step": 1540 }, { "epoch": 0.44, "grad_norm": 4.783101041428015, "learning_rate": 1.9918622446617664e-05, "loss": 0.2004, "step": 1550 }, { "epoch": 0.45, "grad_norm": 10.242505566905294, "learning_rate": 1.9916196797865026e-05, "loss": 0.2696, "step": 1560 }, { "epoch": 0.45, "grad_norm": 6.4406312900829, "learning_rate": 1.9913735679071458e-05, "loss": 0.186, "step": 1570 }, { "epoch": 0.45, "grad_norm": 9.432405625231892, "learning_rate": 1.9911239099041228e-05, "loss": 0.2091, "step": 1580 }, { "epoch": 0.45, "grad_norm": 13.148668437885759, "learning_rate": 1.9908707066705454e-05, "loss": 0.2395, "step": 1590 }, { "epoch": 0.46, "grad_norm": 10.094266612529323, "learning_rate": 1.9906139591122074e-05, "loss": 0.2162, "step": 1600 }, { "epoch": 0.46, "grad_norm": 7.352172437088083, "learning_rate": 1.990353668147583e-05, "loss": 0.2283, "step": 1610 }, { "epoch": 0.46, "grad_norm": 11.311266415605491, "learning_rate": 1.9900898347078214e-05, "loss": 0.2211, "step": 1620 }, { "epoch": 0.47, "grad_norm": 6.451306989861931, "learning_rate": 1.989822459736745e-05, "loss": 0.2413, "step": 1630 }, { "epoch": 0.47, "grad_norm": 7.0906026302706024, "learning_rate": 1.989551544190845e-05, "loss": 0.2268, "step": 1640 }, { "epoch": 0.47, "grad_norm": 4.48545308728041, "learning_rate": 1.989277089039279e-05, "loss": 0.1713, "step": 1650 }, { "epoch": 0.47, "grad_norm": 9.435086983915038, "learning_rate": 1.988999095263866e-05, "loss": 0.2089, "step": 1660 }, { "epoch": 0.48, "grad_norm": 6.9513762943214275, "learning_rate": 1.9887175638590858e-05, "loss": 0.2723, "step": 1670 }, { "epoch": 0.48, "grad_norm": 10.405711483696518, "learning_rate": 1.988432495832071e-05, "loss": 0.2874, "step": 1680 }, { "epoch": 0.48, "grad_norm": 7.245018398559039, "learning_rate": 1.9881438922026083e-05, "loss": 0.2193, "step": 1690 }, { "epoch": 0.49, "grad_norm": 6.647625273994261, "learning_rate": 1.9878517540031306e-05, "loss": 0.1607, "step": 1700 }, { "epoch": 0.49, "grad_norm": 16.238495025805964, "learning_rate": 1.9875560822787158e-05, "loss": 0.1922, "step": 1710 }, { "epoch": 0.49, "grad_norm": 9.152449616598552, "learning_rate": 1.987256878087083e-05, "loss": 0.253, "step": 1720 }, { "epoch": 0.49, "grad_norm": 4.32757542835879, "learning_rate": 1.986954142498587e-05, "loss": 0.2676, "step": 1730 }, { "epoch": 0.5, "grad_norm": 9.015049114379131, "learning_rate": 1.9866478765962174e-05, "loss": 0.2097, "step": 1740 }, { "epoch": 0.5, "grad_norm": 5.59164201453394, "learning_rate": 1.9863380814755905e-05, "loss": 0.1427, "step": 1750 }, { "epoch": 0.5, "grad_norm": 9.176763164522356, "learning_rate": 1.9860247582449503e-05, "loss": 0.2113, "step": 1760 }, { "epoch": 0.51, "grad_norm": 9.091496663612249, "learning_rate": 1.9857079080251597e-05, "loss": 0.2457, "step": 1770 }, { "epoch": 0.51, "grad_norm": 6.163796074624837, "learning_rate": 1.9853875319497007e-05, "loss": 0.2469, "step": 1780 }, { "epoch": 0.51, "grad_norm": 5.475913451475631, "learning_rate": 1.9850636311646675e-05, "loss": 0.1875, "step": 1790 }, { "epoch": 0.51, "grad_norm": 6.905994255198308, "learning_rate": 1.9847362068287642e-05, "loss": 0.2447, "step": 1800 }, { "epoch": 0.52, "grad_norm": 8.227056794563689, "learning_rate": 1.9844052601132988e-05, "loss": 0.1935, "step": 1810 }, { "epoch": 0.52, "grad_norm": 9.053551195282736, "learning_rate": 1.9840707922021808e-05, "loss": 0.191, "step": 1820 }, { "epoch": 0.52, "grad_norm": 9.90691154509339, "learning_rate": 1.9837328042919163e-05, "loss": 0.2348, "step": 1830 }, { "epoch": 0.53, "grad_norm": 5.599345727574276, "learning_rate": 1.9833912975916023e-05, "loss": 0.2295, "step": 1840 }, { "epoch": 0.53, "grad_norm": 10.623392858690584, "learning_rate": 1.9830462733229258e-05, "loss": 0.2477, "step": 1850 }, { "epoch": 0.53, "grad_norm": 16.457789210085107, "learning_rate": 1.982697732720156e-05, "loss": 0.2485, "step": 1860 }, { "epoch": 0.53, "grad_norm": 6.212279974804062, "learning_rate": 1.9823456770301412e-05, "loss": 0.2441, "step": 1870 }, { "epoch": 0.54, "grad_norm": 8.302220922946942, "learning_rate": 1.9819901075123053e-05, "loss": 0.2333, "step": 1880 }, { "epoch": 0.54, "grad_norm": 17.8362950988024, "learning_rate": 1.9816310254386416e-05, "loss": 0.285, "step": 1890 }, { "epoch": 0.54, "grad_norm": 8.496670239062311, "learning_rate": 1.9812684320937084e-05, "loss": 0.2417, "step": 1900 }, { "epoch": 0.55, "grad_norm": 8.960310417656395, "learning_rate": 1.9809023287746266e-05, "loss": 0.2059, "step": 1910 }, { "epoch": 0.55, "grad_norm": 4.865264169669909, "learning_rate": 1.9805327167910725e-05, "loss": 0.1595, "step": 1920 }, { "epoch": 0.55, "grad_norm": 4.240132264287559, "learning_rate": 1.9801595974652738e-05, "loss": 0.2267, "step": 1930 }, { "epoch": 0.55, "grad_norm": 9.441788463250255, "learning_rate": 1.9797829721320063e-05, "loss": 0.2422, "step": 1940 }, { "epoch": 0.56, "grad_norm": 10.267240585865652, "learning_rate": 1.9794028421385868e-05, "loss": 0.2485, "step": 1950 }, { "epoch": 0.56, "grad_norm": 3.4680580617494754, "learning_rate": 1.9790192088448695e-05, "loss": 0.1758, "step": 1960 }, { "epoch": 0.56, "grad_norm": 4.055399631887886, "learning_rate": 1.9786320736232427e-05, "loss": 0.1838, "step": 1970 }, { "epoch": 0.57, "grad_norm": 7.9523467337005656, "learning_rate": 1.9782414378586204e-05, "loss": 0.1497, "step": 1980 }, { "epoch": 0.57, "grad_norm": 6.5361695082686095, "learning_rate": 1.97784730294844e-05, "loss": 0.2323, "step": 1990 }, { "epoch": 0.57, "grad_norm": 12.51354612689472, "learning_rate": 1.9774496703026566e-05, "loss": 0.1749, "step": 2000 }, { "epoch": 0.57, "grad_norm": 6.237931463268067, "learning_rate": 1.977048541343738e-05, "loss": 0.2528, "step": 2010 }, { "epoch": 0.58, "grad_norm": 9.14902333534693, "learning_rate": 1.9766439175066587e-05, "loss": 0.2284, "step": 2020 }, { "epoch": 0.58, "grad_norm": 9.579486196353791, "learning_rate": 1.976235800238897e-05, "loss": 0.2053, "step": 2030 }, { "epoch": 0.58, "grad_norm": 5.186325124038825, "learning_rate": 1.9758241910004272e-05, "loss": 0.1925, "step": 2040 }, { "epoch": 0.59, "grad_norm": 8.94956273550492, "learning_rate": 1.9754090912637157e-05, "loss": 0.2996, "step": 2050 }, { "epoch": 0.59, "grad_norm": 7.508528628616132, "learning_rate": 1.974990502513716e-05, "loss": 0.245, "step": 2060 }, { "epoch": 0.59, "grad_norm": 12.502308136509608, "learning_rate": 1.974568426247863e-05, "loss": 0.184, "step": 2070 }, { "epoch": 0.59, "grad_norm": 5.923026664894129, "learning_rate": 1.9741428639760672e-05, "loss": 0.1945, "step": 2080 }, { "epoch": 0.6, "grad_norm": 7.798299838887786, "learning_rate": 1.9737138172207102e-05, "loss": 0.1839, "step": 2090 }, { "epoch": 0.6, "grad_norm": 9.074511335683333, "learning_rate": 1.9732812875166386e-05, "loss": 0.1501, "step": 2100 }, { "epoch": 0.6, "grad_norm": 3.9580316662863733, "learning_rate": 1.9728452764111584e-05, "loss": 0.254, "step": 2110 }, { "epoch": 0.61, "grad_norm": 7.720958563634273, "learning_rate": 1.9724057854640305e-05, "loss": 0.1792, "step": 2120 }, { "epoch": 0.61, "grad_norm": 6.719137491097171, "learning_rate": 1.9719628162474638e-05, "loss": 0.2252, "step": 2130 }, { "epoch": 0.61, "grad_norm": 9.16598212258154, "learning_rate": 1.97151637034611e-05, "loss": 0.2121, "step": 2140 }, { "epoch": 0.61, "grad_norm": 9.329158757626542, "learning_rate": 1.9710664493570587e-05, "loss": 0.2302, "step": 2150 }, { "epoch": 0.62, "grad_norm": 13.653238413997359, "learning_rate": 1.9706130548898305e-05, "loss": 0.2381, "step": 2160 }, { "epoch": 0.62, "grad_norm": 7.285402350065763, "learning_rate": 1.970156188566372e-05, "loss": 0.2359, "step": 2170 }, { "epoch": 0.62, "grad_norm": 9.821497588472498, "learning_rate": 1.9696958520210502e-05, "loss": 0.2279, "step": 2180 }, { "epoch": 0.63, "grad_norm": 8.176451238673074, "learning_rate": 1.9692320469006455e-05, "loss": 0.2301, "step": 2190 }, { "epoch": 0.63, "grad_norm": 8.86823756516519, "learning_rate": 1.9687647748643477e-05, "loss": 0.2202, "step": 2200 }, { "epoch": 0.63, "grad_norm": 6.281784110823153, "learning_rate": 1.9682940375837477e-05, "loss": 0.1899, "step": 2210 }, { "epoch": 0.63, "grad_norm": 16.994804037052397, "learning_rate": 1.967819836742833e-05, "loss": 0.2065, "step": 2220 }, { "epoch": 0.64, "grad_norm": 5.062766550841094, "learning_rate": 1.9673421740379825e-05, "loss": 0.2067, "step": 2230 }, { "epoch": 0.64, "grad_norm": 9.06252651868426, "learning_rate": 1.9668610511779584e-05, "loss": 0.2088, "step": 2240 }, { "epoch": 0.64, "grad_norm": 7.679196222682739, "learning_rate": 1.9663764698839012e-05, "loss": 0.1933, "step": 2250 }, { "epoch": 0.65, "grad_norm": 7.723346777739477, "learning_rate": 1.9658884318893233e-05, "loss": 0.2151, "step": 2260 }, { "epoch": 0.65, "grad_norm": 9.61016058463137, "learning_rate": 1.9653969389401036e-05, "loss": 0.2396, "step": 2270 }, { "epoch": 0.65, "grad_norm": 9.762100145110377, "learning_rate": 1.9649019927944793e-05, "loss": 0.2208, "step": 2280 }, { "epoch": 0.65, "grad_norm": 6.309451131753409, "learning_rate": 1.9644035952230417e-05, "loss": 0.2102, "step": 2290 }, { "epoch": 0.66, "grad_norm": 5.311635384604668, "learning_rate": 1.9639017480087293e-05, "loss": 0.1887, "step": 2300 }, { "epoch": 0.66, "grad_norm": 12.038916743520652, "learning_rate": 1.963396452946821e-05, "loss": 0.2324, "step": 2310 }, { "epoch": 0.66, "grad_norm": 6.34140467392875, "learning_rate": 1.962887711844929e-05, "loss": 0.2441, "step": 2320 }, { "epoch": 0.67, "grad_norm": 8.422573874498621, "learning_rate": 1.962375526522994e-05, "loss": 0.2244, "step": 2330 }, { "epoch": 0.67, "grad_norm": 2.8219746687175964, "learning_rate": 1.9618598988132773e-05, "loss": 0.2162, "step": 2340 }, { "epoch": 0.67, "grad_norm": 9.005697777977103, "learning_rate": 1.9613408305603558e-05, "loss": 0.254, "step": 2350 }, { "epoch": 0.67, "grad_norm": 15.639587428902626, "learning_rate": 1.960818323621113e-05, "loss": 0.1869, "step": 2360 }, { "epoch": 0.68, "grad_norm": 10.424932714743706, "learning_rate": 1.9602923798647347e-05, "loss": 0.2145, "step": 2370 }, { "epoch": 0.68, "grad_norm": 7.899156607503315, "learning_rate": 1.9597630011727007e-05, "loss": 0.1622, "step": 2380 }, { "epoch": 0.68, "grad_norm": 11.888920705719043, "learning_rate": 1.9592301894387803e-05, "loss": 0.2623, "step": 2390 }, { "epoch": 0.69, "grad_norm": 7.8613286102484325, "learning_rate": 1.9586939465690214e-05, "loss": 0.2649, "step": 2400 }, { "epoch": 0.69, "grad_norm": 6.158772992384807, "learning_rate": 1.9581542744817484e-05, "loss": 0.1851, "step": 2410 }, { "epoch": 0.69, "grad_norm": 10.248536168219761, "learning_rate": 1.957611175107553e-05, "loss": 0.1997, "step": 2420 }, { "epoch": 0.69, "grad_norm": 7.471082251409378, "learning_rate": 1.957064650389285e-05, "loss": 0.1781, "step": 2430 }, { "epoch": 0.7, "grad_norm": 8.196311344337788, "learning_rate": 1.956514702282052e-05, "loss": 0.1973, "step": 2440 }, { "epoch": 0.7, "grad_norm": 13.23513363011614, "learning_rate": 1.9559613327532046e-05, "loss": 0.1926, "step": 2450 }, { "epoch": 0.7, "grad_norm": 7.014549665726194, "learning_rate": 1.9554045437823347e-05, "loss": 0.186, "step": 2460 }, { "epoch": 0.71, "grad_norm": 9.22782921554414, "learning_rate": 1.9548443373612665e-05, "loss": 0.1594, "step": 2470 }, { "epoch": 0.71, "grad_norm": 10.050952801684012, "learning_rate": 1.954280715494049e-05, "loss": 0.288, "step": 2480 }, { "epoch": 0.71, "grad_norm": 7.531389797068707, "learning_rate": 1.9537136801969502e-05, "loss": 0.2546, "step": 2490 }, { "epoch": 0.71, "grad_norm": 7.620218247219295, "learning_rate": 1.953143233498449e-05, "loss": 0.1802, "step": 2500 }, { "epoch": 0.72, "grad_norm": 8.5903437851844, "learning_rate": 1.9525693774392275e-05, "loss": 0.2491, "step": 2510 }, { "epoch": 0.72, "grad_norm": 17.352570251577646, "learning_rate": 1.9519921140721646e-05, "loss": 0.247, "step": 2520 }, { "epoch": 0.72, "grad_norm": 10.910110096572625, "learning_rate": 1.951411445462328e-05, "loss": 0.1839, "step": 2530 }, { "epoch": 0.73, "grad_norm": 8.03200471055702, "learning_rate": 1.950827373686968e-05, "loss": 0.239, "step": 2540 }, { "epoch": 0.73, "grad_norm": 9.476933634787365, "learning_rate": 1.9502399008355078e-05, "loss": 0.2518, "step": 2550 }, { "epoch": 0.73, "grad_norm": 8.51337502529338, "learning_rate": 1.949649029009539e-05, "loss": 0.1654, "step": 2560 }, { "epoch": 0.73, "grad_norm": 8.33040486060561, "learning_rate": 1.949054760322811e-05, "loss": 0.1545, "step": 2570 }, { "epoch": 0.74, "grad_norm": 7.465968424074491, "learning_rate": 1.948457096901226e-05, "loss": 0.2971, "step": 2580 }, { "epoch": 0.74, "grad_norm": 8.069179872723087, "learning_rate": 1.9478560408828296e-05, "loss": 0.2085, "step": 2590 }, { "epoch": 0.74, "grad_norm": 7.83088607977626, "learning_rate": 1.9472515944178043e-05, "loss": 0.2529, "step": 2600 }, { "epoch": 0.75, "grad_norm": 7.177465449894122, "learning_rate": 1.9466437596684613e-05, "loss": 0.1961, "step": 2610 }, { "epoch": 0.75, "grad_norm": 6.080926636805202, "learning_rate": 1.9460325388092332e-05, "loss": 0.2069, "step": 2620 }, { "epoch": 0.75, "grad_norm": 8.087372023264157, "learning_rate": 1.9454179340266647e-05, "loss": 0.1854, "step": 2630 }, { "epoch": 0.75, "grad_norm": 11.71382904491758, "learning_rate": 1.9447999475194074e-05, "loss": 0.2515, "step": 2640 }, { "epoch": 0.76, "grad_norm": 7.9387073950209155, "learning_rate": 1.9441785814982098e-05, "loss": 0.2231, "step": 2650 }, { "epoch": 0.76, "grad_norm": 8.066656889265877, "learning_rate": 1.9435538381859096e-05, "loss": 0.1969, "step": 2660 }, { "epoch": 0.76, "grad_norm": 10.145557021204297, "learning_rate": 1.942925719817427e-05, "loss": 0.2078, "step": 2670 }, { "epoch": 0.77, "grad_norm": 10.55319905886251, "learning_rate": 1.9422942286397562e-05, "loss": 0.2705, "step": 2680 }, { "epoch": 0.77, "grad_norm": 10.454340158489874, "learning_rate": 1.9416593669119556e-05, "loss": 0.2325, "step": 2690 }, { "epoch": 0.77, "grad_norm": 3.842728517844193, "learning_rate": 1.9410211369051424e-05, "loss": 0.173, "step": 2700 }, { "epoch": 0.77, "grad_norm": 8.797476874947407, "learning_rate": 1.9403795409024836e-05, "loss": 0.2379, "step": 2710 }, { "epoch": 0.78, "grad_norm": 6.895102224940745, "learning_rate": 1.9397345811991863e-05, "loss": 0.1811, "step": 2720 }, { "epoch": 0.78, "grad_norm": 9.389746296373183, "learning_rate": 1.9390862601024917e-05, "loss": 0.1511, "step": 2730 }, { "epoch": 0.78, "grad_norm": 10.448323559785127, "learning_rate": 1.9384345799316654e-05, "loss": 0.2404, "step": 2740 }, { "epoch": 0.79, "grad_norm": 5.461019403501057, "learning_rate": 1.9377795430179894e-05, "loss": 0.2177, "step": 2750 }, { "epoch": 0.79, "grad_norm": 6.499803099951165, "learning_rate": 1.9371211517047544e-05, "loss": 0.1899, "step": 2760 }, { "epoch": 0.79, "grad_norm": 6.2987278940668725, "learning_rate": 1.9364594083472513e-05, "loss": 0.2455, "step": 2770 }, { "epoch": 0.79, "grad_norm": 6.372898428588323, "learning_rate": 1.9357943153127606e-05, "loss": 0.1805, "step": 2780 }, { "epoch": 0.8, "grad_norm": 7.364008584767093, "learning_rate": 1.9351258749805478e-05, "loss": 0.2575, "step": 2790 }, { "epoch": 0.8, "grad_norm": 8.545851958310498, "learning_rate": 1.9344540897418516e-05, "loss": 0.1998, "step": 2800 }, { "epoch": 0.8, "grad_norm": 4.095885673387734, "learning_rate": 1.9337789619998772e-05, "loss": 0.1886, "step": 2810 }, { "epoch": 0.81, "grad_norm": 12.077362549713682, "learning_rate": 1.9331004941697867e-05, "loss": 0.1952, "step": 2820 }, { "epoch": 0.81, "grad_norm": 10.695381337091295, "learning_rate": 1.9324186886786903e-05, "loss": 0.1991, "step": 2830 }, { "epoch": 0.81, "grad_norm": 4.724877629385855, "learning_rate": 1.9317335479656406e-05, "loss": 0.2211, "step": 2840 }, { "epoch": 0.81, "grad_norm": 6.703967488303364, "learning_rate": 1.9310450744816173e-05, "loss": 0.2264, "step": 2850 }, { "epoch": 0.82, "grad_norm": 9.3026349344491, "learning_rate": 1.9303532706895266e-05, "loss": 0.169, "step": 2860 }, { "epoch": 0.82, "grad_norm": 4.489536199595186, "learning_rate": 1.9296581390641863e-05, "loss": 0.2521, "step": 2870 }, { "epoch": 0.82, "grad_norm": 4.691310910186008, "learning_rate": 1.9289596820923185e-05, "loss": 0.1613, "step": 2880 }, { "epoch": 0.83, "grad_norm": 14.228137461729936, "learning_rate": 1.9282579022725426e-05, "loss": 0.1901, "step": 2890 }, { "epoch": 0.83, "grad_norm": 10.319821250018055, "learning_rate": 1.927552802115365e-05, "loss": 0.1813, "step": 2900 }, { "epoch": 0.83, "grad_norm": 10.04843274702014, "learning_rate": 1.9268443841431682e-05, "loss": 0.1402, "step": 2910 }, { "epoch": 0.83, "grad_norm": 15.401766172913975, "learning_rate": 1.926132650890206e-05, "loss": 0.1806, "step": 2920 }, { "epoch": 0.84, "grad_norm": 9.980549015088204, "learning_rate": 1.9254176049025905e-05, "loss": 0.2084, "step": 2930 }, { "epoch": 0.84, "grad_norm": 8.256136374951474, "learning_rate": 1.924699248738285e-05, "loss": 0.1813, "step": 2940 }, { "epoch": 0.84, "grad_norm": 8.425898961857405, "learning_rate": 1.923977584967095e-05, "loss": 0.2243, "step": 2950 }, { "epoch": 0.85, "grad_norm": 8.782304259306173, "learning_rate": 1.9232526161706572e-05, "loss": 0.1919, "step": 2960 }, { "epoch": 0.85, "grad_norm": 9.790899325124663, "learning_rate": 1.9225243449424333e-05, "loss": 0.2148, "step": 2970 }, { "epoch": 0.85, "grad_norm": 5.691871723523029, "learning_rate": 1.921792773887697e-05, "loss": 0.2187, "step": 2980 }, { "epoch": 0.85, "grad_norm": 8.336918580973503, "learning_rate": 1.9210579056235278e-05, "loss": 0.203, "step": 2990 }, { "epoch": 0.86, "grad_norm": 8.057491195631291, "learning_rate": 1.9203197427788004e-05, "loss": 0.2079, "step": 3000 }, { "epoch": 0.86, "grad_norm": 8.740543159699087, "learning_rate": 1.9195782879941747e-05, "loss": 0.1823, "step": 3010 }, { "epoch": 0.86, "grad_norm": 8.169905299334728, "learning_rate": 1.9188335439220874e-05, "loss": 0.182, "step": 3020 }, { "epoch": 0.87, "grad_norm": 5.714451576278073, "learning_rate": 1.9180855132267422e-05, "loss": 0.1313, "step": 3030 }, { "epoch": 0.87, "grad_norm": 5.754805505616215, "learning_rate": 1.9173341985841004e-05, "loss": 0.1236, "step": 3040 }, { "epoch": 0.87, "grad_norm": 5.565509624881045, "learning_rate": 1.91657960268187e-05, "loss": 0.1269, "step": 3050 }, { "epoch": 0.87, "grad_norm": 4.1782748963680145, "learning_rate": 1.9158217282194985e-05, "loss": 0.1706, "step": 3060 }, { "epoch": 0.88, "grad_norm": 13.112009940794737, "learning_rate": 1.9150605779081606e-05, "loss": 0.1844, "step": 3070 }, { "epoch": 0.88, "grad_norm": 14.448730856523621, "learning_rate": 1.9142961544707518e-05, "loss": 0.1613, "step": 3080 }, { "epoch": 0.88, "grad_norm": 14.453441191642682, "learning_rate": 1.9135284606418743e-05, "loss": 0.1639, "step": 3090 }, { "epoch": 0.89, "grad_norm": 5.597098259490502, "learning_rate": 1.9127574991678316e-05, "loss": 0.2234, "step": 3100 }, { "epoch": 0.89, "grad_norm": 2.914197003326017, "learning_rate": 1.9119832728066154e-05, "loss": 0.0947, "step": 3110 }, { "epoch": 0.89, "grad_norm": 14.07652620812005, "learning_rate": 1.9112057843278974e-05, "loss": 0.2106, "step": 3120 }, { "epoch": 0.89, "grad_norm": 3.082726307131429, "learning_rate": 1.9104250365130197e-05, "loss": 0.1942, "step": 3130 }, { "epoch": 0.9, "grad_norm": 9.399463930468364, "learning_rate": 1.909641032154983e-05, "loss": 0.167, "step": 3140 }, { "epoch": 0.9, "grad_norm": 12.400300643721808, "learning_rate": 1.9088537740584385e-05, "loss": 0.1652, "step": 3150 }, { "epoch": 0.9, "grad_norm": 9.135748816222208, "learning_rate": 1.9080632650396765e-05, "loss": 0.2417, "step": 3160 }, { "epoch": 0.91, "grad_norm": 6.610105072894608, "learning_rate": 1.9072695079266184e-05, "loss": 0.1602, "step": 3170 }, { "epoch": 0.91, "grad_norm": 9.021109514362255, "learning_rate": 1.906472505558803e-05, "loss": 0.154, "step": 3180 }, { "epoch": 0.91, "grad_norm": 8.38552513921171, "learning_rate": 1.9056722607873805e-05, "loss": 0.1608, "step": 3190 }, { "epoch": 0.91, "grad_norm": 6.76321699533414, "learning_rate": 1.904868776475099e-05, "loss": 0.1297, "step": 3200 }, { "epoch": 0.92, "grad_norm": 9.848126119913903, "learning_rate": 1.904062055496296e-05, "loss": 0.1986, "step": 3210 }, { "epoch": 0.92, "grad_norm": 8.025689838742544, "learning_rate": 1.9032521007368873e-05, "loss": 0.1366, "step": 3220 }, { "epoch": 0.92, "grad_norm": 6.259622719215118, "learning_rate": 1.9024389150943575e-05, "loss": 0.1696, "step": 3230 }, { "epoch": 0.93, "grad_norm": 7.486526848882144, "learning_rate": 1.9016225014777494e-05, "loss": 0.1665, "step": 3240 }, { "epoch": 0.93, "grad_norm": 8.811341987440331, "learning_rate": 1.9008028628076528e-05, "loss": 0.1106, "step": 3250 }, { "epoch": 0.93, "grad_norm": 7.3041161624825675, "learning_rate": 1.8999800020161946e-05, "loss": 0.2302, "step": 3260 }, { "epoch": 0.93, "grad_norm": 8.923702660737083, "learning_rate": 1.899153922047029e-05, "loss": 0.1557, "step": 3270 }, { "epoch": 0.94, "grad_norm": 7.729740611630878, "learning_rate": 1.898324625855325e-05, "loss": 0.1832, "step": 3280 }, { "epoch": 0.94, "grad_norm": 4.238783452389376, "learning_rate": 1.8974921164077584e-05, "loss": 0.1473, "step": 3290 }, { "epoch": 0.94, "grad_norm": 7.174287012276654, "learning_rate": 1.8966563966824996e-05, "loss": 0.1355, "step": 3300 }, { "epoch": 0.95, "grad_norm": 7.063902859064168, "learning_rate": 1.8958174696692032e-05, "loss": 0.1345, "step": 3310 }, { "epoch": 0.95, "grad_norm": 10.397476667784781, "learning_rate": 1.8949753383689964e-05, "loss": 0.1708, "step": 3320 }, { "epoch": 0.95, "grad_norm": 6.160203622697525, "learning_rate": 1.8941300057944715e-05, "loss": 0.1389, "step": 3330 }, { "epoch": 0.95, "grad_norm": 15.884447545575043, "learning_rate": 1.8932814749696703e-05, "loss": 0.2247, "step": 3340 }, { "epoch": 0.96, "grad_norm": 7.327909486769679, "learning_rate": 1.8924297489300772e-05, "loss": 0.0958, "step": 3350 }, { "epoch": 0.96, "grad_norm": 8.513551568102137, "learning_rate": 1.891574830722607e-05, "loss": 0.1768, "step": 3360 }, { "epoch": 0.96, "grad_norm": 9.61437044634696, "learning_rate": 1.890716723405594e-05, "loss": 0.1784, "step": 3370 }, { "epoch": 0.97, "grad_norm": 9.640381647683263, "learning_rate": 1.8898554300487804e-05, "loss": 0.1713, "step": 3380 }, { "epoch": 0.97, "grad_norm": 6.75259519067166, "learning_rate": 1.888990953733306e-05, "loss": 0.1466, "step": 3390 }, { "epoch": 0.97, "grad_norm": 5.059040158874168, "learning_rate": 1.8881232975516982e-05, "loss": 0.1291, "step": 3400 }, { "epoch": 0.97, "grad_norm": 9.31678990627753, "learning_rate": 1.8872524646078583e-05, "loss": 0.1808, "step": 3410 }, { "epoch": 0.98, "grad_norm": 11.323661566794412, "learning_rate": 1.8863784580170535e-05, "loss": 0.1782, "step": 3420 }, { "epoch": 0.98, "grad_norm": 7.92763958122681, "learning_rate": 1.885501280905903e-05, "loss": 0.2064, "step": 3430 }, { "epoch": 0.98, "grad_norm": 9.386667544335989, "learning_rate": 1.8846209364123693e-05, "loss": 0.1644, "step": 3440 }, { "epoch": 0.99, "grad_norm": 5.060786958194394, "learning_rate": 1.8837374276857438e-05, "loss": 0.1332, "step": 3450 }, { "epoch": 0.99, "grad_norm": 10.08057859849181, "learning_rate": 1.8828507578866392e-05, "loss": 0.057, "step": 3460 }, { "epoch": 0.99, "grad_norm": 14.192509913559489, "learning_rate": 1.8819609301869755e-05, "loss": 0.1762, "step": 3470 }, { "epoch": 0.99, "grad_norm": 2.4739489317598222, "learning_rate": 1.8810679477699706e-05, "loss": 0.1156, "step": 3480 }, { "epoch": 1.0, "grad_norm": 4.342914308802732, "learning_rate": 1.8801718138301262e-05, "loss": 0.1704, "step": 3490 }, { "epoch": 1.0, "grad_norm": 1.7127920054946262, "learning_rate": 1.8792725315732197e-05, "loss": 0.1551, "step": 3500 }, { "epoch": 1.0, "grad_norm": 4.220665616879844, "learning_rate": 1.8783701042162902e-05, "loss": 0.1957, "step": 3510 }, { "epoch": 1.01, "grad_norm": 7.336124958084804, "learning_rate": 1.8774645349876288e-05, "loss": 0.1872, "step": 3520 }, { "epoch": 1.01, "grad_norm": 10.981190986453019, "learning_rate": 1.8765558271267645e-05, "loss": 0.1476, "step": 3530 }, { "epoch": 1.01, "grad_norm": 4.280920085586136, "learning_rate": 1.8756439838844557e-05, "loss": 0.1444, "step": 3540 }, { "epoch": 1.01, "grad_norm": 6.8769791962016455, "learning_rate": 1.8747290085226773e-05, "loss": 0.138, "step": 3550 }, { "epoch": 1.02, "grad_norm": 4.198980053172256, "learning_rate": 1.8738109043146073e-05, "loss": 0.1022, "step": 3560 }, { "epoch": 1.02, "grad_norm": 8.917846411502085, "learning_rate": 1.8728896745446184e-05, "loss": 0.1564, "step": 3570 }, { "epoch": 1.02, "grad_norm": 7.745708631059241, "learning_rate": 1.871965322508263e-05, "loss": 0.1645, "step": 3580 }, { "epoch": 1.03, "grad_norm": 12.74185518899138, "learning_rate": 1.871037851512264e-05, "loss": 0.1704, "step": 3590 }, { "epoch": 1.03, "grad_norm": 9.319945770236878, "learning_rate": 1.8701072648745017e-05, "loss": 0.1378, "step": 3600 }, { "epoch": 1.03, "grad_norm": 12.49459722107107, "learning_rate": 1.869173565924002e-05, "loss": 0.1339, "step": 3610 }, { "epoch": 1.03, "grad_norm": 4.33286309747492, "learning_rate": 1.8682367580009235e-05, "loss": 0.1381, "step": 3620 }, { "epoch": 1.04, "grad_norm": 8.718317274067777, "learning_rate": 1.8672968444565488e-05, "loss": 0.2005, "step": 3630 }, { "epoch": 1.04, "grad_norm": 10.275510722366493, "learning_rate": 1.866353828653269e-05, "loss": 0.1146, "step": 3640 }, { "epoch": 1.04, "grad_norm": 6.149645675748795, "learning_rate": 1.8654077139645726e-05, "loss": 0.1377, "step": 3650 }, { "epoch": 1.05, "grad_norm": 6.005314380760392, "learning_rate": 1.8644585037750354e-05, "loss": 0.1691, "step": 3660 }, { "epoch": 1.05, "grad_norm": 6.587720383028617, "learning_rate": 1.863506201480306e-05, "loss": 0.158, "step": 3670 }, { "epoch": 1.05, "grad_norm": 7.299171233489511, "learning_rate": 1.8625508104870937e-05, "loss": 0.1899, "step": 3680 }, { "epoch": 1.05, "grad_norm": 12.105844999598206, "learning_rate": 1.861592334213159e-05, "loss": 0.1838, "step": 3690 }, { "epoch": 1.06, "grad_norm": 7.89681624951867, "learning_rate": 1.8606307760872974e-05, "loss": 0.146, "step": 3700 }, { "epoch": 1.06, "grad_norm": 14.14342923679303, "learning_rate": 1.859666139549331e-05, "loss": 0.1774, "step": 3710 }, { "epoch": 1.06, "grad_norm": 9.74925894244059, "learning_rate": 1.8586984280500937e-05, "loss": 0.1487, "step": 3720 }, { "epoch": 1.07, "grad_norm": 3.196964797383676, "learning_rate": 1.857727645051419e-05, "loss": 0.1248, "step": 3730 }, { "epoch": 1.07, "grad_norm": 6.858134442114929, "learning_rate": 1.85675379402613e-05, "loss": 0.1755, "step": 3740 }, { "epoch": 1.07, "grad_norm": 8.941927153486642, "learning_rate": 1.855776878458023e-05, "loss": 0.1731, "step": 3750 }, { "epoch": 1.07, "grad_norm": 6.466322586239156, "learning_rate": 1.854796901841858e-05, "loss": 0.1235, "step": 3760 }, { "epoch": 1.08, "grad_norm": 3.653825396119742, "learning_rate": 1.8538138676833466e-05, "loss": 0.1005, "step": 3770 }, { "epoch": 1.08, "grad_norm": 4.436428746685115, "learning_rate": 1.852827779499137e-05, "loss": 0.2065, "step": 3780 }, { "epoch": 1.08, "grad_norm": 4.741666611245171, "learning_rate": 1.851838640816802e-05, "loss": 0.1208, "step": 3790 }, { "epoch": 1.09, "grad_norm": 9.128845658629684, "learning_rate": 1.8508464551748292e-05, "loss": 0.2178, "step": 3800 }, { "epoch": 1.09, "grad_norm": 4.103237427727728, "learning_rate": 1.8498512261226047e-05, "loss": 0.1273, "step": 3810 }, { "epoch": 1.09, "grad_norm": 12.555448916122097, "learning_rate": 1.848852957220402e-05, "loss": 0.174, "step": 3820 }, { "epoch": 1.09, "grad_norm": 6.398372812840747, "learning_rate": 1.8478516520393702e-05, "loss": 0.1266, "step": 3830 }, { "epoch": 1.1, "grad_norm": 7.260518829050179, "learning_rate": 1.846847314161519e-05, "loss": 0.2107, "step": 3840 }, { "epoch": 1.1, "grad_norm": 4.803642742419731, "learning_rate": 1.845839947179707e-05, "loss": 0.158, "step": 3850 }, { "epoch": 1.1, "grad_norm": 5.1157854561875675, "learning_rate": 1.8448295546976303e-05, "loss": 0.1569, "step": 3860 }, { "epoch": 1.11, "grad_norm": 7.678679577645648, "learning_rate": 1.8438161403298074e-05, "loss": 0.1691, "step": 3870 }, { "epoch": 1.11, "grad_norm": 11.176787804746237, "learning_rate": 1.842799707701567e-05, "loss": 0.1522, "step": 3880 }, { "epoch": 1.11, "grad_norm": 9.69256498766137, "learning_rate": 1.841780260449035e-05, "loss": 0.1721, "step": 3890 }, { "epoch": 1.12, "grad_norm": 8.497944751670335, "learning_rate": 1.8407578022191223e-05, "loss": 0.1461, "step": 3900 }, { "epoch": 1.12, "grad_norm": 5.6269866825839, "learning_rate": 1.8397323366695102e-05, "loss": 0.1318, "step": 3910 }, { "epoch": 1.12, "grad_norm": 4.843927786240773, "learning_rate": 1.8387038674686397e-05, "loss": 0.1363, "step": 3920 }, { "epoch": 1.12, "grad_norm": 4.512880963335217, "learning_rate": 1.8376723982956954e-05, "loss": 0.1386, "step": 3930 }, { "epoch": 1.13, "grad_norm": 7.093590285061875, "learning_rate": 1.8366379328405943e-05, "loss": 0.1361, "step": 3940 }, { "epoch": 1.13, "grad_norm": 11.300489256403106, "learning_rate": 1.835600474803972e-05, "loss": 0.1848, "step": 3950 }, { "epoch": 1.13, "grad_norm": 9.458818420428871, "learning_rate": 1.8345600278971703e-05, "loss": 0.1798, "step": 3960 }, { "epoch": 1.14, "grad_norm": 5.836747977523529, "learning_rate": 1.8335165958422226e-05, "loss": 0.149, "step": 3970 }, { "epoch": 1.14, "grad_norm": 8.81103157092006, "learning_rate": 1.8324701823718406e-05, "loss": 0.1285, "step": 3980 }, { "epoch": 1.14, "grad_norm": 5.578196634639384, "learning_rate": 1.8314207912294028e-05, "loss": 0.1749, "step": 3990 }, { "epoch": 1.14, "grad_norm": 7.178110109213304, "learning_rate": 1.8303684261689394e-05, "loss": 0.1352, "step": 4000 }, { "epoch": 1.15, "grad_norm": 10.147455627539872, "learning_rate": 1.8293130909551183e-05, "loss": 0.1595, "step": 4010 }, { "epoch": 1.15, "grad_norm": 3.9805553602804378, "learning_rate": 1.8282547893632348e-05, "loss": 0.1498, "step": 4020 }, { "epoch": 1.15, "grad_norm": 4.897027160968049, "learning_rate": 1.8271935251791937e-05, "loss": 0.1461, "step": 4030 }, { "epoch": 1.16, "grad_norm": 7.311881797285872, "learning_rate": 1.826129302199499e-05, "loss": 0.1649, "step": 4040 }, { "epoch": 1.16, "grad_norm": 9.267400921975526, "learning_rate": 1.82506212423124e-05, "loss": 0.1552, "step": 4050 }, { "epoch": 1.16, "grad_norm": 14.99365863271323, "learning_rate": 1.8239919950920756e-05, "loss": 0.1435, "step": 4060 }, { "epoch": 1.16, "grad_norm": 12.588457760934345, "learning_rate": 1.822918918610223e-05, "loss": 0.1658, "step": 4070 }, { "epoch": 1.17, "grad_norm": 6.125435015742154, "learning_rate": 1.8218428986244426e-05, "loss": 0.1582, "step": 4080 }, { "epoch": 1.17, "grad_norm": 4.350669563765579, "learning_rate": 1.8207639389840258e-05, "loss": 0.1444, "step": 4090 }, { "epoch": 1.17, "grad_norm": 4.071865847058376, "learning_rate": 1.819682043548778e-05, "loss": 0.1381, "step": 4100 }, { "epoch": 1.18, "grad_norm": 7.9432342722045535, "learning_rate": 1.8185972161890085e-05, "loss": 0.16, "step": 4110 }, { "epoch": 1.18, "grad_norm": 13.039231598102141, "learning_rate": 1.8175094607855155e-05, "loss": 0.1383, "step": 4120 }, { "epoch": 1.18, "grad_norm": 8.003865738519975, "learning_rate": 1.8164187812295705e-05, "loss": 0.1544, "step": 4130 }, { "epoch": 1.18, "grad_norm": 10.218300159130669, "learning_rate": 1.8153251814229066e-05, "loss": 0.2017, "step": 4140 }, { "epoch": 1.19, "grad_norm": 10.193158596712522, "learning_rate": 1.814228665277703e-05, "loss": 0.173, "step": 4150 }, { "epoch": 1.19, "grad_norm": 5.333351592191595, "learning_rate": 1.813129236716572e-05, "loss": 0.1412, "step": 4160 }, { "epoch": 1.19, "grad_norm": 6.135766667571132, "learning_rate": 1.812026899672545e-05, "loss": 0.1799, "step": 4170 }, { "epoch": 1.2, "grad_norm": 5.151732899488849, "learning_rate": 1.810921658089057e-05, "loss": 0.1597, "step": 4180 }, { "epoch": 1.2, "grad_norm": 5.056082154306695, "learning_rate": 1.8098135159199342e-05, "loss": 0.153, "step": 4190 }, { "epoch": 1.2, "grad_norm": 8.966088302092388, "learning_rate": 1.8087024771293796e-05, "loss": 0.1546, "step": 4200 }, { "epoch": 1.2, "grad_norm": 14.522472462390777, "learning_rate": 1.807588545691958e-05, "loss": 0.1516, "step": 4210 }, { "epoch": 1.21, "grad_norm": 15.002063863549429, "learning_rate": 1.806471725592581e-05, "loss": 0.2435, "step": 4220 }, { "epoch": 1.21, "grad_norm": 7.358739222056481, "learning_rate": 1.8053520208264965e-05, "loss": 0.1279, "step": 4230 }, { "epoch": 1.21, "grad_norm": 14.535708951296352, "learning_rate": 1.8042294353992692e-05, "loss": 0.1488, "step": 4240 }, { "epoch": 1.22, "grad_norm": 10.287145159362007, "learning_rate": 1.803103973326771e-05, "loss": 0.1617, "step": 4250 }, { "epoch": 1.22, "grad_norm": 12.124738867395829, "learning_rate": 1.801975638635163e-05, "loss": 0.1515, "step": 4260 }, { "epoch": 1.22, "grad_norm": 5.944629123627518, "learning_rate": 1.800844435360883e-05, "loss": 0.159, "step": 4270 }, { "epoch": 1.22, "grad_norm": 6.8753029496462235, "learning_rate": 1.7997103675506318e-05, "loss": 0.1151, "step": 4280 }, { "epoch": 1.23, "grad_norm": 11.897982873578233, "learning_rate": 1.7985734392613563e-05, "loss": 0.149, "step": 4290 }, { "epoch": 1.23, "grad_norm": 7.088979318427978, "learning_rate": 1.7974336545602368e-05, "loss": 0.12, "step": 4300 }, { "epoch": 1.23, "grad_norm": 7.029251560617347, "learning_rate": 1.7962910175246713e-05, "loss": 0.1856, "step": 4310 }, { "epoch": 1.24, "grad_norm": 9.528773051612712, "learning_rate": 1.795145532242264e-05, "loss": 0.2223, "step": 4320 }, { "epoch": 1.24, "grad_norm": 7.8395921850209405, "learning_rate": 1.7939972028108047e-05, "loss": 0.1519, "step": 4330 }, { "epoch": 1.24, "grad_norm": 6.891766996318358, "learning_rate": 1.7928460333382603e-05, "loss": 0.2239, "step": 4340 }, { "epoch": 1.24, "grad_norm": 5.779447697318071, "learning_rate": 1.7916920279427566e-05, "loss": 0.1465, "step": 4350 }, { "epoch": 1.25, "grad_norm": 10.366969298169993, "learning_rate": 1.790535190752564e-05, "loss": 0.0971, "step": 4360 }, { "epoch": 1.25, "grad_norm": 7.493231962113351, "learning_rate": 1.7893755259060844e-05, "loss": 0.1898, "step": 4370 }, { "epoch": 1.25, "grad_norm": 8.226497581059139, "learning_rate": 1.7882130375518344e-05, "loss": 0.1446, "step": 4380 }, { "epoch": 1.26, "grad_norm": 7.3611239483159565, "learning_rate": 1.787047729848431e-05, "loss": 0.1447, "step": 4390 }, { "epoch": 1.26, "grad_norm": 8.972681333633854, "learning_rate": 1.7858796069645775e-05, "loss": 0.1542, "step": 4400 }, { "epoch": 1.26, "grad_norm": 7.771738389996093, "learning_rate": 1.7847086730790478e-05, "loss": 0.1625, "step": 4410 }, { "epoch": 1.26, "grad_norm": 4.653350951663296, "learning_rate": 1.7835349323806722e-05, "loss": 0.1295, "step": 4420 }, { "epoch": 1.27, "grad_norm": 3.690740680738886, "learning_rate": 1.7823583890683208e-05, "loss": 0.1612, "step": 4430 }, { "epoch": 1.27, "grad_norm": 5.039029758553999, "learning_rate": 1.781179047350891e-05, "loss": 0.1135, "step": 4440 }, { "epoch": 1.27, "grad_norm": 6.816510900880039, "learning_rate": 1.7799969114472902e-05, "loss": 0.1479, "step": 4450 }, { "epoch": 1.28, "grad_norm": 8.204346077272952, "learning_rate": 1.7788119855864216e-05, "loss": 0.127, "step": 4460 }, { "epoch": 1.28, "grad_norm": 7.681549250187532, "learning_rate": 1.7776242740071693e-05, "loss": 0.1928, "step": 4470 }, { "epoch": 1.28, "grad_norm": 4.959327063902704, "learning_rate": 1.7764337809583828e-05, "loss": 0.1473, "step": 4480 }, { "epoch": 1.28, "grad_norm": 8.20118978289007, "learning_rate": 1.7752405106988616e-05, "loss": 0.1515, "step": 4490 }, { "epoch": 1.29, "grad_norm": 3.6137981313154346, "learning_rate": 1.7740444674973404e-05, "loss": 0.1202, "step": 4500 }, { "epoch": 1.29, "grad_norm": 7.886983805600509, "learning_rate": 1.772845655632474e-05, "loss": 0.1521, "step": 4510 }, { "epoch": 1.29, "grad_norm": 13.093460990247193, "learning_rate": 1.771644079392821e-05, "loss": 0.1368, "step": 4520 }, { "epoch": 1.3, "grad_norm": 24.690647050205317, "learning_rate": 1.7704397430768303e-05, "loss": 0.2097, "step": 4530 }, { "epoch": 1.3, "grad_norm": 6.197972846110496, "learning_rate": 1.7692326509928225e-05, "loss": 0.2123, "step": 4540 }, { "epoch": 1.3, "grad_norm": 8.405296073024157, "learning_rate": 1.768022807458978e-05, "loss": 0.1912, "step": 4550 }, { "epoch": 1.3, "grad_norm": 4.004511911127535, "learning_rate": 1.7668102168033203e-05, "loss": 0.1179, "step": 4560 }, { "epoch": 1.31, "grad_norm": 7.587478622582375, "learning_rate": 1.7655948833636992e-05, "loss": 0.1333, "step": 4570 }, { "epoch": 1.31, "grad_norm": 4.04146399759994, "learning_rate": 1.7643768114877777e-05, "loss": 0.1881, "step": 4580 }, { "epoch": 1.31, "grad_norm": 12.021182120743289, "learning_rate": 1.763156005533014e-05, "loss": 0.2361, "step": 4590 }, { "epoch": 1.32, "grad_norm": 4.971508675824064, "learning_rate": 1.7619324698666468e-05, "loss": 0.1336, "step": 4600 }, { "epoch": 1.32, "grad_norm": 10.471783098974724, "learning_rate": 1.7607062088656813e-05, "loss": 0.1807, "step": 4610 }, { "epoch": 1.32, "grad_norm": 7.635167081812126, "learning_rate": 1.759477226916871e-05, "loss": 0.1756, "step": 4620 }, { "epoch": 1.32, "grad_norm": 9.614385523608986, "learning_rate": 1.7582455284167038e-05, "loss": 0.1573, "step": 4630 }, { "epoch": 1.33, "grad_norm": 8.652576534678937, "learning_rate": 1.7570111177713855e-05, "loss": 0.1801, "step": 4640 }, { "epoch": 1.33, "grad_norm": 10.604374511920186, "learning_rate": 1.755773999396823e-05, "loss": 0.1665, "step": 4650 }, { "epoch": 1.33, "grad_norm": 7.869402637034754, "learning_rate": 1.7545341777186117e-05, "loss": 0.1354, "step": 4660 }, { "epoch": 1.34, "grad_norm": 8.408336281248193, "learning_rate": 1.7532916571720165e-05, "loss": 0.1351, "step": 4670 }, { "epoch": 1.34, "grad_norm": 10.68153437932853, "learning_rate": 1.7520464422019567e-05, "loss": 0.1472, "step": 4680 }, { "epoch": 1.34, "grad_norm": 12.227607658004322, "learning_rate": 1.7507985372629917e-05, "loss": 0.1364, "step": 4690 }, { "epoch": 1.34, "grad_norm": 5.8364447152408205, "learning_rate": 1.7495479468193028e-05, "loss": 0.2062, "step": 4700 }, { "epoch": 1.35, "grad_norm": 8.099363758860312, "learning_rate": 1.748294675344678e-05, "loss": 0.1399, "step": 4710 }, { "epoch": 1.35, "grad_norm": 16.162214996822545, "learning_rate": 1.7470387273224977e-05, "loss": 0.155, "step": 4720 }, { "epoch": 1.35, "grad_norm": 4.2391458921459, "learning_rate": 1.745780107245716e-05, "loss": 0.168, "step": 4730 }, { "epoch": 1.36, "grad_norm": 7.6831910998018405, "learning_rate": 1.7445188196168464e-05, "loss": 0.1385, "step": 4740 }, { "epoch": 1.36, "grad_norm": 13.912859428266467, "learning_rate": 1.743254868947945e-05, "loss": 0.1533, "step": 4750 }, { "epoch": 1.36, "grad_norm": 5.822396661767634, "learning_rate": 1.741988259760594e-05, "loss": 0.1721, "step": 4760 }, { "epoch": 1.36, "grad_norm": 10.550961664854189, "learning_rate": 1.7407189965858868e-05, "loss": 0.1285, "step": 4770 }, { "epoch": 1.37, "grad_norm": 8.290609116580384, "learning_rate": 1.7394470839644105e-05, "loss": 0.139, "step": 4780 }, { "epoch": 1.37, "grad_norm": 8.277193969120296, "learning_rate": 1.738172526446231e-05, "loss": 0.1398, "step": 4790 }, { "epoch": 1.37, "grad_norm": 9.639139861675897, "learning_rate": 1.7368953285908742e-05, "loss": 0.1231, "step": 4800 }, { "epoch": 1.38, "grad_norm": 9.095866058705468, "learning_rate": 1.7356154949673133e-05, "loss": 0.1581, "step": 4810 }, { "epoch": 1.38, "grad_norm": 6.665502287588656, "learning_rate": 1.7343330301539497e-05, "loss": 0.1555, "step": 4820 }, { "epoch": 1.38, "grad_norm": 9.162987219124123, "learning_rate": 1.7330479387385974e-05, "loss": 0.1635, "step": 4830 }, { "epoch": 1.38, "grad_norm": 4.4558668830616375, "learning_rate": 1.7317602253184667e-05, "loss": 0.1476, "step": 4840 }, { "epoch": 1.39, "grad_norm": 11.207406687572488, "learning_rate": 1.730469894500147e-05, "loss": 0.1579, "step": 4850 }, { "epoch": 1.39, "grad_norm": 10.410674383212706, "learning_rate": 1.7291769508995933e-05, "loss": 0.1745, "step": 4860 }, { "epoch": 1.39, "grad_norm": 9.675366093611661, "learning_rate": 1.7278813991421043e-05, "loss": 0.1311, "step": 4870 }, { "epoch": 1.4, "grad_norm": 6.67111566229274, "learning_rate": 1.7265832438623114e-05, "loss": 0.1271, "step": 4880 }, { "epoch": 1.4, "grad_norm": 5.816663993827548, "learning_rate": 1.725282489704159e-05, "loss": 0.145, "step": 4890 }, { "epoch": 1.4, "grad_norm": 11.004950363137363, "learning_rate": 1.7239791413208878e-05, "loss": 0.1712, "step": 4900 }, { "epoch": 1.4, "grad_norm": 4.854042782928574, "learning_rate": 1.7226732033750197e-05, "loss": 0.1515, "step": 4910 }, { "epoch": 1.41, "grad_norm": 8.049908882715352, "learning_rate": 1.7213646805383405e-05, "loss": 0.1651, "step": 4920 }, { "epoch": 1.41, "grad_norm": 11.24243825456537, "learning_rate": 1.720053577491882e-05, "loss": 0.1287, "step": 4930 }, { "epoch": 1.41, "grad_norm": 7.111776294326677, "learning_rate": 1.7187398989259075e-05, "loss": 0.1754, "step": 4940 }, { "epoch": 1.42, "grad_norm": 3.749840542264061, "learning_rate": 1.7174236495398936e-05, "loss": 0.185, "step": 4950 }, { "epoch": 1.42, "grad_norm": 8.098174899725429, "learning_rate": 1.7161048340425132e-05, "loss": 0.1438, "step": 4960 }, { "epoch": 1.42, "grad_norm": 6.464769991082255, "learning_rate": 1.7147834571516184e-05, "loss": 0.1212, "step": 4970 }, { "epoch": 1.42, "grad_norm": 9.658479007371701, "learning_rate": 1.713459523594226e-05, "loss": 0.1669, "step": 4980 }, { "epoch": 1.43, "grad_norm": 7.194162481895858, "learning_rate": 1.7121330381064975e-05, "loss": 0.1374, "step": 4990 }, { "epoch": 1.43, "grad_norm": 3.770033539033441, "learning_rate": 1.7108040054337237e-05, "loss": 0.1178, "step": 5000 }, { "epoch": 1.43, "grad_norm": 8.421782144519419, "learning_rate": 1.7094724303303084e-05, "loss": 0.1714, "step": 5010 }, { "epoch": 1.44, "grad_norm": 6.778397342456885, "learning_rate": 1.708138317559749e-05, "loss": 0.145, "step": 5020 }, { "epoch": 1.44, "grad_norm": 0.47542815362001, "learning_rate": 1.706801671894623e-05, "loss": 0.0989, "step": 5030 }, { "epoch": 1.44, "grad_norm": 9.166677532767588, "learning_rate": 1.7054624981165673e-05, "loss": 0.1668, "step": 5040 }, { "epoch": 1.44, "grad_norm": 8.98450152764166, "learning_rate": 1.7041208010162634e-05, "loss": 0.1183, "step": 5050 }, { "epoch": 1.45, "grad_norm": 8.669438530188154, "learning_rate": 1.7027765853934193e-05, "loss": 0.1056, "step": 5060 }, { "epoch": 1.45, "grad_norm": 13.527707523804962, "learning_rate": 1.7014298560567528e-05, "loss": 0.1603, "step": 5070 }, { "epoch": 1.45, "grad_norm": 11.381581844077628, "learning_rate": 1.7000806178239745e-05, "loss": 0.1587, "step": 5080 }, { "epoch": 1.46, "grad_norm": 4.012142110770331, "learning_rate": 1.6987288755217696e-05, "loss": 0.1823, "step": 5090 }, { "epoch": 1.46, "grad_norm": 9.676515319600789, "learning_rate": 1.6973746339857807e-05, "loss": 0.1727, "step": 5100 }, { "epoch": 1.46, "grad_norm": 10.14259373343927, "learning_rate": 1.6960178980605927e-05, "loss": 0.1592, "step": 5110 }, { "epoch": 1.46, "grad_norm": 2.7684255095524817, "learning_rate": 1.694658672599712e-05, "loss": 0.1421, "step": 5120 }, { "epoch": 1.47, "grad_norm": 8.983382672780085, "learning_rate": 1.6932969624655523e-05, "loss": 0.1508, "step": 5130 }, { "epoch": 1.47, "grad_norm": 6.6633412014391755, "learning_rate": 1.691932772529415e-05, "loss": 0.1714, "step": 5140 }, { "epoch": 1.47, "grad_norm": 7.19745404476906, "learning_rate": 1.6905661076714727e-05, "loss": 0.1532, "step": 5150 }, { "epoch": 1.48, "grad_norm": 6.572979216925368, "learning_rate": 1.6891969727807516e-05, "loss": 0.1214, "step": 5160 }, { "epoch": 1.48, "grad_norm": 9.079447717213275, "learning_rate": 1.6878253727551143e-05, "loss": 0.1203, "step": 5170 }, { "epoch": 1.48, "grad_norm": 4.067832615997426, "learning_rate": 1.6864513125012424e-05, "loss": 0.1564, "step": 5180 }, { "epoch": 1.48, "grad_norm": 9.87765892061237, "learning_rate": 1.6850747969346178e-05, "loss": 0.1608, "step": 5190 }, { "epoch": 1.49, "grad_norm": 6.7409174530720115, "learning_rate": 1.683695830979506e-05, "loss": 0.1716, "step": 5200 }, { "epoch": 1.49, "grad_norm": 13.571241242686224, "learning_rate": 1.6823144195689383e-05, "loss": 0.1301, "step": 5210 }, { "epoch": 1.49, "grad_norm": 2.423572954122206, "learning_rate": 1.6809305676446955e-05, "loss": 0.0974, "step": 5220 }, { "epoch": 1.5, "grad_norm": 12.40599417242528, "learning_rate": 1.679544280157287e-05, "loss": 0.129, "step": 5230 }, { "epoch": 1.5, "grad_norm": 5.185511380233981, "learning_rate": 1.6781555620659366e-05, "loss": 0.1325, "step": 5240 }, { "epoch": 1.5, "grad_norm": 8.238311810931936, "learning_rate": 1.6767644183385624e-05, "loss": 0.1678, "step": 5250 }, { "epoch": 1.5, "grad_norm": 12.349035903427467, "learning_rate": 1.67537085395176e-05, "loss": 0.1602, "step": 5260 }, { "epoch": 1.51, "grad_norm": 6.586958004046353, "learning_rate": 1.6739748738907843e-05, "loss": 0.1704, "step": 5270 }, { "epoch": 1.51, "grad_norm": 2.9802084392897368, "learning_rate": 1.672576483149532e-05, "loss": 0.1836, "step": 5280 }, { "epoch": 1.51, "grad_norm": 4.42584072251524, "learning_rate": 1.671175686730525e-05, "loss": 0.169, "step": 5290 }, { "epoch": 1.52, "grad_norm": 8.208573804195574, "learning_rate": 1.6697724896448887e-05, "loss": 0.1464, "step": 5300 }, { "epoch": 1.52, "grad_norm": 7.3467943739594475, "learning_rate": 1.6683668969123375e-05, "loss": 0.1252, "step": 5310 }, { "epoch": 1.52, "grad_norm": 9.92468779334696, "learning_rate": 1.6669589135611568e-05, "loss": 0.1754, "step": 5320 }, { "epoch": 1.52, "grad_norm": 8.747271303042076, "learning_rate": 1.665548544628183e-05, "loss": 0.1296, "step": 5330 }, { "epoch": 1.53, "grad_norm": 6.738531754854439, "learning_rate": 1.664135795158787e-05, "loss": 0.1269, "step": 5340 }, { "epoch": 1.53, "grad_norm": 7.068431692604632, "learning_rate": 1.6627206702068556e-05, "loss": 0.1796, "step": 5350 }, { "epoch": 1.53, "grad_norm": 8.097399974840014, "learning_rate": 1.6613031748347727e-05, "loss": 0.1157, "step": 5360 }, { "epoch": 1.54, "grad_norm": 2.906302831026024, "learning_rate": 1.6598833141134037e-05, "loss": 0.1334, "step": 5370 }, { "epoch": 1.54, "grad_norm": 7.352926291955581, "learning_rate": 1.658461093122074e-05, "loss": 0.154, "step": 5380 }, { "epoch": 1.54, "grad_norm": 3.692779511634212, "learning_rate": 1.6570365169485537e-05, "loss": 0.1673, "step": 5390 }, { "epoch": 1.54, "grad_norm": 3.5939453071928837, "learning_rate": 1.6556095906890372e-05, "loss": 0.1468, "step": 5400 }, { "epoch": 1.55, "grad_norm": 7.766919279560902, "learning_rate": 1.6541803194481265e-05, "loss": 0.1518, "step": 5410 }, { "epoch": 1.55, "grad_norm": 9.600423374377046, "learning_rate": 1.6527487083388132e-05, "loss": 0.1879, "step": 5420 }, { "epoch": 1.55, "grad_norm": 7.276385995248979, "learning_rate": 1.651314762482457e-05, "loss": 0.1586, "step": 5430 }, { "epoch": 1.56, "grad_norm": 7.432445956693525, "learning_rate": 1.6498784870087732e-05, "loss": 0.1727, "step": 5440 }, { "epoch": 1.56, "grad_norm": 4.482252090330209, "learning_rate": 1.6484398870558077e-05, "loss": 0.1434, "step": 5450 }, { "epoch": 1.56, "grad_norm": 6.981728549493911, "learning_rate": 1.6469989677699233e-05, "loss": 0.1074, "step": 5460 }, { "epoch": 1.56, "grad_norm": 13.93293009878738, "learning_rate": 1.64555573430578e-05, "loss": 0.1561, "step": 5470 }, { "epoch": 1.57, "grad_norm": 8.584184129146225, "learning_rate": 1.644110191826317e-05, "loss": 0.1403, "step": 5480 }, { "epoch": 1.57, "grad_norm": 6.69058299080401, "learning_rate": 1.642662345502731e-05, "loss": 0.1757, "step": 5490 }, { "epoch": 1.57, "grad_norm": 7.560516601754168, "learning_rate": 1.6412122005144643e-05, "loss": 0.1191, "step": 5500 }, { "epoch": 1.58, "grad_norm": 3.9509069681108038, "learning_rate": 1.6397597620491782e-05, "loss": 0.1606, "step": 5510 }, { "epoch": 1.58, "grad_norm": 11.307494009565833, "learning_rate": 1.6383050353027417e-05, "loss": 0.1291, "step": 5520 }, { "epoch": 1.58, "grad_norm": 7.708680786000236, "learning_rate": 1.6368480254792084e-05, "loss": 0.1303, "step": 5530 }, { "epoch": 1.58, "grad_norm": 10.697633456623274, "learning_rate": 1.6353887377907992e-05, "loss": 0.1672, "step": 5540 }, { "epoch": 1.59, "grad_norm": 8.940568730571576, "learning_rate": 1.6339271774578835e-05, "loss": 0.1694, "step": 5550 }, { "epoch": 1.59, "grad_norm": 3.3231734572216096, "learning_rate": 1.632463349708962e-05, "loss": 0.1367, "step": 5560 }, { "epoch": 1.59, "grad_norm": 6.106688862611726, "learning_rate": 1.630997259780645e-05, "loss": 0.097, "step": 5570 }, { "epoch": 1.6, "grad_norm": 9.094787748186887, "learning_rate": 1.6295289129176373e-05, "loss": 0.1929, "step": 5580 }, { "epoch": 1.6, "grad_norm": 7.306961440013205, "learning_rate": 1.6280583143727154e-05, "loss": 0.181, "step": 5590 }, { "epoch": 1.6, "grad_norm": 5.271885079655172, "learning_rate": 1.6265854694067125e-05, "loss": 0.2005, "step": 5600 }, { "epoch": 1.6, "grad_norm": 6.555665444496539, "learning_rate": 1.6251103832884972e-05, "loss": 0.1535, "step": 5610 }, { "epoch": 1.61, "grad_norm": 9.89239784972709, "learning_rate": 1.623633061294956e-05, "loss": 0.1273, "step": 5620 }, { "epoch": 1.61, "grad_norm": 3.0434662359604308, "learning_rate": 1.622153508710973e-05, "loss": 0.1274, "step": 5630 }, { "epoch": 1.61, "grad_norm": 10.973923378798053, "learning_rate": 1.620671730829413e-05, "loss": 0.1918, "step": 5640 }, { "epoch": 1.62, "grad_norm": 9.255226102320684, "learning_rate": 1.6191877329511002e-05, "loss": 0.1495, "step": 5650 }, { "epoch": 1.62, "grad_norm": 8.986431490181984, "learning_rate": 1.6177015203848022e-05, "loss": 0.1284, "step": 5660 }, { "epoch": 1.62, "grad_norm": 8.164251268750666, "learning_rate": 1.6162130984472074e-05, "loss": 0.2029, "step": 5670 }, { "epoch": 1.62, "grad_norm": 5.827883472500369, "learning_rate": 1.61472247246291e-05, "loss": 0.1542, "step": 5680 }, { "epoch": 1.63, "grad_norm": 15.015174819279183, "learning_rate": 1.6132296477643855e-05, "loss": 0.1701, "step": 5690 }, { "epoch": 1.63, "grad_norm": 11.060918145311678, "learning_rate": 1.6117346296919788e-05, "loss": 0.1437, "step": 5700 }, { "epoch": 1.63, "grad_norm": 2.8424215043233376, "learning_rate": 1.610237423593879e-05, "loss": 0.1185, "step": 5710 }, { "epoch": 1.64, "grad_norm": 7.8009568043278055, "learning_rate": 1.6087380348261025e-05, "loss": 0.1454, "step": 5720 }, { "epoch": 1.64, "grad_norm": 1.8405194833559004, "learning_rate": 1.6072364687524748e-05, "loss": 0.165, "step": 5730 }, { "epoch": 1.64, "grad_norm": 8.006291299872743, "learning_rate": 1.6057327307446102e-05, "loss": 0.1439, "step": 5740 }, { "epoch": 1.64, "grad_norm": 3.712151194411852, "learning_rate": 1.604226826181892e-05, "loss": 0.0907, "step": 5750 }, { "epoch": 1.65, "grad_norm": 1.7470808887108256, "learning_rate": 1.6027187604514545e-05, "loss": 0.1297, "step": 5760 }, { "epoch": 1.65, "grad_norm": 4.358911852007257, "learning_rate": 1.6012085389481634e-05, "loss": 0.1179, "step": 5770 }, { "epoch": 1.65, "grad_norm": 9.989934051265445, "learning_rate": 1.5996961670745963e-05, "loss": 0.1884, "step": 5780 }, { "epoch": 1.66, "grad_norm": 7.439428768583784, "learning_rate": 1.5981816502410227e-05, "loss": 0.1311, "step": 5790 }, { "epoch": 1.66, "grad_norm": 7.580023773478164, "learning_rate": 1.5966649938653863e-05, "loss": 0.1389, "step": 5800 }, { "epoch": 1.66, "grad_norm": 10.689445809254302, "learning_rate": 1.5951462033732837e-05, "loss": 0.1887, "step": 5810 }, { "epoch": 1.66, "grad_norm": 7.195735028480249, "learning_rate": 1.5936252841979475e-05, "loss": 0.139, "step": 5820 }, { "epoch": 1.67, "grad_norm": 5.296719979237353, "learning_rate": 1.592102241780224e-05, "loss": 0.138, "step": 5830 }, { "epoch": 1.67, "grad_norm": 4.217179295636819, "learning_rate": 1.5905770815685553e-05, "loss": 0.1467, "step": 5840 }, { "epoch": 1.67, "grad_norm": 12.403959048353965, "learning_rate": 1.589049809018959e-05, "loss": 0.176, "step": 5850 }, { "epoch": 1.68, "grad_norm": 6.785861227409256, "learning_rate": 1.5875204295950114e-05, "loss": 0.0902, "step": 5860 }, { "epoch": 1.68, "grad_norm": 8.66770293080466, "learning_rate": 1.5859889487678234e-05, "loss": 0.1905, "step": 5870 }, { "epoch": 1.68, "grad_norm": 5.630823469361762, "learning_rate": 1.584455372016024e-05, "loss": 0.1575, "step": 5880 }, { "epoch": 1.68, "grad_norm": 15.138361805986317, "learning_rate": 1.5829197048257414e-05, "loss": 0.1646, "step": 5890 }, { "epoch": 1.69, "grad_norm": 8.616459695579552, "learning_rate": 1.5813819526905803e-05, "loss": 0.141, "step": 5900 }, { "epoch": 1.69, "grad_norm": 6.908537196827645, "learning_rate": 1.5798421211116045e-05, "loss": 0.1076, "step": 5910 }, { "epoch": 1.69, "grad_norm": 3.3583710345730675, "learning_rate": 1.5783002155973165e-05, "loss": 0.1196, "step": 5920 }, { "epoch": 1.7, "grad_norm": 10.940344217244578, "learning_rate": 1.576756241663639e-05, "loss": 0.1906, "step": 5930 }, { "epoch": 1.7, "grad_norm": 3.930625970312318, "learning_rate": 1.5752102048338934e-05, "loss": 0.0802, "step": 5940 }, { "epoch": 1.7, "grad_norm": 13.317532013685828, "learning_rate": 1.57366211063878e-05, "loss": 0.1295, "step": 5950 }, { "epoch": 1.7, "grad_norm": 11.30798620723373, "learning_rate": 1.5721119646163607e-05, "loss": 0.1246, "step": 5960 }, { "epoch": 1.71, "grad_norm": 5.799830585504774, "learning_rate": 1.5705597723120358e-05, "loss": 0.1518, "step": 5970 }, { "epoch": 1.71, "grad_norm": 11.436229197459971, "learning_rate": 1.5690055392785272e-05, "loss": 0.1269, "step": 5980 }, { "epoch": 1.71, "grad_norm": 9.37775390867035, "learning_rate": 1.5674492710758558e-05, "loss": 0.2114, "step": 5990 }, { "epoch": 1.72, "grad_norm": 2.903640395816564, "learning_rate": 1.565890973271325e-05, "loss": 0.1958, "step": 6000 }, { "epoch": 1.72, "grad_norm": 7.978987277598217, "learning_rate": 1.5643306514394963e-05, "loss": 0.1308, "step": 6010 }, { "epoch": 1.72, "grad_norm": 10.691320260774955, "learning_rate": 1.5627683111621737e-05, "loss": 0.159, "step": 6020 }, { "epoch": 1.72, "grad_norm": 10.38634192739737, "learning_rate": 1.5612039580283813e-05, "loss": 0.1799, "step": 6030 }, { "epoch": 1.73, "grad_norm": 7.125463637944135, "learning_rate": 1.559637597634344e-05, "loss": 0.162, "step": 6040 }, { "epoch": 1.73, "grad_norm": 7.966013958527261, "learning_rate": 1.5580692355834668e-05, "loss": 0.2194, "step": 6050 }, { "epoch": 1.73, "grad_norm": 5.511951811990918, "learning_rate": 1.556498877486316e-05, "loss": 0.1178, "step": 6060 }, { "epoch": 1.74, "grad_norm": 3.467985189166446, "learning_rate": 1.5549265289605977e-05, "loss": 0.1379, "step": 6070 }, { "epoch": 1.74, "grad_norm": 8.062921557383982, "learning_rate": 1.5533521956311396e-05, "loss": 0.1303, "step": 6080 }, { "epoch": 1.74, "grad_norm": 5.160243112322469, "learning_rate": 1.5517758831298684e-05, "loss": 0.1899, "step": 6090 }, { "epoch": 1.74, "grad_norm": 7.401410674619078, "learning_rate": 1.550197597095792e-05, "loss": 0.1488, "step": 6100 }, { "epoch": 1.75, "grad_norm": 15.614681163003798, "learning_rate": 1.5486173431749777e-05, "loss": 0.2132, "step": 6110 }, { "epoch": 1.75, "grad_norm": 10.726450831310581, "learning_rate": 1.547035127020533e-05, "loss": 0.1313, "step": 6120 }, { "epoch": 1.75, "grad_norm": 6.187739820360311, "learning_rate": 1.5454509542925836e-05, "loss": 0.1492, "step": 6130 }, { "epoch": 1.76, "grad_norm": 4.10737729558547, "learning_rate": 1.5438648306582575e-05, "loss": 0.1128, "step": 6140 }, { "epoch": 1.76, "grad_norm": 4.736410074999796, "learning_rate": 1.5422767617916594e-05, "loss": 0.1489, "step": 6150 }, { "epoch": 1.76, "grad_norm": 4.919698373186092, "learning_rate": 1.5406867533738523e-05, "loss": 0.1848, "step": 6160 }, { "epoch": 1.76, "grad_norm": 9.465513173422266, "learning_rate": 1.5390948110928394e-05, "loss": 0.1661, "step": 6170 }, { "epoch": 1.77, "grad_norm": 7.768691871529942, "learning_rate": 1.5375009406435416e-05, "loss": 0.1237, "step": 6180 }, { "epoch": 1.77, "grad_norm": 3.597643923660683, "learning_rate": 1.5359051477277766e-05, "loss": 0.1422, "step": 6190 }, { "epoch": 1.77, "grad_norm": 4.772149094739379, "learning_rate": 1.5343074380542397e-05, "loss": 0.1316, "step": 6200 }, { "epoch": 1.78, "grad_norm": 5.686799058584262, "learning_rate": 1.532707817338484e-05, "loss": 0.1478, "step": 6210 }, { "epoch": 1.78, "grad_norm": 8.588505590828442, "learning_rate": 1.5311062913028987e-05, "loss": 0.1896, "step": 6220 }, { "epoch": 1.78, "grad_norm": 6.587771050741836, "learning_rate": 1.5295028656766876e-05, "loss": 0.1392, "step": 6230 }, { "epoch": 1.78, "grad_norm": 10.135663480716786, "learning_rate": 1.5278975461958517e-05, "loss": 0.1247, "step": 6240 }, { "epoch": 1.79, "grad_norm": 4.194064787692171, "learning_rate": 1.5262903386031668e-05, "loss": 0.1654, "step": 6250 }, { "epoch": 1.79, "grad_norm": 15.70958404553695, "learning_rate": 1.5246812486481617e-05, "loss": 0.172, "step": 6260 }, { "epoch": 1.79, "grad_norm": 8.178524309904281, "learning_rate": 1.5230702820871007e-05, "loss": 0.1472, "step": 6270 }, { "epoch": 1.8, "grad_norm": 5.344687931667672, "learning_rate": 1.5214574446829603e-05, "loss": 0.1279, "step": 6280 }, { "epoch": 1.8, "grad_norm": 10.549078184235311, "learning_rate": 1.51984274220541e-05, "loss": 0.1109, "step": 6290 }, { "epoch": 1.8, "grad_norm": 11.574600403975886, "learning_rate": 1.5182261804307921e-05, "loss": 0.1396, "step": 6300 }, { "epoch": 1.8, "grad_norm": 6.757216093172837, "learning_rate": 1.5166077651420984e-05, "loss": 0.0958, "step": 6310 }, { "epoch": 1.81, "grad_norm": 10.830087106905653, "learning_rate": 1.5149875021289533e-05, "loss": 0.1586, "step": 6320 }, { "epoch": 1.81, "grad_norm": 10.925897515134876, "learning_rate": 1.5133653971875892e-05, "loss": 0.1317, "step": 6330 }, { "epoch": 1.81, "grad_norm": 2.538780783650223, "learning_rate": 1.5117414561208302e-05, "loss": 0.1315, "step": 6340 }, { "epoch": 1.82, "grad_norm": 9.20234031764623, "learning_rate": 1.5101156847380656e-05, "loss": 0.1188, "step": 6350 }, { "epoch": 1.82, "grad_norm": 4.638793378342908, "learning_rate": 1.508488088855236e-05, "loss": 0.1353, "step": 6360 }, { "epoch": 1.82, "grad_norm": 11.806313484146417, "learning_rate": 1.506858674294806e-05, "loss": 0.0942, "step": 6370 }, { "epoch": 1.82, "grad_norm": 7.733959949801069, "learning_rate": 1.505227446885747e-05, "loss": 0.1373, "step": 6380 }, { "epoch": 1.83, "grad_norm": 7.017672437527253, "learning_rate": 1.5035944124635167e-05, "loss": 0.1541, "step": 6390 }, { "epoch": 1.83, "grad_norm": 7.7722365788740815, "learning_rate": 1.5019595768700357e-05, "loss": 0.1601, "step": 6400 }, { "epoch": 1.83, "grad_norm": 8.10721696131972, "learning_rate": 1.5003229459536689e-05, "loss": 0.1335, "step": 6410 }, { "epoch": 1.84, "grad_norm": 6.775557840614577, "learning_rate": 1.4986845255692028e-05, "loss": 0.1228, "step": 6420 }, { "epoch": 1.84, "grad_norm": 11.96668004395597, "learning_rate": 1.4970443215778268e-05, "loss": 0.1156, "step": 6430 }, { "epoch": 1.84, "grad_norm": 9.73665507694518, "learning_rate": 1.495402339847109e-05, "loss": 0.1337, "step": 6440 }, { "epoch": 1.84, "grad_norm": 5.9821374758958505, "learning_rate": 1.4937585862509787e-05, "loss": 0.2534, "step": 6450 }, { "epoch": 1.85, "grad_norm": 9.346219658204811, "learning_rate": 1.4921130666697036e-05, "loss": 0.1476, "step": 6460 }, { "epoch": 1.85, "grad_norm": 7.568787790293896, "learning_rate": 1.4904657869898675e-05, "loss": 0.1518, "step": 6470 }, { "epoch": 1.85, "grad_norm": 10.408221267166118, "learning_rate": 1.4888167531043524e-05, "loss": 0.1431, "step": 6480 }, { "epoch": 1.86, "grad_norm": 10.149438445995077, "learning_rate": 1.4871659709123142e-05, "loss": 0.1633, "step": 6490 }, { "epoch": 1.86, "grad_norm": 9.629694994951853, "learning_rate": 1.4855134463191654e-05, "loss": 0.1227, "step": 6500 }, { "epoch": 1.86, "grad_norm": 6.945314971614437, "learning_rate": 1.4838591852365485e-05, "loss": 0.1194, "step": 6510 }, { "epoch": 1.86, "grad_norm": 7.6463244814859825, "learning_rate": 1.4822031935823206e-05, "loss": 0.1407, "step": 6520 }, { "epoch": 1.87, "grad_norm": 11.81248966600077, "learning_rate": 1.4805454772805285e-05, "loss": 0.1819, "step": 6530 }, { "epoch": 1.87, "grad_norm": 1.2926846102685143, "learning_rate": 1.478886042261389e-05, "loss": 0.1133, "step": 6540 }, { "epoch": 1.87, "grad_norm": 15.337280649918844, "learning_rate": 1.4772248944612664e-05, "loss": 0.155, "step": 6550 }, { "epoch": 1.88, "grad_norm": 5.457448737217818, "learning_rate": 1.4755620398226536e-05, "loss": 0.1553, "step": 6560 }, { "epoch": 1.88, "grad_norm": 5.4782894965030025, "learning_rate": 1.4738974842941491e-05, "loss": 0.1299, "step": 6570 }, { "epoch": 1.88, "grad_norm": 11.773794552274968, "learning_rate": 1.4722312338304352e-05, "loss": 0.203, "step": 6580 }, { "epoch": 1.88, "grad_norm": 8.724569367982484, "learning_rate": 1.4705632943922582e-05, "loss": 0.1433, "step": 6590 }, { "epoch": 1.89, "grad_norm": 5.352212195393741, "learning_rate": 1.4688936719464069e-05, "loss": 0.1613, "step": 6600 }, { "epoch": 1.89, "grad_norm": 3.3212244554139585, "learning_rate": 1.4672223724656898e-05, "loss": 0.1034, "step": 6610 }, { "epoch": 1.89, "grad_norm": 11.206049879152019, "learning_rate": 1.4655494019289153e-05, "loss": 0.1079, "step": 6620 }, { "epoch": 1.9, "grad_norm": 13.415542902985177, "learning_rate": 1.4638747663208701e-05, "loss": 0.2044, "step": 6630 }, { "epoch": 1.9, "grad_norm": 13.288698508757005, "learning_rate": 1.4621984716322963e-05, "loss": 0.133, "step": 6640 }, { "epoch": 1.9, "grad_norm": 6.7195945031575555, "learning_rate": 1.4605205238598721e-05, "loss": 0.1486, "step": 6650 }, { "epoch": 1.9, "grad_norm": 11.158577299596676, "learning_rate": 1.4588409290061891e-05, "loss": 0.1195, "step": 6660 }, { "epoch": 1.91, "grad_norm": 7.229586307830694, "learning_rate": 1.457159693079731e-05, "loss": 0.1079, "step": 6670 }, { "epoch": 1.91, "grad_norm": 7.586285976966358, "learning_rate": 1.4554768220948515e-05, "loss": 0.1026, "step": 6680 }, { "epoch": 1.91, "grad_norm": 3.173722654497538, "learning_rate": 1.453792322071755e-05, "loss": 0.1063, "step": 6690 }, { "epoch": 1.92, "grad_norm": 8.77912806602167, "learning_rate": 1.452106199036472e-05, "loss": 0.1631, "step": 6700 }, { "epoch": 1.92, "grad_norm": 12.817060449815267, "learning_rate": 1.45041845902084e-05, "loss": 0.1715, "step": 6710 }, { "epoch": 1.92, "grad_norm": 5.6708394936503135, "learning_rate": 1.448729108062481e-05, "loss": 0.113, "step": 6720 }, { "epoch": 1.92, "grad_norm": 3.6354119301722294, "learning_rate": 1.4470381522047792e-05, "loss": 0.1298, "step": 6730 }, { "epoch": 1.93, "grad_norm": 6.369375103492051, "learning_rate": 1.4453455974968607e-05, "loss": 0.1408, "step": 6740 }, { "epoch": 1.93, "grad_norm": 2.437252521178252, "learning_rate": 1.4436514499935708e-05, "loss": 0.1185, "step": 6750 }, { "epoch": 1.93, "grad_norm": 10.604943947075132, "learning_rate": 1.441955715755453e-05, "loss": 0.102, "step": 6760 }, { "epoch": 1.94, "grad_norm": 6.097353451802753, "learning_rate": 1.4402584008487273e-05, "loss": 0.1596, "step": 6770 }, { "epoch": 1.94, "grad_norm": 5.772275485120495, "learning_rate": 1.4385595113452677e-05, "loss": 0.1519, "step": 6780 }, { "epoch": 1.94, "grad_norm": 5.312627095216832, "learning_rate": 1.4368590533225817e-05, "loss": 0.1597, "step": 6790 }, { "epoch": 1.94, "grad_norm": 8.310742644212961, "learning_rate": 1.4351570328637878e-05, "loss": 0.1782, "step": 6800 }, { "epoch": 1.95, "grad_norm": 10.428439107753427, "learning_rate": 1.4334534560575933e-05, "loss": 0.1821, "step": 6810 }, { "epoch": 1.95, "grad_norm": 2.386176264619301, "learning_rate": 1.4317483289982737e-05, "loss": 0.1024, "step": 6820 }, { "epoch": 1.95, "grad_norm": 5.7207347990118365, "learning_rate": 1.43004165778565e-05, "loss": 0.1635, "step": 6830 }, { "epoch": 1.96, "grad_norm": 6.89719539532476, "learning_rate": 1.4283334485250677e-05, "loss": 0.1352, "step": 6840 }, { "epoch": 1.96, "grad_norm": 6.242384276075554, "learning_rate": 1.4266237073273734e-05, "loss": 0.0974, "step": 6850 }, { "epoch": 1.96, "grad_norm": 7.349226317914276, "learning_rate": 1.4249124403088947e-05, "loss": 0.1024, "step": 6860 }, { "epoch": 1.96, "grad_norm": 5.315658707374481, "learning_rate": 1.4231996535914177e-05, "loss": 0.1845, "step": 6870 }, { "epoch": 1.97, "grad_norm": 11.135299886597268, "learning_rate": 1.4214853533021648e-05, "loss": 0.126, "step": 6880 }, { "epoch": 1.97, "grad_norm": 15.391495950134535, "learning_rate": 1.4197695455737721e-05, "loss": 0.1587, "step": 6890 }, { "epoch": 1.97, "grad_norm": 4.4030602996599315, "learning_rate": 1.4180522365442696e-05, "loss": 0.0737, "step": 6900 }, { "epoch": 1.98, "grad_norm": 2.4650565416765007, "learning_rate": 1.4163334323570579e-05, "loss": 0.0963, "step": 6910 }, { "epoch": 1.98, "grad_norm": 14.050252370502271, "learning_rate": 1.414613139160885e-05, "loss": 0.1211, "step": 6920 }, { "epoch": 1.98, "grad_norm": 3.9562912877054224, "learning_rate": 1.4128913631098267e-05, "loss": 0.155, "step": 6930 }, { "epoch": 1.98, "grad_norm": 10.622176928492356, "learning_rate": 1.4111681103632635e-05, "loss": 0.1605, "step": 6940 }, { "epoch": 1.99, "grad_norm": 9.277908288744586, "learning_rate": 1.4094433870858582e-05, "loss": 0.1941, "step": 6950 }, { "epoch": 1.99, "grad_norm": 4.923488969936356, "learning_rate": 1.407717199447534e-05, "loss": 0.1641, "step": 6960 }, { "epoch": 1.99, "grad_norm": 4.481118751210592, "learning_rate": 1.4059895536234531e-05, "loss": 0.155, "step": 6970 }, { "epoch": 2.0, "grad_norm": 9.125595700402634, "learning_rate": 1.4042604557939938e-05, "loss": 0.1692, "step": 6980 }, { "epoch": 2.0, "grad_norm": 7.279209358542866, "learning_rate": 1.402529912144729e-05, "loss": 0.1245, "step": 6990 }, { "epoch": 2.0, "grad_norm": 5.622891009530036, "learning_rate": 1.400797928866403e-05, "loss": 0.1288, "step": 7000 } ], "logging_steps": 10, "max_steps": 17485, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 12, "trial_name": null, "trial_params": null }