{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996477632969355, "eval_steps": 500, "global_step": 1419, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.4084507042253521e-06, "loss": 1.9425, "step": 1 }, { "epoch": 0.0, "learning_rate": 7.042253521126762e-06, "loss": 2.0234, "step": 5 }, { "epoch": 0.01, "learning_rate": 1.4084507042253523e-05, "loss": 1.9715, "step": 10 }, { "epoch": 0.01, "learning_rate": 2.112676056338028e-05, "loss": 1.9477, "step": 15 }, { "epoch": 0.01, "learning_rate": 2.8169014084507046e-05, "loss": 1.8815, "step": 20 }, { "epoch": 0.02, "learning_rate": 3.5211267605633805e-05, "loss": 1.8108, "step": 25 }, { "epoch": 0.02, "learning_rate": 4.225352112676056e-05, "loss": 1.7765, "step": 30 }, { "epoch": 0.02, "learning_rate": 4.929577464788733e-05, "loss": 1.7328, "step": 35 }, { "epoch": 0.03, "learning_rate": 5.633802816901409e-05, "loss": 1.7477, "step": 40 }, { "epoch": 0.03, "learning_rate": 6.338028169014085e-05, "loss": 1.6816, "step": 45 }, { "epoch": 0.04, "learning_rate": 7.042253521126761e-05, "loss": 1.6714, "step": 50 }, { "epoch": 0.04, "learning_rate": 7.746478873239437e-05, "loss": 1.642, "step": 55 }, { "epoch": 0.04, "learning_rate": 8.450704225352113e-05, "loss": 1.612, "step": 60 }, { "epoch": 0.05, "learning_rate": 9.15492957746479e-05, "loss": 1.592, "step": 65 }, { "epoch": 0.05, "learning_rate": 9.859154929577466e-05, "loss": 1.6068, "step": 70 }, { "epoch": 0.05, "learning_rate": 0.0001056338028169014, "loss": 1.5911, "step": 75 }, { "epoch": 0.06, "learning_rate": 0.00011267605633802819, "loss": 1.5727, "step": 80 }, { "epoch": 0.06, "learning_rate": 0.00011971830985915493, "loss": 1.5929, "step": 85 }, { "epoch": 0.06, "learning_rate": 0.0001267605633802817, "loss": 1.5441, "step": 90 }, { "epoch": 0.07, "learning_rate": 0.00013380281690140845, "loss": 1.5659, "step": 95 }, { "epoch": 0.07, "learning_rate": 0.00014084507042253522, "loss": 1.5422, "step": 100 }, { "epoch": 0.07, "learning_rate": 0.000147887323943662, "loss": 1.5782, "step": 105 }, { "epoch": 0.08, "learning_rate": 0.00015492957746478874, "loss": 1.5553, "step": 110 }, { "epoch": 0.08, "learning_rate": 0.0001619718309859155, "loss": 1.5324, "step": 115 }, { "epoch": 0.08, "learning_rate": 0.00016901408450704225, "loss": 1.4999, "step": 120 }, { "epoch": 0.09, "learning_rate": 0.00017605633802816902, "loss": 1.5137, "step": 125 }, { "epoch": 0.09, "learning_rate": 0.0001830985915492958, "loss": 1.5549, "step": 130 }, { "epoch": 0.1, "learning_rate": 0.00019014084507042254, "loss": 1.5369, "step": 135 }, { "epoch": 0.1, "learning_rate": 0.0001971830985915493, "loss": 1.5427, "step": 140 }, { "epoch": 0.1, "learning_rate": 0.00019999727649301603, "loss": 1.5195, "step": 145 }, { "epoch": 0.11, "learning_rate": 0.00019998063337645585, "loss": 1.5184, "step": 150 }, { "epoch": 0.11, "learning_rate": 0.0001999488627179007, "loss": 1.5382, "step": 155 }, { "epoch": 0.11, "learning_rate": 0.00019990196932440123, "loss": 1.5211, "step": 160 }, { "epoch": 0.12, "learning_rate": 0.00019983996029114938, "loss": 1.5237, "step": 165 }, { "epoch": 0.12, "learning_rate": 0.00019976284500040467, "loss": 1.5225, "step": 170 }, { "epoch": 0.12, "learning_rate": 0.00019967063512007482, "loss": 1.5393, "step": 175 }, { "epoch": 0.13, "learning_rate": 0.00019956334460195015, "loss": 1.4915, "step": 180 }, { "epoch": 0.13, "learning_rate": 0.00019944098967959278, "loss": 1.492, "step": 185 }, { "epoch": 0.13, "learning_rate": 0.00019930358886588025, "loss": 1.5237, "step": 190 }, { "epoch": 0.14, "learning_rate": 0.00019915116295020458, "loss": 1.5198, "step": 195 }, { "epoch": 0.14, "learning_rate": 0.00019898373499532666, "loss": 1.5046, "step": 200 }, { "epoch": 0.14, "learning_rate": 0.0001988013303338867, "loss": 1.4883, "step": 205 }, { "epoch": 0.15, "learning_rate": 0.00019860397656457147, "loss": 1.4738, "step": 210 }, { "epoch": 0.15, "learning_rate": 0.00019839170354793827, "loss": 1.5288, "step": 215 }, { "epoch": 0.15, "learning_rate": 0.00019816454340189693, "loss": 1.5271, "step": 220 }, { "epoch": 0.16, "learning_rate": 0.0001979225304968504, "loss": 1.4791, "step": 225 }, { "epoch": 0.16, "learning_rate": 0.00019766570145049407, "loss": 1.4587, "step": 230 }, { "epoch": 0.17, "learning_rate": 0.0001973940951222756, "loss": 1.4508, "step": 235 }, { "epoch": 0.17, "learning_rate": 0.0001971077526075151, "loss": 1.5056, "step": 240 }, { "epoch": 0.17, "learning_rate": 0.00019680671723118734, "loss": 1.4492, "step": 245 }, { "epoch": 0.18, "learning_rate": 0.0001964910345413664, "loss": 1.5118, "step": 250 }, { "epoch": 0.18, "learning_rate": 0.00019616075230233408, "loss": 1.4772, "step": 255 }, { "epoch": 0.18, "learning_rate": 0.00019581592048735296, "loss": 1.4862, "step": 260 }, { "epoch": 0.19, "learning_rate": 0.00019545659127110507, "loss": 1.4979, "step": 265 }, { "epoch": 0.19, "learning_rate": 0.00019508281902179782, "loss": 1.4564, "step": 270 }, { "epoch": 0.19, "learning_rate": 0.00019469466029293776, "loss": 1.4446, "step": 275 }, { "epoch": 0.2, "learning_rate": 0.00019429217381477357, "loss": 1.4828, "step": 280 }, { "epoch": 0.2, "learning_rate": 0.00019387542048541023, "loss": 1.4632, "step": 285 }, { "epoch": 0.2, "learning_rate": 0.0001934444633615946, "loss": 1.4543, "step": 290 }, { "epoch": 0.21, "learning_rate": 0.0001929993676491747, "loss": 1.4742, "step": 295 }, { "epoch": 0.21, "learning_rate": 0.00019254020069323386, "loss": 1.4811, "step": 300 }, { "epoch": 0.21, "learning_rate": 0.00019206703196790096, "loss": 1.4505, "step": 305 }, { "epoch": 0.22, "learning_rate": 0.00019157993306583867, "loss": 1.4237, "step": 310 }, { "epoch": 0.22, "learning_rate": 0.00019107897768741127, "loss": 1.4285, "step": 315 }, { "epoch": 0.23, "learning_rate": 0.0001905642416295333, "loss": 1.4454, "step": 320 }, { "epoch": 0.23, "learning_rate": 0.0001900358027742012, "loss": 1.42, "step": 325 }, { "epoch": 0.23, "learning_rate": 0.00018949374107670935, "loss": 1.4507, "step": 330 }, { "epoch": 0.24, "learning_rate": 0.0001889381385535525, "loss": 1.4222, "step": 335 }, { "epoch": 0.24, "learning_rate": 0.00018836907927001628, "loss": 1.4502, "step": 340 }, { "epoch": 0.24, "learning_rate": 0.00018778664932745772, "loss": 1.4255, "step": 345 }, { "epoch": 0.25, "learning_rate": 0.0001871909368502777, "loss": 1.4597, "step": 350 }, { "epoch": 0.25, "learning_rate": 0.00018658203197258728, "loss": 1.432, "step": 355 }, { "epoch": 0.25, "learning_rate": 0.0001859600268245701, "loss": 1.4676, "step": 360 }, { "epoch": 0.26, "learning_rate": 0.00018532501551854242, "loss": 1.4267, "step": 365 }, { "epoch": 0.26, "learning_rate": 0.00018467709413471378, "loss": 1.4392, "step": 370 }, { "epoch": 0.26, "learning_rate": 0.00018401636070664945, "loss": 1.4067, "step": 375 }, { "epoch": 0.27, "learning_rate": 0.0001833429152064374, "loss": 1.4427, "step": 380 }, { "epoch": 0.27, "learning_rate": 0.00018265685952956222, "loss": 1.4519, "step": 385 }, { "epoch": 0.27, "learning_rate": 0.00018195829747948773, "loss": 1.4449, "step": 390 }, { "epoch": 0.28, "learning_rate": 0.00018124733475195117, "loss": 1.424, "step": 395 }, { "epoch": 0.28, "learning_rate": 0.00018052407891897075, "loss": 1.447, "step": 400 }, { "epoch": 0.29, "learning_rate": 0.0001797886394125696, "loss": 1.4363, "step": 405 }, { "epoch": 0.29, "learning_rate": 0.00017904112750821824, "loss": 1.4152, "step": 410 }, { "epoch": 0.29, "learning_rate": 0.00017828165630799796, "loss": 1.41, "step": 415 }, { "epoch": 0.3, "learning_rate": 0.000177510340723488, "loss": 1.446, "step": 420 }, { "epoch": 0.3, "learning_rate": 0.0001767272974583789, "loss": 1.4449, "step": 425 }, { "epoch": 0.3, "learning_rate": 0.00017593264499081465, "loss": 1.4519, "step": 430 }, { "epoch": 0.31, "learning_rate": 0.00017512650355546634, "loss": 1.4323, "step": 435 }, { "epoch": 0.31, "learning_rate": 0.00017430899512534026, "loss": 1.424, "step": 440 }, { "epoch": 0.31, "learning_rate": 0.00017348024339332259, "loss": 1.3989, "step": 445 }, { "epoch": 0.32, "learning_rate": 0.0001726403737534642, "loss": 1.4005, "step": 450 }, { "epoch": 0.32, "learning_rate": 0.00017178951328200798, "loss": 1.4457, "step": 455 }, { "epoch": 0.32, "learning_rate": 0.0001709277907181615, "loss": 1.4335, "step": 460 }, { "epoch": 0.33, "learning_rate": 0.0001700553364446182, "loss": 1.4705, "step": 465 }, { "epoch": 0.33, "learning_rate": 0.00016917228246782987, "loss": 1.4012, "step": 470 }, { "epoch": 0.33, "learning_rate": 0.00016827876239803352, "loss": 1.4038, "step": 475 }, { "epoch": 0.34, "learning_rate": 0.0001673749114290354, "loss": 1.4353, "step": 480 }, { "epoch": 0.34, "learning_rate": 0.00016646086631775563, "loss": 1.4034, "step": 485 }, { "epoch": 0.35, "learning_rate": 0.00016553676536353612, "loss": 1.4265, "step": 490 }, { "epoch": 0.35, "learning_rate": 0.00016460274838721545, "loss": 1.4354, "step": 495 }, { "epoch": 0.35, "learning_rate": 0.00016365895670997305, "loss": 1.4099, "step": 500 }, { "epoch": 0.36, "learning_rate": 0.00016270553313194684, "loss": 1.4237, "step": 505 }, { "epoch": 0.36, "learning_rate": 0.00016174262191062674, "loss": 1.4616, "step": 510 }, { "epoch": 0.36, "learning_rate": 0.00016077036873902797, "loss": 1.4041, "step": 515 }, { "epoch": 0.37, "learning_rate": 0.00015978892072364694, "loss": 1.4019, "step": 520 }, { "epoch": 0.37, "learning_rate": 0.00015879842636220328, "loss": 1.3983, "step": 525 }, { "epoch": 0.37, "learning_rate": 0.00015779903552117153, "loss": 1.4043, "step": 530 }, { "epoch": 0.38, "learning_rate": 0.00015679089941310575, "loss": 1.373, "step": 535 }, { "epoch": 0.38, "learning_rate": 0.00015577417057376, "loss": 1.4002, "step": 540 }, { "epoch": 0.38, "learning_rate": 0.00015474900283900923, "loss": 1.3802, "step": 545 }, { "epoch": 0.39, "learning_rate": 0.00015371555132157318, "loss": 1.3933, "step": 550 }, { "epoch": 0.39, "learning_rate": 0.00015267397238754693, "loss": 1.4463, "step": 555 }, { "epoch": 0.39, "learning_rate": 0.00015162442363274214, "loss": 1.3943, "step": 560 }, { "epoch": 0.4, "learning_rate": 0.00015056706385884196, "loss": 1.4336, "step": 565 }, { "epoch": 0.4, "learning_rate": 0.00014950205304937368, "loss": 1.3881, "step": 570 }, { "epoch": 0.41, "learning_rate": 0.00014842955234550231, "loss": 1.3875, "step": 575 }, { "epoch": 0.41, "learning_rate": 0.00014734972402164932, "loss": 1.4053, "step": 580 }, { "epoch": 0.41, "learning_rate": 0.00014626273146093967, "loss": 1.404, "step": 585 }, { "epoch": 0.42, "learning_rate": 0.00014516873913048117, "loss": 1.4101, "step": 590 }, { "epoch": 0.42, "learning_rate": 0.0001440679125564799, "loss": 1.3958, "step": 595 }, { "epoch": 0.42, "learning_rate": 0.00014296041829919522, "loss": 1.3972, "step": 600 }, { "epoch": 0.43, "learning_rate": 0.00014184642392773845, "loss": 1.4385, "step": 605 }, { "epoch": 0.43, "learning_rate": 0.00014072609799471896, "loss": 1.4154, "step": 610 }, { "epoch": 0.43, "learning_rate": 0.00013959961001074113, "loss": 1.392, "step": 615 }, { "epoch": 0.44, "learning_rate": 0.00013846713041875693, "loss": 1.3893, "step": 620 }, { "epoch": 0.44, "learning_rate": 0.00013732883056827684, "loss": 1.3978, "step": 625 }, { "epoch": 0.44, "learning_rate": 0.0001361848826894439, "loss": 1.414, "step": 630 }, { "epoch": 0.45, "learning_rate": 0.00013503545986697456, "loss": 1.3606, "step": 635 }, { "epoch": 0.45, "learning_rate": 0.00013388073601397008, "loss": 1.4059, "step": 640 }, { "epoch": 0.45, "learning_rate": 0.0001327208858456027, "loss": 1.3912, "step": 645 }, { "epoch": 0.46, "learning_rate": 0.00013155608485268031, "loss": 1.4211, "step": 650 }, { "epoch": 0.46, "learning_rate": 0.00013038650927509404, "loss": 1.384, "step": 655 }, { "epoch": 0.46, "learning_rate": 0.00012921233607515216, "loss": 1.3631, "step": 660 }, { "epoch": 0.47, "learning_rate": 0.0001280337429108049, "loss": 1.3803, "step": 665 }, { "epoch": 0.47, "learning_rate": 0.00012685090810876407, "loss": 1.3631, "step": 670 }, { "epoch": 0.48, "learning_rate": 0.0001256640106375212, "loss": 1.4074, "step": 675 }, { "epoch": 0.48, "learning_rate": 0.0001244732300802689, "loss": 1.386, "step": 680 }, { "epoch": 0.48, "learning_rate": 0.00012327874660772898, "loss": 1.4198, "step": 685 }, { "epoch": 0.49, "learning_rate": 0.00012208074095089192, "loss": 1.3607, "step": 690 }, { "epoch": 0.49, "learning_rate": 0.00012087939437367126, "loss": 1.3762, "step": 695 }, { "epoch": 0.49, "learning_rate": 0.0001196748886454775, "loss": 1.389, "step": 700 }, { "epoch": 0.5, "learning_rate": 0.00011846740601371576, "loss": 1.4058, "step": 705 }, { "epoch": 0.5, "learning_rate": 0.00011725712917621059, "loss": 1.3868, "step": 710 }, { "epoch": 0.5, "learning_rate": 0.00011604424125356312, "loss": 1.3644, "step": 715 }, { "epoch": 0.51, "learning_rate": 0.00011482892576144405, "loss": 1.3894, "step": 720 }, { "epoch": 0.51, "learning_rate": 0.00011361136658282663, "loss": 1.363, "step": 725 }, { "epoch": 0.51, "learning_rate": 0.00011239174794016469, "loss": 1.3882, "step": 730 }, { "epoch": 0.52, "learning_rate": 0.00011117025436751855, "loss": 1.409, "step": 735 }, { "epoch": 0.52, "learning_rate": 0.00010994707068263434, "loss": 1.3615, "step": 740 }, { "epoch": 0.52, "learning_rate": 0.00010872238195898019, "loss": 1.3623, "step": 745 }, { "epoch": 0.53, "learning_rate": 0.00010749637349774357, "loss": 1.3658, "step": 750 }, { "epoch": 0.53, "learning_rate": 0.00010626923079979465, "loss": 1.3687, "step": 755 }, { "epoch": 0.54, "learning_rate": 0.00010504113953761885, "loss": 1.3753, "step": 760 }, { "epoch": 0.54, "learning_rate": 0.00010381228552722392, "loss": 1.3861, "step": 765 }, { "epoch": 0.54, "learning_rate": 0.00010258285470002494, "loss": 1.3645, "step": 770 }, { "epoch": 0.55, "learning_rate": 0.00010135303307471213, "loss": 1.3761, "step": 775 }, { "epoch": 0.55, "learning_rate": 0.00010012300672910535, "loss": 1.3553, "step": 780 }, { "epoch": 0.55, "learning_rate": 9.889296177199954e-05, "loss": 1.3739, "step": 785 }, { "epoch": 0.56, "learning_rate": 9.766308431500566e-05, "loss": 1.3452, "step": 790 }, { "epoch": 0.56, "learning_rate": 9.64335604443912e-05, "loss": 1.3559, "step": 795 }, { "epoch": 0.56, "learning_rate": 9.520457619292423e-05, "loss": 1.3802, "step": 800 }, { "epoch": 0.57, "learning_rate": 9.397631751172601e-05, "loss": 1.3469, "step": 805 }, { "epoch": 0.57, "learning_rate": 9.274897024213544e-05, "loss": 1.3818, "step": 810 }, { "epoch": 0.57, "learning_rate": 9.152272008759076e-05, "loss": 1.3351, "step": 815 }, { "epoch": 0.58, "learning_rate": 9.029775258553128e-05, "loss": 1.3855, "step": 820 }, { "epoch": 0.58, "learning_rate": 8.907425307932514e-05, "loss": 1.3522, "step": 825 }, { "epoch": 0.58, "learning_rate": 8.785240669022568e-05, "loss": 1.344, "step": 830 }, { "epoch": 0.59, "learning_rate": 8.663239828936174e-05, "loss": 1.3704, "step": 835 }, { "epoch": 0.59, "learning_rate": 8.541441246976607e-05, "loss": 1.3571, "step": 840 }, { "epoch": 0.6, "learning_rate": 8.419863351844508e-05, "loss": 1.3621, "step": 845 }, { "epoch": 0.6, "learning_rate": 8.298524538849576e-05, "loss": 1.3889, "step": 850 }, { "epoch": 0.6, "learning_rate": 8.177443167127244e-05, "loss": 1.3597, "step": 855 }, { "epoch": 0.61, "learning_rate": 8.056637556860872e-05, "loss": 1.3719, "step": 860 }, { "epoch": 0.61, "learning_rate": 7.936125986509803e-05, "loss": 1.3565, "step": 865 }, { "epoch": 0.61, "learning_rate": 7.815926690043756e-05, "loss": 1.3909, "step": 870 }, { "epoch": 0.62, "learning_rate": 7.696057854183935e-05, "loss": 1.3711, "step": 875 }, { "epoch": 0.62, "learning_rate": 7.576537615651295e-05, "loss": 1.3712, "step": 880 }, { "epoch": 0.62, "learning_rate": 7.457384058422368e-05, "loss": 1.3663, "step": 885 }, { "epoch": 0.63, "learning_rate": 7.338615210993074e-05, "loss": 1.3771, "step": 890 }, { "epoch": 0.63, "learning_rate": 7.220249043650918e-05, "loss": 1.3454, "step": 895 }, { "epoch": 0.63, "learning_rate": 7.102303465756019e-05, "loss": 1.3879, "step": 900 }, { "epoch": 0.64, "learning_rate": 6.98479632303131e-05, "loss": 1.3803, "step": 905 }, { "epoch": 0.64, "learning_rate": 6.867745394862422e-05, "loss": 1.3628, "step": 910 }, { "epoch": 0.64, "learning_rate": 6.751168391607576e-05, "loss": 1.4021, "step": 915 }, { "epoch": 0.65, "learning_rate": 6.635082951917897e-05, "loss": 1.3821, "step": 920 }, { "epoch": 0.65, "learning_rate": 6.519506640068638e-05, "loss": 1.3592, "step": 925 }, { "epoch": 0.66, "learning_rate": 6.404456943301592e-05, "loss": 1.3659, "step": 930 }, { "epoch": 0.66, "learning_rate": 6.289951269179215e-05, "loss": 1.3887, "step": 935 }, { "epoch": 0.66, "learning_rate": 6.17600694295076e-05, "loss": 1.3603, "step": 940 }, { "epoch": 0.67, "learning_rate": 6.0626412049308965e-05, "loss": 1.3654, "step": 945 }, { "epoch": 0.67, "learning_rate": 5.949871207891162e-05, "loss": 1.3842, "step": 950 }, { "epoch": 0.67, "learning_rate": 5.837714014464677e-05, "loss": 1.3333, "step": 955 }, { "epoch": 0.68, "learning_rate": 5.7261865945644664e-05, "loss": 1.3426, "step": 960 }, { "epoch": 0.68, "learning_rate": 5.6153058228158686e-05, "loss": 1.3652, "step": 965 }, { "epoch": 0.68, "learning_rate": 5.505088476003284e-05, "loss": 1.3833, "step": 970 }, { "epoch": 0.69, "learning_rate": 5.395551230531797e-05, "loss": 1.387, "step": 975 }, { "epoch": 0.69, "learning_rate": 5.286710659903937e-05, "loss": 1.3584, "step": 980 }, { "epoch": 0.69, "learning_rate": 5.1785832322120574e-05, "loss": 1.3588, "step": 985 }, { "epoch": 0.7, "learning_rate": 5.0711853076466045e-05, "loss": 1.403, "step": 990 }, { "epoch": 0.7, "learning_rate": 4.9645331360207505e-05, "loss": 1.3297, "step": 995 }, { "epoch": 0.7, "learning_rate": 4.8586428543117434e-05, "loss": 1.3314, "step": 1000 }, { "epoch": 0.71, "learning_rate": 4.753530484219279e-05, "loss": 1.3619, "step": 1005 }, { "epoch": 0.71, "learning_rate": 4.6492119297413475e-05, "loss": 1.3421, "step": 1010 }, { "epoch": 0.72, "learning_rate": 4.545702974767891e-05, "loss": 1.356, "step": 1015 }, { "epoch": 0.72, "learning_rate": 4.443019280692638e-05, "loss": 1.3057, "step": 1020 }, { "epoch": 0.72, "learning_rate": 4.341176384043416e-05, "loss": 1.3334, "step": 1025 }, { "epoch": 0.73, "learning_rate": 4.240189694131458e-05, "loss": 1.3574, "step": 1030 }, { "epoch": 0.73, "learning_rate": 4.140074490719861e-05, "loss": 1.3476, "step": 1035 }, { "epoch": 0.73, "learning_rate": 4.040845921711702e-05, "loss": 1.36, "step": 1040 }, { "epoch": 0.74, "learning_rate": 3.9425190008580826e-05, "loss": 1.3517, "step": 1045 }, { "epoch": 0.74, "learning_rate": 3.8451086054864896e-05, "loss": 1.3709, "step": 1050 }, { "epoch": 0.74, "learning_rate": 3.748629474249766e-05, "loss": 1.3551, "step": 1055 }, { "epoch": 0.75, "learning_rate": 3.6530962048960973e-05, "loss": 1.3444, "step": 1060 }, { "epoch": 0.75, "learning_rate": 3.558523252060295e-05, "loss": 1.3683, "step": 1065 }, { "epoch": 0.75, "learning_rate": 3.464924925076758e-05, "loss": 1.3673, "step": 1070 }, { "epoch": 0.76, "learning_rate": 3.3723153858143784e-05, "loss": 1.3692, "step": 1075 }, { "epoch": 0.76, "learning_rate": 3.280708646533799e-05, "loss": 1.3456, "step": 1080 }, { "epoch": 0.76, "learning_rate": 3.1901185677672976e-05, "loss": 1.3424, "step": 1085 }, { "epoch": 0.77, "learning_rate": 3.100558856221606e-05, "loss": 1.3521, "step": 1090 }, { "epoch": 0.77, "learning_rate": 3.012043062704034e-05, "loss": 1.3721, "step": 1095 }, { "epoch": 0.77, "learning_rate": 2.9245845800721574e-05, "loss": 1.3416, "step": 1100 }, { "epoch": 0.78, "learning_rate": 2.838196641207438e-05, "loss": 1.3671, "step": 1105 }, { "epoch": 0.78, "learning_rate": 2.752892317012985e-05, "loss": 1.3855, "step": 1110 }, { "epoch": 0.79, "learning_rate": 2.668684514435914e-05, "loss": 1.388, "step": 1115 }, { "epoch": 0.79, "learning_rate": 2.5855859745144284e-05, "loss": 1.3535, "step": 1120 }, { "epoch": 0.79, "learning_rate": 2.503609270450058e-05, "loss": 1.3519, "step": 1125 }, { "epoch": 0.8, "learning_rate": 2.4227668057052634e-05, "loss": 1.3591, "step": 1130 }, { "epoch": 0.8, "learning_rate": 2.343070812126752e-05, "loss": 1.3823, "step": 1135 }, { "epoch": 0.8, "learning_rate": 2.2645333480947173e-05, "loss": 1.3482, "step": 1140 }, { "epoch": 0.81, "learning_rate": 2.1871662966983663e-05, "loss": 1.3656, "step": 1145 }, { "epoch": 0.81, "learning_rate": 2.1109813639379427e-05, "loss": 1.3696, "step": 1150 }, { "epoch": 0.81, "learning_rate": 2.035990076953569e-05, "loss": 1.3406, "step": 1155 }, { "epoch": 0.82, "learning_rate": 1.9622037822811123e-05, "loss": 1.3501, "step": 1160 }, { "epoch": 0.82, "learning_rate": 1.8896336441354167e-05, "loss": 1.3447, "step": 1165 }, { "epoch": 0.82, "learning_rate": 1.8182906427211043e-05, "loss": 1.3734, "step": 1170 }, { "epoch": 0.83, "learning_rate": 1.7481855725712093e-05, "loss": 1.3903, "step": 1175 }, { "epoch": 0.83, "learning_rate": 1.6793290409139207e-05, "loss": 1.3574, "step": 1180 }, { "epoch": 0.83, "learning_rate": 1.611731466067655e-05, "loss": 1.3495, "step": 1185 }, { "epoch": 0.84, "learning_rate": 1.54540307586473e-05, "loss": 1.342, "step": 1190 }, { "epoch": 0.84, "learning_rate": 1.4803539061038119e-05, "loss": 1.3346, "step": 1195 }, { "epoch": 0.85, "learning_rate": 1.4165937990314915e-05, "loss": 1.3269, "step": 1200 }, { "epoch": 0.85, "learning_rate": 1.3541324018530788e-05, "loss": 1.37, "step": 1205 }, { "epoch": 0.85, "learning_rate": 1.2929791652729439e-05, "loss": 1.3472, "step": 1210 }, { "epoch": 0.86, "learning_rate": 1.233143342064581e-05, "loss": 1.3459, "step": 1215 }, { "epoch": 0.86, "learning_rate": 1.1746339856706234e-05, "loss": 1.3591, "step": 1220 }, { "epoch": 0.86, "learning_rate": 1.1174599488330051e-05, "loss": 1.361, "step": 1225 }, { "epoch": 0.87, "learning_rate": 1.0616298822535087e-05, "loss": 1.3585, "step": 1230 }, { "epoch": 0.87, "learning_rate": 1.007152233284866e-05, "loss": 1.3768, "step": 1235 }, { "epoch": 0.87, "learning_rate": 9.540352446526479e-06, "loss": 1.3655, "step": 1240 }, { "epoch": 0.88, "learning_rate": 9.022869532080735e-06, "loss": 1.3734, "step": 1245 }, { "epoch": 0.88, "learning_rate": 8.51915188712027e-06, "loss": 1.3404, "step": 1250 }, { "epoch": 0.88, "learning_rate": 8.029275726503626e-06, "loss": 1.3352, "step": 1255 }, { "epoch": 0.89, "learning_rate": 7.553315170807373e-06, "loss": 1.3394, "step": 1260 }, { "epoch": 0.89, "learning_rate": 7.09134223511132e-06, "loss": 1.3762, "step": 1265 }, { "epoch": 0.89, "learning_rate": 6.643426818102339e-06, "loss": 1.373, "step": 1270 }, { "epoch": 0.9, "learning_rate": 6.209636691498366e-06, "loss": 1.3304, "step": 1275 }, { "epoch": 0.9, "learning_rate": 5.790037489794109e-06, "loss": 1.3483, "step": 1280 }, { "epoch": 0.91, "learning_rate": 5.384692700330407e-06, "loss": 1.3342, "step": 1285 }, { "epoch": 0.91, "learning_rate": 4.993663653688152e-06, "loss": 1.3695, "step": 1290 }, { "epoch": 0.91, "learning_rate": 4.617009514408788e-06, "loss": 1.3326, "step": 1295 }, { "epoch": 0.92, "learning_rate": 4.254787272042315e-06, "loss": 1.352, "step": 1300 }, { "epoch": 0.92, "learning_rate": 3.907051732524669e-06, "loss": 1.3532, "step": 1305 }, { "epoch": 0.92, "learning_rate": 3.573855509885171e-06, "loss": 1.3466, "step": 1310 }, { "epoch": 0.93, "learning_rate": 3.2552490182858443e-06, "loss": 1.3799, "step": 1315 }, { "epoch": 0.93, "learning_rate": 2.95128046439348e-06, "loss": 1.3706, "step": 1320 }, { "epoch": 0.93, "learning_rate": 2.6619958400858513e-06, "loss": 1.3794, "step": 1325 }, { "epoch": 0.94, "learning_rate": 2.3874389154927233e-06, "loss": 1.3892, "step": 1330 }, { "epoch": 0.94, "learning_rate": 2.127651232373429e-06, "loss": 1.3396, "step": 1335 }, { "epoch": 0.94, "learning_rate": 1.8826720978313062e-06, "loss": 1.352, "step": 1340 }, { "epoch": 0.95, "learning_rate": 1.6525385783663028e-06, "loss": 1.3655, "step": 1345 }, { "epoch": 0.95, "learning_rate": 1.4372854942667602e-06, "loss": 1.3788, "step": 1350 }, { "epoch": 0.95, "learning_rate": 1.2369454143408289e-06, "loss": 1.339, "step": 1355 }, { "epoch": 0.96, "learning_rate": 1.051548650988754e-06, "loss": 1.3178, "step": 1360 }, { "epoch": 0.96, "learning_rate": 8.811232556163451e-07, "loss": 1.3495, "step": 1365 }, { "epoch": 0.97, "learning_rate": 7.256950143907815e-07, "loss": 1.3702, "step": 1370 }, { "epoch": 0.97, "learning_rate": 5.852874443389888e-07, "loss": 1.3459, "step": 1375 }, { "epoch": 0.97, "learning_rate": 4.5992178978938593e-07, "loss": 1.3747, "step": 1380 }, { "epoch": 0.98, "learning_rate": 3.496170191575554e-07, "loss": 1.3402, "step": 1385 }, { "epoch": 0.98, "learning_rate": 2.5438982207622866e-07, "loss": 1.3601, "step": 1390 }, { "epoch": 0.98, "learning_rate": 1.742546068700168e-07, "loss": 1.3837, "step": 1395 }, { "epoch": 0.99, "learning_rate": 1.0922349837545476e-07, "loss": 1.4011, "step": 1400 }, { "epoch": 0.99, "learning_rate": 5.9306336106346174e-08, "loss": 1.3838, "step": 1405 }, { "epoch": 0.99, "learning_rate": 2.4510672765087894e-08, "loss": 1.3608, "step": 1410 }, { "epoch": 1.0, "learning_rate": 4.8417730998728814e-09, "loss": 1.3504, "step": 1415 }, { "epoch": 1.0, "eval_loss": 1.355061411857605, "eval_runtime": 1858.7192, "eval_samples_per_second": 0.709, "eval_steps_per_second": 0.089, "step": 1419 }, { "epoch": 1.0, "step": 1419, "total_flos": 9.977407964782264e+17, "train_loss": 1.4210990094399267, "train_runtime": 56404.0081, "train_samples_per_second": 0.201, "train_steps_per_second": 0.025 } ], "logging_steps": 5, "max_steps": 1419, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 9.977407964782264e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }