{ "best_metric": 1.4109047651290894, "best_model_checkpoint": "./results/checkpoint-1500", "epoch": 2.8728752693320567, "eval_steps": 100, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019152501795547044, "grad_norm": 0.16222093999385834, "learning_rate": 4.000000000000001e-06, "loss": 1.9964, "step": 10 }, { "epoch": 0.03830500359109409, "grad_norm": 0.16309529542922974, "learning_rate": 8.000000000000001e-06, "loss": 1.9937, "step": 20 }, { "epoch": 0.05745750538664113, "grad_norm": 0.18897925317287445, "learning_rate": 1.2e-05, "loss": 1.9954, "step": 30 }, { "epoch": 0.07661000718218818, "grad_norm": 0.2183838188648224, "learning_rate": 1.6000000000000003e-05, "loss": 2.005, "step": 40 }, { "epoch": 0.09576250897773522, "grad_norm": 0.28962740302085876, "learning_rate": 2e-05, "loss": 1.9914, "step": 50 }, { "epoch": 0.11491501077328226, "grad_norm": 0.3513906002044678, "learning_rate": 2.4e-05, "loss": 1.9941, "step": 60 }, { "epoch": 0.13406751256882932, "grad_norm": 0.45517614483833313, "learning_rate": 2.8000000000000003e-05, "loss": 1.9874, "step": 70 }, { "epoch": 0.15322001436437635, "grad_norm": 0.5223982334136963, "learning_rate": 3.2000000000000005e-05, "loss": 1.9738, "step": 80 }, { "epoch": 0.17237251615992338, "grad_norm": 0.6227847933769226, "learning_rate": 3.6e-05, "loss": 1.9676, "step": 90 }, { "epoch": 0.19152501795547044, "grad_norm": 0.7356652021408081, "learning_rate": 4e-05, "loss": 1.9455, "step": 100 }, { "epoch": 0.19152501795547044, "eval_loss": 1.938758134841919, "eval_runtime": 54.5397, "eval_samples_per_second": 5.501, "eval_steps_per_second": 0.697, "step": 100 }, { "epoch": 0.21067751975101748, "grad_norm": 0.7847080230712891, "learning_rate": 4.4000000000000006e-05, "loss": 1.9324, "step": 110 }, { "epoch": 0.2298300215465645, "grad_norm": 0.8326261043548584, "learning_rate": 4.8e-05, "loss": 1.9078, "step": 120 }, { "epoch": 0.24898252334211157, "grad_norm": 0.8586732149124146, "learning_rate": 5.2000000000000004e-05, "loss": 1.8753, "step": 130 }, { "epoch": 0.26813502513765863, "grad_norm": 0.8781760931015015, "learning_rate": 5.6000000000000006e-05, "loss": 1.8631, "step": 140 }, { "epoch": 0.28728752693320564, "grad_norm": 0.8517449498176575, "learning_rate": 6e-05, "loss": 1.8474, "step": 150 }, { "epoch": 0.3064400287287527, "grad_norm": 0.8484007716178894, "learning_rate": 6.400000000000001e-05, "loss": 1.8256, "step": 160 }, { "epoch": 0.32559253052429976, "grad_norm": 0.7468522787094116, "learning_rate": 6.800000000000001e-05, "loss": 1.8098, "step": 170 }, { "epoch": 0.34474503231984677, "grad_norm": 0.6709467768669128, "learning_rate": 7.2e-05, "loss": 1.7933, "step": 180 }, { "epoch": 0.36389753411539383, "grad_norm": 0.5786746144294739, "learning_rate": 7.6e-05, "loss": 1.7697, "step": 190 }, { "epoch": 0.3830500359109409, "grad_norm": 0.5029782652854919, "learning_rate": 8e-05, "loss": 1.7615, "step": 200 }, { "epoch": 0.3830500359109409, "eval_loss": 1.7498724460601807, "eval_runtime": 54.4734, "eval_samples_per_second": 5.507, "eval_steps_per_second": 0.698, "step": 200 }, { "epoch": 0.4022025377064879, "grad_norm": 0.3556228578090668, "learning_rate": 8.4e-05, "loss": 1.743, "step": 210 }, { "epoch": 0.42135503950203496, "grad_norm": 0.2803383767604828, "learning_rate": 8.800000000000001e-05, "loss": 1.7251, "step": 220 }, { "epoch": 0.440507541297582, "grad_norm": 0.2616518437862396, "learning_rate": 9.200000000000001e-05, "loss": 1.7162, "step": 230 }, { "epoch": 0.459660043093129, "grad_norm": 0.27756568789482117, "learning_rate": 9.6e-05, "loss": 1.7092, "step": 240 }, { "epoch": 0.4788125448886761, "grad_norm": 0.2593691945075989, "learning_rate": 0.0001, "loss": 1.6953, "step": 250 }, { "epoch": 0.49796504668422314, "grad_norm": 0.28326255083084106, "learning_rate": 0.00010400000000000001, "loss": 1.6713, "step": 260 }, { "epoch": 0.5171175484797702, "grad_norm": 0.3027288019657135, "learning_rate": 0.00010800000000000001, "loss": 1.6653, "step": 270 }, { "epoch": 0.5362700502753173, "grad_norm": 0.2939159870147705, "learning_rate": 0.00011200000000000001, "loss": 1.6383, "step": 280 }, { "epoch": 0.5554225520708642, "grad_norm": 0.2954121530056, "learning_rate": 0.000116, "loss": 1.6316, "step": 290 }, { "epoch": 0.5745750538664113, "grad_norm": 0.28846076130867004, "learning_rate": 0.00012, "loss": 1.611, "step": 300 }, { "epoch": 0.5745750538664113, "eval_loss": 1.6042068004608154, "eval_runtime": 54.3935, "eval_samples_per_second": 5.515, "eval_steps_per_second": 0.699, "step": 300 }, { "epoch": 0.5937275556619583, "grad_norm": 0.2778375744819641, "learning_rate": 0.000124, "loss": 1.5971, "step": 310 }, { "epoch": 0.6128800574575054, "grad_norm": 0.25998637080192566, "learning_rate": 0.00012800000000000002, "loss": 1.5911, "step": 320 }, { "epoch": 0.6320325592530525, "grad_norm": 0.2551552951335907, "learning_rate": 0.000132, "loss": 1.581, "step": 330 }, { "epoch": 0.6511850610485995, "grad_norm": 0.27175173163414, "learning_rate": 0.00013600000000000003, "loss": 1.5584, "step": 340 }, { "epoch": 0.6703375628441465, "grad_norm": 0.2791101932525635, "learning_rate": 0.00014, "loss": 1.549, "step": 350 }, { "epoch": 0.6894900646396935, "grad_norm": 0.2629423439502716, "learning_rate": 0.000144, "loss": 1.5311, "step": 360 }, { "epoch": 0.7086425664352406, "grad_norm": 0.25314760208129883, "learning_rate": 0.000148, "loss": 1.5318, "step": 370 }, { "epoch": 0.7277950682307877, "grad_norm": 0.264098584651947, "learning_rate": 0.000152, "loss": 1.5345, "step": 380 }, { "epoch": 0.7469475700263347, "grad_norm": 0.2715175747871399, "learning_rate": 0.00015600000000000002, "loss": 1.5119, "step": 390 }, { "epoch": 0.7661000718218818, "grad_norm": 0.27246925234794617, "learning_rate": 0.00016, "loss": 1.5051, "step": 400 }, { "epoch": 0.7661000718218818, "eval_loss": 1.5016202926635742, "eval_runtime": 54.4497, "eval_samples_per_second": 5.51, "eval_steps_per_second": 0.698, "step": 400 }, { "epoch": 0.7852525736174287, "grad_norm": 0.30923280119895935, "learning_rate": 0.000164, "loss": 1.4888, "step": 410 }, { "epoch": 0.8044050754129758, "grad_norm": 0.27323517203330994, "learning_rate": 0.000168, "loss": 1.4983, "step": 420 }, { "epoch": 0.8235575772085229, "grad_norm": 0.2511342167854309, "learning_rate": 0.000172, "loss": 1.4961, "step": 430 }, { "epoch": 0.8427100790040699, "grad_norm": 0.2792860269546509, "learning_rate": 0.00017600000000000002, "loss": 1.4746, "step": 440 }, { "epoch": 0.861862580799617, "grad_norm": 0.2971252202987671, "learning_rate": 0.00018, "loss": 1.492, "step": 450 }, { "epoch": 0.881015082595164, "grad_norm": 0.2936541736125946, "learning_rate": 0.00018400000000000003, "loss": 1.4793, "step": 460 }, { "epoch": 0.900167584390711, "grad_norm": 0.29754728078842163, "learning_rate": 0.000188, "loss": 1.4743, "step": 470 }, { "epoch": 0.919320086186258, "grad_norm": 0.26322704553604126, "learning_rate": 0.000192, "loss": 1.468, "step": 480 }, { "epoch": 0.9384725879818051, "grad_norm": 0.25653937458992004, "learning_rate": 0.000196, "loss": 1.4586, "step": 490 }, { "epoch": 0.9576250897773522, "grad_norm": 0.29916030168533325, "learning_rate": 0.0002, "loss": 1.4695, "step": 500 }, { "epoch": 0.9576250897773522, "eval_loss": 1.4585037231445312, "eval_runtime": 54.3714, "eval_samples_per_second": 5.518, "eval_steps_per_second": 0.699, "step": 500 }, { "epoch": 0.9767775915728992, "grad_norm": 0.3299102783203125, "learning_rate": 0.0001990521327014218, "loss": 1.4606, "step": 510 }, { "epoch": 0.9959300933684463, "grad_norm": 0.2592889368534088, "learning_rate": 0.0001981042654028436, "loss": 1.4456, "step": 520 }, { "epoch": 1.0150825951639932, "grad_norm": 0.2642243504524231, "learning_rate": 0.0001971563981042654, "loss": 1.468, "step": 530 }, { "epoch": 1.0342350969595404, "grad_norm": 0.3317527174949646, "learning_rate": 0.0001962085308056872, "loss": 1.4533, "step": 540 }, { "epoch": 1.0533875987550874, "grad_norm": 0.2916889786720276, "learning_rate": 0.000195260663507109, "loss": 1.4585, "step": 550 }, { "epoch": 1.0725401005506345, "grad_norm": 0.29687121510505676, "learning_rate": 0.00019431279620853083, "loss": 1.4591, "step": 560 }, { "epoch": 1.0916926023461815, "grad_norm": 0.2740378975868225, "learning_rate": 0.0001933649289099526, "loss": 1.4539, "step": 570 }, { "epoch": 1.1108451041417284, "grad_norm": 0.26880577206611633, "learning_rate": 0.00019241706161137443, "loss": 1.4382, "step": 580 }, { "epoch": 1.1299976059372756, "grad_norm": 0.3242061734199524, "learning_rate": 0.0001914691943127962, "loss": 1.4503, "step": 590 }, { "epoch": 1.1491501077328226, "grad_norm": 0.25346624851226807, "learning_rate": 0.000190521327014218, "loss": 1.4413, "step": 600 }, { "epoch": 1.1491501077328226, "eval_loss": 1.4430006742477417, "eval_runtime": 54.3849, "eval_samples_per_second": 5.516, "eval_steps_per_second": 0.699, "step": 600 }, { "epoch": 1.1683026095283697, "grad_norm": 0.31677117943763733, "learning_rate": 0.00018957345971563983, "loss": 1.4451, "step": 610 }, { "epoch": 1.1874551113239167, "grad_norm": 0.2845339775085449, "learning_rate": 0.0001886255924170616, "loss": 1.4517, "step": 620 }, { "epoch": 1.2066076131194636, "grad_norm": 0.26891258358955383, "learning_rate": 0.00018767772511848343, "loss": 1.4477, "step": 630 }, { "epoch": 1.2257601149150108, "grad_norm": 0.28971320390701294, "learning_rate": 0.00018672985781990523, "loss": 1.4356, "step": 640 }, { "epoch": 1.2449126167105578, "grad_norm": 0.2949787378311157, "learning_rate": 0.00018578199052132703, "loss": 1.4416, "step": 650 }, { "epoch": 1.264065118506105, "grad_norm": 0.29537567496299744, "learning_rate": 0.00018483412322274883, "loss": 1.4456, "step": 660 }, { "epoch": 1.2832176203016519, "grad_norm": 0.35687655210494995, "learning_rate": 0.0001838862559241706, "loss": 1.4359, "step": 670 }, { "epoch": 1.3023701220971988, "grad_norm": 0.27780023217201233, "learning_rate": 0.00018293838862559243, "loss": 1.4277, "step": 680 }, { "epoch": 1.321522623892746, "grad_norm": 0.2748234272003174, "learning_rate": 0.00018199052132701423, "loss": 1.4395, "step": 690 }, { "epoch": 1.3406751256882932, "grad_norm": 0.27264848351478577, "learning_rate": 0.00018104265402843603, "loss": 1.444, "step": 700 }, { "epoch": 1.3406751256882932, "eval_loss": 1.4343491792678833, "eval_runtime": 54.4906, "eval_samples_per_second": 5.506, "eval_steps_per_second": 0.697, "step": 700 }, { "epoch": 1.3598276274838401, "grad_norm": 0.2756544351577759, "learning_rate": 0.00018009478672985783, "loss": 1.4513, "step": 710 }, { "epoch": 1.378980129279387, "grad_norm": 0.27677226066589355, "learning_rate": 0.00017914691943127963, "loss": 1.439, "step": 720 }, { "epoch": 1.3981326310749342, "grad_norm": 0.2960895895957947, "learning_rate": 0.00017819905213270143, "loss": 1.4193, "step": 730 }, { "epoch": 1.4172851328704812, "grad_norm": 0.2897762954235077, "learning_rate": 0.00017725118483412323, "loss": 1.436, "step": 740 }, { "epoch": 1.4364376346660284, "grad_norm": 0.25498512387275696, "learning_rate": 0.00017630331753554503, "loss": 1.4442, "step": 750 }, { "epoch": 1.4555901364615753, "grad_norm": 0.2929374575614929, "learning_rate": 0.00017535545023696683, "loss": 1.4312, "step": 760 }, { "epoch": 1.4747426382571223, "grad_norm": 0.24873781204223633, "learning_rate": 0.00017440758293838863, "loss": 1.4382, "step": 770 }, { "epoch": 1.4938951400526694, "grad_norm": 0.24335262179374695, "learning_rate": 0.00017345971563981043, "loss": 1.435, "step": 780 }, { "epoch": 1.5130476418482164, "grad_norm": 0.26835963129997253, "learning_rate": 0.00017251184834123225, "loss": 1.4363, "step": 790 }, { "epoch": 1.5322001436437636, "grad_norm": 0.26550954580307007, "learning_rate": 0.00017156398104265403, "loss": 1.442, "step": 800 }, { "epoch": 1.5322001436437636, "eval_loss": 1.428343415260315, "eval_runtime": 54.3845, "eval_samples_per_second": 5.516, "eval_steps_per_second": 0.699, "step": 800 }, { "epoch": 1.5513526454393105, "grad_norm": 0.271453320980072, "learning_rate": 0.00017061611374407585, "loss": 1.428, "step": 810 }, { "epoch": 1.5705051472348575, "grad_norm": 0.26026639342308044, "learning_rate": 0.00016966824644549762, "loss": 1.4298, "step": 820 }, { "epoch": 1.5896576490304046, "grad_norm": 0.24006444215774536, "learning_rate": 0.00016872037914691945, "loss": 1.425, "step": 830 }, { "epoch": 1.6088101508259518, "grad_norm": 0.27471840381622314, "learning_rate": 0.00016777251184834125, "loss": 1.4245, "step": 840 }, { "epoch": 1.6279626526214988, "grad_norm": 0.2577083110809326, "learning_rate": 0.00016682464454976302, "loss": 1.4183, "step": 850 }, { "epoch": 1.6471151544170457, "grad_norm": 0.2574625313282013, "learning_rate": 0.00016587677725118485, "loss": 1.427, "step": 860 }, { "epoch": 1.6662676562125927, "grad_norm": 0.24562470614910126, "learning_rate": 0.00016492890995260665, "loss": 1.4287, "step": 870 }, { "epoch": 1.6854201580081398, "grad_norm": 0.2509622871875763, "learning_rate": 0.00016398104265402845, "loss": 1.4364, "step": 880 }, { "epoch": 1.704572659803687, "grad_norm": 0.25581684708595276, "learning_rate": 0.00016303317535545025, "loss": 1.4359, "step": 890 }, { "epoch": 1.723725161599234, "grad_norm": 0.2766726315021515, "learning_rate": 0.00016208530805687205, "loss": 1.4183, "step": 900 }, { "epoch": 1.723725161599234, "eval_loss": 1.424310564994812, "eval_runtime": 54.5403, "eval_samples_per_second": 5.501, "eval_steps_per_second": 0.697, "step": 900 }, { "epoch": 1.742877663394781, "grad_norm": 0.24907958507537842, "learning_rate": 0.00016113744075829385, "loss": 1.4306, "step": 910 }, { "epoch": 1.7620301651903278, "grad_norm": 0.25425609946250916, "learning_rate": 0.00016018957345971565, "loss": 1.4208, "step": 920 }, { "epoch": 1.781182666985875, "grad_norm": 0.23228569328784943, "learning_rate": 0.00015924170616113745, "loss": 1.4271, "step": 930 }, { "epoch": 1.8003351687814222, "grad_norm": 0.24616825580596924, "learning_rate": 0.00015829383886255925, "loss": 1.4304, "step": 940 }, { "epoch": 1.8194876705769691, "grad_norm": 0.25094714760780334, "learning_rate": 0.00015734597156398105, "loss": 1.4248, "step": 950 }, { "epoch": 1.838640172372516, "grad_norm": 0.2325587123632431, "learning_rate": 0.00015639810426540285, "loss": 1.4302, "step": 960 }, { "epoch": 1.857792674168063, "grad_norm": 0.27836284041404724, "learning_rate": 0.00015545023696682465, "loss": 1.4281, "step": 970 }, { "epoch": 1.8769451759636102, "grad_norm": 0.25468766689300537, "learning_rate": 0.00015450236966824645, "loss": 1.4224, "step": 980 }, { "epoch": 1.8960976777591574, "grad_norm": 0.25568103790283203, "learning_rate": 0.00015355450236966827, "loss": 1.4331, "step": 990 }, { "epoch": 1.9152501795547043, "grad_norm": 0.2426796555519104, "learning_rate": 0.00015260663507109004, "loss": 1.4116, "step": 1000 }, { "epoch": 1.9152501795547043, "eval_loss": 1.4207836389541626, "eval_runtime": 54.4341, "eval_samples_per_second": 5.511, "eval_steps_per_second": 0.698, "step": 1000 }, { "epoch": 1.9344026813502513, "grad_norm": 0.25107625126838684, "learning_rate": 0.00015165876777251184, "loss": 1.4315, "step": 1010 }, { "epoch": 1.9535551831457985, "grad_norm": 0.25296613574028015, "learning_rate": 0.00015071090047393367, "loss": 1.4285, "step": 1020 }, { "epoch": 1.9727076849413456, "grad_norm": 0.23212146759033203, "learning_rate": 0.00014976303317535544, "loss": 1.4146, "step": 1030 }, { "epoch": 1.9918601867368926, "grad_norm": 0.2573173940181732, "learning_rate": 0.00014881516587677727, "loss": 1.4264, "step": 1040 }, { "epoch": 2.0110126885324395, "grad_norm": 0.2510049045085907, "learning_rate": 0.00014786729857819904, "loss": 1.4338, "step": 1050 }, { "epoch": 2.0301651903279865, "grad_norm": 0.24541810154914856, "learning_rate": 0.00014691943127962087, "loss": 1.4249, "step": 1060 }, { "epoch": 2.0493176921235334, "grad_norm": 0.2721598744392395, "learning_rate": 0.00014597156398104267, "loss": 1.4225, "step": 1070 }, { "epoch": 2.068470193919081, "grad_norm": 0.2560027539730072, "learning_rate": 0.00014502369668246447, "loss": 1.4365, "step": 1080 }, { "epoch": 2.0876226957146278, "grad_norm": 0.26836445927619934, "learning_rate": 0.00014407582938388627, "loss": 1.4231, "step": 1090 }, { "epoch": 2.1067751975101747, "grad_norm": 0.27337607741355896, "learning_rate": 0.00014312796208530804, "loss": 1.4378, "step": 1100 }, { "epoch": 2.1067751975101747, "eval_loss": 1.4182133674621582, "eval_runtime": 54.3304, "eval_samples_per_second": 5.522, "eval_steps_per_second": 0.699, "step": 1100 }, { "epoch": 2.1259276993057217, "grad_norm": 0.24439986050128937, "learning_rate": 0.00014218009478672987, "loss": 1.4219, "step": 1110 }, { "epoch": 2.145080201101269, "grad_norm": 0.2794075906276703, "learning_rate": 0.00014123222748815167, "loss": 1.4271, "step": 1120 }, { "epoch": 2.164232702896816, "grad_norm": 0.2690628468990326, "learning_rate": 0.00014028436018957347, "loss": 1.4199, "step": 1130 }, { "epoch": 2.183385204692363, "grad_norm": 0.23879903554916382, "learning_rate": 0.00013933649289099527, "loss": 1.4245, "step": 1140 }, { "epoch": 2.20253770648791, "grad_norm": 0.24499297142028809, "learning_rate": 0.00013838862559241707, "loss": 1.4211, "step": 1150 }, { "epoch": 2.221690208283457, "grad_norm": 0.2482273280620575, "learning_rate": 0.00013744075829383887, "loss": 1.4107, "step": 1160 }, { "epoch": 2.2408427100790043, "grad_norm": 0.26195070147514343, "learning_rate": 0.0001364928909952607, "loss": 1.4177, "step": 1170 }, { "epoch": 2.259995211874551, "grad_norm": 0.2543138265609741, "learning_rate": 0.00013554502369668246, "loss": 1.4218, "step": 1180 }, { "epoch": 2.279147713670098, "grad_norm": 0.2674494683742523, "learning_rate": 0.00013459715639810426, "loss": 1.4267, "step": 1190 }, { "epoch": 2.298300215465645, "grad_norm": 0.24894660711288452, "learning_rate": 0.00013364928909952606, "loss": 1.4192, "step": 1200 }, { "epoch": 2.298300215465645, "eval_loss": 1.4160188436508179, "eval_runtime": 54.5297, "eval_samples_per_second": 5.502, "eval_steps_per_second": 0.697, "step": 1200 }, { "epoch": 2.317452717261192, "grad_norm": 0.24203181266784668, "learning_rate": 0.00013270142180094786, "loss": 1.4217, "step": 1210 }, { "epoch": 2.3366052190567395, "grad_norm": 0.24441660940647125, "learning_rate": 0.0001317535545023697, "loss": 1.4228, "step": 1220 }, { "epoch": 2.3557577208522864, "grad_norm": 0.2262783795595169, "learning_rate": 0.00013080568720379146, "loss": 1.4307, "step": 1230 }, { "epoch": 2.3749102226478334, "grad_norm": 0.23150447010993958, "learning_rate": 0.0001298578199052133, "loss": 1.4279, "step": 1240 }, { "epoch": 2.3940627244433803, "grad_norm": 0.24665914475917816, "learning_rate": 0.0001289099526066351, "loss": 1.4192, "step": 1250 }, { "epoch": 2.4132152262389273, "grad_norm": 0.22649230062961578, "learning_rate": 0.00012796208530805686, "loss": 1.4223, "step": 1260 }, { "epoch": 2.4323677280344747, "grad_norm": 0.2241869419813156, "learning_rate": 0.0001270142180094787, "loss": 1.4188, "step": 1270 }, { "epoch": 2.4515202298300216, "grad_norm": 0.23246009647846222, "learning_rate": 0.00012606635071090046, "loss": 1.4223, "step": 1280 }, { "epoch": 2.4706727316255686, "grad_norm": 0.22931689023971558, "learning_rate": 0.0001251184834123223, "loss": 1.4152, "step": 1290 }, { "epoch": 2.4898252334211155, "grad_norm": 0.23769351840019226, "learning_rate": 0.0001241706161137441, "loss": 1.4243, "step": 1300 }, { "epoch": 2.4898252334211155, "eval_loss": 1.4141488075256348, "eval_runtime": 54.3608, "eval_samples_per_second": 5.519, "eval_steps_per_second": 0.699, "step": 1300 }, { "epoch": 2.5089777352166625, "grad_norm": 0.2524212598800659, "learning_rate": 0.0001232227488151659, "loss": 1.4099, "step": 1310 }, { "epoch": 2.52813023701221, "grad_norm": 0.25845396518707275, "learning_rate": 0.0001222748815165877, "loss": 1.4137, "step": 1320 }, { "epoch": 2.547282738807757, "grad_norm": 0.22692696750164032, "learning_rate": 0.0001213270142180095, "loss": 1.4133, "step": 1330 }, { "epoch": 2.5664352406033037, "grad_norm": 0.2297840416431427, "learning_rate": 0.00012037914691943129, "loss": 1.417, "step": 1340 }, { "epoch": 2.5855877423988507, "grad_norm": 0.2325245440006256, "learning_rate": 0.00011943127962085307, "loss": 1.4249, "step": 1350 }, { "epoch": 2.6047402441943976, "grad_norm": 0.22750811278820038, "learning_rate": 0.00011848341232227489, "loss": 1.4105, "step": 1360 }, { "epoch": 2.623892745989945, "grad_norm": 0.23942901194095612, "learning_rate": 0.00011753554502369668, "loss": 1.4162, "step": 1370 }, { "epoch": 2.643045247785492, "grad_norm": 0.24744774401187897, "learning_rate": 0.0001165876777251185, "loss": 1.4193, "step": 1380 }, { "epoch": 2.662197749581039, "grad_norm": 0.22848005592823029, "learning_rate": 0.00011563981042654028, "loss": 1.4144, "step": 1390 }, { "epoch": 2.6813502513765863, "grad_norm": 0.23857158422470093, "learning_rate": 0.0001146919431279621, "loss": 1.421, "step": 1400 }, { "epoch": 2.6813502513765863, "eval_loss": 1.4123934507369995, "eval_runtime": 54.5068, "eval_samples_per_second": 5.504, "eval_steps_per_second": 0.697, "step": 1400 }, { "epoch": 2.700502753172133, "grad_norm": 0.23743149638175964, "learning_rate": 0.00011374407582938388, "loss": 1.4173, "step": 1410 }, { "epoch": 2.7196552549676802, "grad_norm": 0.2245146483182907, "learning_rate": 0.0001127962085308057, "loss": 1.4111, "step": 1420 }, { "epoch": 2.738807756763227, "grad_norm": 0.2418992519378662, "learning_rate": 0.0001118483412322275, "loss": 1.4199, "step": 1430 }, { "epoch": 2.757960258558774, "grad_norm": 0.22437739372253418, "learning_rate": 0.00011090047393364928, "loss": 1.421, "step": 1440 }, { "epoch": 2.7771127603543215, "grad_norm": 0.24725133180618286, "learning_rate": 0.0001099526066350711, "loss": 1.3966, "step": 1450 }, { "epoch": 2.7962652621498685, "grad_norm": 0.22570419311523438, "learning_rate": 0.0001090047393364929, "loss": 1.4032, "step": 1460 }, { "epoch": 2.8154177639454154, "grad_norm": 0.23828823864459991, "learning_rate": 0.00010805687203791471, "loss": 1.4154, "step": 1470 }, { "epoch": 2.8345702657409624, "grad_norm": 0.2168547809123993, "learning_rate": 0.0001071090047393365, "loss": 1.4138, "step": 1480 }, { "epoch": 2.8537227675365093, "grad_norm": 0.26562485098838806, "learning_rate": 0.00010616113744075831, "loss": 1.4048, "step": 1490 }, { "epoch": 2.8728752693320567, "grad_norm": 0.23477376997470856, "learning_rate": 0.0001052132701421801, "loss": 1.3943, "step": 1500 }, { "epoch": 2.8728752693320567, "eval_loss": 1.4109047651290894, "eval_runtime": 54.4664, "eval_samples_per_second": 5.508, "eval_steps_per_second": 0.698, "step": 1500 } ], "logging_steps": 10, "max_steps": 2610, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.944124994125824e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }