{ "best_metric": 1.4095008373260498, "best_model_checkpoint": "./results/checkpoint-2000", "epoch": 1.9038553069966682, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00951927653498334, "grad_norm": 0.19429340958595276, "learning_rate": 5e-06, "loss": 1.9923, "step": 10 }, { "epoch": 0.01903855306996668, "grad_norm": 0.22915661334991455, "learning_rate": 1e-05, "loss": 2.0345, "step": 20 }, { "epoch": 0.028557829604950024, "grad_norm": 0.24586208164691925, "learning_rate": 1.5e-05, "loss": 1.9754, "step": 30 }, { "epoch": 0.03807710613993336, "grad_norm": 0.2809593677520752, "learning_rate": 2e-05, "loss": 1.9887, "step": 40 }, { "epoch": 0.047596382674916705, "grad_norm": 0.3456374406814575, "learning_rate": 2.5e-05, "loss": 1.9913, "step": 50 }, { "epoch": 0.05711565920990005, "grad_norm": 0.43767085671424866, "learning_rate": 3e-05, "loss": 1.9953, "step": 60 }, { "epoch": 0.06663493574488338, "grad_norm": 0.5941727757453918, "learning_rate": 3.5e-05, "loss": 1.9795, "step": 70 }, { "epoch": 0.07615421227986673, "grad_norm": 0.6205775141716003, "learning_rate": 4e-05, "loss": 1.9804, "step": 80 }, { "epoch": 0.08567348881485007, "grad_norm": 0.7349818348884583, "learning_rate": 4.5e-05, "loss": 1.9511, "step": 90 }, { "epoch": 0.09519276534983341, "grad_norm": 0.7896203398704529, "learning_rate": 5e-05, "loss": 1.9337, "step": 100 }, { "epoch": 0.09519276534983341, "eval_loss": 1.9389821290969849, "eval_runtime": 39.6995, "eval_samples_per_second": 2.519, "eval_steps_per_second": 0.63, "step": 100 }, { "epoch": 0.10471204188481675, "grad_norm": 0.8368595242500305, "learning_rate": 5.500000000000001e-05, "loss": 1.9052, "step": 110 }, { "epoch": 0.1142313184198001, "grad_norm": 0.9555522799491882, "learning_rate": 6e-05, "loss": 1.8766, "step": 120 }, { "epoch": 0.12375059495478344, "grad_norm": 0.9343065023422241, "learning_rate": 6.500000000000001e-05, "loss": 1.8416, "step": 130 }, { "epoch": 0.13326987148976677, "grad_norm": 0.82816481590271, "learning_rate": 7e-05, "loss": 1.8246, "step": 140 }, { "epoch": 0.14278914802475012, "grad_norm": 0.7882226705551147, "learning_rate": 7.500000000000001e-05, "loss": 1.7909, "step": 150 }, { "epoch": 0.15230842455973345, "grad_norm": 0.6617611646652222, "learning_rate": 8e-05, "loss": 1.7938, "step": 160 }, { "epoch": 0.1618277010947168, "grad_norm": 0.598513126373291, "learning_rate": 8.5e-05, "loss": 1.759, "step": 170 }, { "epoch": 0.17134697762970014, "grad_norm": 0.45931604504585266, "learning_rate": 9e-05, "loss": 1.7542, "step": 180 }, { "epoch": 0.1808662541646835, "grad_norm": 0.38422173261642456, "learning_rate": 9.5e-05, "loss": 1.7525, "step": 190 }, { "epoch": 0.19038553069966682, "grad_norm": 0.31998202204704285, "learning_rate": 0.0001, "loss": 1.732, "step": 200 }, { "epoch": 0.19038553069966682, "eval_loss": 1.7440533638000488, "eval_runtime": 39.6462, "eval_samples_per_second": 2.522, "eval_steps_per_second": 0.631, "step": 200 }, { "epoch": 0.19990480723465018, "grad_norm": 0.3325157165527344, "learning_rate": 0.000105, "loss": 1.7215, "step": 210 }, { "epoch": 0.2094240837696335, "grad_norm": 0.31313714385032654, "learning_rate": 0.00011000000000000002, "loss": 1.7033, "step": 220 }, { "epoch": 0.21894336030461684, "grad_norm": 0.3179369270801544, "learning_rate": 0.00011499999999999999, "loss": 1.7025, "step": 230 }, { "epoch": 0.2284626368396002, "grad_norm": 0.3547224700450897, "learning_rate": 0.00012, "loss": 1.6833, "step": 240 }, { "epoch": 0.23798191337458352, "grad_norm": 0.3367106020450592, "learning_rate": 0.000125, "loss": 1.6524, "step": 250 }, { "epoch": 0.24750118990956688, "grad_norm": 0.38071829080581665, "learning_rate": 0.00013000000000000002, "loss": 1.6425, "step": 260 }, { "epoch": 0.25702046644455023, "grad_norm": 0.34949544072151184, "learning_rate": 0.00013500000000000003, "loss": 1.6218, "step": 270 }, { "epoch": 0.26653974297953353, "grad_norm": 0.32234707474708557, "learning_rate": 0.00014, "loss": 1.6284, "step": 280 }, { "epoch": 0.2760590195145169, "grad_norm": 0.3492746949195862, "learning_rate": 0.000145, "loss": 1.6053, "step": 290 }, { "epoch": 0.28557829604950025, "grad_norm": 0.3380492031574249, "learning_rate": 0.00015000000000000001, "loss": 1.5728, "step": 300 }, { "epoch": 0.28557829604950025, "eval_loss": 1.5842629671096802, "eval_runtime": 39.8763, "eval_samples_per_second": 2.508, "eval_steps_per_second": 0.627, "step": 300 }, { "epoch": 0.2950975725844836, "grad_norm": 0.3693602383136749, "learning_rate": 0.000155, "loss": 1.5769, "step": 310 }, { "epoch": 0.3046168491194669, "grad_norm": 0.3339674174785614, "learning_rate": 0.00016, "loss": 1.5691, "step": 320 }, { "epoch": 0.31413612565445026, "grad_norm": 0.33094632625579834, "learning_rate": 0.000165, "loss": 1.5501, "step": 330 }, { "epoch": 0.3236554021894336, "grad_norm": 0.3607189953327179, "learning_rate": 0.00017, "loss": 1.5373, "step": 340 }, { "epoch": 0.3331746787244169, "grad_norm": 0.34884127974510193, "learning_rate": 0.000175, "loss": 1.5084, "step": 350 }, { "epoch": 0.3426939552594003, "grad_norm": 0.33757245540618896, "learning_rate": 0.00018, "loss": 1.5158, "step": 360 }, { "epoch": 0.35221323179438363, "grad_norm": 0.34877315163612366, "learning_rate": 0.00018500000000000002, "loss": 1.5002, "step": 370 }, { "epoch": 0.361732508329367, "grad_norm": 0.38293707370758057, "learning_rate": 0.00019, "loss": 1.4845, "step": 380 }, { "epoch": 0.3712517848643503, "grad_norm": 0.3441324830055237, "learning_rate": 0.000195, "loss": 1.4849, "step": 390 }, { "epoch": 0.38077106139933364, "grad_norm": 0.41154617071151733, "learning_rate": 0.0002, "loss": 1.4862, "step": 400 }, { "epoch": 0.38077106139933364, "eval_loss": 1.4930579662322998, "eval_runtime": 39.6754, "eval_samples_per_second": 2.52, "eval_steps_per_second": 0.63, "step": 400 }, { "epoch": 0.390290337934317, "grad_norm": 0.36156630516052246, "learning_rate": 0.0001995876288659794, "loss": 1.4872, "step": 410 }, { "epoch": 0.39980961446930036, "grad_norm": 0.32177475094795227, "learning_rate": 0.00019917525773195877, "loss": 1.457, "step": 420 }, { "epoch": 0.40932889100428366, "grad_norm": 0.41120991110801697, "learning_rate": 0.00019876288659793816, "loss": 1.4794, "step": 430 }, { "epoch": 0.418848167539267, "grad_norm": 0.3899654448032379, "learning_rate": 0.00019835051546391753, "loss": 1.4728, "step": 440 }, { "epoch": 0.42836744407425037, "grad_norm": 0.3392334580421448, "learning_rate": 0.00019793814432989693, "loss": 1.4837, "step": 450 }, { "epoch": 0.43788672060923367, "grad_norm": 0.35381370782852173, "learning_rate": 0.00019752577319587632, "loss": 1.4722, "step": 460 }, { "epoch": 0.447405997144217, "grad_norm": 0.3540886342525482, "learning_rate": 0.0001971134020618557, "loss": 1.4608, "step": 470 }, { "epoch": 0.4569252736792004, "grad_norm": 0.38124117255210876, "learning_rate": 0.00019670103092783505, "loss": 1.4648, "step": 480 }, { "epoch": 0.46644455021418374, "grad_norm": 0.34540703892707825, "learning_rate": 0.00019628865979381442, "loss": 1.4651, "step": 490 }, { "epoch": 0.47596382674916704, "grad_norm": 0.34259673953056335, "learning_rate": 0.00019587628865979381, "loss": 1.442, "step": 500 }, { "epoch": 0.47596382674916704, "eval_loss": 1.4635019302368164, "eval_runtime": 39.7119, "eval_samples_per_second": 2.518, "eval_steps_per_second": 0.63, "step": 500 }, { "epoch": 0.4854831032841504, "grad_norm": 0.39079272747039795, "learning_rate": 0.0001954639175257732, "loss": 1.4676, "step": 510 }, { "epoch": 0.49500237981913375, "grad_norm": 0.3873017132282257, "learning_rate": 0.00019505154639175258, "loss": 1.4491, "step": 520 }, { "epoch": 0.504521656354117, "grad_norm": 0.46681663393974304, "learning_rate": 0.00019463917525773197, "loss": 1.4483, "step": 530 }, { "epoch": 0.5140409328891005, "grad_norm": 0.3657790720462799, "learning_rate": 0.00019422680412371134, "loss": 1.4402, "step": 540 }, { "epoch": 0.5235602094240838, "grad_norm": 0.3777405321598053, "learning_rate": 0.00019381443298969073, "loss": 1.4295, "step": 550 }, { "epoch": 0.5330794859590671, "grad_norm": 0.3857463002204895, "learning_rate": 0.00019340206185567012, "loss": 1.4548, "step": 560 }, { "epoch": 0.5425987624940505, "grad_norm": 0.36316171288490295, "learning_rate": 0.0001929896907216495, "loss": 1.4567, "step": 570 }, { "epoch": 0.5521180390290338, "grad_norm": 0.37083303928375244, "learning_rate": 0.00019257731958762889, "loss": 1.4458, "step": 580 }, { "epoch": 0.5616373155640171, "grad_norm": 0.39506301283836365, "learning_rate": 0.00019216494845360825, "loss": 1.4364, "step": 590 }, { "epoch": 0.5711565920990005, "grad_norm": 0.3553004264831543, "learning_rate": 0.00019175257731958765, "loss": 1.4676, "step": 600 }, { "epoch": 0.5711565920990005, "eval_loss": 1.449947714805603, "eval_runtime": 39.6196, "eval_samples_per_second": 2.524, "eval_steps_per_second": 0.631, "step": 600 }, { "epoch": 0.5806758686339838, "grad_norm": 0.4231698513031006, "learning_rate": 0.00019134020618556704, "loss": 1.4348, "step": 610 }, { "epoch": 0.5901951451689672, "grad_norm": 0.4203357994556427, "learning_rate": 0.0001909278350515464, "loss": 1.4455, "step": 620 }, { "epoch": 0.5997144217039505, "grad_norm": 0.39158034324645996, "learning_rate": 0.00019051546391752577, "loss": 1.4594, "step": 630 }, { "epoch": 0.6092336982389338, "grad_norm": 0.39505264163017273, "learning_rate": 0.00019010309278350514, "loss": 1.4653, "step": 640 }, { "epoch": 0.6187529747739172, "grad_norm": 0.3803844153881073, "learning_rate": 0.00018969072164948454, "loss": 1.4335, "step": 650 }, { "epoch": 0.6282722513089005, "grad_norm": 0.3700083792209625, "learning_rate": 0.00018927835051546393, "loss": 1.4409, "step": 660 }, { "epoch": 0.6377915278438838, "grad_norm": 0.35323163866996765, "learning_rate": 0.0001888659793814433, "loss": 1.4456, "step": 670 }, { "epoch": 0.6473108043788672, "grad_norm": 0.34639179706573486, "learning_rate": 0.0001884536082474227, "loss": 1.441, "step": 680 }, { "epoch": 0.6568300809138505, "grad_norm": 0.39847052097320557, "learning_rate": 0.00018804123711340206, "loss": 1.4214, "step": 690 }, { "epoch": 0.6663493574488338, "grad_norm": 0.3468664884567261, "learning_rate": 0.00018762886597938145, "loss": 1.4398, "step": 700 }, { "epoch": 0.6663493574488338, "eval_loss": 1.4422180652618408, "eval_runtime": 39.7751, "eval_samples_per_second": 2.514, "eval_steps_per_second": 0.629, "step": 700 }, { "epoch": 0.6758686339838172, "grad_norm": 0.35168835520744324, "learning_rate": 0.00018721649484536085, "loss": 1.436, "step": 710 }, { "epoch": 0.6853879105188005, "grad_norm": 0.3769698143005371, "learning_rate": 0.0001868041237113402, "loss": 1.4424, "step": 720 }, { "epoch": 0.694907187053784, "grad_norm": 0.4488052725791931, "learning_rate": 0.0001863917525773196, "loss": 1.4509, "step": 730 }, { "epoch": 0.7044264635887673, "grad_norm": 0.34030118584632874, "learning_rate": 0.00018597938144329897, "loss": 1.4355, "step": 740 }, { "epoch": 0.7139457401237506, "grad_norm": 0.3737122714519501, "learning_rate": 0.00018556701030927837, "loss": 1.4336, "step": 750 }, { "epoch": 0.723465016658734, "grad_norm": 0.36228156089782715, "learning_rate": 0.00018515463917525776, "loss": 1.4371, "step": 760 }, { "epoch": 0.7329842931937173, "grad_norm": 0.37088775634765625, "learning_rate": 0.00018474226804123713, "loss": 1.4445, "step": 770 }, { "epoch": 0.7425035697287006, "grad_norm": 0.39574727416038513, "learning_rate": 0.0001843298969072165, "loss": 1.4252, "step": 780 }, { "epoch": 0.752022846263684, "grad_norm": 0.41419413685798645, "learning_rate": 0.00018391752577319586, "loss": 1.4173, "step": 790 }, { "epoch": 0.7615421227986673, "grad_norm": 0.3290116786956787, "learning_rate": 0.00018350515463917526, "loss": 1.4544, "step": 800 }, { "epoch": 0.7615421227986673, "eval_loss": 1.436285138130188, "eval_runtime": 39.7262, "eval_samples_per_second": 2.517, "eval_steps_per_second": 0.629, "step": 800 }, { "epoch": 0.7710613993336506, "grad_norm": 0.38003116846084595, "learning_rate": 0.00018309278350515465, "loss": 1.425, "step": 810 }, { "epoch": 0.780580675868634, "grad_norm": 0.3706168532371521, "learning_rate": 0.00018268041237113402, "loss": 1.4537, "step": 820 }, { "epoch": 0.7900999524036173, "grad_norm": 0.36038973927497864, "learning_rate": 0.0001822680412371134, "loss": 1.4188, "step": 830 }, { "epoch": 0.7996192289386007, "grad_norm": 0.36866557598114014, "learning_rate": 0.00018185567010309278, "loss": 1.4246, "step": 840 }, { "epoch": 0.809138505473584, "grad_norm": 0.37158384919166565, "learning_rate": 0.00018144329896907217, "loss": 1.4323, "step": 850 }, { "epoch": 0.8186577820085673, "grad_norm": 0.3713236153125763, "learning_rate": 0.00018103092783505157, "loss": 1.4407, "step": 860 }, { "epoch": 0.8281770585435507, "grad_norm": 0.3830552101135254, "learning_rate": 0.00018061855670103093, "loss": 1.4427, "step": 870 }, { "epoch": 0.837696335078534, "grad_norm": 0.3613452613353729, "learning_rate": 0.00018020618556701033, "loss": 1.445, "step": 880 }, { "epoch": 0.8472156116135173, "grad_norm": 0.3281649351119995, "learning_rate": 0.0001797938144329897, "loss": 1.4311, "step": 890 }, { "epoch": 0.8567348881485007, "grad_norm": 0.3342822194099426, "learning_rate": 0.0001793814432989691, "loss": 1.4455, "step": 900 }, { "epoch": 0.8567348881485007, "eval_loss": 1.4309983253479004, "eval_runtime": 39.6671, "eval_samples_per_second": 2.521, "eval_steps_per_second": 0.63, "step": 900 }, { "epoch": 0.866254164683484, "grad_norm": 0.3413979411125183, "learning_rate": 0.00017896907216494848, "loss": 1.4416, "step": 910 }, { "epoch": 0.8757734412184673, "grad_norm": 0.36158689856529236, "learning_rate": 0.00017855670103092785, "loss": 1.4328, "step": 920 }, { "epoch": 0.8852927177534508, "grad_norm": 0.3876676857471466, "learning_rate": 0.00017814432989690724, "loss": 1.4321, "step": 930 }, { "epoch": 0.894811994288434, "grad_norm": 0.35580044984817505, "learning_rate": 0.00017773195876288658, "loss": 1.4377, "step": 940 }, { "epoch": 0.9043312708234175, "grad_norm": 0.3638615906238556, "learning_rate": 0.00017731958762886598, "loss": 1.4222, "step": 950 }, { "epoch": 0.9138505473584008, "grad_norm": 0.32744646072387695, "learning_rate": 0.00017690721649484537, "loss": 1.4235, "step": 960 }, { "epoch": 0.9233698238933841, "grad_norm": 0.3513677716255188, "learning_rate": 0.00017649484536082474, "loss": 1.4511, "step": 970 }, { "epoch": 0.9328891004283675, "grad_norm": 0.35235506296157837, "learning_rate": 0.00017608247422680413, "loss": 1.4274, "step": 980 }, { "epoch": 0.9424083769633508, "grad_norm": 0.343514621257782, "learning_rate": 0.0001756701030927835, "loss": 1.417, "step": 990 }, { "epoch": 0.9519276534983341, "grad_norm": 0.38501816987991333, "learning_rate": 0.0001752577319587629, "loss": 1.4422, "step": 1000 }, { "epoch": 0.9519276534983341, "eval_loss": 1.4280465841293335, "eval_runtime": 39.8059, "eval_samples_per_second": 2.512, "eval_steps_per_second": 0.628, "step": 1000 }, { "epoch": 0.9614469300333175, "grad_norm": 0.3453533351421356, "learning_rate": 0.0001748453608247423, "loss": 1.4134, "step": 1010 }, { "epoch": 0.9709662065683008, "grad_norm": 0.3687838017940521, "learning_rate": 0.00017443298969072165, "loss": 1.4357, "step": 1020 }, { "epoch": 0.9804854831032842, "grad_norm": 0.3397800624370575, "learning_rate": 0.00017402061855670105, "loss": 1.4221, "step": 1030 }, { "epoch": 0.9900047596382675, "grad_norm": 0.3534553647041321, "learning_rate": 0.00017360824742268042, "loss": 1.4232, "step": 1040 }, { "epoch": 0.9995240361732508, "grad_norm": 0.31195327639579773, "learning_rate": 0.0001731958762886598, "loss": 1.4316, "step": 1050 }, { "epoch": 1.009043312708234, "grad_norm": 0.3326282799243927, "learning_rate": 0.0001727835051546392, "loss": 1.4361, "step": 1060 }, { "epoch": 1.0185625892432175, "grad_norm": 0.38880521059036255, "learning_rate": 0.00017237113402061857, "loss": 1.4282, "step": 1070 }, { "epoch": 1.028081865778201, "grad_norm": 0.3405047357082367, "learning_rate": 0.00017195876288659796, "loss": 1.4295, "step": 1080 }, { "epoch": 1.0376011423131841, "grad_norm": 0.3320964574813843, "learning_rate": 0.0001715463917525773, "loss": 1.426, "step": 1090 }, { "epoch": 1.0471204188481675, "grad_norm": 0.3246203660964966, "learning_rate": 0.0001711340206185567, "loss": 1.4115, "step": 1100 }, { "epoch": 1.0471204188481675, "eval_loss": 1.4249218702316284, "eval_runtime": 39.6935, "eval_samples_per_second": 2.519, "eval_steps_per_second": 0.63, "step": 1100 }, { "epoch": 1.056639695383151, "grad_norm": 0.33676016330718994, "learning_rate": 0.0001707216494845361, "loss": 1.4429, "step": 1110 }, { "epoch": 1.0661589719181341, "grad_norm": 0.304193913936615, "learning_rate": 0.00017030927835051546, "loss": 1.4145, "step": 1120 }, { "epoch": 1.0756782484531175, "grad_norm": 0.3132353723049164, "learning_rate": 0.00016989690721649485, "loss": 1.3993, "step": 1130 }, { "epoch": 1.085197524988101, "grad_norm": 0.3249862492084503, "learning_rate": 0.00016948453608247422, "loss": 1.4079, "step": 1140 }, { "epoch": 1.0947168015230841, "grad_norm": 0.31127285957336426, "learning_rate": 0.00016907216494845361, "loss": 1.4189, "step": 1150 }, { "epoch": 1.1042360780580676, "grad_norm": 0.3150855302810669, "learning_rate": 0.000168659793814433, "loss": 1.4211, "step": 1160 }, { "epoch": 1.113755354593051, "grad_norm": 0.3354848325252533, "learning_rate": 0.00016824742268041238, "loss": 1.4146, "step": 1170 }, { "epoch": 1.1232746311280342, "grad_norm": 0.36233270168304443, "learning_rate": 0.00016783505154639177, "loss": 1.4289, "step": 1180 }, { "epoch": 1.1327939076630176, "grad_norm": 0.3264828622341156, "learning_rate": 0.00016742268041237114, "loss": 1.4021, "step": 1190 }, { "epoch": 1.142313184198001, "grad_norm": 0.3206409215927124, "learning_rate": 0.00016701030927835053, "loss": 1.4014, "step": 1200 }, { "epoch": 1.142313184198001, "eval_loss": 1.4221289157867432, "eval_runtime": 39.6868, "eval_samples_per_second": 2.52, "eval_steps_per_second": 0.63, "step": 1200 }, { "epoch": 1.1518324607329844, "grad_norm": 0.31770697236061096, "learning_rate": 0.00016659793814432993, "loss": 1.424, "step": 1210 }, { "epoch": 1.1613517372679676, "grad_norm": 0.313504695892334, "learning_rate": 0.0001661855670103093, "loss": 1.4229, "step": 1220 }, { "epoch": 1.170871013802951, "grad_norm": 0.35023483633995056, "learning_rate": 0.00016577319587628869, "loss": 1.4059, "step": 1230 }, { "epoch": 1.1803902903379344, "grad_norm": 0.3138754963874817, "learning_rate": 0.00016536082474226803, "loss": 1.4185, "step": 1240 }, { "epoch": 1.1899095668729176, "grad_norm": 0.31875547766685486, "learning_rate": 0.00016494845360824742, "loss": 1.4115, "step": 1250 }, { "epoch": 1.199428843407901, "grad_norm": 0.3276744484901428, "learning_rate": 0.00016453608247422681, "loss": 1.405, "step": 1260 }, { "epoch": 1.2089481199428844, "grad_norm": 0.3124449849128723, "learning_rate": 0.00016412371134020618, "loss": 1.4164, "step": 1270 }, { "epoch": 1.2184673964778676, "grad_norm": 0.31706124544143677, "learning_rate": 0.00016371134020618558, "loss": 1.413, "step": 1280 }, { "epoch": 1.227986673012851, "grad_norm": 0.3277345895767212, "learning_rate": 0.00016329896907216494, "loss": 1.4339, "step": 1290 }, { "epoch": 1.2375059495478344, "grad_norm": 0.3391578793525696, "learning_rate": 0.00016288659793814434, "loss": 1.414, "step": 1300 }, { "epoch": 1.2375059495478344, "eval_loss": 1.4196603298187256, "eval_runtime": 39.6612, "eval_samples_per_second": 2.521, "eval_steps_per_second": 0.63, "step": 1300 }, { "epoch": 1.2470252260828176, "grad_norm": 0.3561215400695801, "learning_rate": 0.00016247422680412373, "loss": 1.431, "step": 1310 }, { "epoch": 1.256544502617801, "grad_norm": 0.3222928047180176, "learning_rate": 0.0001620618556701031, "loss": 1.4207, "step": 1320 }, { "epoch": 1.2660637791527845, "grad_norm": 0.29916074872016907, "learning_rate": 0.0001616494845360825, "loss": 1.413, "step": 1330 }, { "epoch": 1.2755830556877679, "grad_norm": 0.31502220034599304, "learning_rate": 0.00016123711340206186, "loss": 1.4041, "step": 1340 }, { "epoch": 1.285102332222751, "grad_norm": 0.30248329043388367, "learning_rate": 0.00016082474226804125, "loss": 1.4136, "step": 1350 }, { "epoch": 1.2946216087577345, "grad_norm": 0.29750174283981323, "learning_rate": 0.00016041237113402065, "loss": 1.4149, "step": 1360 }, { "epoch": 1.3041408852927177, "grad_norm": 0.35403352975845337, "learning_rate": 0.00016, "loss": 1.434, "step": 1370 }, { "epoch": 1.313660161827701, "grad_norm": 0.3174794614315033, "learning_rate": 0.0001595876288659794, "loss": 1.4129, "step": 1380 }, { "epoch": 1.3231794383626845, "grad_norm": 0.31973496079444885, "learning_rate": 0.00015917525773195875, "loss": 1.4312, "step": 1390 }, { "epoch": 1.332698714897668, "grad_norm": 0.29147669672966003, "learning_rate": 0.00015876288659793814, "loss": 1.4203, "step": 1400 }, { "epoch": 1.332698714897668, "eval_loss": 1.4181098937988281, "eval_runtime": 39.59, "eval_samples_per_second": 2.526, "eval_steps_per_second": 0.631, "step": 1400 }, { "epoch": 1.342217991432651, "grad_norm": 0.3290899097919464, "learning_rate": 0.00015835051546391754, "loss": 1.4159, "step": 1410 }, { "epoch": 1.3517372679676345, "grad_norm": 0.2933942377567291, "learning_rate": 0.0001579381443298969, "loss": 1.4399, "step": 1420 }, { "epoch": 1.3612565445026177, "grad_norm": 0.3046482801437378, "learning_rate": 0.0001575257731958763, "loss": 1.4004, "step": 1430 }, { "epoch": 1.370775821037601, "grad_norm": 0.29072305560112, "learning_rate": 0.00015711340206185566, "loss": 1.4026, "step": 1440 }, { "epoch": 1.3802950975725845, "grad_norm": 0.32136955857276917, "learning_rate": 0.00015670103092783506, "loss": 1.4189, "step": 1450 }, { "epoch": 1.389814374107568, "grad_norm": 0.3227519094944, "learning_rate": 0.00015628865979381445, "loss": 1.417, "step": 1460 }, { "epoch": 1.399333650642551, "grad_norm": 0.32664725184440613, "learning_rate": 0.00015587628865979382, "loss": 1.41, "step": 1470 }, { "epoch": 1.4088529271775345, "grad_norm": 0.3103736340999603, "learning_rate": 0.0001554639175257732, "loss": 1.4175, "step": 1480 }, { "epoch": 1.418372203712518, "grad_norm": 0.30146801471710205, "learning_rate": 0.00015505154639175258, "loss": 1.4281, "step": 1490 }, { "epoch": 1.4278914802475011, "grad_norm": 0.3465326726436615, "learning_rate": 0.00015463917525773197, "loss": 1.3968, "step": 1500 }, { "epoch": 1.4278914802475011, "eval_loss": 1.4155893325805664, "eval_runtime": 39.6723, "eval_samples_per_second": 2.521, "eval_steps_per_second": 0.63, "step": 1500 }, { "epoch": 1.4374107567824845, "grad_norm": 0.33584079146385193, "learning_rate": 0.00015422680412371137, "loss": 1.4215, "step": 1510 }, { "epoch": 1.446930033317468, "grad_norm": 0.3028492331504822, "learning_rate": 0.00015381443298969073, "loss": 1.4143, "step": 1520 }, { "epoch": 1.4564493098524511, "grad_norm": 0.29686522483825684, "learning_rate": 0.00015340206185567013, "loss": 1.4249, "step": 1530 }, { "epoch": 1.4659685863874345, "grad_norm": 0.30677148699760437, "learning_rate": 0.00015298969072164947, "loss": 1.4225, "step": 1540 }, { "epoch": 1.475487862922418, "grad_norm": 0.3218235969543457, "learning_rate": 0.00015257731958762886, "loss": 1.3925, "step": 1550 }, { "epoch": 1.4850071394574011, "grad_norm": 0.29073500633239746, "learning_rate": 0.00015216494845360826, "loss": 1.4068, "step": 1560 }, { "epoch": 1.4945264159923846, "grad_norm": 0.3078315258026123, "learning_rate": 0.00015175257731958762, "loss": 1.404, "step": 1570 }, { "epoch": 1.504045692527368, "grad_norm": 0.30224424600601196, "learning_rate": 0.00015134020618556702, "loss": 1.4244, "step": 1580 }, { "epoch": 1.5135649690623514, "grad_norm": 0.3163105845451355, "learning_rate": 0.00015092783505154638, "loss": 1.4099, "step": 1590 }, { "epoch": 1.5230842455973346, "grad_norm": 0.289919376373291, "learning_rate": 0.00015051546391752578, "loss": 1.4235, "step": 1600 }, { "epoch": 1.5230842455973346, "eval_loss": 1.4139596223831177, "eval_runtime": 39.6382, "eval_samples_per_second": 2.523, "eval_steps_per_second": 0.631, "step": 1600 }, { "epoch": 1.532603522132318, "grad_norm": 0.2937719225883484, "learning_rate": 0.00015010309278350517, "loss": 1.4112, "step": 1610 }, { "epoch": 1.5421227986673012, "grad_norm": 0.30618056654930115, "learning_rate": 0.00014969072164948454, "loss": 1.41, "step": 1620 }, { "epoch": 1.5516420752022846, "grad_norm": 0.3118852972984314, "learning_rate": 0.00014927835051546393, "loss": 1.4234, "step": 1630 }, { "epoch": 1.561161351737268, "grad_norm": 0.30171871185302734, "learning_rate": 0.0001488659793814433, "loss": 1.4151, "step": 1640 }, { "epoch": 1.5706806282722514, "grad_norm": 0.3201466500759125, "learning_rate": 0.0001484536082474227, "loss": 1.4054, "step": 1650 }, { "epoch": 1.5801999048072346, "grad_norm": 0.3177640736103058, "learning_rate": 0.0001480412371134021, "loss": 1.4043, "step": 1660 }, { "epoch": 1.589719181342218, "grad_norm": 0.3199518918991089, "learning_rate": 0.00014762886597938146, "loss": 1.4049, "step": 1670 }, { "epoch": 1.5992384578772012, "grad_norm": 0.30514276027679443, "learning_rate": 0.00014721649484536085, "loss": 1.4103, "step": 1680 }, { "epoch": 1.6087577344121846, "grad_norm": 0.31192877888679504, "learning_rate": 0.0001468041237113402, "loss": 1.4173, "step": 1690 }, { "epoch": 1.618277010947168, "grad_norm": 0.2868764400482178, "learning_rate": 0.00014639175257731958, "loss": 1.4154, "step": 1700 }, { "epoch": 1.618277010947168, "eval_loss": 1.4129725694656372, "eval_runtime": 39.6482, "eval_samples_per_second": 2.522, "eval_steps_per_second": 0.631, "step": 1700 }, { "epoch": 1.6277962874821514, "grad_norm": 0.2957026958465576, "learning_rate": 0.00014597938144329898, "loss": 1.413, "step": 1710 }, { "epoch": 1.6373155640171349, "grad_norm": 0.2958736717700958, "learning_rate": 0.00014556701030927834, "loss": 1.4209, "step": 1720 }, { "epoch": 1.646834840552118, "grad_norm": 0.31275510787963867, "learning_rate": 0.00014515463917525774, "loss": 1.4022, "step": 1730 }, { "epoch": 1.6563541170871012, "grad_norm": 0.32319965958595276, "learning_rate": 0.0001447422680412371, "loss": 1.3997, "step": 1740 }, { "epoch": 1.6658733936220846, "grad_norm": 0.29642221331596375, "learning_rate": 0.0001443298969072165, "loss": 1.4032, "step": 1750 }, { "epoch": 1.675392670157068, "grad_norm": 0.32867589592933655, "learning_rate": 0.0001439175257731959, "loss": 1.4203, "step": 1760 }, { "epoch": 1.6849119466920515, "grad_norm": 0.31527167558670044, "learning_rate": 0.00014350515463917526, "loss": 1.4157, "step": 1770 }, { "epoch": 1.6944312232270349, "grad_norm": 0.3072745203971863, "learning_rate": 0.00014309278350515465, "loss": 1.3823, "step": 1780 }, { "epoch": 1.703950499762018, "grad_norm": 0.28791388869285583, "learning_rate": 0.00014268041237113402, "loss": 1.4317, "step": 1790 }, { "epoch": 1.7134697762970015, "grad_norm": 0.28491660952568054, "learning_rate": 0.00014226804123711342, "loss": 1.4128, "step": 1800 }, { "epoch": 1.7134697762970015, "eval_loss": 1.4114233255386353, "eval_runtime": 39.5879, "eval_samples_per_second": 2.526, "eval_steps_per_second": 0.632, "step": 1800 }, { "epoch": 1.7229890528319847, "grad_norm": 0.2969178259372711, "learning_rate": 0.0001418556701030928, "loss": 1.4014, "step": 1810 }, { "epoch": 1.732508329366968, "grad_norm": 0.29975786805152893, "learning_rate": 0.00014144329896907218, "loss": 1.3926, "step": 1820 }, { "epoch": 1.7420276059019515, "grad_norm": 0.31133607029914856, "learning_rate": 0.00014103092783505157, "loss": 1.4063, "step": 1830 }, { "epoch": 1.751546882436935, "grad_norm": 0.29973700642585754, "learning_rate": 0.0001406185567010309, "loss": 1.4148, "step": 1840 }, { "epoch": 1.761066158971918, "grad_norm": 0.2864433228969574, "learning_rate": 0.0001402061855670103, "loss": 1.4032, "step": 1850 }, { "epoch": 1.7705854355069015, "grad_norm": 0.2933174669742584, "learning_rate": 0.0001397938144329897, "loss": 1.4284, "step": 1860 }, { "epoch": 1.7801047120418847, "grad_norm": 0.2991732060909271, "learning_rate": 0.00013938144329896907, "loss": 1.4039, "step": 1870 }, { "epoch": 1.789623988576868, "grad_norm": 0.2981884479522705, "learning_rate": 0.00013896907216494846, "loss": 1.3952, "step": 1880 }, { "epoch": 1.7991432651118515, "grad_norm": 0.29072457551956177, "learning_rate": 0.00013855670103092783, "loss": 1.4121, "step": 1890 }, { "epoch": 1.808662541646835, "grad_norm": 0.3064673840999603, "learning_rate": 0.00013814432989690722, "loss": 1.4029, "step": 1900 }, { "epoch": 1.808662541646835, "eval_loss": 1.4104112386703491, "eval_runtime": 39.5819, "eval_samples_per_second": 2.526, "eval_steps_per_second": 0.632, "step": 1900 }, { "epoch": 1.8181818181818183, "grad_norm": 0.28513726592063904, "learning_rate": 0.00013773195876288661, "loss": 1.4098, "step": 1910 }, { "epoch": 1.8277010947168015, "grad_norm": 0.3132310211658478, "learning_rate": 0.00013731958762886598, "loss": 1.4157, "step": 1920 }, { "epoch": 1.8372203712517847, "grad_norm": 0.2846241295337677, "learning_rate": 0.00013690721649484538, "loss": 1.4045, "step": 1930 }, { "epoch": 1.8467396477867681, "grad_norm": 0.2918606996536255, "learning_rate": 0.00013649484536082474, "loss": 1.4107, "step": 1940 }, { "epoch": 1.8562589243217515, "grad_norm": 0.28303080797195435, "learning_rate": 0.00013608247422680414, "loss": 1.4221, "step": 1950 }, { "epoch": 1.865778200856735, "grad_norm": 0.2898513674736023, "learning_rate": 0.00013567010309278353, "loss": 1.3937, "step": 1960 }, { "epoch": 1.8752974773917184, "grad_norm": 0.2937197685241699, "learning_rate": 0.0001352577319587629, "loss": 1.4134, "step": 1970 }, { "epoch": 1.8848167539267016, "grad_norm": 0.2756626605987549, "learning_rate": 0.0001348453608247423, "loss": 1.4085, "step": 1980 }, { "epoch": 1.8943360304616848, "grad_norm": 0.28634968400001526, "learning_rate": 0.00013443298969072166, "loss": 1.3999, "step": 1990 }, { "epoch": 1.9038553069966682, "grad_norm": 0.28319478034973145, "learning_rate": 0.00013402061855670103, "loss": 1.3935, "step": 2000 }, { "epoch": 1.9038553069966682, "eval_loss": 1.4095008373260498, "eval_runtime": 39.6055, "eval_samples_per_second": 2.525, "eval_steps_per_second": 0.631, "step": 2000 } ], "logging_steps": 10, "max_steps": 5250, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.296083329417216e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }