{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 28120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.17780938833570412, "grad_norm": 1.8023470640182495, "learning_rate": 0.0002988, "loss": 4.6629, "step": 500 }, { "epoch": 0.17780938833570412, "eval_loss": 0.3540094494819641, "eval_runtime": 77.5992, "eval_samples_per_second": 19.059, "eval_steps_per_second": 2.384, "eval_wer": 0.5421303656597775, "step": 500 }, { "epoch": 0.35561877667140823, "grad_norm": 1.411382794380188, "learning_rate": 0.0002945908761766835, "loss": 0.6579, "step": 1000 }, { "epoch": 0.35561877667140823, "eval_loss": 0.25881391763687134, "eval_runtime": 77.5785, "eval_samples_per_second": 19.065, "eval_steps_per_second": 2.385, "eval_wer": 0.48825295884119413, "step": 1000 }, { "epoch": 0.5334281650071123, "grad_norm": 1.561378836631775, "learning_rate": 0.00028916002896451845, "loss": 0.591, "step": 1500 }, { "epoch": 0.5334281650071123, "eval_loss": 0.25524866580963135, "eval_runtime": 77.3789, "eval_samples_per_second": 19.114, "eval_steps_per_second": 2.391, "eval_wer": 0.47200141317788374, "step": 1500 }, { "epoch": 0.7112375533428165, "grad_norm": 1.4678888320922852, "learning_rate": 0.00028372918175235333, "loss": 0.5467, "step": 2000 }, { "epoch": 0.7112375533428165, "eval_loss": 0.23701371252536774, "eval_runtime": 77.4845, "eval_samples_per_second": 19.088, "eval_steps_per_second": 2.388, "eval_wer": 0.4541600423953365, "step": 2000 }, { "epoch": 0.8890469416785206, "grad_norm": 2.583188056945801, "learning_rate": 0.00027829833454018826, "loss": 0.5405, "step": 2500 }, { "epoch": 0.8890469416785206, "eval_loss": 0.23755988478660583, "eval_runtime": 78.0759, "eval_samples_per_second": 18.943, "eval_steps_per_second": 2.369, "eval_wer": 0.45557322027910263, "step": 2500 }, { "epoch": 1.0668563300142249, "grad_norm": 1.422250509262085, "learning_rate": 0.00027286748732802314, "loss": 0.5027, "step": 3000 }, { "epoch": 1.0668563300142249, "eval_loss": 0.22338581085205078, "eval_runtime": 77.6024, "eval_samples_per_second": 19.059, "eval_steps_per_second": 2.384, "eval_wer": 0.4306659600777248, "step": 3000 }, { "epoch": 1.2446657183499288, "grad_norm": 1.006721019744873, "learning_rate": 0.00026743664011585807, "loss": 0.5001, "step": 3500 }, { "epoch": 1.2446657183499288, "eval_loss": 0.21763387322425842, "eval_runtime": 77.5866, "eval_samples_per_second": 19.063, "eval_steps_per_second": 2.384, "eval_wer": 0.42130365659777425, "step": 3500 }, { "epoch": 1.422475106685633, "grad_norm": 1.3477824926376343, "learning_rate": 0.00026200579290369295, "loss": 0.4962, "step": 4000 }, { "epoch": 1.422475106685633, "eval_loss": 0.21994474530220032, "eval_runtime": 77.7546, "eval_samples_per_second": 19.021, "eval_steps_per_second": 2.379, "eval_wer": 0.4205087440381558, "step": 4000 }, { "epoch": 1.600284495021337, "grad_norm": 1.08402419090271, "learning_rate": 0.00025657494569152783, "loss": 0.486, "step": 4500 }, { "epoch": 1.600284495021337, "eval_loss": 0.21454988420009613, "eval_runtime": 77.6412, "eval_samples_per_second": 19.049, "eval_steps_per_second": 2.383, "eval_wer": 0.41671082847553437, "step": 4500 }, { "epoch": 1.7780938833570412, "grad_norm": 0.5935032963752747, "learning_rate": 0.00025114409847936276, "loss": 0.47, "step": 5000 }, { "epoch": 1.7780938833570412, "eval_loss": 0.21590569615364075, "eval_runtime": 77.5792, "eval_samples_per_second": 19.064, "eval_steps_per_second": 2.385, "eval_wer": 0.4168874757110051, "step": 5000 }, { "epoch": 1.9559032716927454, "grad_norm": 1.4301828145980835, "learning_rate": 0.00024571325126719764, "loss": 0.4557, "step": 5500 }, { "epoch": 1.9559032716927454, "eval_loss": 0.2098698765039444, "eval_runtime": 78.1464, "eval_samples_per_second": 18.926, "eval_steps_per_second": 2.367, "eval_wer": 0.4135311782370606, "step": 5500 }, { "epoch": 2.1337126600284497, "grad_norm": 1.2725244760513306, "learning_rate": 0.00024028240405503257, "loss": 0.4514, "step": 6000 }, { "epoch": 2.1337126600284497, "eval_loss": 0.20907503366470337, "eval_runtime": 78.1315, "eval_samples_per_second": 18.93, "eval_steps_per_second": 2.368, "eval_wer": 0.4099982335276453, "step": 6000 }, { "epoch": 2.3115220483641536, "grad_norm": 1.0271036624908447, "learning_rate": 0.00023485155684286748, "loss": 0.4539, "step": 6500 }, { "epoch": 2.3115220483641536, "eval_loss": 0.2038286179304123, "eval_runtime": 77.8532, "eval_samples_per_second": 18.997, "eval_steps_per_second": 2.376, "eval_wer": 0.40160748984278394, "step": 6500 }, { "epoch": 2.4893314366998576, "grad_norm": 1.0436575412750244, "learning_rate": 0.0002294315713251267, "loss": 0.439, "step": 7000 }, { "epoch": 2.4893314366998576, "eval_loss": 0.20407754182815552, "eval_runtime": 78.101, "eval_samples_per_second": 18.937, "eval_steps_per_second": 2.369, "eval_wer": 0.4024907260201378, "step": 7000 }, { "epoch": 2.667140825035562, "grad_norm": 1.0521398782730103, "learning_rate": 0.00022400072411296162, "loss": 0.4378, "step": 7500 }, { "epoch": 2.667140825035562, "eval_loss": 0.20021408796310425, "eval_runtime": 78.1969, "eval_samples_per_second": 18.914, "eval_steps_per_second": 2.366, "eval_wer": 0.39162692103868574, "step": 7500 }, { "epoch": 2.844950213371266, "grad_norm": 0.716386616230011, "learning_rate": 0.0002185698769007965, "loss": 0.4347, "step": 8000 }, { "epoch": 2.844950213371266, "eval_loss": 0.19606797397136688, "eval_runtime": 78.0563, "eval_samples_per_second": 18.948, "eval_steps_per_second": 2.37, "eval_wer": 0.39109697933227344, "step": 8000 }, { "epoch": 3.0227596017069702, "grad_norm": 0.6210708022117615, "learning_rate": 0.0002131390296886314, "loss": 0.4278, "step": 8500 }, { "epoch": 3.0227596017069702, "eval_loss": 0.1994515061378479, "eval_runtime": 78.0396, "eval_samples_per_second": 18.952, "eval_steps_per_second": 2.371, "eval_wer": 0.3923335099805688, "step": 8500 }, { "epoch": 3.200568990042674, "grad_norm": 0.4629976153373718, "learning_rate": 0.0002077081824764663, "loss": 0.4117, "step": 9000 }, { "epoch": 3.200568990042674, "eval_loss": 0.19594013690948486, "eval_runtime": 77.8452, "eval_samples_per_second": 18.999, "eval_steps_per_second": 2.377, "eval_wer": 0.38915385974209504, "step": 9000 }, { "epoch": 3.3783783783783785, "grad_norm": 0.9285233020782471, "learning_rate": 0.0002022773352643012, "loss": 0.4149, "step": 9500 }, { "epoch": 3.3783783783783785, "eval_loss": 0.1925920844078064, "eval_runtime": 77.8068, "eval_samples_per_second": 19.009, "eval_steps_per_second": 2.378, "eval_wer": 0.3858858858858859, "step": 9500 }, { "epoch": 3.5561877667140824, "grad_norm": 1.0625221729278564, "learning_rate": 0.00019684648805213612, "loss": 0.4148, "step": 10000 }, { "epoch": 3.5561877667140824, "eval_loss": 0.1958448737859726, "eval_runtime": 77.9401, "eval_samples_per_second": 18.976, "eval_steps_per_second": 2.374, "eval_wer": 0.38040982158629216, "step": 10000 }, { "epoch": 3.733997155049787, "grad_norm": 0.7951219081878662, "learning_rate": 0.00019142650253439536, "loss": 0.4009, "step": 10500 }, { "epoch": 3.733997155049787, "eval_loss": 0.19297942519187927, "eval_runtime": 77.4746, "eval_samples_per_second": 19.09, "eval_steps_per_second": 2.388, "eval_wer": 0.37899664370252606, "step": 10500 }, { "epoch": 3.9118065433854907, "grad_norm": 0.689900279045105, "learning_rate": 0.00018599565532223026, "loss": 0.4174, "step": 11000 }, { "epoch": 3.9118065433854907, "eval_loss": 0.19552023708820343, "eval_runtime": 77.7104, "eval_samples_per_second": 19.032, "eval_steps_per_second": 2.381, "eval_wer": 0.3822646175587352, "step": 11000 }, { "epoch": 4.089615931721195, "grad_norm": 0.5333609580993652, "learning_rate": 0.00018056480811006514, "loss": 0.4012, "step": 11500 }, { "epoch": 4.089615931721195, "eval_loss": 0.19501054286956787, "eval_runtime": 77.674, "eval_samples_per_second": 19.041, "eval_steps_per_second": 2.382, "eval_wer": 0.3812047341459106, "step": 11500 }, { "epoch": 4.2674253200568995, "grad_norm": 1.5428721904754639, "learning_rate": 0.00017513396089790005, "loss": 0.3974, "step": 12000 }, { "epoch": 4.2674253200568995, "eval_loss": 0.19340351223945618, "eval_runtime": 78.0603, "eval_samples_per_second": 18.947, "eval_steps_per_second": 2.37, "eval_wer": 0.3773184949655538, "step": 12000 }, { "epoch": 4.445234708392603, "grad_norm": 0.44063669443130493, "learning_rate": 0.00016970311368573495, "loss": 0.3943, "step": 12500 }, { "epoch": 4.445234708392603, "eval_loss": 0.18450064957141876, "eval_runtime": 78.009, "eval_samples_per_second": 18.959, "eval_steps_per_second": 2.372, "eval_wer": 0.37201907790143085, "step": 12500 }, { "epoch": 4.623044096728307, "grad_norm": 0.9376386404037476, "learning_rate": 0.0001642831281679942, "loss": 0.4071, "step": 13000 }, { "epoch": 4.623044096728307, "eval_loss": 0.1920066624879837, "eval_runtime": 77.3116, "eval_samples_per_second": 19.13, "eval_steps_per_second": 2.393, "eval_wer": 0.3838544426779721, "step": 13000 }, { "epoch": 4.800853485064011, "grad_norm": 0.6557429432868958, "learning_rate": 0.0001588522809558291, "loss": 0.3968, "step": 13500 }, { "epoch": 4.800853485064011, "eval_loss": 0.1866944283246994, "eval_runtime": 77.5081, "eval_samples_per_second": 19.082, "eval_steps_per_second": 2.387, "eval_wer": 0.37431549196255076, "step": 13500 }, { "epoch": 4.978662873399715, "grad_norm": 0.5748176574707031, "learning_rate": 0.000153421433743664, "loss": 0.3795, "step": 14000 }, { "epoch": 4.978662873399715, "eval_loss": 0.18717192113399506, "eval_runtime": 77.4836, "eval_samples_per_second": 19.088, "eval_steps_per_second": 2.388, "eval_wer": 0.37131248895954777, "step": 14000 }, { "epoch": 5.15647226173542, "grad_norm": 0.8843936920166016, "learning_rate": 0.0001479905865314989, "loss": 0.3856, "step": 14500 }, { "epoch": 5.15647226173542, "eval_loss": 0.18692350387573242, "eval_runtime": 77.4161, "eval_samples_per_second": 19.105, "eval_steps_per_second": 2.39, "eval_wer": 0.3736972266384031, "step": 14500 }, { "epoch": 5.334281650071124, "grad_norm": 1.5647237300872803, "learning_rate": 0.0001425597393193338, "loss": 0.3706, "step": 15000 }, { "epoch": 5.334281650071124, "eval_loss": 0.19033658504486084, "eval_runtime": 77.561, "eval_samples_per_second": 19.069, "eval_steps_per_second": 2.385, "eval_wer": 0.37661190602367073, "step": 15000 }, { "epoch": 5.512091038406828, "grad_norm": 2.1427793502807617, "learning_rate": 0.00013713975380159305, "loss": 0.3784, "step": 15500 }, { "epoch": 5.512091038406828, "eval_loss": 0.18607346713542938, "eval_runtime": 77.668, "eval_samples_per_second": 19.043, "eval_steps_per_second": 2.382, "eval_wer": 0.3683094859565448, "step": 15500 }, { "epoch": 5.689900426742532, "grad_norm": 2.697434186935425, "learning_rate": 0.00013170890658942793, "loss": 0.3777, "step": 16000 }, { "epoch": 5.689900426742532, "eval_loss": 0.1866033524274826, "eval_runtime": 77.9873, "eval_samples_per_second": 18.965, "eval_steps_per_second": 2.372, "eval_wer": 0.37131248895954777, "step": 16000 }, { "epoch": 5.867709815078236, "grad_norm": 1.5033948421478271, "learning_rate": 0.00012627805937726286, "loss": 0.3861, "step": 16500 }, { "epoch": 5.867709815078236, "eval_loss": 0.18121445178985596, "eval_runtime": 77.6152, "eval_samples_per_second": 19.056, "eval_steps_per_second": 2.384, "eval_wer": 0.3637166578343049, "step": 16500 }, { "epoch": 6.0455192034139404, "grad_norm": 0.5671353936195374, "learning_rate": 0.00012084721216509774, "loss": 0.3711, "step": 17000 }, { "epoch": 6.0455192034139404, "eval_loss": 0.18417000770568848, "eval_runtime": 77.4756, "eval_samples_per_second": 19.09, "eval_steps_per_second": 2.388, "eval_wer": 0.3667196608373079, "step": 17000 }, { "epoch": 6.223328591749644, "grad_norm": 1.0996285676956177, "learning_rate": 0.00011542722664735697, "loss": 0.374, "step": 17500 }, { "epoch": 6.223328591749644, "eval_loss": 0.18148785829544067, "eval_runtime": 77.7259, "eval_samples_per_second": 19.028, "eval_steps_per_second": 2.38, "eval_wer": 0.3617735382441265, "step": 17500 }, { "epoch": 6.401137980085348, "grad_norm": 0.6861454844474792, "learning_rate": 0.00010999637943519187, "loss": 0.3539, "step": 18000 }, { "epoch": 6.401137980085348, "eval_loss": 0.18153779208660126, "eval_runtime": 77.7594, "eval_samples_per_second": 19.02, "eval_steps_per_second": 2.379, "eval_wer": 0.3646882176293941, "step": 18000 }, { "epoch": 6.578947368421053, "grad_norm": 1.423963189125061, "learning_rate": 0.00010456553222302678, "loss": 0.3625, "step": 18500 }, { "epoch": 6.578947368421053, "eval_loss": 0.17849859595298767, "eval_runtime": 77.8982, "eval_samples_per_second": 18.986, "eval_steps_per_second": 2.375, "eval_wer": 0.3588588588588589, "step": 18500 }, { "epoch": 6.756756756756757, "grad_norm": 0.9670858383178711, "learning_rate": 9.913468501086169e-05, "loss": 0.3599, "step": 19000 }, { "epoch": 6.756756756756757, "eval_loss": 0.17952215671539307, "eval_runtime": 77.3124, "eval_samples_per_second": 19.13, "eval_steps_per_second": 2.393, "eval_wer": 0.362126832715068, "step": 19000 }, { "epoch": 6.934566145092461, "grad_norm": 0.523705005645752, "learning_rate": 9.370383779869658e-05, "loss": 0.3654, "step": 19500 }, { "epoch": 6.934566145092461, "eval_loss": 0.1822131723165512, "eval_runtime": 77.744, "eval_samples_per_second": 19.024, "eval_steps_per_second": 2.38, "eval_wer": 0.36239180356827416, "step": 19500 }, { "epoch": 7.112375533428165, "grad_norm": 1.0047301054000854, "learning_rate": 8.82729905865315e-05, "loss": 0.3693, "step": 20000 }, { "epoch": 7.112375533428165, "eval_loss": 0.17921391129493713, "eval_runtime": 77.3373, "eval_samples_per_second": 19.124, "eval_steps_per_second": 2.392, "eval_wer": 0.3611552729199788, "step": 20000 }, { "epoch": 7.290184921763869, "grad_norm": 0.6277859807014465, "learning_rate": 8.285300506879071e-05, "loss": 0.3519, "step": 20500 }, { "epoch": 7.290184921763869, "eval_loss": 0.18002206087112427, "eval_runtime": 77.6246, "eval_samples_per_second": 19.053, "eval_steps_per_second": 2.383, "eval_wer": 0.36751457339692634, "step": 20500 }, { "epoch": 7.467994310099574, "grad_norm": 0.8712663650512695, "learning_rate": 7.743301955104996e-05, "loss": 0.3553, "step": 21000 }, { "epoch": 7.467994310099574, "eval_loss": 0.1808168739080429, "eval_runtime": 77.4403, "eval_samples_per_second": 19.099, "eval_steps_per_second": 2.389, "eval_wer": 0.36398162868751105, "step": 21000 }, { "epoch": 7.6458036984352775, "grad_norm": 1.000291109085083, "learning_rate": 7.200217233888485e-05, "loss": 0.3451, "step": 21500 }, { "epoch": 7.6458036984352775, "eval_loss": 0.18079166114330292, "eval_runtime": 77.717, "eval_samples_per_second": 19.031, "eval_steps_per_second": 2.38, "eval_wer": 0.36195018547959723, "step": 21500 }, { "epoch": 7.823613086770981, "grad_norm": 0.9150896668434143, "learning_rate": 6.657132512671976e-05, "loss": 0.3558, "step": 22000 }, { "epoch": 7.823613086770981, "eval_loss": 0.17938227951526642, "eval_runtime": 77.6629, "eval_samples_per_second": 19.044, "eval_steps_per_second": 2.382, "eval_wer": 0.36097862568450806, "step": 22000 }, { "epoch": 8.001422475106686, "grad_norm": 0.4225611686706543, "learning_rate": 6.114047791455467e-05, "loss": 0.3595, "step": 22500 }, { "epoch": 8.001422475106686, "eval_loss": 0.17718034982681274, "eval_runtime": 78.0056, "eval_samples_per_second": 18.96, "eval_steps_per_second": 2.372, "eval_wer": 0.3576223282105635, "step": 22500 }, { "epoch": 8.17923186344239, "grad_norm": 0.7559336423873901, "learning_rate": 5.570963070238957e-05, "loss": 0.3404, "step": 23000 }, { "epoch": 8.17923186344239, "eval_loss": 0.17881204187870026, "eval_runtime": 78.2489, "eval_samples_per_second": 18.901, "eval_steps_per_second": 2.364, "eval_wer": 0.35806394629924043, "step": 23000 }, { "epoch": 8.357041251778094, "grad_norm": 1.2298369407653809, "learning_rate": 5.027878349022447e-05, "loss": 0.3593, "step": 23500 }, { "epoch": 8.357041251778094, "eval_loss": 0.1782107651233673, "eval_runtime": 77.8969, "eval_samples_per_second": 18.987, "eval_steps_per_second": 2.375, "eval_wer": 0.357975622681505, "step": 23500 }, { "epoch": 8.534850640113799, "grad_norm": 0.5965376496315002, "learning_rate": 4.4847936278059375e-05, "loss": 0.3471, "step": 24000 }, { "epoch": 8.534850640113799, "eval_loss": 0.17967215180397034, "eval_runtime": 77.691, "eval_samples_per_second": 19.037, "eval_steps_per_second": 2.381, "eval_wer": 0.3606253312135665, "step": 24000 }, { "epoch": 8.712660028449502, "grad_norm": 0.3367222249507904, "learning_rate": 3.941708906589427e-05, "loss": 0.3497, "step": 24500 }, { "epoch": 8.712660028449502, "eval_loss": 0.17775095999240875, "eval_runtime": 78.061, "eval_samples_per_second": 18.947, "eval_steps_per_second": 2.37, "eval_wer": 0.35877053524112346, "step": 24500 }, { "epoch": 8.890469416785207, "grad_norm": 1.0665998458862305, "learning_rate": 3.399710354815351e-05, "loss": 0.3398, "step": 25000 }, { "epoch": 8.890469416785207, "eval_loss": 0.1774686574935913, "eval_runtime": 77.6438, "eval_samples_per_second": 19.049, "eval_steps_per_second": 2.383, "eval_wer": 0.3583289171524466, "step": 25000 }, { "epoch": 9.06827880512091, "grad_norm": 1.8358111381530762, "learning_rate": 2.8566256335988413e-05, "loss": 0.3444, "step": 25500 }, { "epoch": 9.06827880512091, "eval_loss": 0.1796201765537262, "eval_runtime": 77.7985, "eval_samples_per_second": 19.011, "eval_steps_per_second": 2.378, "eval_wer": 0.35859388800565273, "step": 25500 }, { "epoch": 9.246088193456615, "grad_norm": 1.32257878780365, "learning_rate": 2.3135409123823315e-05, "loss": 0.3366, "step": 26000 }, { "epoch": 9.246088193456615, "eval_loss": 0.1784891039133072, "eval_runtime": 77.5756, "eval_samples_per_second": 19.065, "eval_steps_per_second": 2.385, "eval_wer": 0.35735735735735735, "step": 26000 }, { "epoch": 9.42389758179232, "grad_norm": 1.8535629510879517, "learning_rate": 1.7704561911658217e-05, "loss": 0.3434, "step": 26500 }, { "epoch": 9.42389758179232, "eval_loss": 0.17805208265781403, "eval_runtime": 77.8845, "eval_samples_per_second": 18.99, "eval_steps_per_second": 2.375, "eval_wer": 0.3592121533298004, "step": 26500 }, { "epoch": 9.601706970128022, "grad_norm": 1.9332554340362549, "learning_rate": 1.228457639391745e-05, "loss": 0.3426, "step": 27000 }, { "epoch": 9.601706970128022, "eval_loss": 0.17857009172439575, "eval_runtime": 77.3945, "eval_samples_per_second": 19.11, "eval_steps_per_second": 2.39, "eval_wer": 0.35930047694753575, "step": 27000 }, { "epoch": 9.779516358463727, "grad_norm": 1.1602191925048828, "learning_rate": 6.8537291817523524e-06, "loss": 0.3496, "step": 27500 }, { "epoch": 9.779516358463727, "eval_loss": 0.17868547141551971, "eval_runtime": 77.5791, "eval_samples_per_second": 19.064, "eval_steps_per_second": 2.385, "eval_wer": 0.3590355060943296, "step": 27500 }, { "epoch": 9.95732574679943, "grad_norm": 1.2770110368728638, "learning_rate": 1.4228819695872554e-06, "loss": 0.334, "step": 28000 }, { "epoch": 9.95732574679943, "eval_loss": 0.17876511812210083, "eval_runtime": 78.0274, "eval_samples_per_second": 18.955, "eval_steps_per_second": 2.371, "eval_wer": 0.35877053524112346, "step": 28000 }, { "epoch": 10.0, "step": 28120, "total_flos": 2.1329860467760157e+19, "train_loss": 0.48163046043254915, "train_runtime": 15300.4436, "train_samples_per_second": 7.349, "train_steps_per_second": 1.838 } ], "logging_steps": 500, "max_steps": 28120, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1329860467760157e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }