{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 1.2292424440383911, "learning_rate": 9.949748743718594e-05, "loss": 2.6316, "step": 100 }, { "epoch": 0.032, "grad_norm": 1.361342191696167, "learning_rate": 9.84924623115578e-05, "loss": 1.9567, "step": 200 }, { "epoch": 0.048, "grad_norm": 1.3794206380844116, "learning_rate": 9.748743718592965e-05, "loss": 1.8415, "step": 300 }, { "epoch": 0.064, "grad_norm": 1.2263352870941162, "learning_rate": 9.64824120603015e-05, "loss": 1.8072, "step": 400 }, { "epoch": 0.08, "grad_norm": 1.1767858266830444, "learning_rate": 9.547738693467337e-05, "loss": 1.7898, "step": 500 }, { "epoch": 0.096, "grad_norm": 1.173898696899414, "learning_rate": 9.447236180904523e-05, "loss": 1.7314, "step": 600 }, { "epoch": 0.112, "grad_norm": 1.2088936567306519, "learning_rate": 9.34673366834171e-05, "loss": 1.7011, "step": 700 }, { "epoch": 0.128, "grad_norm": 0.9866878986358643, "learning_rate": 9.246231155778895e-05, "loss": 1.6663, "step": 800 }, { "epoch": 0.144, "grad_norm": 1.2384347915649414, "learning_rate": 9.14572864321608e-05, "loss": 1.6507, "step": 900 }, { "epoch": 0.16, "grad_norm": 1.065297245979309, "learning_rate": 9.045226130653267e-05, "loss": 1.6705, "step": 1000 }, { "epoch": 0.176, "grad_norm": 1.1226449012756348, "learning_rate": 8.944723618090453e-05, "loss": 1.6577, "step": 1100 }, { "epoch": 0.192, "grad_norm": 0.9142518639564514, "learning_rate": 8.84422110552764e-05, "loss": 1.612, "step": 1200 }, { "epoch": 0.208, "grad_norm": 1.1804460287094116, "learning_rate": 8.743718592964825e-05, "loss": 1.6083, "step": 1300 }, { "epoch": 0.224, "grad_norm": 1.1100006103515625, "learning_rate": 8.64321608040201e-05, "loss": 1.6242, "step": 1400 }, { "epoch": 0.24, "grad_norm": 1.1566694974899292, "learning_rate": 8.542713567839196e-05, "loss": 1.6111, "step": 1500 }, { "epoch": 0.256, "grad_norm": 1.094859004020691, "learning_rate": 8.442211055276383e-05, "loss": 1.5816, "step": 1600 }, { "epoch": 0.272, "grad_norm": 1.2286021709442139, "learning_rate": 8.341708542713568e-05, "loss": 1.578, "step": 1700 }, { "epoch": 0.288, "grad_norm": 1.0682488679885864, "learning_rate": 8.241206030150754e-05, "loss": 1.5795, "step": 1800 }, { "epoch": 0.304, "grad_norm": 1.1403608322143555, "learning_rate": 8.14070351758794e-05, "loss": 1.5676, "step": 1900 }, { "epoch": 0.32, "grad_norm": 1.0942330360412598, "learning_rate": 8.040201005025126e-05, "loss": 1.5749, "step": 2000 }, { "epoch": 0.336, "grad_norm": 1.060088872909546, "learning_rate": 7.939698492462313e-05, "loss": 1.5184, "step": 2100 }, { "epoch": 0.352, "grad_norm": 1.085312008857727, "learning_rate": 7.839195979899498e-05, "loss": 1.5339, "step": 2200 }, { "epoch": 0.368, "grad_norm": 1.303536295890808, "learning_rate": 7.738693467336684e-05, "loss": 1.515, "step": 2300 }, { "epoch": 0.384, "grad_norm": 0.9337490797042847, "learning_rate": 7.638190954773869e-05, "loss": 1.5243, "step": 2400 }, { "epoch": 0.4, "grad_norm": 1.2959569692611694, "learning_rate": 7.537688442211056e-05, "loss": 1.4846, "step": 2500 }, { "epoch": 0.416, "grad_norm": 1.1419408321380615, "learning_rate": 7.437185929648241e-05, "loss": 1.5158, "step": 2600 }, { "epoch": 0.432, "grad_norm": 0.9983295202255249, "learning_rate": 7.336683417085427e-05, "loss": 1.4873, "step": 2700 }, { "epoch": 0.448, "grad_norm": 1.1773889064788818, "learning_rate": 7.236180904522614e-05, "loss": 1.4894, "step": 2800 }, { "epoch": 0.464, "grad_norm": 1.2258810997009277, "learning_rate": 7.135678391959799e-05, "loss": 1.5015, "step": 2900 }, { "epoch": 0.48, "grad_norm": 1.1287764310836792, "learning_rate": 7.035175879396985e-05, "loss": 1.5166, "step": 3000 }, { "epoch": 0.496, "grad_norm": 1.1293085813522339, "learning_rate": 6.93467336683417e-05, "loss": 1.5117, "step": 3100 }, { "epoch": 0.512, "grad_norm": 1.0602566003799438, "learning_rate": 6.834170854271357e-05, "loss": 1.455, "step": 3200 }, { "epoch": 0.528, "grad_norm": 1.2482367753982544, "learning_rate": 6.733668341708544e-05, "loss": 1.4356, "step": 3300 }, { "epoch": 0.544, "grad_norm": 1.35064697265625, "learning_rate": 6.633165829145729e-05, "loss": 1.4528, "step": 3400 }, { "epoch": 0.56, "grad_norm": 1.065523386001587, "learning_rate": 6.532663316582915e-05, "loss": 1.4706, "step": 3500 }, { "epoch": 0.576, "grad_norm": 1.4030505418777466, "learning_rate": 6.4321608040201e-05, "loss": 1.4325, "step": 3600 }, { "epoch": 0.592, "grad_norm": 1.1023573875427246, "learning_rate": 6.331658291457287e-05, "loss": 1.455, "step": 3700 }, { "epoch": 0.608, "grad_norm": 1.179084062576294, "learning_rate": 6.231155778894473e-05, "loss": 1.4552, "step": 3800 }, { "epoch": 0.624, "grad_norm": 1.0885223150253296, "learning_rate": 6.130653266331658e-05, "loss": 1.4178, "step": 3900 }, { "epoch": 0.64, "grad_norm": 1.3725833892822266, "learning_rate": 6.030150753768844e-05, "loss": 1.456, "step": 4000 }, { "epoch": 0.656, "grad_norm": 1.1671427488327026, "learning_rate": 5.929648241206031e-05, "loss": 1.4552, "step": 4100 }, { "epoch": 0.672, "grad_norm": 1.0521718263626099, "learning_rate": 5.829145728643216e-05, "loss": 1.4236, "step": 4200 }, { "epoch": 0.688, "grad_norm": 1.1262151002883911, "learning_rate": 5.728643216080403e-05, "loss": 1.456, "step": 4300 }, { "epoch": 0.704, "grad_norm": 1.090331792831421, "learning_rate": 5.628140703517588e-05, "loss": 1.4021, "step": 4400 }, { "epoch": 0.72, "grad_norm": 1.1581507921218872, "learning_rate": 5.527638190954774e-05, "loss": 1.4708, "step": 4500 }, { "epoch": 0.736, "grad_norm": 1.1916351318359375, "learning_rate": 5.4271356783919604e-05, "loss": 1.4283, "step": 4600 }, { "epoch": 0.752, "grad_norm": 1.2623261213302612, "learning_rate": 5.3266331658291455e-05, "loss": 1.4593, "step": 4700 }, { "epoch": 0.768, "grad_norm": 1.2002214193344116, "learning_rate": 5.226130653266332e-05, "loss": 1.4387, "step": 4800 }, { "epoch": 0.784, "grad_norm": 1.0627392530441284, "learning_rate": 5.125628140703518e-05, "loss": 1.4313, "step": 4900 }, { "epoch": 0.8, "grad_norm": 1.2739390134811401, "learning_rate": 5.0251256281407036e-05, "loss": 1.4024, "step": 5000 }, { "epoch": 0.816, "grad_norm": 1.3108317852020264, "learning_rate": 4.92462311557789e-05, "loss": 1.4385, "step": 5100 }, { "epoch": 0.832, "grad_norm": 1.4682525396347046, "learning_rate": 4.824120603015075e-05, "loss": 1.4015, "step": 5200 }, { "epoch": 0.848, "grad_norm": 1.301832675933838, "learning_rate": 4.723618090452262e-05, "loss": 1.3995, "step": 5300 }, { "epoch": 0.864, "grad_norm": 1.3100578784942627, "learning_rate": 4.6231155778894475e-05, "loss": 1.4203, "step": 5400 }, { "epoch": 0.88, "grad_norm": 1.2472883462905884, "learning_rate": 4.522613065326633e-05, "loss": 1.3984, "step": 5500 }, { "epoch": 0.896, "grad_norm": 1.1501699686050415, "learning_rate": 4.42211055276382e-05, "loss": 1.4177, "step": 5600 }, { "epoch": 0.912, "grad_norm": 1.306634783744812, "learning_rate": 4.321608040201005e-05, "loss": 1.4013, "step": 5700 }, { "epoch": 0.928, "grad_norm": 1.199546217918396, "learning_rate": 4.2211055276381914e-05, "loss": 1.3998, "step": 5800 }, { "epoch": 0.944, "grad_norm": 1.4669443368911743, "learning_rate": 4.120603015075377e-05, "loss": 1.3858, "step": 5900 }, { "epoch": 0.96, "grad_norm": 1.1618568897247314, "learning_rate": 4.020100502512563e-05, "loss": 1.3952, "step": 6000 }, { "epoch": 0.976, "grad_norm": 1.3658894300460815, "learning_rate": 3.919597989949749e-05, "loss": 1.34, "step": 6100 }, { "epoch": 0.992, "grad_norm": 1.1548917293548584, "learning_rate": 3.8190954773869346e-05, "loss": 1.3753, "step": 6200 }, { "epoch": 1.008, "grad_norm": 1.250981092453003, "learning_rate": 3.7185929648241204e-05, "loss": 1.363, "step": 6300 }, { "epoch": 1.024, "grad_norm": 1.1988142728805542, "learning_rate": 3.618090452261307e-05, "loss": 1.2739, "step": 6400 }, { "epoch": 1.04, "grad_norm": 1.3094350099563599, "learning_rate": 3.517587939698493e-05, "loss": 1.3268, "step": 6500 }, { "epoch": 1.056, "grad_norm": 1.4513778686523438, "learning_rate": 3.4170854271356785e-05, "loss": 1.3114, "step": 6600 }, { "epoch": 1.072, "grad_norm": 1.2981783151626587, "learning_rate": 3.3165829145728643e-05, "loss": 1.2866, "step": 6700 }, { "epoch": 1.088, "grad_norm": 1.350372314453125, "learning_rate": 3.21608040201005e-05, "loss": 1.2909, "step": 6800 }, { "epoch": 1.104, "grad_norm": 1.1077184677124023, "learning_rate": 3.1155778894472366e-05, "loss": 1.2278, "step": 6900 }, { "epoch": 1.12, "grad_norm": 1.3056607246398926, "learning_rate": 3.015075376884422e-05, "loss": 1.2573, "step": 7000 }, { "epoch": 1.1360000000000001, "grad_norm": 1.38368558883667, "learning_rate": 2.914572864321608e-05, "loss": 1.3041, "step": 7100 }, { "epoch": 1.152, "grad_norm": 1.7526077032089233, "learning_rate": 2.814070351758794e-05, "loss": 1.3056, "step": 7200 }, { "epoch": 1.168, "grad_norm": 1.1916877031326294, "learning_rate": 2.7135678391959802e-05, "loss": 1.2359, "step": 7300 }, { "epoch": 1.184, "grad_norm": 1.326968789100647, "learning_rate": 2.613065326633166e-05, "loss": 1.2529, "step": 7400 }, { "epoch": 1.2, "grad_norm": 1.502866506576538, "learning_rate": 2.5125628140703518e-05, "loss": 1.3043, "step": 7500 }, { "epoch": 1.216, "grad_norm": 1.7037489414215088, "learning_rate": 2.4120603015075376e-05, "loss": 1.3254, "step": 7600 }, { "epoch": 1.232, "grad_norm": 1.3369475603103638, "learning_rate": 2.3115577889447238e-05, "loss": 1.3274, "step": 7700 }, { "epoch": 1.248, "grad_norm": 1.3407210111618042, "learning_rate": 2.21105527638191e-05, "loss": 1.2879, "step": 7800 }, { "epoch": 1.264, "grad_norm": 1.5996978282928467, "learning_rate": 2.1105527638190957e-05, "loss": 1.2853, "step": 7900 }, { "epoch": 1.28, "grad_norm": 1.3061344623565674, "learning_rate": 2.0100502512562815e-05, "loss": 1.274, "step": 8000 }, { "epoch": 1.296, "grad_norm": 1.335577130317688, "learning_rate": 1.9105527638190956e-05, "loss": 1.2482, "step": 8100 }, { "epoch": 1.312, "grad_norm": 1.632110834121704, "learning_rate": 1.8100502512562814e-05, "loss": 1.2849, "step": 8200 }, { "epoch": 1.328, "grad_norm": 1.457372784614563, "learning_rate": 1.7095477386934675e-05, "loss": 1.27, "step": 8300 }, { "epoch": 1.3439999999999999, "grad_norm": 1.3104965686798096, "learning_rate": 1.6090452261306533e-05, "loss": 1.2698, "step": 8400 }, { "epoch": 1.3599999999999999, "grad_norm": 1.350401520729065, "learning_rate": 1.5085427135678393e-05, "loss": 1.2337, "step": 8500 }, { "epoch": 1.376, "grad_norm": 1.3079415559768677, "learning_rate": 1.4080402010050253e-05, "loss": 1.2904, "step": 8600 }, { "epoch": 1.392, "grad_norm": 1.3506203889846802, "learning_rate": 1.3075376884422111e-05, "loss": 1.2847, "step": 8700 }, { "epoch": 1.408, "grad_norm": 1.4178451299667358, "learning_rate": 1.2070351758793969e-05, "loss": 1.2713, "step": 8800 }, { "epoch": 1.424, "grad_norm": 1.2672168016433716, "learning_rate": 1.106532663316583e-05, "loss": 1.2634, "step": 8900 }, { "epoch": 1.44, "grad_norm": 1.4467307329177856, "learning_rate": 1.0070351758793971e-05, "loss": 1.2868, "step": 9000 }, { "epoch": 1.456, "grad_norm": 1.5032036304473877, "learning_rate": 9.06532663316583e-06, "loss": 1.2323, "step": 9100 }, { "epoch": 1.472, "grad_norm": 1.1872940063476562, "learning_rate": 8.060301507537689e-06, "loss": 1.2868, "step": 9200 }, { "epoch": 1.488, "grad_norm": 1.6626771688461304, "learning_rate": 7.055276381909548e-06, "loss": 1.27, "step": 9300 }, { "epoch": 1.504, "grad_norm": 1.3130452632904053, "learning_rate": 6.050251256281407e-06, "loss": 1.2542, "step": 9400 }, { "epoch": 1.52, "grad_norm": 1.4746296405792236, "learning_rate": 5.045226130653267e-06, "loss": 1.257, "step": 9500 }, { "epoch": 1.536, "grad_norm": 1.3648103475570679, "learning_rate": 4.0402010050251256e-06, "loss": 1.2487, "step": 9600 }, { "epoch": 1.552, "grad_norm": 1.3191380500793457, "learning_rate": 3.035175879396985e-06, "loss": 1.2557, "step": 9700 }, { "epoch": 1.568, "grad_norm": 1.806413173675537, "learning_rate": 2.0301507537688442e-06, "loss": 1.2323, "step": 9800 }, { "epoch": 1.584, "grad_norm": 1.6092606782913208, "learning_rate": 1.0251256281407035e-06, "loss": 1.2321, "step": 9900 }, { "epoch": 1.6, "grad_norm": 1.3367202281951904, "learning_rate": 2.0100502512562817e-08, "loss": 1.2361, "step": 10000 } ], "logging_steps": 100, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.995709021001564e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }