{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.4296788482834994, "eval_steps": 20, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017718715393133997, "grad_norm": 14.5, "learning_rate": 5e-09, "loss": 0.5108, "step": 1 }, { "epoch": 0.035437430786267994, "grad_norm": 19.375, "learning_rate": 1e-08, "loss": 0.6139, "step": 2 }, { "epoch": 0.053156146179401995, "grad_norm": 17.0, "learning_rate": 1.5e-08, "loss": 0.5292, "step": 3 }, { "epoch": 0.07087486157253599, "grad_norm": 18.25, "learning_rate": 2e-08, "loss": 0.5878, "step": 4 }, { "epoch": 0.08859357696567, "grad_norm": 16.25, "learning_rate": 2.5e-08, "loss": 0.5313, "step": 5 }, { "epoch": 0.10631229235880399, "grad_norm": 15.5625, "learning_rate": 3e-08, "loss": 0.5137, "step": 6 }, { "epoch": 0.12403100775193798, "grad_norm": 16.125, "learning_rate": 3.5e-08, "loss": 0.52, "step": 7 }, { "epoch": 0.14174972314507198, "grad_norm": 16.5, "learning_rate": 4e-08, "loss": 0.5432, "step": 8 }, { "epoch": 0.15946843853820597, "grad_norm": 13.75, "learning_rate": 4.5e-08, "loss": 0.4886, "step": 9 }, { "epoch": 0.17718715393134, "grad_norm": 16.875, "learning_rate": 5e-08, "loss": 0.5696, "step": 10 }, { "epoch": 0.19490586932447398, "grad_norm": 16.0, "learning_rate": 5.4999999999999996e-08, "loss": 0.5412, "step": 11 }, { "epoch": 0.21262458471760798, "grad_norm": 15.9375, "learning_rate": 6e-08, "loss": 0.5447, "step": 12 }, { "epoch": 0.23034330011074197, "grad_norm": 17.875, "learning_rate": 6.5e-08, "loss": 0.5685, "step": 13 }, { "epoch": 0.24806201550387597, "grad_norm": 17.875, "learning_rate": 7e-08, "loss": 0.5531, "step": 14 }, { "epoch": 0.26578073089701, "grad_norm": 19.125, "learning_rate": 7.5e-08, "loss": 0.5978, "step": 15 }, { "epoch": 0.28349944629014395, "grad_norm": 15.125, "learning_rate": 8e-08, "loss": 0.4974, "step": 16 }, { "epoch": 0.301218161683278, "grad_norm": 17.25, "learning_rate": 8.500000000000001e-08, "loss": 0.5525, "step": 17 }, { "epoch": 0.31893687707641194, "grad_norm": 16.125, "learning_rate": 9e-08, "loss": 0.517, "step": 18 }, { "epoch": 0.33665559246954596, "grad_norm": 16.875, "learning_rate": 9.499999999999999e-08, "loss": 0.5459, "step": 19 }, { "epoch": 0.35437430786268, "grad_norm": 16.75, "learning_rate": 1e-07, "loss": 0.5251, "step": 20 }, { "epoch": 0.35437430786268, "eval_loss": 1.2212821245193481, "eval_runtime": 158.7537, "eval_samples_per_second": 1.72, "eval_steps_per_second": 1.72, "step": 20 }, { "epoch": 0.37209302325581395, "grad_norm": 19.25, "learning_rate": 1.0499999999999999e-07, "loss": 0.6009, "step": 21 }, { "epoch": 0.38981173864894797, "grad_norm": 15.9375, "learning_rate": 1.0999999999999999e-07, "loss": 0.5199, "step": 22 }, { "epoch": 0.40753045404208194, "grad_norm": 15.8125, "learning_rate": 1.15e-07, "loss": 0.5464, "step": 23 }, { "epoch": 0.42524916943521596, "grad_norm": 14.5625, "learning_rate": 1.2e-07, "loss": 0.505, "step": 24 }, { "epoch": 0.4429678848283499, "grad_norm": 17.25, "learning_rate": 1.25e-07, "loss": 0.52, "step": 25 }, { "epoch": 0.46068660022148394, "grad_norm": 16.75, "learning_rate": 1.3e-07, "loss": 0.5414, "step": 26 }, { "epoch": 0.47840531561461797, "grad_norm": 16.375, "learning_rate": 1.35e-07, "loss": 0.5295, "step": 27 }, { "epoch": 0.49612403100775193, "grad_norm": 17.75, "learning_rate": 1.4e-07, "loss": 0.5881, "step": 28 }, { "epoch": 0.5138427464008859, "grad_norm": 15.3125, "learning_rate": 1.45e-07, "loss": 0.5256, "step": 29 }, { "epoch": 0.53156146179402, "grad_norm": 14.375, "learning_rate": 1.5e-07, "loss": 0.4898, "step": 30 }, { "epoch": 0.5492801771871539, "grad_norm": 17.5, "learning_rate": 1.55e-07, "loss": 0.579, "step": 31 }, { "epoch": 0.5669988925802879, "grad_norm": 17.125, "learning_rate": 1.6e-07, "loss": 0.5493, "step": 32 }, { "epoch": 0.584717607973422, "grad_norm": 17.75, "learning_rate": 1.65e-07, "loss": 0.5534, "step": 33 }, { "epoch": 0.602436323366556, "grad_norm": 16.125, "learning_rate": 1.7000000000000001e-07, "loss": 0.5236, "step": 34 }, { "epoch": 0.6201550387596899, "grad_norm": 14.625, "learning_rate": 1.75e-07, "loss": 0.4911, "step": 35 }, { "epoch": 0.6378737541528239, "grad_norm": 15.4375, "learning_rate": 1.8e-07, "loss": 0.5075, "step": 36 }, { "epoch": 0.655592469545958, "grad_norm": 14.1875, "learning_rate": 1.85e-07, "loss": 0.4666, "step": 37 }, { "epoch": 0.6733111849390919, "grad_norm": 16.25, "learning_rate": 1.8999999999999998e-07, "loss": 0.5636, "step": 38 }, { "epoch": 0.6910299003322259, "grad_norm": 16.875, "learning_rate": 1.9499999999999999e-07, "loss": 0.5495, "step": 39 }, { "epoch": 0.70874861572536, "grad_norm": 18.0, "learning_rate": 2e-07, "loss": 0.5918, "step": 40 }, { "epoch": 0.70874861572536, "eval_loss": 1.1992905139923096, "eval_runtime": 158.6959, "eval_samples_per_second": 1.72, "eval_steps_per_second": 1.72, "step": 40 }, { "epoch": 0.7264673311184939, "grad_norm": 17.25, "learning_rate": 2.0499999999999997e-07, "loss": 0.5951, "step": 41 }, { "epoch": 0.7441860465116279, "grad_norm": 19.375, "learning_rate": 2.0999999999999997e-07, "loss": 0.5839, "step": 42 }, { "epoch": 0.7619047619047619, "grad_norm": 16.125, "learning_rate": 2.1499999999999998e-07, "loss": 0.5395, "step": 43 }, { "epoch": 0.7796234772978959, "grad_norm": 14.5, "learning_rate": 2.1999999999999998e-07, "loss": 0.4897, "step": 44 }, { "epoch": 0.7973421926910299, "grad_norm": 18.375, "learning_rate": 2.25e-07, "loss": 0.5672, "step": 45 }, { "epoch": 0.8150609080841639, "grad_norm": 14.5, "learning_rate": 2.3e-07, "loss": 0.4998, "step": 46 }, { "epoch": 0.832779623477298, "grad_norm": 14.375, "learning_rate": 2.3499999999999997e-07, "loss": 0.5193, "step": 47 }, { "epoch": 0.8504983388704319, "grad_norm": 15.8125, "learning_rate": 2.4e-07, "loss": 0.5346, "step": 48 }, { "epoch": 0.8682170542635659, "grad_norm": 14.125, "learning_rate": 2.45e-07, "loss": 0.4607, "step": 49 }, { "epoch": 0.8859357696566998, "grad_norm": 14.625, "learning_rate": 2.5e-07, "loss": 0.5139, "step": 50 }, { "epoch": 0.9036544850498339, "grad_norm": 14.5, "learning_rate": 2.55e-07, "loss": 0.4855, "step": 51 }, { "epoch": 0.9213732004429679, "grad_norm": 14.9375, "learning_rate": 2.6e-07, "loss": 0.5002, "step": 52 }, { "epoch": 0.9390919158361019, "grad_norm": 16.375, "learning_rate": 2.65e-07, "loss": 0.5471, "step": 53 }, { "epoch": 0.9568106312292359, "grad_norm": 15.3125, "learning_rate": 2.7e-07, "loss": 0.5188, "step": 54 }, { "epoch": 0.9745293466223699, "grad_norm": 15.0, "learning_rate": 2.75e-07, "loss": 0.5227, "step": 55 }, { "epoch": 0.9922480620155039, "grad_norm": 13.5625, "learning_rate": 2.8e-07, "loss": 0.4944, "step": 56 }, { "epoch": 1.0099667774086378, "grad_norm": 13.3125, "learning_rate": 2.8499999999999997e-07, "loss": 0.4779, "step": 57 }, { "epoch": 1.0276854928017718, "grad_norm": 14.25, "learning_rate": 2.9e-07, "loss": 0.5235, "step": 58 }, { "epoch": 1.0454042081949058, "grad_norm": 13.4375, "learning_rate": 2.95e-07, "loss": 0.5077, "step": 59 }, { "epoch": 1.06312292358804, "grad_norm": 12.0, "learning_rate": 3e-07, "loss": 0.4494, "step": 60 }, { "epoch": 1.06312292358804, "eval_loss": 1.1168979406356812, "eval_runtime": 158.9495, "eval_samples_per_second": 1.718, "eval_steps_per_second": 1.718, "step": 60 }, { "epoch": 1.080841638981174, "grad_norm": 14.5625, "learning_rate": 3.05e-07, "loss": 0.5247, "step": 61 }, { "epoch": 1.0985603543743079, "grad_norm": 13.5625, "learning_rate": 3.1e-07, "loss": 0.4839, "step": 62 }, { "epoch": 1.1162790697674418, "grad_norm": 13.5625, "learning_rate": 3.15e-07, "loss": 0.5015, "step": 63 }, { "epoch": 1.1339977851605758, "grad_norm": 15.5, "learning_rate": 3.2e-07, "loss": 0.5636, "step": 64 }, { "epoch": 1.1517165005537098, "grad_norm": 14.625, "learning_rate": 3.25e-07, "loss": 0.5375, "step": 65 }, { "epoch": 1.169435215946844, "grad_norm": 14.5625, "learning_rate": 3.3e-07, "loss": 0.5469, "step": 66 }, { "epoch": 1.187153931339978, "grad_norm": 12.125, "learning_rate": 3.35e-07, "loss": 0.4589, "step": 67 }, { "epoch": 1.204872646733112, "grad_norm": 11.8125, "learning_rate": 3.4000000000000003e-07, "loss": 0.4577, "step": 68 }, { "epoch": 1.2225913621262459, "grad_norm": 12.75, "learning_rate": 3.45e-07, "loss": 0.4597, "step": 69 }, { "epoch": 1.2403100775193798, "grad_norm": 10.4375, "learning_rate": 3.5e-07, "loss": 0.4009, "step": 70 }, { "epoch": 1.2580287929125138, "grad_norm": 12.125, "learning_rate": 3.55e-07, "loss": 0.4883, "step": 71 }, { "epoch": 1.2757475083056478, "grad_norm": 12.8125, "learning_rate": 3.6e-07, "loss": 0.488, "step": 72 }, { "epoch": 1.2934662236987817, "grad_norm": 14.0625, "learning_rate": 3.65e-07, "loss": 0.5073, "step": 73 }, { "epoch": 1.311184939091916, "grad_norm": 10.8125, "learning_rate": 3.7e-07, "loss": 0.4076, "step": 74 }, { "epoch": 1.3289036544850499, "grad_norm": 13.3125, "learning_rate": 3.75e-07, "loss": 0.4999, "step": 75 }, { "epoch": 1.3466223698781838, "grad_norm": 12.3125, "learning_rate": 3.7999999999999996e-07, "loss": 0.4513, "step": 76 }, { "epoch": 1.3643410852713178, "grad_norm": 10.5, "learning_rate": 3.8499999999999997e-07, "loss": 0.4267, "step": 77 }, { "epoch": 1.3820598006644518, "grad_norm": 12.0625, "learning_rate": 3.8999999999999997e-07, "loss": 0.4675, "step": 78 }, { "epoch": 1.3997785160575857, "grad_norm": 10.375, "learning_rate": 3.95e-07, "loss": 0.4157, "step": 79 }, { "epoch": 1.41749723145072, "grad_norm": 12.375, "learning_rate": 4e-07, "loss": 0.4751, "step": 80 }, { "epoch": 1.41749723145072, "eval_loss": 1.0955771207809448, "eval_runtime": 159.794, "eval_samples_per_second": 1.708, "eval_steps_per_second": 1.708, "step": 80 }, { "epoch": 1.435215946843854, "grad_norm": 9.25, "learning_rate": 4.05e-07, "loss": 0.4139, "step": 81 }, { "epoch": 1.4529346622369879, "grad_norm": 11.8125, "learning_rate": 4.0999999999999994e-07, "loss": 0.4691, "step": 82 }, { "epoch": 1.4706533776301218, "grad_norm": 9.625, "learning_rate": 4.1499999999999994e-07, "loss": 0.4047, "step": 83 }, { "epoch": 1.4883720930232558, "grad_norm": 11.5, "learning_rate": 4.1999999999999995e-07, "loss": 0.4555, "step": 84 }, { "epoch": 1.5060908084163898, "grad_norm": 8.375, "learning_rate": 4.2499999999999995e-07, "loss": 0.3627, "step": 85 }, { "epoch": 1.5238095238095237, "grad_norm": 10.5, "learning_rate": 4.2999999999999996e-07, "loss": 0.4305, "step": 86 }, { "epoch": 1.5415282392026577, "grad_norm": 11.0, "learning_rate": 4.3499999999999996e-07, "loss": 0.4732, "step": 87 }, { "epoch": 1.5592469545957917, "grad_norm": 10.1875, "learning_rate": 4.3999999999999997e-07, "loss": 0.4001, "step": 88 }, { "epoch": 1.5769656699889258, "grad_norm": 9.0, "learning_rate": 4.45e-07, "loss": 0.3758, "step": 89 }, { "epoch": 1.5946843853820598, "grad_norm": 10.25, "learning_rate": 4.5e-07, "loss": 0.4344, "step": 90 }, { "epoch": 1.6124031007751938, "grad_norm": 10.125, "learning_rate": 4.55e-07, "loss": 0.4525, "step": 91 }, { "epoch": 1.6301218161683277, "grad_norm": 10.125, "learning_rate": 4.6e-07, "loss": 0.4031, "step": 92 }, { "epoch": 1.647840531561462, "grad_norm": 9.875, "learning_rate": 4.65e-07, "loss": 0.4037, "step": 93 }, { "epoch": 1.665559246954596, "grad_norm": 9.5625, "learning_rate": 4.6999999999999995e-07, "loss": 0.4184, "step": 94 }, { "epoch": 1.6832779623477299, "grad_norm": 9.3125, "learning_rate": 4.7499999999999995e-07, "loss": 0.3953, "step": 95 }, { "epoch": 1.7009966777408638, "grad_norm": 8.3125, "learning_rate": 4.8e-07, "loss": 0.3842, "step": 96 }, { "epoch": 1.7187153931339978, "grad_norm": 8.5625, "learning_rate": 4.85e-07, "loss": 0.3609, "step": 97 }, { "epoch": 1.7364341085271318, "grad_norm": 8.1875, "learning_rate": 4.9e-07, "loss": 0.3448, "step": 98 }, { "epoch": 1.7541528239202657, "grad_norm": 8.9375, "learning_rate": 4.95e-07, "loss": 0.3931, "step": 99 }, { "epoch": 1.7718715393133997, "grad_norm": 9.0625, "learning_rate": 5e-07, "loss": 0.3817, "step": 100 }, { "epoch": 1.7718715393133997, "eval_loss": 1.021960735321045, "eval_runtime": 159.9796, "eval_samples_per_second": 1.706, "eval_steps_per_second": 1.706, "step": 100 }, { "epoch": 1.7895902547065337, "grad_norm": 8.0, "learning_rate": 4.999778497139454e-07, "loss": 0.3562, "step": 101 }, { "epoch": 1.8073089700996676, "grad_norm": 8.1875, "learning_rate": 4.999114027808631e-07, "loss": 0.3657, "step": 102 }, { "epoch": 1.8250276854928018, "grad_norm": 8.0625, "learning_rate": 4.998006709753016e-07, "loss": 0.3418, "step": 103 }, { "epoch": 1.8427464008859358, "grad_norm": 9.0625, "learning_rate": 4.996456739191904e-07, "loss": 0.373, "step": 104 }, { "epoch": 1.8604651162790697, "grad_norm": 7.875, "learning_rate": 4.994464390783624e-07, "loss": 0.3421, "step": 105 }, { "epoch": 1.878183831672204, "grad_norm": 9.375, "learning_rate": 4.992030017576875e-07, "loss": 0.3889, "step": 106 }, { "epoch": 1.895902547065338, "grad_norm": 8.5625, "learning_rate": 4.989154050948158e-07, "loss": 0.3617, "step": 107 }, { "epoch": 1.9136212624584719, "grad_norm": 9.0, "learning_rate": 4.985837000525343e-07, "loss": 0.3364, "step": 108 }, { "epoch": 1.9313399778516058, "grad_norm": 7.90625, "learning_rate": 4.982079454097353e-07, "loss": 0.3174, "step": 109 }, { "epoch": 1.9490586932447398, "grad_norm": 8.5, "learning_rate": 4.977882077510018e-07, "loss": 0.3359, "step": 110 }, { "epoch": 1.9667774086378738, "grad_norm": 7.90625, "learning_rate": 4.973245614548071e-07, "loss": 0.3314, "step": 111 }, { "epoch": 1.9844961240310077, "grad_norm": 8.875, "learning_rate": 4.968170886803361e-07, "loss": 0.3506, "step": 112 }, { "epoch": 2.0022148394241417, "grad_norm": 8.3125, "learning_rate": 4.962658793529257e-07, "loss": 0.2992, "step": 113 }, { "epoch": 2.0199335548172757, "grad_norm": 8.75, "learning_rate": 4.956710311481302e-07, "loss": 0.3275, "step": 114 }, { "epoch": 2.0376522702104096, "grad_norm": 8.5, "learning_rate": 4.950326494744127e-07, "loss": 0.3139, "step": 115 }, { "epoch": 2.0553709856035436, "grad_norm": 10.125, "learning_rate": 4.943508474544666e-07, "loss": 0.3305, "step": 116 }, { "epoch": 2.0730897009966776, "grad_norm": 9.0625, "learning_rate": 4.936257459051702e-07, "loss": 0.3196, "step": 117 }, { "epoch": 2.0908084163898115, "grad_norm": 7.59375, "learning_rate": 4.928574733161775e-07, "loss": 0.2779, "step": 118 }, { "epoch": 2.108527131782946, "grad_norm": 8.9375, "learning_rate": 4.920461658271491e-07, "loss": 0.2939, "step": 119 }, { "epoch": 2.12624584717608, "grad_norm": 9.3125, "learning_rate": 4.91191967203629e-07, "loss": 0.2925, "step": 120 }, { "epoch": 2.12624584717608, "eval_loss": 0.9208351969718933, "eval_runtime": 159.0286, "eval_samples_per_second": 1.717, "eval_steps_per_second": 1.717, "step": 120 }, { "epoch": 2.143964562569214, "grad_norm": 8.4375, "learning_rate": 4.902950288115678e-07, "loss": 0.2788, "step": 121 }, { "epoch": 2.161683277962348, "grad_norm": 8.125, "learning_rate": 4.893555095905013e-07, "loss": 0.2756, "step": 122 }, { "epoch": 2.179401993355482, "grad_norm": 8.5625, "learning_rate": 4.883735760253855e-07, "loss": 0.2651, "step": 123 }, { "epoch": 2.1971207087486158, "grad_norm": 7.8125, "learning_rate": 4.873494021170954e-07, "loss": 0.2578, "step": 124 }, { "epoch": 2.2148394241417497, "grad_norm": 7.75, "learning_rate": 4.862831693515908e-07, "loss": 0.2618, "step": 125 }, { "epoch": 2.2325581395348837, "grad_norm": 8.0, "learning_rate": 4.851750666677583e-07, "loss": 0.2672, "step": 126 }, { "epoch": 2.2502768549280177, "grad_norm": 7.4375, "learning_rate": 4.840252904239291e-07, "loss": 0.2548, "step": 127 }, { "epoch": 2.2679955703211516, "grad_norm": 7.1875, "learning_rate": 4.828340443630846e-07, "loss": 0.2538, "step": 128 }, { "epoch": 2.2857142857142856, "grad_norm": 7.3125, "learning_rate": 4.81601539576753e-07, "loss": 0.2433, "step": 129 }, { "epoch": 2.3034330011074196, "grad_norm": 6.3125, "learning_rate": 4.803279944676032e-07, "loss": 0.2461, "step": 130 }, { "epoch": 2.3211517165005535, "grad_norm": 5.96875, "learning_rate": 4.790136347107427e-07, "loss": 0.2473, "step": 131 }, { "epoch": 2.338870431893688, "grad_norm": 5.34375, "learning_rate": 4.776586932137283e-07, "loss": 0.2277, "step": 132 }, { "epoch": 2.356589147286822, "grad_norm": 5.3125, "learning_rate": 4.762634100752939e-07, "loss": 0.2352, "step": 133 }, { "epoch": 2.374307862679956, "grad_norm": 5.15625, "learning_rate": 4.748280325428048e-07, "loss": 0.2375, "step": 134 }, { "epoch": 2.39202657807309, "grad_norm": 5.625, "learning_rate": 4.7335281496844435e-07, "loss": 0.2583, "step": 135 }, { "epoch": 2.409745293466224, "grad_norm": 4.4375, "learning_rate": 4.7183801876414286e-07, "loss": 0.2133, "step": 136 }, { "epoch": 2.4274640088593578, "grad_norm": 5.125, "learning_rate": 4.702839123552541e-07, "loss": 0.2503, "step": 137 }, { "epoch": 2.4451827242524917, "grad_norm": 5.28125, "learning_rate": 4.6869077113299025e-07, "loss": 0.212, "step": 138 }, { "epoch": 2.4629014396456257, "grad_norm": 4.5625, "learning_rate": 4.670588774056218e-07, "loss": 0.2278, "step": 139 }, { "epoch": 2.4806201550387597, "grad_norm": 4.9375, "learning_rate": 4.653885203484515e-07, "loss": 0.2512, "step": 140 }, { "epoch": 2.4806201550387597, "eval_loss": 0.8999977111816406, "eval_runtime": 159.9391, "eval_samples_per_second": 1.707, "eval_steps_per_second": 1.707, "step": 140 }, { "epoch": 2.4983388704318936, "grad_norm": 4.875, "learning_rate": 4.636799959525726e-07, "loss": 0.2177, "step": 141 }, { "epoch": 2.5160575858250276, "grad_norm": 4.625, "learning_rate": 4.6193360697241766e-07, "loss": 0.2191, "step": 142 }, { "epoch": 2.5337763012181616, "grad_norm": 3.921875, "learning_rate": 4.601496628721108e-07, "loss": 0.2055, "step": 143 }, { "epoch": 2.5514950166112955, "grad_norm": 5.03125, "learning_rate": 4.583284797706287e-07, "loss": 0.255, "step": 144 }, { "epoch": 2.56921373200443, "grad_norm": 4.3125, "learning_rate": 4.564703803857848e-07, "loss": 0.2316, "step": 145 }, { "epoch": 2.5869324473975635, "grad_norm": 4.34375, "learning_rate": 4.545756939770422e-07, "loss": 0.2044, "step": 146 }, { "epoch": 2.604651162790698, "grad_norm": 4.125, "learning_rate": 4.5264475628716847e-07, "loss": 0.2089, "step": 147 }, { "epoch": 2.622369878183832, "grad_norm": 4.1875, "learning_rate": 4.5067790948274085e-07, "loss": 0.2115, "step": 148 }, { "epoch": 2.640088593576966, "grad_norm": 4.6875, "learning_rate": 4.4867550209351435e-07, "loss": 0.2485, "step": 149 }, { "epoch": 2.6578073089700998, "grad_norm": 4.09375, "learning_rate": 4.4663788895066065e-07, "loss": 0.1996, "step": 150 }, { "epoch": 2.6755260243632337, "grad_norm": 4.34375, "learning_rate": 4.4456543112389145e-07, "loss": 0.2082, "step": 151 }, { "epoch": 2.6932447397563677, "grad_norm": 4.1875, "learning_rate": 4.4245849585747655e-07, "loss": 0.1998, "step": 152 }, { "epoch": 2.7109634551495017, "grad_norm": 4.40625, "learning_rate": 4.403174565051666e-07, "loss": 0.214, "step": 153 }, { "epoch": 2.7286821705426356, "grad_norm": 4.59375, "learning_rate": 4.3814269246403456e-07, "loss": 0.2169, "step": 154 }, { "epoch": 2.7464008859357696, "grad_norm": 4.15625, "learning_rate": 4.3593458910724537e-07, "loss": 0.207, "step": 155 }, { "epoch": 2.7641196013289036, "grad_norm": 4.5, "learning_rate": 4.336935377157668e-07, "loss": 0.2221, "step": 156 }, { "epoch": 2.7818383167220375, "grad_norm": 4.15625, "learning_rate": 4.3141993540903397e-07, "loss": 0.2073, "step": 157 }, { "epoch": 2.7995570321151715, "grad_norm": 4.71875, "learning_rate": 4.2911418507457876e-07, "loss": 0.2317, "step": 158 }, { "epoch": 2.8172757475083055, "grad_norm": 4.5, "learning_rate": 4.2677669529663686e-07, "loss": 0.1929, "step": 159 }, { "epoch": 2.83499446290144, "grad_norm": 4.53125, "learning_rate": 4.244078802837462e-07, "loss": 0.226, "step": 160 }, { "epoch": 2.83499446290144, "eval_loss": 0.8948097825050354, "eval_runtime": 160.2748, "eval_samples_per_second": 1.703, "eval_steps_per_second": 1.703, "step": 160 }, { "epoch": 2.8527131782945734, "grad_norm": 4.0, "learning_rate": 4.220081597953479e-07, "loss": 0.2011, "step": 161 }, { "epoch": 2.870431893687708, "grad_norm": 4.03125, "learning_rate": 4.1957795906740403e-07, "loss": 0.199, "step": 162 }, { "epoch": 2.8881506090808418, "grad_norm": 4.0, "learning_rate": 4.171177087370451e-07, "loss": 0.1994, "step": 163 }, { "epoch": 2.9058693244739757, "grad_norm": 4.53125, "learning_rate": 4.146278447662597e-07, "loss": 0.2173, "step": 164 }, { "epoch": 2.9235880398671097, "grad_norm": 3.96875, "learning_rate": 4.121088083646413e-07, "loss": 0.2259, "step": 165 }, { "epoch": 2.9413067552602437, "grad_norm": 3.734375, "learning_rate": 4.09561045911205e-07, "loss": 0.1816, "step": 166 }, { "epoch": 2.9590254706533776, "grad_norm": 4.4375, "learning_rate": 4.0698500887528797e-07, "loss": 0.2038, "step": 167 }, { "epoch": 2.9767441860465116, "grad_norm": 4.09375, "learning_rate": 4.0438115373654795e-07, "loss": 0.2074, "step": 168 }, { "epoch": 2.9944629014396456, "grad_norm": 4.15625, "learning_rate": 4.0174994190407443e-07, "loss": 0.1812, "step": 169 }, { "epoch": 3.0121816168327795, "grad_norm": 3.65625, "learning_rate": 3.9909183963462536e-07, "loss": 0.1991, "step": 170 }, { "epoch": 3.0299003322259135, "grad_norm": 4.6875, "learning_rate": 3.9640731795000584e-07, "loss": 0.2092, "step": 171 }, { "epoch": 3.0476190476190474, "grad_norm": 4.8125, "learning_rate": 3.9369685255360173e-07, "loss": 0.1816, "step": 172 }, { "epoch": 3.0653377630121814, "grad_norm": 4.1875, "learning_rate": 3.9096092374608395e-07, "loss": 0.1895, "step": 173 }, { "epoch": 3.083056478405316, "grad_norm": 4.125, "learning_rate": 3.882000163402983e-07, "loss": 0.2041, "step": 174 }, { "epoch": 3.10077519379845, "grad_norm": 5.78125, "learning_rate": 3.8541461957535526e-07, "loss": 0.1944, "step": 175 }, { "epoch": 3.1184939091915838, "grad_norm": 3.90625, "learning_rate": 3.826052270299356e-07, "loss": 0.195, "step": 176 }, { "epoch": 3.1362126245847177, "grad_norm": 4.1875, "learning_rate": 3.7977233653482764e-07, "loss": 0.2008, "step": 177 }, { "epoch": 3.1539313399778517, "grad_norm": 4.5625, "learning_rate": 3.7691645008470997e-07, "loss": 0.2174, "step": 178 }, { "epoch": 3.1716500553709857, "grad_norm": 4.03125, "learning_rate": 3.740380737491971e-07, "loss": 0.1876, "step": 179 }, { "epoch": 3.1893687707641196, "grad_norm": 4.09375, "learning_rate": 3.7113771758316255e-07, "loss": 0.1859, "step": 180 }, { "epoch": 3.1893687707641196, "eval_loss": 0.9389155507087708, "eval_runtime": 159.3135, "eval_samples_per_second": 1.714, "eval_steps_per_second": 1.714, "step": 180 }, { "epoch": 3.2070874861572536, "grad_norm": 4.78125, "learning_rate": 3.6821589553635633e-07, "loss": 0.2219, "step": 181 }, { "epoch": 3.2248062015503876, "grad_norm": 4.34375, "learning_rate": 3.6527312536233147e-07, "loss": 0.2054, "step": 182 }, { "epoch": 3.2425249169435215, "grad_norm": 4.21875, "learning_rate": 3.623099285266972e-07, "loss": 0.1951, "step": 183 }, { "epoch": 3.2602436323366555, "grad_norm": 4.40625, "learning_rate": 3.593268301147139e-07, "loss": 0.2061, "step": 184 }, { "epoch": 3.2779623477297894, "grad_norm": 5.0625, "learning_rate": 3.5632435873824674e-07, "loss": 0.2058, "step": 185 }, { "epoch": 3.2956810631229234, "grad_norm": 4.65625, "learning_rate": 3.533030464420945e-07, "loss": 0.2071, "step": 186 }, { "epoch": 3.3133997785160574, "grad_norm": 5.0, "learning_rate": 3.5026342860971036e-07, "loss": 0.1898, "step": 187 }, { "epoch": 3.331118493909192, "grad_norm": 4.40625, "learning_rate": 3.472060438683302e-07, "loss": 0.2048, "step": 188 }, { "epoch": 3.3488372093023258, "grad_norm": 4.03125, "learning_rate": 3.4413143399352656e-07, "loss": 0.2041, "step": 189 }, { "epoch": 3.3665559246954597, "grad_norm": 3.765625, "learning_rate": 3.4104014381320555e-07, "loss": 0.1948, "step": 190 }, { "epoch": 3.3842746400885937, "grad_norm": 4.875, "learning_rate": 3.379327211110612e-07, "loss": 0.2116, "step": 191 }, { "epoch": 3.4019933554817277, "grad_norm": 4.09375, "learning_rate": 3.348097165295075e-07, "loss": 0.2208, "step": 192 }, { "epoch": 3.4197120708748616, "grad_norm": 4.4375, "learning_rate": 3.316716834721032e-07, "loss": 0.2111, "step": 193 }, { "epoch": 3.4374307862679956, "grad_norm": 4.59375, "learning_rate": 3.2851917800548725e-07, "loss": 0.2107, "step": 194 }, { "epoch": 3.4551495016611296, "grad_norm": 3.890625, "learning_rate": 3.2535275876084246e-07, "loss": 0.1875, "step": 195 }, { "epoch": 3.4728682170542635, "grad_norm": 4.09375, "learning_rate": 3.2217298683490525e-07, "loss": 0.2047, "step": 196 }, { "epoch": 3.4905869324473975, "grad_norm": 3.875, "learning_rate": 3.1898042569053765e-07, "loss": 0.186, "step": 197 }, { "epoch": 3.5083056478405314, "grad_norm": 4.1875, "learning_rate": 3.157756410568803e-07, "loss": 0.207, "step": 198 }, { "epoch": 3.5260243632336654, "grad_norm": 4.1875, "learning_rate": 3.125592008291044e-07, "loss": 0.1971, "step": 199 }, { "epoch": 3.5437430786267994, "grad_norm": 4.0625, "learning_rate": 3.0933167496777873e-07, "loss": 0.2053, "step": 200 }, { "epoch": 3.5437430786267994, "eval_loss": 0.8853705525398254, "eval_runtime": 160.263, "eval_samples_per_second": 1.703, "eval_steps_per_second": 1.703, "step": 200 }, { "epoch": 3.561461794019934, "grad_norm": 5.75, "learning_rate": 3.0609363539787204e-07, "loss": 0.187, "step": 201 }, { "epoch": 3.5791805094130673, "grad_norm": 4.375, "learning_rate": 3.0284565590740607e-07, "loss": 0.2093, "step": 202 }, { "epoch": 3.5968992248062017, "grad_norm": 4.0625, "learning_rate": 2.9958831204577945e-07, "loss": 0.1965, "step": 203 }, { "epoch": 3.6146179401993357, "grad_norm": 3.875, "learning_rate": 2.9632218102177856e-07, "loss": 0.204, "step": 204 }, { "epoch": 3.6323366555924697, "grad_norm": 4.96875, "learning_rate": 2.930478416012953e-07, "loss": 0.1965, "step": 205 }, { "epoch": 3.6500553709856036, "grad_norm": 4.03125, "learning_rate": 2.89765874004768e-07, "loss": 0.1767, "step": 206 }, { "epoch": 3.6677740863787376, "grad_norm": 6.5625, "learning_rate": 2.864768598043654e-07, "loss": 0.194, "step": 207 }, { "epoch": 3.6854928017718716, "grad_norm": 4.09375, "learning_rate": 2.8318138182093047e-07, "loss": 0.2015, "step": 208 }, { "epoch": 3.7032115171650055, "grad_norm": 3.671875, "learning_rate": 2.798800240207034e-07, "loss": 0.1805, "step": 209 }, { "epoch": 3.7209302325581395, "grad_norm": 4.625, "learning_rate": 2.7657337141184134e-07, "loss": 0.2159, "step": 210 }, { "epoch": 3.7386489479512734, "grad_norm": 3.96875, "learning_rate": 2.732620099407536e-07, "loss": 0.1664, "step": 211 }, { "epoch": 3.7563676633444074, "grad_norm": 4.34375, "learning_rate": 2.6994652638827075e-07, "loss": 0.1906, "step": 212 }, { "epoch": 3.7740863787375414, "grad_norm": 3.625, "learning_rate": 2.666275082656656e-07, "loss": 0.1926, "step": 213 }, { "epoch": 3.791805094130676, "grad_norm": 4.1875, "learning_rate": 2.633055437105446e-07, "loss": 0.2067, "step": 214 }, { "epoch": 3.8095238095238093, "grad_norm": 4.0625, "learning_rate": 2.599812213826294e-07, "loss": 0.2013, "step": 215 }, { "epoch": 3.8272425249169437, "grad_norm": 4.125, "learning_rate": 2.566551303594437e-07, "loss": 0.1913, "step": 216 }, { "epoch": 3.8449612403100772, "grad_norm": 4.375, "learning_rate": 2.5332786003192846e-07, "loss": 0.1929, "step": 217 }, { "epoch": 3.8626799557032117, "grad_norm": 3.96875, "learning_rate": 2.5e-07, "loss": 0.1859, "step": 218 }, { "epoch": 3.8803986710963456, "grad_norm": 4.34375, "learning_rate": 2.466721399680716e-07, "loss": 0.1978, "step": 219 }, { "epoch": 3.8981173864894796, "grad_norm": 4.09375, "learning_rate": 2.433448696405563e-07, "loss": 0.1826, "step": 220 }, { "epoch": 3.8981173864894796, "eval_loss": 0.9020561575889587, "eval_runtime": 158.9544, "eval_samples_per_second": 1.717, "eval_steps_per_second": 1.717, "step": 220 }, { "epoch": 3.9158361018826136, "grad_norm": 3.640625, "learning_rate": 2.4001877861737067e-07, "loss": 0.1973, "step": 221 }, { "epoch": 3.9335548172757475, "grad_norm": 4.75, "learning_rate": 2.3669445628945538e-07, "loss": 0.1839, "step": 222 }, { "epoch": 3.9512735326688815, "grad_norm": 4.78125, "learning_rate": 2.3337249173433443e-07, "loss": 0.1919, "step": 223 }, { "epoch": 3.9689922480620154, "grad_norm": 4.34375, "learning_rate": 2.300534736117292e-07, "loss": 0.21, "step": 224 }, { "epoch": 3.9867109634551494, "grad_norm": 5.125, "learning_rate": 2.2673799005924637e-07, "loss": 0.1828, "step": 225 }, { "epoch": 4.004429678848283, "grad_norm": 4.34375, "learning_rate": 2.2342662858815867e-07, "loss": 0.1934, "step": 226 }, { "epoch": 4.022148394241418, "grad_norm": 6.25, "learning_rate": 2.2011997597929656e-07, "loss": 0.1774, "step": 227 }, { "epoch": 4.039867109634551, "grad_norm": 3.875, "learning_rate": 2.168186181790695e-07, "loss": 0.1919, "step": 228 }, { "epoch": 4.057585825027686, "grad_norm": 8.75, "learning_rate": 2.1352314019563457e-07, "loss": 0.2059, "step": 229 }, { "epoch": 4.075304540420819, "grad_norm": 3.953125, "learning_rate": 2.10234125995232e-07, "loss": 0.1821, "step": 230 }, { "epoch": 4.093023255813954, "grad_norm": 4.75, "learning_rate": 2.0695215839870476e-07, "loss": 0.2125, "step": 231 }, { "epoch": 4.110741971207087, "grad_norm": 4.375, "learning_rate": 2.0367781897822144e-07, "loss": 0.2207, "step": 232 }, { "epoch": 4.128460686600222, "grad_norm": 3.578125, "learning_rate": 2.004116879542205e-07, "loss": 0.1892, "step": 233 }, { "epoch": 4.146179401993355, "grad_norm": 4.375, "learning_rate": 1.971543440925939e-07, "loss": 0.1904, "step": 234 }, { "epoch": 4.1638981173864895, "grad_norm": 3.953125, "learning_rate": 1.93906364602128e-07, "loss": 0.192, "step": 235 }, { "epoch": 4.181616832779623, "grad_norm": 4.0625, "learning_rate": 1.9066832503222128e-07, "loss": 0.2023, "step": 236 }, { "epoch": 4.1993355481727574, "grad_norm": 3.78125, "learning_rate": 1.8744079917089568e-07, "loss": 0.178, "step": 237 }, { "epoch": 4.217054263565892, "grad_norm": 4.15625, "learning_rate": 1.8422435894311973e-07, "loss": 0.1786, "step": 238 }, { "epoch": 4.234772978959025, "grad_norm": 3.953125, "learning_rate": 1.8101957430946238e-07, "loss": 0.1732, "step": 239 }, { "epoch": 4.25249169435216, "grad_norm": 3.796875, "learning_rate": 1.7782701316509478e-07, "loss": 0.173, "step": 240 }, { "epoch": 4.25249169435216, "eval_loss": 0.8932636976242065, "eval_runtime": 158.3728, "eval_samples_per_second": 1.724, "eval_steps_per_second": 1.724, "step": 240 }, { "epoch": 4.270210409745293, "grad_norm": 3.984375, "learning_rate": 1.7464724123915757e-07, "loss": 0.1949, "step": 241 }, { "epoch": 4.287929125138428, "grad_norm": 3.84375, "learning_rate": 1.7148082199451286e-07, "loss": 0.1821, "step": 242 }, { "epoch": 4.305647840531561, "grad_norm": 4.09375, "learning_rate": 1.6832831652789672e-07, "loss": 0.1916, "step": 243 }, { "epoch": 4.323366555924696, "grad_norm": 4.4375, "learning_rate": 1.651902834704924e-07, "loss": 0.201, "step": 244 }, { "epoch": 4.341085271317829, "grad_norm": 3.875, "learning_rate": 1.6206727888893873e-07, "loss": 0.1821, "step": 245 }, { "epoch": 4.358803986710964, "grad_norm": 3.65625, "learning_rate": 1.5895985618679445e-07, "loss": 0.1713, "step": 246 }, { "epoch": 4.376522702104097, "grad_norm": 3.953125, "learning_rate": 1.5586856600647344e-07, "loss": 0.1907, "step": 247 }, { "epoch": 4.3942414174972315, "grad_norm": 3.875, "learning_rate": 1.5279395613166985e-07, "loss": 0.1828, "step": 248 }, { "epoch": 4.411960132890365, "grad_norm": 3.984375, "learning_rate": 1.497365713902896e-07, "loss": 0.1827, "step": 249 }, { "epoch": 4.4296788482834994, "grad_norm": 4.3125, "learning_rate": 1.4669695355790552e-07, "loss": 0.193, "step": 250 } ], "logging_steps": 1, "max_steps": 336, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.11806271012864e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }