{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.4696485623003195, "eval_steps": 39, "global_step": 117, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 0.06435385346412659, "learning_rate": 1e-05, "loss": 0.6574, "step": 1 }, { "epoch": 0.01, "eval_loss": 0.7461566925048828, "eval_runtime": 151.7973, "eval_samples_per_second": 1.482, "eval_steps_per_second": 0.376, "step": 1 }, { "epoch": 0.03, "grad_norm": 0.062134526669979095, "learning_rate": 2e-05, "loss": 0.6897, "step": 2 }, { "epoch": 0.04, "grad_norm": 0.06962593644857407, "learning_rate": 3e-05, "loss": 0.6974, "step": 3 }, { "epoch": 0.05, "grad_norm": 0.0730314701795578, "learning_rate": 4e-05, "loss": 0.7454, "step": 4 }, { "epoch": 0.06, "grad_norm": 0.07659862190485, "learning_rate": 5e-05, "loss": 0.6962, "step": 5 }, { "epoch": 0.08, "grad_norm": 0.07314729690551758, "learning_rate": 6e-05, "loss": 0.6602, "step": 6 }, { "epoch": 0.09, "grad_norm": 0.0892895832657814, "learning_rate": 7e-05, "loss": 0.6594, "step": 7 }, { "epoch": 0.1, "grad_norm": 0.11520648747682571, "learning_rate": 8e-05, "loss": 0.7406, "step": 8 }, { "epoch": 0.12, "grad_norm": 0.1333557516336441, "learning_rate": 9e-05, "loss": 0.663, "step": 9 }, { "epoch": 0.13, "grad_norm": 0.14017663896083832, "learning_rate": 0.0001, "loss": 0.6185, "step": 10 }, { "epoch": 0.14, "grad_norm": 0.1453818678855896, "learning_rate": 0.00011000000000000002, "loss": 0.602, "step": 11 }, { "epoch": 0.15, "grad_norm": 0.14279265701770782, "learning_rate": 0.00012, "loss": 0.6151, "step": 12 }, { "epoch": 0.17, "grad_norm": 0.11934962123632431, "learning_rate": 0.00013000000000000002, "loss": 0.5085, "step": 13 }, { "epoch": 0.18, "grad_norm": 0.11375121772289276, "learning_rate": 0.00014, "loss": 0.4823, "step": 14 }, { "epoch": 0.19, "grad_norm": 0.09707039594650269, "learning_rate": 0.00015000000000000001, "loss": 0.4281, "step": 15 }, { "epoch": 0.2, "grad_norm": 0.11333765089511871, "learning_rate": 0.00016, "loss": 0.4417, "step": 16 }, { "epoch": 0.22, "grad_norm": 0.09926053136587143, "learning_rate": 0.00017, "loss": 0.3654, "step": 17 }, { "epoch": 0.23, "grad_norm": 0.1027929037809372, "learning_rate": 0.00018, "loss": 0.3748, "step": 18 }, { "epoch": 0.24, "grad_norm": 0.09768980741500854, "learning_rate": 0.00019, "loss": 0.3306, "step": 19 }, { "epoch": 0.26, "grad_norm": 0.08072198927402496, "learning_rate": 0.0002, "loss": 0.355, "step": 20 }, { "epoch": 0.27, "grad_norm": 0.06444709748029709, "learning_rate": 0.00019997332081116373, "loss": 0.3152, "step": 21 }, { "epoch": 0.28, "grad_norm": 0.0576409213244915, "learning_rate": 0.00019989329748023725, "loss": 0.2674, "step": 22 }, { "epoch": 0.29, "grad_norm": 0.08624427020549774, "learning_rate": 0.0001997599727063717, "loss": 0.2807, "step": 23 }, { "epoch": 0.31, "grad_norm": 0.08828990906476974, "learning_rate": 0.00019957341762950344, "loss": 0.2736, "step": 24 }, { "epoch": 0.32, "grad_norm": 0.0641990676522255, "learning_rate": 0.00019933373179239502, "loss": 0.2427, "step": 25 }, { "epoch": 0.33, "grad_norm": 0.05487390235066414, "learning_rate": 0.0001990410430875205, "loss": 0.2737, "step": 26 }, { "epoch": 0.35, "grad_norm": 0.0666637197136879, "learning_rate": 0.00019869550768882455, "loss": 0.2806, "step": 27 }, { "epoch": 0.36, "grad_norm": 0.053027719259262085, "learning_rate": 0.0001982973099683902, "loss": 0.2664, "step": 28 }, { "epoch": 0.37, "grad_norm": 0.055217448621988297, "learning_rate": 0.0001978466623980609, "loss": 0.309, "step": 29 }, { "epoch": 0.38, "grad_norm": 0.05493360385298729, "learning_rate": 0.0001973438054360693, "loss": 0.2318, "step": 30 }, { "epoch": 0.4, "grad_norm": 0.0376153439283371, "learning_rate": 0.00019678900739873226, "loss": 0.2049, "step": 31 }, { "epoch": 0.41, "grad_norm": 0.03811287507414818, "learning_rate": 0.00019618256431728194, "loss": 0.2771, "step": 32 }, { "epoch": 0.42, "grad_norm": 0.04185184836387634, "learning_rate": 0.000195524799779908, "loss": 0.1843, "step": 33 }, { "epoch": 0.43, "grad_norm": 0.04743755981326103, "learning_rate": 0.0001948160647590966, "loss": 0.2414, "step": 34 }, { "epoch": 0.45, "grad_norm": 0.03343382850289345, "learning_rate": 0.00019405673742435678, "loss": 0.2007, "step": 35 }, { "epoch": 0.46, "grad_norm": 0.03178093209862709, "learning_rate": 0.00019324722294043558, "loss": 0.1898, "step": 36 }, { "epoch": 0.47, "grad_norm": 0.03482227399945259, "learning_rate": 0.0001923879532511287, "loss": 0.2657, "step": 37 }, { "epoch": 0.49, "grad_norm": 0.03727172687649727, "learning_rate": 0.0001914793868488021, "loss": 0.2185, "step": 38 }, { "epoch": 0.5, "grad_norm": 0.0382206104695797, "learning_rate": 0.00019052200852974819, "loss": 0.1778, "step": 39 }, { "epoch": 0.5, "eval_loss": 0.2122737020254135, "eval_runtime": 154.5774, "eval_samples_per_second": 1.456, "eval_steps_per_second": 0.369, "step": 39 }, { "epoch": 0.51, "grad_norm": 0.03256627917289734, "learning_rate": 0.00018951632913550626, "loss": 0.2528, "step": 40 }, { "epoch": 0.52, "grad_norm": 0.032382771372795105, "learning_rate": 0.00018846288528028555, "loss": 0.2165, "step": 41 }, { "epoch": 0.54, "grad_norm": 0.030948858708143234, "learning_rate": 0.00018736223906463696, "loss": 0.1754, "step": 42 }, { "epoch": 0.55, "grad_norm": 0.041598908603191376, "learning_rate": 0.00018621497777552507, "loss": 0.2676, "step": 43 }, { "epoch": 0.56, "grad_norm": 0.03000044636428356, "learning_rate": 0.00018502171357296144, "loss": 0.1501, "step": 44 }, { "epoch": 0.58, "grad_norm": 0.03305242955684662, "learning_rate": 0.00018378308316336584, "loss": 0.1937, "step": 45 }, { "epoch": 0.59, "grad_norm": 0.02798221819102764, "learning_rate": 0.00018249974745983023, "loss": 0.2241, "step": 46 }, { "epoch": 0.6, "grad_norm": 0.03340066224336624, "learning_rate": 0.00018117239122946615, "loss": 0.2078, "step": 47 }, { "epoch": 0.61, "grad_norm": 0.032081808894872665, "learning_rate": 0.000179801722728024, "loss": 0.2061, "step": 48 }, { "epoch": 0.63, "grad_norm": 0.033740073442459106, "learning_rate": 0.00017838847332197938, "loss": 0.2411, "step": 49 }, { "epoch": 0.64, "grad_norm": 0.02972455322742462, "learning_rate": 0.00017693339709828792, "loss": 0.2346, "step": 50 }, { "epoch": 0.65, "grad_norm": 0.03134370595216751, "learning_rate": 0.0001754372704620164, "loss": 0.2573, "step": 51 }, { "epoch": 0.66, "grad_norm": 0.035204820334911346, "learning_rate": 0.00017390089172206592, "loss": 0.2596, "step": 52 }, { "epoch": 0.68, "grad_norm": 0.027202850207686424, "learning_rate": 0.00017232508066520702, "loss": 0.1738, "step": 53 }, { "epoch": 0.69, "grad_norm": 0.02930806204676628, "learning_rate": 0.00017071067811865476, "loss": 0.2178, "step": 54 }, { "epoch": 0.7, "grad_norm": 0.03558573126792908, "learning_rate": 0.00016905854550141716, "loss": 0.2677, "step": 55 }, { "epoch": 0.72, "grad_norm": 0.03179259970784187, "learning_rate": 0.00016736956436465573, "loss": 0.1704, "step": 56 }, { "epoch": 0.73, "grad_norm": 0.0318523608148098, "learning_rate": 0.00016564463592130428, "loss": 0.2117, "step": 57 }, { "epoch": 0.74, "grad_norm": 0.034657321870326996, "learning_rate": 0.00016388468056519612, "loss": 0.206, "step": 58 }, { "epoch": 0.75, "grad_norm": 0.032808613032102585, "learning_rate": 0.00016209063737995715, "loss": 0.2235, "step": 59 }, { "epoch": 0.77, "grad_norm": 0.029593750834465027, "learning_rate": 0.00016026346363792567, "loss": 0.1976, "step": 60 }, { "epoch": 0.78, "grad_norm": 0.033486686646938324, "learning_rate": 0.00015840413428936767, "loss": 0.184, "step": 61 }, { "epoch": 0.79, "grad_norm": 0.029341375455260277, "learning_rate": 0.0001565136414422592, "loss": 0.2114, "step": 62 }, { "epoch": 0.81, "grad_norm": 0.03653452917933464, "learning_rate": 0.00015459299383291345, "loss": 0.2183, "step": 63 }, { "epoch": 0.82, "grad_norm": 0.03146987408399582, "learning_rate": 0.0001526432162877356, "loss": 0.2444, "step": 64 }, { "epoch": 0.83, "grad_norm": 0.03224271163344383, "learning_rate": 0.00015066534917639195, "loss": 0.2157, "step": 65 }, { "epoch": 0.84, "grad_norm": 0.028182610869407654, "learning_rate": 0.00014866044785668563, "loss": 0.16, "step": 66 }, { "epoch": 0.86, "grad_norm": 0.03271663188934326, "learning_rate": 0.0001466295821114348, "loss": 0.2102, "step": 67 }, { "epoch": 0.87, "grad_norm": 0.032980792224407196, "learning_rate": 0.00014457383557765386, "loss": 0.2605, "step": 68 }, { "epoch": 0.88, "grad_norm": 0.03456718847155571, "learning_rate": 0.0001424943051683422, "loss": 0.212, "step": 69 }, { "epoch": 0.89, "grad_norm": 0.036501459777355194, "learning_rate": 0.00014039210048718949, "loss": 0.1798, "step": 70 }, { "epoch": 0.91, "grad_norm": 0.033730726689100266, "learning_rate": 0.000138268343236509, "loss": 0.2018, "step": 71 }, { "epoch": 0.92, "grad_norm": 0.03012872487306595, "learning_rate": 0.00013612416661871533, "loss": 0.2309, "step": 72 }, { "epoch": 0.93, "grad_norm": 0.032879915088415146, "learning_rate": 0.00013396071473166613, "loss": 0.1912, "step": 73 }, { "epoch": 0.95, "grad_norm": 0.02916533872485161, "learning_rate": 0.00013177914195819016, "loss": 0.2313, "step": 74 }, { "epoch": 0.96, "grad_norm": 0.032934658229351044, "learning_rate": 0.00012958061235012706, "loss": 0.2118, "step": 75 }, { "epoch": 0.97, "grad_norm": 0.03474392369389534, "learning_rate": 0.0001273662990072083, "loss": 0.1465, "step": 76 }, { "epoch": 0.98, "grad_norm": 0.033331647515296936, "learning_rate": 0.0001251373834511103, "loss": 0.2207, "step": 77 }, { "epoch": 1.0, "grad_norm": 0.030515702441334724, "learning_rate": 0.0001228950549950134, "loss": 0.2121, "step": 78 }, { "epoch": 1.0, "eval_loss": 0.19000689685344696, "eval_runtime": 153.8331, "eval_samples_per_second": 1.463, "eval_steps_per_second": 0.371, "step": 78 }, { "epoch": 1.01, "grad_norm": 0.03746604546904564, "learning_rate": 0.00012064051010900397, "loss": 0.1755, "step": 79 }, { "epoch": 1.02, "grad_norm": 0.032529812306165695, "learning_rate": 0.00011837495178165706, "loss": 0.2279, "step": 80 }, { "epoch": 1.01, "grad_norm": 0.02698327600955963, "learning_rate": 0.00011609958887814129, "loss": 0.1718, "step": 81 }, { "epoch": 1.02, "grad_norm": 0.030606260523200035, "learning_rate": 0.00011381563549518823, "loss": 0.2253, "step": 82 }, { "epoch": 1.04, "grad_norm": 0.03295598179101944, "learning_rate": 0.00011152431031326978, "loss": 0.1592, "step": 83 }, { "epoch": 1.05, "grad_norm": 0.04062403365969658, "learning_rate": 0.00010922683594633021, "loss": 0.2458, "step": 84 }, { "epoch": 1.06, "grad_norm": 0.04118546470999718, "learning_rate": 0.00010692443828941918, "loss": 0.1938, "step": 85 }, { "epoch": 1.07, "grad_norm": 0.03897158056497574, "learning_rate": 0.00010461834586457398, "loss": 0.1816, "step": 86 }, { "epoch": 1.09, "grad_norm": 0.031059524044394493, "learning_rate": 0.00010230978916530012, "loss": 0.2024, "step": 87 }, { "epoch": 1.1, "grad_norm": 0.0294091384857893, "learning_rate": 0.0001, "loss": 0.199, "step": 88 }, { "epoch": 1.11, "grad_norm": 0.03456791117787361, "learning_rate": 9.76902108346999e-05, "loss": 0.2616, "step": 89 }, { "epoch": 1.12, "grad_norm": 0.03276889771223068, "learning_rate": 9.538165413542607e-05, "loss": 0.2115, "step": 90 }, { "epoch": 1.14, "grad_norm": 0.03038157895207405, "learning_rate": 9.307556171058085e-05, "loss": 0.1721, "step": 91 }, { "epoch": 1.15, "grad_norm": 0.02885863371193409, "learning_rate": 9.077316405366981e-05, "loss": 0.2199, "step": 92 }, { "epoch": 1.16, "grad_norm": 0.03282390534877777, "learning_rate": 8.847568968673026e-05, "loss": 0.1885, "step": 93 }, { "epoch": 1.18, "grad_norm": 0.03559669107198715, "learning_rate": 8.61843645048118e-05, "loss": 0.2253, "step": 94 }, { "epoch": 1.19, "grad_norm": 0.027744270861148834, "learning_rate": 8.39004111218587e-05, "loss": 0.1825, "step": 95 }, { "epoch": 1.2, "grad_norm": 0.035874996334314346, "learning_rate": 8.162504821834295e-05, "loss": 0.1983, "step": 96 }, { "epoch": 1.21, "grad_norm": 0.03325138986110687, "learning_rate": 7.935948989099605e-05, "loss": 0.1812, "step": 97 }, { "epoch": 1.23, "grad_norm": 0.028679154813289642, "learning_rate": 7.710494500498662e-05, "loss": 0.1875, "step": 98 }, { "epoch": 1.24, "grad_norm": 0.036103084683418274, "learning_rate": 7.486261654888973e-05, "loss": 0.2025, "step": 99 }, { "epoch": 1.25, "grad_norm": 0.0321945995092392, "learning_rate": 7.263370099279172e-05, "loss": 0.2342, "step": 100 }, { "epoch": 1.27, "grad_norm": 0.0304876621812582, "learning_rate": 7.041938764987297e-05, "loss": 0.1642, "step": 101 }, { "epoch": 1.28, "grad_norm": 0.03619467467069626, "learning_rate": 6.822085804180984e-05, "loss": 0.187, "step": 102 }, { "epoch": 1.29, "grad_norm": 0.03597356379032135, "learning_rate": 6.603928526833387e-05, "loss": 0.1343, "step": 103 }, { "epoch": 1.3, "grad_norm": 0.036540765315294266, "learning_rate": 6.387583338128471e-05, "loss": 0.2062, "step": 104 }, { "epoch": 1.32, "grad_norm": 0.03327038511633873, "learning_rate": 6.173165676349103e-05, "loss": 0.22, "step": 105 }, { "epoch": 1.33, "grad_norm": 0.03374246507883072, "learning_rate": 5.960789951281052e-05, "loss": 0.1765, "step": 106 }, { "epoch": 1.34, "grad_norm": 0.032783444970846176, "learning_rate": 5.750569483165784e-05, "loss": 0.2397, "step": 107 }, { "epoch": 1.35, "grad_norm": 0.03667570650577545, "learning_rate": 5.542616442234618e-05, "loss": 0.1841, "step": 108 }, { "epoch": 1.37, "grad_norm": 0.029467754065990448, "learning_rate": 5.337041788856518e-05, "loss": 0.2282, "step": 109 }, { "epoch": 1.38, "grad_norm": 0.03702455386519432, "learning_rate": 5.1339552143314384e-05, "loss": 0.2601, "step": 110 }, { "epoch": 1.39, "grad_norm": 0.033595018088817596, "learning_rate": 4.933465082360807e-05, "loss": 0.2077, "step": 111 }, { "epoch": 1.41, "grad_norm": 0.03349510580301285, "learning_rate": 4.735678371226441e-05, "loss": 0.1686, "step": 112 }, { "epoch": 1.42, "grad_norm": 0.03149344027042389, "learning_rate": 4.540700616708658e-05, "loss": 0.2075, "step": 113 }, { "epoch": 1.43, "grad_norm": 0.035017482936382294, "learning_rate": 4.3486358557740814e-05, "loss": 0.1462, "step": 114 }, { "epoch": 1.44, "grad_norm": 0.029670318588614464, "learning_rate": 4.159586571063236e-05, "loss": 0.1602, "step": 115 }, { "epoch": 1.46, "grad_norm": 0.037266362458467484, "learning_rate": 3.973653636207437e-05, "loss": 0.1798, "step": 116 }, { "epoch": 1.47, "grad_norm": 0.03251258283853531, "learning_rate": 3.7909362620042865e-05, "loss": 0.2512, "step": 117 }, { "epoch": 1.47, "eval_loss": 0.18417498469352722, "eval_runtime": 155.2041, "eval_samples_per_second": 1.45, "eval_steps_per_second": 0.367, "step": 117 } ], "logging_steps": 1, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 39, "total_flos": 1.5385293522940723e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }