{ "best_metric": 3.8748562335968018, "best_model_checkpoint": "/kaggle/working/checkpoint-669", "epoch": 50.0, "eval_steps": 500, "global_step": 11150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "eval_loss": 3.875725507736206, "eval_mean_perplexity": 366.3703887939453, "eval_perplexities": [ 335.80645751953125, 377.47796630859375, 475.9695129394531, 276.7535705566406, 294.2838134765625, 264.73394775390625, 394.9538269042969, 325.2862854003906, 525.1426391601562, 393.2958679199219 ], "eval_runtime": 2.3638, "eval_samples_per_second": 4.231, "eval_steps_per_second": 0.846, "step": 223 }, { "epoch": 2.0, "eval_loss": 3.877554416656494, "eval_mean_perplexity": 373.7104461669922, "eval_perplexities": [ 356.83929443359375, 425.78094482421875, 495.7661437988281, 287.1143798828125, 270.86737060546875, 256.76934814453125, 391.27044677734375, 345.6097412109375, 498.4945373535156, 408.5922546386719 ], "eval_runtime": 2.2583, "eval_samples_per_second": 4.428, "eval_steps_per_second": 0.886, "step": 446 }, { "epoch": 3.0, "eval_loss": 3.8748562335968018, "eval_mean_perplexity": 378.68709869384764, "eval_perplexities": [ 404.7926940917969, 400.53558349609375, 487.69427490234375, 229.20298767089844, 278.294189453125, 259.6055908203125, 387.93963623046875, 365.1226501464844, 508.79669189453125, 464.8866882324219 ], "eval_runtime": 2.4538, "eval_samples_per_second": 4.075, "eval_steps_per_second": 0.815, "step": 669 }, { "epoch": 4.0, "eval_loss": 3.889125108718872, "eval_mean_perplexity": 395.6389587402344, "eval_perplexities": [ 386.92559814453125, 444.32073974609375, 480.409912109375, 278.6746520996094, 279.8287353515625, 286.8155212402344, 426.2447204589844, 352.7798767089844, 545.115234375, 475.27459716796875 ], "eval_runtime": 2.2494, "eval_samples_per_second": 4.446, "eval_steps_per_second": 0.889, "step": 892 }, { "epoch": 5.0, "eval_loss": 3.9018893241882324, "eval_mean_perplexity": 418.4004302978516, "eval_perplexities": [ 426.7615051269531, 474.9723815917969, 516.2012329101562, 285.6946105957031, 314.9617919921875, 305.4250793457031, 411.5309143066406, 412.8928527832031, 537.16943359375, 498.3945007324219 ], "eval_runtime": 2.2536, "eval_samples_per_second": 4.437, "eval_steps_per_second": 0.887, "step": 1115 }, { "epoch": 6.0, "eval_loss": 3.9105305671691895, "eval_mean_perplexity": 423.3804229736328, "eval_perplexities": [ 421.5846862792969, 457.750244140625, 521.7881469726562, 293.4595642089844, 283.9613037109375, 287.46807861328125, 451.5904846191406, 458.28509521484375, 608.7252197265625, 449.19140625 ], "eval_runtime": 2.261, "eval_samples_per_second": 4.423, "eval_steps_per_second": 0.885, "step": 1338 }, { "epoch": 7.0, "eval_loss": 3.910905122756958, "eval_mean_perplexity": 437.6560882568359, "eval_perplexities": [ 459.6704406738281, 471.1756591796875, 508.8789367675781, 278.22161865234375, 305.4996337890625, 335.0756530761719, 449.5406494140625, 402.190673828125, 618.786376953125, 547.521240234375 ], "eval_runtime": 2.4541, "eval_samples_per_second": 4.075, "eval_steps_per_second": 0.815, "step": 1561 }, { "epoch": 8.0, "eval_loss": 3.9314892292022705, "eval_mean_perplexity": 451.49346618652345, "eval_perplexities": [ 432.6178283691406, 494.5645446777344, 518.4781494140625, 289.39727783203125, 317.63031005859375, 327.0579528808594, 491.4974670410156, 445.9523620605469, 667.2817993164062, 530.4569702148438 ], "eval_runtime": 3.1442, "eval_samples_per_second": 3.18, "eval_steps_per_second": 0.636, "step": 1784 }, { "epoch": 9.0, "eval_loss": 3.9446353912353516, "eval_mean_perplexity": 451.08067626953124, "eval_perplexities": [ 452.6179504394531, 445.2970275878906, 576.3162841796875, 262.3009033203125, 326.6243591308594, 311.2583923339844, 440.389404296875, 448.5811767578125, 677.4688110351562, 569.9524536132812 ], "eval_runtime": 2.2555, "eval_samples_per_second": 4.434, "eval_steps_per_second": 0.887, "step": 2007 }, { "epoch": 10.0, "eval_loss": 3.979356288909912, "eval_mean_perplexity": 466.0081390380859, "eval_perplexities": [ 479.5843505859375, 542.103759765625, 561.8331298828125, 283.5645751953125, 342.3445739746094, 315.0624694824219, 494.7655334472656, 397.63812255859375, 683.822265625, 559.3626098632812 ], "eval_runtime": 2.255, "eval_samples_per_second": 4.435, "eval_steps_per_second": 0.887, "step": 2230 }, { "epoch": 11.0, "eval_loss": 3.985069751739502, "eval_mean_perplexity": 471.23480529785155, "eval_perplexities": [ 442.65118408203125, 578.3666381835938, 530.2559204101562, 292.97174072265625, 325.9568786621094, 331.6105651855469, 466.1777648925781, 453.5052490234375, 693.37646484375, 597.4756469726562 ], "eval_runtime": 2.2568, "eval_samples_per_second": 4.431, "eval_steps_per_second": 0.886, "step": 2453 }, { "epoch": 12.0, "eval_loss": 4.011897087097168, "eval_mean_perplexity": 481.7092010498047, "eval_perplexities": [ 490.259765625, 590.09716796875, 524.1170043945312, 292.7325134277344, 362.09210205078125, 320.8348388671875, 483.432861328125, 492.00469970703125, 676.3094482421875, 585.2116088867188 ], "eval_runtime": 2.2714, "eval_samples_per_second": 4.402, "eval_steps_per_second": 0.88, "step": 2676 }, { "epoch": 13.0, "eval_loss": 4.028537750244141, "eval_mean_perplexity": 491.83470153808594, "eval_perplexities": [ 434.7430725097656, 579.150390625, 531.6643676757812, 323.02447509765625, 381.5349426269531, 329.8095397949219, 569.4808349609375, 454.2198486328125, 723.43798828125, 591.2815551757812 ], "eval_runtime": 2.268, "eval_samples_per_second": 4.409, "eval_steps_per_second": 0.882, "step": 2899 }, { "epoch": 14.0, "eval_loss": 4.046214580535889, "eval_mean_perplexity": 488.7431976318359, "eval_perplexities": [ 465.3315734863281, 577.7365112304688, 532.9083251953125, 298.2411193847656, 371.57415771484375, 335.2172546386719, 547.2592163085938, 480.532470703125, 732.7112426757812, 545.9201049804688 ], "eval_runtime": 2.2446, "eval_samples_per_second": 4.455, "eval_steps_per_second": 0.891, "step": 3122 }, { "epoch": 15.0, "eval_loss": 4.064830780029297, "eval_mean_perplexity": 493.6821044921875, "eval_perplexities": [ 508.1759948730469, 588.5740356445312, 494.7876892089844, 310.3844299316406, 357.8840026855469, 349.0634460449219, 574.1718139648438, 489.5400085449219, 730.5029907226562, 533.7366333007812 ], "eval_runtime": 2.2496, "eval_samples_per_second": 4.445, "eval_steps_per_second": 0.889, "step": 3345 }, { "epoch": 16.0, "eval_loss": 4.0865068435668945, "eval_mean_perplexity": 527.1822723388672, "eval_perplexities": [ 516.6371459960938, 603.1266479492188, 558.2211303710938, 320.877685546875, 440.86993408203125, 384.0972595214844, 586.449951171875, 441.53643798828125, 805.31201171875, 614.6945190429688 ], "eval_runtime": 2.2935, "eval_samples_per_second": 4.36, "eval_steps_per_second": 0.872, "step": 3568 }, { "epoch": 17.0, "eval_loss": 4.112179756164551, "eval_mean_perplexity": 517.5738647460937, "eval_perplexities": [ 484.16485595703125, 623.9596557617188, 544.3837890625, 327.7882995605469, 432.2279052734375, 325.2957458496094, 587.2503051757812, 500.35589599609375, 768.452392578125, 581.8598022460938 ], "eval_runtime": 2.6951, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.742, "step": 3791 }, { "epoch": 18.0, "eval_loss": 4.133630275726318, "eval_mean_perplexity": 532.7662231445313, "eval_perplexities": [ 459.6676025390625, 644.7689819335938, 559.6870727539062, 313.52886962890625, 407.5067443847656, 358.4698486328125, 635.401123046875, 511.8723449707031, 728.2614135742188, 708.4982299804688 ], "eval_runtime": 2.2702, "eval_samples_per_second": 4.405, "eval_steps_per_second": 0.881, "step": 4014 }, { "epoch": 19.0, "eval_loss": 4.14639949798584, "eval_mean_perplexity": 546.8367309570312, "eval_perplexities": [ 513.874267578125, 633.9168090820312, 623.620849609375, 317.8645324707031, 417.3320007324219, 351.769287109375, 557.8528442382812, 530.8079223632812, 834.164306640625, 687.1644897460938 ], "eval_runtime": 2.28, "eval_samples_per_second": 4.386, "eval_steps_per_second": 0.877, "step": 4237 }, { "epoch": 20.0, "eval_loss": 4.1683173179626465, "eval_mean_perplexity": 530.1545104980469, "eval_perplexities": [ 518.4563598632812, 616.3810424804688, 584.39404296875, 357.87530517578125, 424.8235778808594, 363.0649719238281, 572.78369140625, 489.98748779296875, 762.8850708007812, 610.8935546875 ], "eval_runtime": 2.5117, "eval_samples_per_second": 3.981, "eval_steps_per_second": 0.796, "step": 4460 }, { "epoch": 21.0, "eval_loss": 4.197434425354004, "eval_mean_perplexity": 552.9715087890625, "eval_perplexities": [ 496.9158630371094, 622.5189208984375, 620.3365478515625, 315.29119873046875, 508.8257751464844, 386.271728515625, 597.5241088867188, 540.0391845703125, 771.0007934570312, 670.990966796875 ], "eval_runtime": 2.2574, "eval_samples_per_second": 4.43, "eval_steps_per_second": 0.886, "step": 4683 }, { "epoch": 22.0, "eval_loss": 4.2334303855896, "eval_mean_perplexity": 591.004769897461, "eval_perplexities": [ 542.1947631835938, 699.9281616210938, 655.9802856445312, 351.9259948730469, 517.2227783203125, 364.0205078125, 738.9130859375, 537.7424926757812, 836.2189331054688, 665.9006958007812 ], "eval_runtime": 2.2478, "eval_samples_per_second": 4.449, "eval_steps_per_second": 0.89, "step": 4906 }, { "epoch": 22.42, "grad_norm": 1.9367769956588745, "learning_rate": 2.7578475336322873e-05, "loss": 3.112, "step": 5000 }, { "epoch": 23.0, "eval_loss": 4.230788707733154, "eval_mean_perplexity": 571.9811889648438, "eval_perplexities": [ 529.1312255859375, 677.629638671875, 639.4375, 351.9476623535156, 471.0734558105469, 388.02935791015625, 630.8019409179688, 550.748046875, 854.4290771484375, 626.583984375 ], "eval_runtime": 2.3322, "eval_samples_per_second": 4.288, "eval_steps_per_second": 0.858, "step": 5129 }, { "epoch": 24.0, "eval_loss": 4.264792442321777, "eval_mean_perplexity": 588.0643798828125, "eval_perplexities": [ 532.3706665039062, 650.5995483398438, 640.096435546875, 367.4383850097656, 510.7347106933594, 402.79925537109375, 740.1875610351562, 580.5010375976562, 837.0123901367188, 618.90380859375 ], "eval_runtime": 2.414, "eval_samples_per_second": 4.142, "eval_steps_per_second": 0.828, "step": 5352 }, { "epoch": 25.0, "eval_loss": 4.280625820159912, "eval_mean_perplexity": 568.3704132080078, "eval_perplexities": [ 493.6273193359375, 687.0193481445312, 617.8013305664062, 356.9241943359375, 462.1817321777344, 377.38800048828125, 662.7510375976562, 553.1047973632812, 769.9375, 702.9688720703125 ], "eval_runtime": 2.2617, "eval_samples_per_second": 4.421, "eval_steps_per_second": 0.884, "step": 5575 }, { "epoch": 26.0, "eval_loss": 4.299654006958008, "eval_mean_perplexity": 585.3176055908203, "eval_perplexities": [ 507.7140808105469, 719.7584228515625, 610.5079956054688, 355.23577880859375, 474.7140197753906, 343.5024719238281, 644.6328125, 599.3197631835938, 868.6974487304688, 729.09326171875 ], "eval_runtime": 2.2752, "eval_samples_per_second": 4.395, "eval_steps_per_second": 0.879, "step": 5798 }, { "epoch": 27.0, "eval_loss": 4.309383869171143, "eval_mean_perplexity": 572.4837829589844, "eval_perplexities": [ 532.3777465820312, 701.5357055664062, 574.2848510742188, 366.85614013671875, 481.5206298828125, 389.685546875, 579.2503662109375, 525.9729614257812, 906.0205078125, 667.3333740234375 ], "eval_runtime": 2.4436, "eval_samples_per_second": 4.092, "eval_steps_per_second": 0.818, "step": 6021 }, { "epoch": 28.0, "eval_loss": 4.338665962219238, "eval_mean_perplexity": 617.5483917236328, "eval_perplexities": [ 530.51416015625, 770.9551391601562, 616.0216674804688, 386.1516418457031, 501.05426025390625, 419.87841796875, 735.8825073242188, 594.3335571289062, 911.9882202148438, 708.704345703125 ], "eval_runtime": 2.2597, "eval_samples_per_second": 4.425, "eval_steps_per_second": 0.885, "step": 6244 }, { "epoch": 29.0, "eval_loss": 4.354172229766846, "eval_mean_perplexity": 609.77958984375, "eval_perplexities": [ 571.1878662109375, 744.3867797851562, 607.4262084960938, 374.7521057128906, 501.3298034667969, 401.453369140625, 706.0652465820312, 600.0166625976562, 898.824951171875, 692.3529052734375 ], "eval_runtime": 2.4521, "eval_samples_per_second": 4.078, "eval_steps_per_second": 0.816, "step": 6467 }, { "epoch": 30.0, "eval_loss": 4.371840000152588, "eval_mean_perplexity": 639.5644592285156, "eval_perplexities": [ 518.4427490234375, 768.5755615234375, 643.536376953125, 434.2029724121094, 553.5997924804688, 401.9341735839844, 738.0169677734375, 627.0206909179688, 923.0927734375, 787.2225341796875 ], "eval_runtime": 2.2759, "eval_samples_per_second": 4.394, "eval_steps_per_second": 0.879, "step": 6690 }, { "epoch": 31.0, "eval_loss": 4.397173881530762, "eval_mean_perplexity": 666.7775909423829, "eval_perplexities": [ 549.1520385742188, 851.6005249023438, 712.2999267578125, 395.8175048828125, 540.697021484375, 451.8854064941406, 723.3524780273438, 630.9887084960938, 1021.4713745117188, 790.5109252929688 ], "eval_runtime": 2.2555, "eval_samples_per_second": 4.434, "eval_steps_per_second": 0.887, "step": 6913 }, { "epoch": 32.0, "eval_loss": 4.415882110595703, "eval_mean_perplexity": 660.8353820800781, "eval_perplexities": [ 512.7852783203125, 785.50439453125, 668.9815063476562, 457.69830322265625, 539.0671997070312, 459.60736083984375, 751.9481201171875, 625.0054321289062, 999.4456787109375, 808.310546875 ], "eval_runtime": 2.2544, "eval_samples_per_second": 4.436, "eval_steps_per_second": 0.887, "step": 7136 }, { "epoch": 33.0, "eval_loss": 4.42371940612793, "eval_mean_perplexity": 662.7573822021484, "eval_perplexities": [ 559.4247436523438, 747.0921020507812, 678.9531860351562, 437.0193786621094, 544.0770874023438, 431.26568603515625, 756.7093505859375, 626.279052734375, 1022.3748168945312, 824.37841796875 ], "eval_runtime": 2.2901, "eval_samples_per_second": 4.367, "eval_steps_per_second": 0.873, "step": 7359 }, { "epoch": 34.0, "eval_loss": 4.434357166290283, "eval_mean_perplexity": 650.7052612304688, "eval_perplexities": [ 549.6817626953125, 811.8685302734375, 639.683349609375, 410.87249755859375, 510.8334045410156, 438.6333923339844, 779.5153198242188, 584.6727294921875, 1027.2081298828125, 754.08349609375 ], "eval_runtime": 2.3566, "eval_samples_per_second": 4.243, "eval_steps_per_second": 0.849, "step": 7582 }, { "epoch": 35.0, "eval_loss": 4.456236839294434, "eval_mean_perplexity": 683.213168334961, "eval_perplexities": [ 584.2586059570312, 837.6990966796875, 639.265869140625, 431.7611389160156, 558.5822143554688, 446.50067138671875, 781.8605346679688, 649.52392578125, 1031.22900390625, 871.4506225585938 ], "eval_runtime": 2.2758, "eval_samples_per_second": 4.394, "eval_steps_per_second": 0.879, "step": 7805 }, { "epoch": 36.0, "eval_loss": 4.476213455200195, "eval_mean_perplexity": 673.9735595703125, "eval_perplexities": [ 570.093505859375, 876.3085327148438, 637.6167602539062, 410.8495788574219, 565.6197509765625, 431.1340637207031, 784.1497192382812, 635.3842163085938, 1020.8198852539062, 807.7595825195312 ], "eval_runtime": 2.2666, "eval_samples_per_second": 4.412, "eval_steps_per_second": 0.882, "step": 8028 }, { "epoch": 37.0, "eval_loss": 4.485024452209473, "eval_mean_perplexity": 678.1103912353516, "eval_perplexities": [ 524.7376708984375, 897.5795288085938, 604.8717651367188, 421.68621826171875, 562.4842529296875, 447.8833312988281, 743.5353393554688, 641.0125122070312, 1038.24658203125, 899.0667114257812 ], "eval_runtime": 2.2714, "eval_samples_per_second": 4.403, "eval_steps_per_second": 0.881, "step": 8251 }, { "epoch": 38.0, "eval_loss": 4.500662326812744, "eval_mean_perplexity": 667.2204986572266, "eval_perplexities": [ 517.3828125, 849.9619140625, 632.2133178710938, 421.2346496582031, 562.2227783203125, 412.367919921875, 801.9597778320312, 634.914794921875, 955.3936767578125, 884.5533447265625 ], "eval_runtime": 2.2586, "eval_samples_per_second": 4.428, "eval_steps_per_second": 0.886, "step": 8474 }, { "epoch": 39.0, "eval_loss": 4.507510662078857, "eval_mean_perplexity": 674.6702362060547, "eval_perplexities": [ 523.6810913085938, 871.532958984375, 702.7888793945312, 412.113525390625, 559.32421875, 436.3467712402344, 741.5978393554688, 612.4104614257812, 969.3262329101562, 917.5803833007812 ], "eval_runtime": 2.2926, "eval_samples_per_second": 4.362, "eval_steps_per_second": 0.872, "step": 8697 }, { "epoch": 40.0, "eval_loss": 4.5184006690979, "eval_mean_perplexity": 690.1411804199219, "eval_perplexities": [ 533.463623046875, 880.5369262695312, 677.1242065429688, 468.8255920410156, 524.9594116210938, 481.3450012207031, 724.6446533203125, 644.5559692382812, 1083.451416015625, 882.5050048828125 ], "eval_runtime": 2.2507, "eval_samples_per_second": 4.443, "eval_steps_per_second": 0.889, "step": 8920 }, { "epoch": 41.0, "eval_loss": 4.528339385986328, "eval_mean_perplexity": 686.3480163574219, "eval_perplexities": [ 540.8236083984375, 941.376708984375, 641.9276733398438, 446.772216796875, 562.5877685546875, 455.73175048828125, 764.4960327148438, 647.960205078125, 983.826416015625, 877.977783203125 ], "eval_runtime": 2.2901, "eval_samples_per_second": 4.367, "eval_steps_per_second": 0.873, "step": 9143 }, { "epoch": 42.0, "eval_loss": 4.525776386260986, "eval_mean_perplexity": 692.94345703125, "eval_perplexities": [ 512.2537841796875, 996.9277954101562, 626.796142578125, 407.5407409667969, 572.6943969726562, 439.5016174316406, 776.8684692382812, 660.894775390625, 1015.8809204101562, 920.075927734375 ], "eval_runtime": 2.2958, "eval_samples_per_second": 4.356, "eval_steps_per_second": 0.871, "step": 9366 }, { "epoch": 43.0, "eval_loss": 4.5432844161987305, "eval_mean_perplexity": 698.305697631836, "eval_perplexities": [ 520.623779296875, 898.4200439453125, 635.4287719726562, 451.1359558105469, 590.0744018554688, 448.103759765625, 752.5787353515625, 682.3783569335938, 1070.27294921875, 934.0402221679688 ], "eval_runtime": 2.8416, "eval_samples_per_second": 3.519, "eval_steps_per_second": 0.704, "step": 9589 }, { "epoch": 44.0, "eval_loss": 4.546627998352051, "eval_mean_perplexity": 698.2709533691407, "eval_perplexities": [ 496.5508728027344, 900.9185180664062, 651.6212768554688, 429.4538879394531, 572.0066528320312, 429.689208984375, 786.3671264648438, 686.788818359375, 1078.3935546875, 950.9196166992188 ], "eval_runtime": 2.2997, "eval_samples_per_second": 4.348, "eval_steps_per_second": 0.87, "step": 9812 }, { "epoch": 44.84, "grad_norm": 2.2081024646759033, "learning_rate": 5.15695067264574e-06, "loss": 2.4732, "step": 10000 }, { "epoch": 45.0, "eval_loss": 4.5482940673828125, "eval_mean_perplexity": 683.0953948974609, "eval_perplexities": [ 513.7635498046875, 881.8916625976562, 650.1325073242188, 426.90740966796875, 583.7874145507812, 441.8877258300781, 793.0048828125, 660.0620727539062, 992.5838012695312, 886.9329223632812 ], "eval_runtime": 2.6736, "eval_samples_per_second": 3.74, "eval_steps_per_second": 0.748, "step": 10035 }, { "epoch": 46.0, "eval_loss": 4.556254863739014, "eval_mean_perplexity": 694.8246520996094, "eval_perplexities": [ 484.3752136230469, 864.9951171875, 721.4644775390625, 440.7280578613281, 583.1522216796875, 458.59246826171875, 795.0460815429688, 658.3065795898438, 1043.74267578125, 897.8436279296875 ], "eval_runtime": 2.6592, "eval_samples_per_second": 3.761, "eval_steps_per_second": 0.752, "step": 10258 }, { "epoch": 47.0, "eval_loss": 4.565934181213379, "eval_mean_perplexity": 697.1294586181641, "eval_perplexities": [ 521.9780883789062, 891.4306030273438, 694.7803344726562, 465.2270812988281, 585.0350341796875, 454.22308349609375, 784.2135620117188, 672.1557006835938, 1016.0956420898438, 886.1554565429688 ], "eval_runtime": 2.3829, "eval_samples_per_second": 4.197, "eval_steps_per_second": 0.839, "step": 10481 }, { "epoch": 48.0, "eval_loss": 4.569299221038818, "eval_mean_perplexity": 698.8598907470703, "eval_perplexities": [ 523.372802734375, 930.4688720703125, 683.875732421875, 451.5999755859375, 597.7974243164062, 447.0304870605469, 766.9913940429688, 671.3782348632812, 1039.4984130859375, 876.5855712890625 ], "eval_runtime": 2.2797, "eval_samples_per_second": 4.387, "eval_steps_per_second": 0.877, "step": 10704 }, { "epoch": 49.0, "eval_loss": 4.569689750671387, "eval_mean_perplexity": 694.5519165039062, "eval_perplexities": [ 508.9573059082031, 927.446533203125, 672.3489379882812, 473.5368347167969, 568.0377807617188, 454.00933837890625, 733.6597900390625, 695.4989624023438, 1047.673828125, 864.349853515625 ], "eval_runtime": 2.3973, "eval_samples_per_second": 4.171, "eval_steps_per_second": 0.834, "step": 10927 }, { "epoch": 50.0, "eval_loss": 4.569555759429932, "eval_mean_perplexity": 698.6035186767579, "eval_perplexities": [ 517.4149169921875, 924.535888671875, 704.73291015625, 465.9677429199219, 577.629150390625, 443.994140625, 770.1861572265625, 683.028076171875, 1017.7510375976562, 880.795166015625 ], "eval_runtime": 2.2771, "eval_samples_per_second": 4.391, "eval_steps_per_second": 0.878, "step": 11150 } ], "logging_steps": 5000, "max_steps": 11150, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "total_flos": 1.1355181056e+16, "train_batch_size": 10, "trial_name": null, "trial_params": null }