{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9921671018276762, "eval_steps": 500, "global_step": 95, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 6.531418223090284, "learning_rate": 1e-07, "logits/chosen": -2.851747512817383, "logits/rejected": -2.833996534347534, "logps/chosen": -165.70089721679688, "logps/rejected": -198.857666015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "grad_norm": 5.924500505195678, "learning_rate": 5e-07, "logits/chosen": -2.7712948322296143, "logits/rejected": -2.7740774154663086, "logps/chosen": -171.32467651367188, "logps/rejected": -172.57489013671875, "loss": 0.693, "rewards/accuracies": 0.375, "rewards/chosen": 0.0003833131631836295, "rewards/margins": -8.183407771866769e-06, "rewards/rejected": 0.00039149660733528435, "step": 5 }, { "epoch": 0.1, "grad_norm": 7.115083584877395, "learning_rate": 1e-06, "logits/chosen": -2.787468433380127, "logits/rejected": -2.796309471130371, "logps/chosen": -188.18690490722656, "logps/rejected": -193.19281005859375, "loss": 0.6903, "rewards/accuracies": 0.625, "rewards/chosen": 0.021850628778338432, "rewards/margins": 0.00575407687574625, "rewards/rejected": 0.016096554696559906, "step": 10 }, { "epoch": 0.16, "grad_norm": 7.576406474777614, "learning_rate": 9.91486549841951e-07, "logits/chosen": -2.84570574760437, "logits/rejected": -2.855926513671875, "logps/chosen": -193.6645050048828, "logps/rejected": -185.57162475585938, "loss": 0.681, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.06298746168613434, "rewards/margins": 0.029651161283254623, "rewards/rejected": 0.03333630412817001, "step": 15 }, { "epoch": 0.21, "grad_norm": 9.329171862378226, "learning_rate": 9.66236114702178e-07, "logits/chosen": -2.831686496734619, "logits/rejected": -2.8417012691497803, "logps/chosen": -170.6667938232422, "logps/rejected": -189.2131805419922, "loss": 0.6564, "rewards/accuracies": 0.6875, "rewards/chosen": -0.019239652901887894, "rewards/margins": 0.0761820375919342, "rewards/rejected": -0.0954216942191124, "step": 20 }, { "epoch": 0.26, "grad_norm": 11.230824794807015, "learning_rate": 9.251085678648071e-07, "logits/chosen": -2.7868192195892334, "logits/rejected": -2.7850959300994873, "logps/chosen": -136.22763061523438, "logps/rejected": -169.01756286621094, "loss": 0.6363, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04755619913339615, "rewards/margins": 0.1481182873249054, "rewards/rejected": -0.19567449390888214, "step": 25 }, { "epoch": 0.31, "grad_norm": 10.213782257588267, "learning_rate": 8.695044586103295e-07, "logits/chosen": -2.74267840385437, "logits/rejected": -2.7413182258605957, "logps/chosen": -163.45603942871094, "logps/rejected": -190.14236450195312, "loss": 0.6341, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.039765361696481705, "rewards/margins": 0.20882920920848846, "rewards/rejected": -0.24859456717967987, "step": 30 }, { "epoch": 0.37, "grad_norm": 9.724737450156574, "learning_rate": 8.013173181896282e-07, "logits/chosen": -2.7809696197509766, "logits/rejected": -2.774864435195923, "logps/chosen": -203.69174194335938, "logps/rejected": -228.673828125, "loss": 0.623, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.12420445680618286, "rewards/margins": 0.1601138412952423, "rewards/rejected": -0.28431832790374756, "step": 35 }, { "epoch": 0.42, "grad_norm": 12.552857274756276, "learning_rate": 7.228691778882692e-07, "logits/chosen": -2.6985127925872803, "logits/rejected": -2.7104218006134033, "logps/chosen": -158.4896240234375, "logps/rejected": -199.87753295898438, "loss": 0.6049, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.25630590319633484, "rewards/margins": 0.26447391510009766, "rewards/rejected": -0.5207797884941101, "step": 40 }, { "epoch": 0.47, "grad_norm": 12.147016576418242, "learning_rate": 6.368314950360415e-07, "logits/chosen": -2.6584348678588867, "logits/rejected": -2.6736438274383545, "logps/chosen": -232.7853240966797, "logps/rejected": -262.41015625, "loss": 0.6157, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4413759112358093, "rewards/margins": 0.3659912645816803, "rewards/rejected": -0.807367205619812, "step": 45 }, { "epoch": 0.52, "grad_norm": 13.949220920878545, "learning_rate": 5.46134179731651e-07, "logits/chosen": -2.61830735206604, "logits/rejected": -2.6202476024627686, "logps/chosen": -231.3226776123047, "logps/rejected": -291.72406005859375, "loss": 0.588, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5699625015258789, "rewards/margins": 0.44822096824645996, "rewards/rejected": -1.0181834697723389, "step": 50 }, { "epoch": 0.57, "grad_norm": 15.029123695933512, "learning_rate": 4.5386582026834904e-07, "logits/chosen": -2.4635961055755615, "logits/rejected": -2.4716153144836426, "logps/chosen": -202.64266967773438, "logps/rejected": -248.4220733642578, "loss": 0.5839, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5452982783317566, "rewards/margins": 0.4111364483833313, "rewards/rejected": -0.9564347267150879, "step": 55 }, { "epoch": 0.63, "grad_norm": 17.205678814880788, "learning_rate": 3.6316850496395855e-07, "logits/chosen": -2.5053164958953857, "logits/rejected": -2.5017011165618896, "logps/chosen": -228.46469116210938, "logps/rejected": -310.7025451660156, "loss": 0.5648, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5650664567947388, "rewards/margins": 0.5835850238800049, "rewards/rejected": -1.148651361465454, "step": 60 }, { "epoch": 0.68, "grad_norm": 19.89901816550521, "learning_rate": 2.771308221117309e-07, "logits/chosen": -2.4981369972229004, "logits/rejected": -2.5108680725097656, "logps/chosen": -236.54653930664062, "logps/rejected": -310.23162841796875, "loss": 0.563, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7446938753128052, "rewards/margins": 0.6594551801681519, "rewards/rejected": -1.4041489362716675, "step": 65 }, { "epoch": 0.73, "grad_norm": 23.0785172466263, "learning_rate": 1.9868268181037184e-07, "logits/chosen": -2.4181766510009766, "logits/rejected": -2.4266486167907715, "logps/chosen": -264.23138427734375, "logps/rejected": -309.7364807128906, "loss": 0.5533, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8709458112716675, "rewards/margins": 0.48874396085739136, "rewards/rejected": -1.3596898317337036, "step": 70 }, { "epoch": 0.78, "grad_norm": 20.36959281359639, "learning_rate": 1.304955413896705e-07, "logits/chosen": -2.3758950233459473, "logits/rejected": -2.3715245723724365, "logps/chosen": -251.191650390625, "logps/rejected": -353.01251220703125, "loss": 0.5328, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7933205366134644, "rewards/margins": 0.9006916880607605, "rewards/rejected": -1.6940120458602905, "step": 75 }, { "epoch": 0.84, "grad_norm": 32.00854391073857, "learning_rate": 7.4891432135193e-08, "logits/chosen": -2.346649646759033, "logits/rejected": -2.3456640243530273, "logps/chosen": -242.99374389648438, "logps/rejected": -310.0765075683594, "loss": 0.5629, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7500452995300293, "rewards/margins": 0.5974142551422119, "rewards/rejected": -1.3474594354629517, "step": 80 }, { "epoch": 0.89, "grad_norm": 24.468901281465246, "learning_rate": 3.376388529782215e-08, "logits/chosen": -2.396498918533325, "logits/rejected": -2.398857593536377, "logps/chosen": -232.2967987060547, "logps/rejected": -311.627197265625, "loss": 0.5814, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7211315035820007, "rewards/margins": 0.70616614818573, "rewards/rejected": -1.427297830581665, "step": 85 }, { "epoch": 0.94, "grad_norm": 21.76700663596853, "learning_rate": 8.513450158049106e-09, "logits/chosen": -2.337101459503174, "logits/rejected": -2.347583770751953, "logps/chosen": -246.55062866210938, "logps/rejected": -331.9063415527344, "loss": 0.5556, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7761114835739136, "rewards/margins": 0.7871293425559998, "rewards/rejected": -1.5632407665252686, "step": 90 }, { "epoch": 0.99, "grad_norm": 19.722062441035273, "learning_rate": 0.0, "logits/chosen": -2.403937339782715, "logits/rejected": -2.4172253608703613, "logps/chosen": -263.65972900390625, "logps/rejected": -332.18292236328125, "loss": 0.5584, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7805107831954956, "rewards/margins": 0.713537335395813, "rewards/rejected": -1.4940482378005981, "step": 95 }, { "epoch": 0.99, "step": 95, "total_flos": 0.0, "train_loss": 0.6041482373287803, "train_runtime": 2420.5057, "train_samples_per_second": 5.051, "train_steps_per_second": 0.039 } ], "logging_steps": 5, "max_steps": 95, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }