{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.005925925925925926, "eval_steps": 500, "global_step": 10, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005925925925925926, "grad_norm": 10.865681648254395, "learning_rate": 5e-07, "logits/chosen": -4.444676399230957, "logits/rejected": -4.0909342765808105, "logps/chosen": -186.1875, "logps/rejected": -228.68560791015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0011851851851851852, "grad_norm": 7.5645222663879395, "learning_rate": 4.849231551964771e-07, "logits/chosen": -4.530362606048584, "logits/rejected": -3.981240749359131, "logps/chosen": -192.2100830078125, "logps/rejected": -213.73086547851562, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0017777777777777779, "grad_norm": 12.264541625976562, "learning_rate": 4.415111107797445e-07, "logits/chosen": -4.541451454162598, "logits/rejected": -4.319692134857178, "logps/chosen": -200.88140869140625, "logps/rejected": -224.45669555664062, "loss": 0.6763, "rewards/accuracies": 1.0, "rewards/chosen": 0.03173255920410156, "rewards/margins": 0.034035492688417435, "rewards/rejected": -0.0023029325529932976, "step": 3 }, { "epoch": 0.0023703703703703703, "grad_norm": 12.956914901733398, "learning_rate": 3.75e-07, "logits/chosen": -3.9681053161621094, "logits/rejected": -4.464873313903809, "logps/chosen": -239.55850219726562, "logps/rejected": -202.18539428710938, "loss": 0.7201, "rewards/accuracies": 0.25, "rewards/chosen": 0.0021869614720344543, "rewards/margins": -0.05291424319148064, "rewards/rejected": 0.05510121211409569, "step": 4 }, { "epoch": 0.002962962962962963, "grad_norm": 6.8053297996521, "learning_rate": 2.934120444167326e-07, "logits/chosen": -4.139642715454102, "logits/rejected": -4.70045280456543, "logps/chosen": -169.15768432617188, "logps/rejected": -149.37445068359375, "loss": 0.6838, "rewards/accuracies": 0.5, "rewards/chosen": 0.014175796881318092, "rewards/margins": 0.0190824493765831, "rewards/rejected": -0.004906653426587582, "step": 5 }, { "epoch": 0.0035555555555555557, "grad_norm": 12.668423652648926, "learning_rate": 2.065879555832674e-07, "logits/chosen": -5.052216529846191, "logits/rejected": -5.141656875610352, "logps/chosen": -193.5230712890625, "logps/rejected": -202.5981903076172, "loss": 0.6645, "rewards/accuracies": 0.75, "rewards/chosen": 0.024214554578065872, "rewards/margins": 0.05957336351275444, "rewards/rejected": -0.03535880893468857, "step": 6 }, { "epoch": 0.004148148148148148, "grad_norm": 6.801946640014648, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -4.241697311401367, "logits/rejected": -3.6782054901123047, "logps/chosen": -176.48068237304688, "logps/rejected": -223.32989501953125, "loss": 0.702, "rewards/accuracies": 0.25, "rewards/chosen": -0.007834245450794697, "rewards/margins": -0.016704559326171875, "rewards/rejected": 0.008870314806699753, "step": 7 }, { "epoch": 0.004740740740740741, "grad_norm": 10.848843574523926, "learning_rate": 5.848888922025552e-08, "logits/chosen": -4.412553310394287, "logits/rejected": -4.416936874389648, "logps/chosen": -182.169677734375, "logps/rejected": -196.12384033203125, "loss": 0.7098, "rewards/accuracies": 0.25, "rewards/chosen": -0.046158790588378906, "rewards/margins": -0.0327875129878521, "rewards/rejected": -0.01337127760052681, "step": 8 }, { "epoch": 0.005333333333333333, "grad_norm": 9.211188316345215, "learning_rate": 1.507684480352292e-08, "logits/chosen": -4.912792205810547, "logits/rejected": -4.601197719573975, "logps/chosen": -154.6743927001953, "logps/rejected": -178.96322631835938, "loss": 0.7079, "rewards/accuracies": 0.5, "rewards/chosen": 0.004396629519760609, "rewards/margins": -0.02871532551944256, "rewards/rejected": 0.03311195224523544, "step": 9 }, { "epoch": 0.005925925925925926, "grad_norm": 9.900830268859863, "learning_rate": 0.0, "logits/chosen": -3.811270236968994, "logits/rejected": -3.6899518966674805, "logps/chosen": -232.9193878173828, "logps/rejected": -254.06576538085938, "loss": 0.7203, "rewards/accuracies": 0.25, "rewards/chosen": -0.048137664794921875, "rewards/margins": -0.05214844271540642, "rewards/rejected": 0.0040107727982103825, "step": 10 } ], "logging_steps": 1, "max_steps": 10, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }