{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.005925925925925926, "eval_steps": 500, "global_step": 10, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005925925925925926, "grad_norm": 10.913461685180664, "learning_rate": 5e-07, "logits/chosen": -4.444676399230957, "logits/rejected": -4.0909342765808105, "logps/chosen": -186.1875, "logps/rejected": -228.68560791015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0011851851851851852, "grad_norm": 7.564138412475586, "learning_rate": 4.849231551964771e-07, "logits/chosen": -4.530362606048584, "logits/rejected": -3.981240749359131, "logps/chosen": -192.2100830078125, "logps/rejected": -213.73086547851562, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0017777777777777779, "grad_norm": 14.393203735351562, "learning_rate": 4.415111107797445e-07, "logits/chosen": -4.539660930633545, "logits/rejected": -4.32346248626709, "logps/chosen": -200.69912719726562, "logps/rejected": -224.77163696289062, "loss": 0.6526, "rewards/accuracies": 1.0, "rewards/chosen": 0.04995880275964737, "rewards/margins": 0.08375511318445206, "rewards/rejected": -0.03379631042480469, "step": 3 }, { "epoch": 0.0023703703703703703, "grad_norm": 12.62684440612793, "learning_rate": 3.75e-07, "logits/chosen": -3.9760360717773438, "logits/rejected": -4.4724321365356445, "logps/chosen": -239.9991455078125, "logps/rejected": -202.63072204589844, "loss": 0.7198, "rewards/accuracies": 0.0, "rewards/chosen": -0.041876986622810364, "rewards/margins": -0.052445217967033386, "rewards/rejected": 0.01056823693215847, "step": 4 }, { "epoch": 0.002962962962962963, "grad_norm": 6.745312213897705, "learning_rate": 2.934120444167326e-07, "logits/chosen": -4.136646270751953, "logits/rejected": -4.701557159423828, "logps/chosen": -169.00192260742188, "logps/rejected": -149.2642059326172, "loss": 0.6818, "rewards/accuracies": 0.5, "rewards/chosen": 0.02975158765912056, "rewards/margins": 0.023634720593690872, "rewards/rejected": 0.006116867531090975, "step": 5 }, { "epoch": 0.0035555555555555557, "grad_norm": 13.741002082824707, "learning_rate": 2.065879555832674e-07, "logits/chosen": -5.057063579559326, "logits/rejected": -5.147495746612549, "logps/chosen": -193.6820526123047, "logps/rejected": -202.20425415039062, "loss": 0.6913, "rewards/accuracies": 0.75, "rewards/chosen": 0.0083160400390625, "rewards/margins": 0.004281995818018913, "rewards/rejected": 0.0040340423583984375, "step": 6 }, { "epoch": 0.004148148148148148, "grad_norm": 6.920108795166016, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -4.2448506355285645, "logits/rejected": -3.682173490524292, "logps/chosen": -176.11734008789062, "logps/rejected": -222.982666015625, "loss": 0.7008, "rewards/accuracies": 0.0, "rewards/chosen": 0.028499603271484375, "rewards/margins": -0.015093998052179813, "rewards/rejected": 0.043593600392341614, "step": 7 }, { "epoch": 0.004740740740740741, "grad_norm": 10.856575965881348, "learning_rate": 5.848888922025552e-08, "logits/chosen": -4.413946151733398, "logits/rejected": -4.419940948486328, "logps/chosen": -181.99610900878906, "logps/rejected": -196.20811462402344, "loss": 0.6967, "rewards/accuracies": 0.25, "rewards/chosen": -0.02880115620791912, "rewards/margins": -0.007002831436693668, "rewards/rejected": -0.021798323839902878, "step": 8 }, { "epoch": 0.005333333333333333, "grad_norm": 9.559263229370117, "learning_rate": 1.507684480352292e-08, "logits/chosen": -4.909343242645264, "logits/rejected": -4.601991176605225, "logps/chosen": -154.6862030029297, "logps/rejected": -179.27401733398438, "loss": 0.6929, "rewards/accuracies": 0.25, "rewards/chosen": 0.0032157916575670242, "rewards/margins": 0.0011837054044008255, "rewards/rejected": 0.0020320871844887733, "step": 9 }, { "epoch": 0.005925925925925926, "grad_norm": 10.816996574401855, "learning_rate": 0.0, "logits/chosen": -3.8037424087524414, "logits/rejected": -3.683922529220581, "logps/chosen": -232.978515625, "logps/rejected": -254.24078369140625, "loss": 0.7146, "rewards/accuracies": 0.5, "rewards/chosen": -0.05405044183135033, "rewards/margins": -0.040559008717536926, "rewards/rejected": -0.013491439633071423, "step": 10 } ], "logging_steps": 1, "max_steps": 10, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }