{"output_dir": "/data2/assaf/mamba/outputs/models", "cache_dir": "/data2/hf_cache", "activate_logging": true, "wandb_dir": "/data2/assaf/wandb/mamba", "run_name_addon": "", "record_debug_params": false, "eval_mode": false, "mamba_arch": "vanilla", "model_type": "mamba-130m", "use_finetuned_model": false, "load_cp": null, "clip_grad": true, "clip_grad_max_norm": 1, "seed": 123, "lr_sched_type": "const", "sampling_temperature": 1.2, "save_steps": 100, "eval_steps": 20, "grad_flow_steps": 10, "max_step": 20000, "epochs": 5, "model_device": "cuda:6", "dataset": "niah_custom", "train_set_size": 6144, "eval_set_size": 20, "eval_samples_to_log": 30, "eval_max_len": 20, "max_train_input_len": 20000, "scrolls_evaluator_path": "/data1/assaf/datasets/scrolls/evaluator/dataset_evaluator.py", "niah_context_len_train": 2000, "niah_needle_depths_eval": [0, 0.25, 0.5, 0.75, 1], "niah_context_lens_eval": [1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000], "ppl_test_context_len_train": 2000, "ppl_test_pred_len": 30, "ppl_test_context_lens_eval": [1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000], "ppl_test_num_windows_per_context_len_eval": 100, "deci_num_chunks": 1, "activate_decimation": false, "decimation_type": "max_p", "decimation_k": 2, "min_decimating_layer": 12, "max_decimating_layer": 20, "decimating_layers": [21, 27, 28, 32], "decimation_min_seq_len": 20, "decimation_max_p_L_base": 2000, "lr": 0.0001, "weight_decay": 0.1, "grad_accum_steps": 32, "activate_profiling": false}