alexredna commited on
Commit
0e2e36c
1 Parent(s): 2f83532

Model save

Browse files
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 1.2578
24
 
25
  ## Model description
26
 
@@ -40,12 +40,12 @@ More information needed
40
 
41
  The following hyperparameters were used during training:
42
  - learning_rate: 5e-05
43
- - train_batch_size: 10
44
- - eval_batch_size: 5
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
  - gradient_accumulation_steps: 20
48
- - total_train_batch_size: 200
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - num_epochs: 1
@@ -54,8 +54,10 @@ The following hyperparameters were used during training:
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
- | 1.3338 | 0.39 | 10 | 1.2857 |
58
- | 1.2798 | 0.79 | 20 | 1.2591 |
 
 
59
 
60
 
61
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.2243
24
 
25
  ## Model description
26
 
 
40
 
41
  The following hyperparameters were used during training:
42
  - learning_rate: 5e-05
43
+ - train_batch_size: 6
44
+ - eval_batch_size: 3
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
  - gradient_accumulation_steps: 20
48
+ - total_train_batch_size: 120
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - num_epochs: 1
 
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
+ | 1.3384 | 0.24 | 10 | 1.2810 |
58
+ | 1.2633 | 0.47 | 20 | 1.2418 |
59
+ | 1.2495 | 0.71 | 30 | 1.2277 |
60
+ | 1.2291 | 0.94 | 40 | 1.2244 |
61
 
62
 
63
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:98cd3890c51ae5ffa91d126fdd9651957d93ea83b95be60e19440fa9853572b2
3
  size 26361536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f965e58223d70e7e4cada5a72a84a621326ead0de0547c6040e366e7c5849d7d
3
  size 26361536
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 0.98,
3
- "eval_loss": 1.257819414138794,
4
- "eval_runtime": 24.0321,
5
  "eval_samples": 662,
6
- "eval_samples_per_second": 5.368,
7
- "eval_steps_per_second": 1.082,
8
- "train_loss": 1.3208525276184082,
9
- "train_runtime": 3151.0527,
10
  "train_samples": 25778,
11
- "train_samples_per_second": 1.613,
12
- "train_steps_per_second": 0.008
13
  }
 
1
  {
2
+ "epoch": 0.99,
3
+ "eval_loss": 1.2242895364761353,
4
+ "eval_runtime": 24.7608,
5
  "eval_samples": 662,
6
+ "eval_samples_per_second": 5.21,
7
+ "eval_steps_per_second": 1.737,
8
+ "train_loss": 1.282569306237357,
9
+ "train_runtime": 3226.4767,
10
  "train_samples": 25778,
11
+ "train_samples_per_second": 1.576,
12
+ "train_steps_per_second": 0.013
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.98,
3
- "eval_loss": 1.257819414138794,
4
- "eval_runtime": 24.0321,
5
  "eval_samples": 662,
6
- "eval_samples_per_second": 5.368,
7
- "eval_steps_per_second": 1.082
8
  }
 
1
  {
2
+ "epoch": 0.99,
3
+ "eval_loss": 1.2242895364761353,
4
+ "eval_runtime": 24.7608,
5
  "eval_samples": 662,
6
+ "eval_samples_per_second": 5.21,
7
+ "eval_steps_per_second": 1.737
8
  }
runs/Jan20_09-47-06_98f107f1aa39/events.out.tfevents.1705744129.98f107f1aa39.59272.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ec6843eaefbf474d9105d7c7f25f544603a96e1b0683b083a1eae513311c0b6
3
- size 5822
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:195513f1517de6140e91b2d0ea1567bf0817d870483f26c6ad36ed3d7ca013e8
3
+ size 7318
runs/Jan20_09-47-06_98f107f1aa39/events.out.tfevents.1705747380.98f107f1aa39.59272.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c2ed3583edf8c7c082ba0e054e067c2c515a9f4a8eaf101cf3132cbda44515c
3
+ size 354
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.98,
3
- "train_loss": 1.3208525276184082,
4
- "train_runtime": 3151.0527,
5
  "train_samples": 25778,
6
- "train_samples_per_second": 1.613,
7
- "train_steps_per_second": 0.008
8
  }
 
1
  {
2
+ "epoch": 0.99,
3
+ "train_loss": 1.282569306237357,
4
+ "train_runtime": 3226.4767,
5
  "train_samples": 25778,
6
+ "train_samples_per_second": 1.576,
7
+ "train_steps_per_second": 0.013
8
  }
trainer_state.json CHANGED
@@ -1,82 +1,116 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9823182711198428,
5
  "eval_steps": 10,
6
- "global_step": 25,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.04,
13
- "learning_rate": 4.980286753286195e-05,
14
- "loss": 1.4539,
15
  "step": 1
16
  },
17
  {
18
- "epoch": 0.2,
19
- "learning_rate": 4.522542485937369e-05,
20
- "loss": 1.4374,
21
  "step": 5
22
  },
23
  {
24
- "epoch": 0.39,
25
- "learning_rate": 3.272542485937369e-05,
26
- "loss": 1.3338,
27
  "step": 10
28
  },
29
  {
30
- "epoch": 0.39,
31
- "eval_loss": 1.285738229751587,
32
- "eval_runtime": 24.0251,
33
- "eval_samples_per_second": 5.369,
34
- "eval_steps_per_second": 1.082,
35
  "step": 10
36
  },
37
  {
38
- "epoch": 0.59,
39
- "learning_rate": 1.7274575140626318e-05,
40
- "loss": 1.286,
41
  "step": 15
42
  },
43
  {
44
- "epoch": 0.79,
45
- "learning_rate": 4.7745751406263165e-06,
46
- "loss": 1.2798,
47
  "step": 20
48
  },
49
  {
50
- "epoch": 0.79,
51
- "eval_loss": 1.259059190750122,
52
- "eval_runtime": 24.0291,
53
- "eval_samples_per_second": 5.368,
54
- "eval_steps_per_second": 1.082,
55
  "step": 20
56
  },
57
  {
58
- "epoch": 0.98,
59
- "learning_rate": 0.0,
60
- "loss": 1.2639,
61
  "step": 25
62
  },
63
  {
64
- "epoch": 0.98,
65
- "step": 25,
66
- "total_flos": 6.39647288959959e+16,
67
- "train_loss": 1.3208525276184082,
68
- "train_runtime": 3151.0527,
69
- "train_samples_per_second": 1.613,
70
- "train_steps_per_second": 0.008
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  }
72
  ],
73
  "logging_steps": 5,
74
- "max_steps": 25,
75
  "num_input_tokens_seen": 0,
76
  "num_train_epochs": 1,
77
  "save_steps": 20,
78
- "total_flos": 6.39647288959959e+16,
79
- "train_batch_size": 10,
80
  "trial_name": null,
81
  "trial_params": null
82
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9905660377358491,
5
  "eval_steps": 10,
6
+ "global_step": 42,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.02,
13
+ "learning_rate": 4.9930094929529506e-05,
14
+ "loss": 1.458,
15
  "step": 1
16
  },
17
  {
18
+ "epoch": 0.12,
19
+ "learning_rate": 4.827184371610511e-05,
20
+ "loss": 1.4238,
21
  "step": 5
22
  },
23
  {
24
+ "epoch": 0.24,
25
+ "learning_rate": 4.332629679574566e-05,
26
+ "loss": 1.3384,
27
  "step": 10
28
  },
29
  {
30
+ "epoch": 0.24,
31
+ "eval_loss": 1.2809978723526,
32
+ "eval_runtime": 24.7617,
33
+ "eval_samples_per_second": 5.21,
34
+ "eval_steps_per_second": 1.737,
35
  "step": 10
36
  },
37
  {
38
+ "epoch": 0.35,
39
+ "learning_rate": 3.5847093477938956e-05,
40
+ "loss": 1.2865,
41
  "step": 15
42
  },
43
  {
44
+ "epoch": 0.47,
45
+ "learning_rate": 2.686825233966061e-05,
46
+ "loss": 1.2633,
47
  "step": 20
48
  },
49
  {
50
+ "epoch": 0.47,
51
+ "eval_loss": 1.2418025732040405,
52
+ "eval_runtime": 24.7653,
53
+ "eval_samples_per_second": 5.209,
54
+ "eval_steps_per_second": 1.736,
55
  "step": 20
56
  },
57
  {
58
+ "epoch": 0.59,
59
+ "learning_rate": 1.7631120639727393e-05,
60
+ "loss": 1.2433,
61
  "step": 25
62
  },
63
  {
64
+ "epoch": 0.71,
65
+ "learning_rate": 9.412754953531663e-06,
66
+ "loss": 1.2495,
67
+ "step": 30
68
+ },
69
+ {
70
+ "epoch": 0.71,
71
+ "eval_loss": 1.2276524305343628,
72
+ "eval_runtime": 24.7715,
73
+ "eval_samples_per_second": 5.208,
74
+ "eval_steps_per_second": 1.736,
75
+ "step": 30
76
+ },
77
+ {
78
+ "epoch": 0.83,
79
+ "learning_rate": 3.3493649053890326e-06,
80
+ "loss": 1.2364,
81
+ "step": 35
82
+ },
83
+ {
84
+ "epoch": 0.94,
85
+ "learning_rate": 2.7922934437178695e-07,
86
+ "loss": 1.2291,
87
+ "step": 40
88
+ },
89
+ {
90
+ "epoch": 0.94,
91
+ "eval_loss": 1.2243515253067017,
92
+ "eval_runtime": 24.7908,
93
+ "eval_samples_per_second": 5.204,
94
+ "eval_steps_per_second": 1.735,
95
+ "step": 40
96
+ },
97
+ {
98
+ "epoch": 0.99,
99
+ "step": 42,
100
+ "total_flos": 6.447644673468006e+16,
101
+ "train_loss": 1.282569306237357,
102
+ "train_runtime": 3226.4767,
103
+ "train_samples_per_second": 1.576,
104
+ "train_steps_per_second": 0.013
105
  }
106
  ],
107
  "logging_steps": 5,
108
+ "max_steps": 42,
109
  "num_input_tokens_seen": 0,
110
  "num_train_epochs": 1,
111
  "save_steps": 20,
112
+ "total_flos": 6.447644673468006e+16,
113
+ "train_batch_size": 6,
114
  "trial_name": null,
115
  "trial_params": null
116
  }