scottsuk0306 commited on
Commit
bd90e6c
1 Parent(s): 7d9df90

Model save

Browse files
Files changed (4) hide show
  1. README.md +5 -10
  2. all_results.json +4 -10
  3. train_results.json +4 -4
  4. trainer_state.json +103 -358
README.md CHANGED
@@ -2,7 +2,6 @@
2
  license: gemma
3
  base_model: scottsuk0306/easylm-sft-gemma-2-2b
4
  tags:
5
- - easylm
6
  - trl
7
  - reward-trainer
8
  - generated_from_trainer
@@ -20,8 +19,8 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [scottsuk0306/easylm-sft-gemma-2-2b](https://huggingface.co/scottsuk0306/easylm-sft-gemma-2-2b) on an unknown dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 0.6525
24
- - Accuracy: 0.6146
25
 
26
  ## Model description
27
 
@@ -46,7 +45,8 @@ The following hyperparameters were used during training:
46
  - seed: 42
47
  - distributed_type: multi-GPU
48
  - num_devices: 8
49
- - total_train_batch_size: 16
 
50
  - total_eval_batch_size: 16
51
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
  - lr_scheduler_type: cosine
@@ -56,12 +56,7 @@ The following hyperparameters were used during training:
56
 
57
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
58
  |:-------------:|:------:|:----:|:---------------:|:--------:|
59
- | 0.6722 | 0.1667 | 100 | 0.6810 | 0.6354 |
60
- | 0.6883 | 0.3333 | 200 | 0.7212 | 0.5104 |
61
- | 0.6512 | 0.5 | 300 | 0.6590 | 0.625 |
62
- | 0.6842 | 0.6667 | 400 | 0.6537 | 0.6771 |
63
- | 0.698 | 0.8333 | 500 | 0.6507 | 0.6562 |
64
- | 0.6254 | 1.0 | 600 | 0.6525 | 0.6146 |
65
 
66
 
67
  ### Framework versions
 
2
  license: gemma
3
  base_model: scottsuk0306/easylm-sft-gemma-2-2b
4
  tags:
 
5
  - trl
6
  - reward-trainer
7
  - generated_from_trainer
 
19
 
20
  This model is a fine-tuned version of [scottsuk0306/easylm-sft-gemma-2-2b](https://huggingface.co/scottsuk0306/easylm-sft-gemma-2-2b) on an unknown dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.6616
23
+ - Accuracy: 0.6771
24
 
25
  ## Model description
26
 
 
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
  - num_devices: 8
48
+ - gradient_accumulation_steps: 4
49
+ - total_train_batch_size: 64
50
  - total_eval_batch_size: 16
51
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
  - lr_scheduler_type: cosine
 
56
 
57
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
58
  |:-------------:|:------:|:----:|:---------------:|:--------:|
59
+ | 0.6716 | 0.6667 | 100 | 0.6616 | 0.6771 |
 
 
 
 
 
60
 
61
 
62
  ### Framework versions
all_results.json CHANGED
@@ -1,15 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_accuracy": 0.6145833333333334,
4
- "eval_loss": 0.6525471806526184,
5
- "eval_runtime": 0.6682,
6
- "eval_samples": 96,
7
- "eval_samples_per_second": 143.664,
8
- "eval_steps_per_second": 8.979,
9
  "total_flos": 0.0,
10
- "train_loss": 0.6666407775878906,
11
- "train_runtime": 1051.6323,
12
  "train_samples": 9595,
13
- "train_samples_per_second": 9.124,
14
- "train_steps_per_second": 0.571
15
  }
 
1
  {
2
  "epoch": 1.0,
 
 
 
 
 
 
3
  "total_flos": 0.0,
4
+ "train_loss": 0.6696457926432292,
5
+ "train_runtime": 575.0153,
6
  "train_samples": 9595,
7
+ "train_samples_per_second": 16.687,
8
+ "train_steps_per_second": 0.261
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 0.0,
4
- "train_loss": 0.6666407775878906,
5
- "train_runtime": 1051.6323,
6
  "train_samples": 9595,
7
- "train_samples_per_second": 9.124,
8
- "train_steps_per_second": 0.571
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.6696457926432292,
5
+ "train_runtime": 575.0153,
6
  "train_samples": 9595,
7
+ "train_samples_per_second": 16.687,
8
+ "train_steps_per_second": 0.261
9
  }
trainer_state.json CHANGED
@@ -3,497 +3,242 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 100,
6
- "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
- {
12
- "epoch": 0.016666666666666666,
13
- "grad_norm": 21.375614166259766,
14
- "learning_rate": 9.99314767377287e-07,
15
- "loss": 0.691,
16
- "step": 10
17
- },
18
  {
19
  "epoch": 0.03333333333333333,
20
- "grad_norm": 11.13298511505127,
21
  "learning_rate": 9.972609476841365e-07,
22
- "loss": 0.7027,
23
- "step": 20
24
- },
25
- {
26
- "epoch": 0.05,
27
- "grad_norm": 11.09454345703125,
28
- "learning_rate": 9.938441702975689e-07,
29
- "loss": 0.6766,
30
- "step": 30
31
  },
32
  {
33
  "epoch": 0.06666666666666667,
34
- "grad_norm": 19.415067672729492,
35
  "learning_rate": 9.890738003669027e-07,
36
- "loss": 0.6706,
37
- "step": 40
38
- },
39
- {
40
- "epoch": 0.08333333333333333,
41
- "grad_norm": 15.57960319519043,
42
- "learning_rate": 9.82962913144534e-07,
43
- "loss": 0.6716,
44
- "step": 50
45
  },
46
  {
47
  "epoch": 0.1,
48
- "grad_norm": 9.34956169128418,
49
  "learning_rate": 9.755282581475767e-07,
50
- "loss": 0.7021,
51
- "step": 60
52
- },
53
- {
54
- "epoch": 0.11666666666666667,
55
- "grad_norm": 10.283132553100586,
56
- "learning_rate": 9.667902132486008e-07,
57
- "loss": 0.6837,
58
- "step": 70
59
  },
60
  {
61
  "epoch": 0.13333333333333333,
62
- "grad_norm": 15.809244155883789,
63
  "learning_rate": 9.567727288213004e-07,
64
- "loss": 0.6661,
65
- "step": 80
66
- },
67
- {
68
- "epoch": 0.15,
69
- "grad_norm": 18.98931312561035,
70
- "learning_rate": 9.455032620941839e-07,
71
- "loss": 0.6708,
72
- "step": 90
73
  },
74
  {
75
  "epoch": 0.16666666666666666,
76
- "grad_norm": 16.9694766998291,
77
  "learning_rate": 9.330127018922193e-07,
78
- "loss": 0.6722,
79
- "step": 100
80
- },
81
- {
82
- "epoch": 0.16666666666666666,
83
- "eval_accuracy": 0.6354166666666666,
84
- "eval_loss": 0.6810302734375,
85
- "eval_runtime": 0.814,
86
- "eval_samples_per_second": 117.941,
87
- "eval_steps_per_second": 7.371,
88
- "step": 100
89
- },
90
- {
91
- "epoch": 0.18333333333333332,
92
- "grad_norm": 19.83936882019043,
93
- "learning_rate": 9.19335283972712e-07,
94
- "loss": 0.6659,
95
- "step": 110
96
  },
97
  {
98
  "epoch": 0.2,
99
- "grad_norm": 13.237977981567383,
100
  "learning_rate": 9.045084971874737e-07,
101
- "loss": 0.6897,
102
- "step": 120
103
- },
104
- {
105
- "epoch": 0.21666666666666667,
106
- "grad_norm": 10.554994583129883,
107
- "learning_rate": 8.885729807284854e-07,
108
- "loss": 0.6771,
109
- "step": 130
110
  },
111
  {
112
  "epoch": 0.23333333333333334,
113
- "grad_norm": 17.837158203125,
114
  "learning_rate": 8.71572412738697e-07,
115
- "loss": 0.6973,
116
- "step": 140
117
- },
118
- {
119
- "epoch": 0.25,
120
- "grad_norm": 6.722561836242676,
121
- "learning_rate": 8.535533905932737e-07,
122
- "loss": 0.689,
123
- "step": 150
124
  },
125
  {
126
  "epoch": 0.26666666666666666,
127
- "grad_norm": 7.517063617706299,
128
  "learning_rate": 8.34565303179429e-07,
129
- "loss": 0.6788,
130
- "step": 160
131
- },
132
- {
133
- "epoch": 0.2833333333333333,
134
- "grad_norm": 20.102699279785156,
135
- "learning_rate": 8.146601955249187e-07,
136
- "loss": 0.6564,
137
- "step": 170
138
  },
139
  {
140
  "epoch": 0.3,
141
- "grad_norm": 38.80488586425781,
142
  "learning_rate": 7.938926261462365e-07,
143
- "loss": 0.7027,
144
- "step": 180
145
- },
146
- {
147
- "epoch": 0.31666666666666665,
148
- "grad_norm": 25.59081268310547,
149
- "learning_rate": 7.723195175075135e-07,
150
- "loss": 0.6154,
151
- "step": 190
152
  },
153
  {
154
  "epoch": 0.3333333333333333,
155
- "grad_norm": 30.760852813720703,
156
  "learning_rate": 7.5e-07,
157
- "loss": 0.6883,
158
- "step": 200
159
- },
160
- {
161
- "epoch": 0.3333333333333333,
162
- "eval_accuracy": 0.5104166666666666,
163
- "eval_loss": 0.7211506962776184,
164
- "eval_runtime": 0.6312,
165
- "eval_samples_per_second": 152.089,
166
- "eval_steps_per_second": 9.506,
167
- "step": 200
168
- },
169
- {
170
- "epoch": 0.35,
171
- "grad_norm": 29.57691192626953,
172
- "learning_rate": 7.269952498697734e-07,
173
- "loss": 0.6856,
174
- "step": 210
175
  },
176
  {
177
  "epoch": 0.36666666666666664,
178
- "grad_norm": 20.42752456665039,
179
  "learning_rate": 7.033683215379002e-07,
180
- "loss": 0.6582,
181
- "step": 220
182
- },
183
- {
184
- "epoch": 0.38333333333333336,
185
- "grad_norm": 11.012421607971191,
186
- "learning_rate": 6.7918397477265e-07,
187
- "loss": 0.6808,
188
- "step": 230
189
  },
190
  {
191
  "epoch": 0.4,
192
- "grad_norm": 8.908668518066406,
193
  "learning_rate": 6.545084971874736e-07,
194
- "loss": 0.6795,
195
- "step": 240
196
- },
197
- {
198
- "epoch": 0.4166666666666667,
199
- "grad_norm": 8.45557689666748,
200
- "learning_rate": 6.294095225512604e-07,
201
- "loss": 0.6611,
202
- "step": 250
203
  },
204
  {
205
  "epoch": 0.43333333333333335,
206
- "grad_norm": 14.864495277404785,
207
  "learning_rate": 6.039558454088795e-07,
208
- "loss": 0.6253,
209
- "step": 260
210
- },
211
- {
212
- "epoch": 0.45,
213
- "grad_norm": 19.124385833740234,
214
- "learning_rate": 5.782172325201155e-07,
215
- "loss": 0.6431,
216
- "step": 270
217
  },
218
  {
219
  "epoch": 0.4666666666666667,
220
- "grad_norm": 11.030838012695312,
221
  "learning_rate": 5.522642316338268e-07,
222
- "loss": 0.6351,
223
- "step": 280
224
- },
225
- {
226
- "epoch": 0.48333333333333334,
227
- "grad_norm": 11.4014892578125,
228
- "learning_rate": 5.26167978121472e-07,
229
- "loss": 0.7384,
230
- "step": 290
231
  },
232
  {
233
  "epoch": 0.5,
234
- "grad_norm": 16.28369140625,
235
  "learning_rate": 5e-07,
236
- "loss": 0.6512,
237
- "step": 300
238
- },
239
- {
240
- "epoch": 0.5,
241
- "eval_accuracy": 0.625,
242
- "eval_loss": 0.6590169072151184,
243
- "eval_runtime": 0.9151,
244
- "eval_samples_per_second": 104.903,
245
- "eval_steps_per_second": 6.556,
246
- "step": 300
247
- },
248
- {
249
- "epoch": 0.5166666666666667,
250
- "grad_norm": 28.847896575927734,
251
- "learning_rate": 4.7383202187852804e-07,
252
- "loss": 0.6759,
253
- "step": 310
254
  },
255
  {
256
  "epoch": 0.5333333333333333,
257
- "grad_norm": 11.870433807373047,
258
  "learning_rate": 4.477357683661733e-07,
259
- "loss": 0.6917,
260
- "step": 320
261
- },
262
- {
263
- "epoch": 0.55,
264
- "grad_norm": 8.195870399475098,
265
- "learning_rate": 4.2178276747988444e-07,
266
- "loss": 0.6624,
267
- "step": 330
268
  },
269
  {
270
  "epoch": 0.5666666666666667,
271
- "grad_norm": 10.456932067871094,
272
  "learning_rate": 3.960441545911204e-07,
273
- "loss": 0.6807,
274
- "step": 340
275
- },
276
- {
277
- "epoch": 0.5833333333333334,
278
- "grad_norm": 9.85155963897705,
279
- "learning_rate": 3.7059047744873955e-07,
280
- "loss": 0.6642,
281
- "step": 350
282
  },
283
  {
284
  "epoch": 0.6,
285
- "grad_norm": 15.47450065612793,
286
  "learning_rate": 3.454915028125263e-07,
287
- "loss": 0.6446,
288
- "step": 360
289
- },
290
- {
291
- "epoch": 0.6166666666666667,
292
- "grad_norm": 13.11928939819336,
293
- "learning_rate": 3.2081602522734985e-07,
294
- "loss": 0.6441,
295
- "step": 370
296
  },
297
  {
298
  "epoch": 0.6333333333333333,
299
- "grad_norm": 14.159366607666016,
300
  "learning_rate": 2.9663167846209996e-07,
301
- "loss": 0.684,
302
- "step": 380
303
- },
304
- {
305
- "epoch": 0.65,
306
- "grad_norm": 10.897772789001465,
307
- "learning_rate": 2.730047501302266e-07,
308
- "loss": 0.6696,
309
- "step": 390
310
  },
311
  {
312
  "epoch": 0.6666666666666666,
313
- "grad_norm": 10.67762565612793,
314
  "learning_rate": 2.500000000000001e-07,
315
- "loss": 0.6842,
316
- "step": 400
317
  },
318
  {
319
  "epoch": 0.6666666666666666,
320
  "eval_accuracy": 0.6770833333333334,
321
- "eval_loss": 0.6537272334098816,
322
- "eval_runtime": 0.6549,
323
- "eval_samples_per_second": 146.577,
324
- "eval_steps_per_second": 9.161,
325
- "step": 400
326
- },
327
- {
328
- "epoch": 0.6833333333333333,
329
- "grad_norm": 11.094571113586426,
330
- "learning_rate": 2.2768048249248644e-07,
331
- "loss": 0.666,
332
- "step": 410
333
  },
334
  {
335
  "epoch": 0.7,
336
- "grad_norm": 11.438158988952637,
337
  "learning_rate": 2.0610737385376348e-07,
338
- "loss": 0.6485,
339
- "step": 420
340
- },
341
- {
342
- "epoch": 0.7166666666666667,
343
- "grad_norm": 13.184762954711914,
344
- "learning_rate": 1.8533980447508135e-07,
345
- "loss": 0.7017,
346
- "step": 430
347
  },
348
  {
349
  "epoch": 0.7333333333333333,
350
- "grad_norm": 14.232587814331055,
351
  "learning_rate": 1.6543469682057104e-07,
352
- "loss": 0.6189,
353
- "step": 440
354
- },
355
- {
356
- "epoch": 0.75,
357
- "grad_norm": 14.100610733032227,
358
- "learning_rate": 1.4644660940672627e-07,
359
- "loss": 0.6203,
360
- "step": 450
361
  },
362
  {
363
  "epoch": 0.7666666666666667,
364
- "grad_norm": 17.135536193847656,
365
  "learning_rate": 1.284275872613028e-07,
366
- "loss": 0.6481,
367
- "step": 460
368
- },
369
- {
370
- "epoch": 0.7833333333333333,
371
- "grad_norm": 11.631980895996094,
372
- "learning_rate": 1.1142701927151454e-07,
373
- "loss": 0.6196,
374
- "step": 470
375
  },
376
  {
377
  "epoch": 0.8,
378
- "grad_norm": 20.285520553588867,
379
  "learning_rate": 9.549150281252632e-08,
380
- "loss": 0.6731,
381
- "step": 480
382
- },
383
- {
384
- "epoch": 0.8166666666666667,
385
- "grad_norm": 21.172334671020508,
386
- "learning_rate": 8.066471602728803e-08,
387
- "loss": 0.6452,
388
- "step": 490
389
  },
390
  {
391
  "epoch": 0.8333333333333334,
392
- "grad_norm": 15.872474670410156,
393
  "learning_rate": 6.698729810778064e-08,
394
- "loss": 0.698,
395
- "step": 500
396
- },
397
- {
398
- "epoch": 0.8333333333333334,
399
- "eval_accuracy": 0.65625,
400
- "eval_loss": 0.6507161259651184,
401
- "eval_runtime": 0.6488,
402
- "eval_samples_per_second": 147.958,
403
- "eval_steps_per_second": 9.247,
404
- "step": 500
405
- },
406
- {
407
- "epoch": 0.85,
408
- "grad_norm": 17.40620231628418,
409
- "learning_rate": 5.44967379058161e-08,
410
- "loss": 0.6668,
411
- "step": 510
412
  },
413
  {
414
  "epoch": 0.8666666666666667,
415
- "grad_norm": 18.17245101928711,
416
  "learning_rate": 4.322727117869951e-08,
417
- "loss": 0.6736,
418
- "step": 520
419
- },
420
- {
421
- "epoch": 0.8833333333333333,
422
- "grad_norm": 16.35578727722168,
423
- "learning_rate": 3.3209786751399184e-08,
424
- "loss": 0.6311,
425
- "step": 530
426
  },
427
  {
428
  "epoch": 0.9,
429
- "grad_norm": 17.59069061279297,
430
  "learning_rate": 2.4471741852423233e-08,
431
- "loss": 0.6764,
432
- "step": 540
433
- },
434
- {
435
- "epoch": 0.9166666666666666,
436
- "grad_norm": 13.054784774780273,
437
- "learning_rate": 1.7037086855465898e-08,
438
- "loss": 0.6684,
439
- "step": 550
440
  },
441
  {
442
  "epoch": 0.9333333333333333,
443
- "grad_norm": 15.742310523986816,
444
  "learning_rate": 1.0926199633097154e-08,
445
- "loss": 0.6221,
446
- "step": 560
447
- },
448
- {
449
- "epoch": 0.95,
450
- "grad_norm": 16.06201934814453,
451
- "learning_rate": 6.15582970243117e-09,
452
- "loss": 0.6677,
453
- "step": 570
454
  },
455
  {
456
  "epoch": 0.9666666666666667,
457
- "grad_norm": 14.09910774230957,
458
  "learning_rate": 2.739052315863355e-09,
459
- "loss": 0.6592,
460
- "step": 580
461
- },
462
- {
463
- "epoch": 0.9833333333333333,
464
- "grad_norm": 15.314577102661133,
465
- "learning_rate": 6.852326227130833e-10,
466
- "loss": 0.6409,
467
- "step": 590
468
  },
469
  {
470
  "epoch": 1.0,
471
- "grad_norm": 12.720383644104004,
472
  "learning_rate": 0.0,
473
- "loss": 0.6254,
474
- "step": 600
475
- },
476
- {
477
- "epoch": 1.0,
478
- "eval_accuracy": 0.6145833333333334,
479
- "eval_loss": 0.6525471806526184,
480
- "eval_runtime": 0.6796,
481
- "eval_samples_per_second": 141.25,
482
- "eval_steps_per_second": 8.828,
483
- "step": 600
484
  },
485
  {
486
  "epoch": 1.0,
487
- "step": 600,
488
  "total_flos": 0.0,
489
- "train_loss": 0.6666407775878906,
490
- "train_runtime": 1051.6323,
491
- "train_samples_per_second": 9.124,
492
- "train_steps_per_second": 0.571
493
  }
494
  ],
495
- "logging_steps": 10,
496
- "max_steps": 600,
497
  "num_input_tokens_seen": 0,
498
  "num_train_epochs": 1,
499
  "save_steps": 500,
 
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 100,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
 
11
  {
12
  "epoch": 0.03333333333333333,
13
+ "grad_norm": 18.165170669555664,
14
  "learning_rate": 9.972609476841365e-07,
15
+ "loss": 0.7056,
16
+ "step": 5
 
 
 
 
 
 
 
17
  },
18
  {
19
  "epoch": 0.06666666666666667,
20
+ "grad_norm": 17.16242218017578,
21
  "learning_rate": 9.890738003669027e-07,
22
+ "loss": 0.6646,
23
+ "step": 10
 
 
 
 
 
 
 
24
  },
25
  {
26
  "epoch": 0.1,
27
+ "grad_norm": 21.325637817382812,
28
  "learning_rate": 9.755282581475767e-07,
29
+ "loss": 0.6945,
30
+ "step": 15
 
 
 
 
 
 
 
31
  },
32
  {
33
  "epoch": 0.13333333333333333,
34
+ "grad_norm": 14.320093154907227,
35
  "learning_rate": 9.567727288213004e-07,
36
+ "loss": 0.6743,
37
+ "step": 20
 
 
 
 
 
 
 
38
  },
39
  {
40
  "epoch": 0.16666666666666666,
41
+ "grad_norm": 10.650166511535645,
42
  "learning_rate": 9.330127018922193e-07,
43
+ "loss": 0.676,
44
+ "step": 25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  },
46
  {
47
  "epoch": 0.2,
48
+ "grad_norm": 13.821106910705566,
49
  "learning_rate": 9.045084971874737e-07,
50
+ "loss": 0.6838,
51
+ "step": 30
 
 
 
 
 
 
 
52
  },
53
  {
54
  "epoch": 0.23333333333333334,
55
+ "grad_norm": 9.151570320129395,
56
  "learning_rate": 8.71572412738697e-07,
57
+ "loss": 0.6718,
58
+ "step": 35
 
 
 
 
 
 
 
59
  },
60
  {
61
  "epoch": 0.26666666666666666,
62
+ "grad_norm": 7.747379302978516,
63
  "learning_rate": 8.34565303179429e-07,
64
+ "loss": 0.687,
65
+ "step": 40
 
 
 
 
 
 
 
66
  },
67
  {
68
  "epoch": 0.3,
69
+ "grad_norm": 9.819681167602539,
70
  "learning_rate": 7.938926261462365e-07,
71
+ "loss": 0.6653,
72
+ "step": 45
 
 
 
 
 
 
 
73
  },
74
  {
75
  "epoch": 0.3333333333333333,
76
+ "grad_norm": 8.698646545410156,
77
  "learning_rate": 7.5e-07,
78
+ "loss": 0.6708,
79
+ "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  },
81
  {
82
  "epoch": 0.36666666666666664,
83
+ "grad_norm": 8.188970565795898,
84
  "learning_rate": 7.033683215379002e-07,
85
+ "loss": 0.688,
86
+ "step": 55
 
 
 
 
 
 
 
87
  },
88
  {
89
  "epoch": 0.4,
90
+ "grad_norm": 7.681589603424072,
91
  "learning_rate": 6.545084971874736e-07,
92
+ "loss": 0.6833,
93
+ "step": 60
 
 
 
 
 
 
 
94
  },
95
  {
96
  "epoch": 0.43333333333333335,
97
+ "grad_norm": 7.524262428283691,
98
  "learning_rate": 6.039558454088795e-07,
99
+ "loss": 0.6585,
100
+ "step": 65
 
 
 
 
 
 
 
101
  },
102
  {
103
  "epoch": 0.4666666666666667,
104
+ "grad_norm": 9.614619255065918,
105
  "learning_rate": 5.522642316338268e-07,
106
+ "loss": 0.6364,
107
+ "step": 70
 
 
 
 
 
 
 
108
  },
109
  {
110
  "epoch": 0.5,
111
+ "grad_norm": 12.577666282653809,
112
  "learning_rate": 5e-07,
113
+ "loss": 0.6842,
114
+ "step": 75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  },
116
  {
117
  "epoch": 0.5333333333333333,
118
+ "grad_norm": 11.13433837890625,
119
  "learning_rate": 4.477357683661733e-07,
120
+ "loss": 0.6968,
121
+ "step": 80
 
 
 
 
 
 
 
122
  },
123
  {
124
  "epoch": 0.5666666666666667,
125
+ "grad_norm": 9.573627471923828,
126
  "learning_rate": 3.960441545911204e-07,
127
+ "loss": 0.6679,
128
+ "step": 85
 
 
 
 
 
 
 
129
  },
130
  {
131
  "epoch": 0.6,
132
+ "grad_norm": 6.869621753692627,
133
  "learning_rate": 3.454915028125263e-07,
134
+ "loss": 0.672,
135
+ "step": 90
 
 
 
 
 
 
 
136
  },
137
  {
138
  "epoch": 0.6333333333333333,
139
+ "grad_norm": 6.626986026763916,
140
  "learning_rate": 2.9663167846209996e-07,
141
+ "loss": 0.6644,
142
+ "step": 95
 
 
 
 
 
 
 
143
  },
144
  {
145
  "epoch": 0.6666666666666666,
146
+ "grad_norm": 7.816995143890381,
147
  "learning_rate": 2.500000000000001e-07,
148
+ "loss": 0.6716,
149
+ "step": 100
150
  },
151
  {
152
  "epoch": 0.6666666666666666,
153
  "eval_accuracy": 0.6770833333333334,
154
+ "eval_loss": 0.6615803837776184,
155
+ "eval_runtime": 0.6663,
156
+ "eval_samples_per_second": 144.07,
157
+ "eval_steps_per_second": 9.004,
158
+ "step": 100
 
 
 
 
 
 
 
159
  },
160
  {
161
  "epoch": 0.7,
162
+ "grad_norm": 7.169824123382568,
163
  "learning_rate": 2.0610737385376348e-07,
164
+ "loss": 0.6527,
165
+ "step": 105
 
 
 
 
 
 
 
166
  },
167
  {
168
  "epoch": 0.7333333333333333,
169
+ "grad_norm": 7.605311393737793,
170
  "learning_rate": 1.6543469682057104e-07,
171
+ "loss": 0.661,
172
+ "step": 110
 
 
 
 
 
 
 
173
  },
174
  {
175
  "epoch": 0.7666666666666667,
176
+ "grad_norm": 8.795389175415039,
177
  "learning_rate": 1.284275872613028e-07,
178
+ "loss": 0.644,
179
+ "step": 115
 
 
 
 
 
 
 
180
  },
181
  {
182
  "epoch": 0.8,
183
+ "grad_norm": 9.404634475708008,
184
  "learning_rate": 9.549150281252632e-08,
185
+ "loss": 0.6536,
186
+ "step": 120
 
 
 
 
 
 
 
187
  },
188
  {
189
  "epoch": 0.8333333333333334,
190
+ "grad_norm": 9.008624076843262,
191
  "learning_rate": 6.698729810778064e-08,
192
+ "loss": 0.6744,
193
+ "step": 125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  },
195
  {
196
  "epoch": 0.8666666666666667,
197
+ "grad_norm": 8.526646614074707,
198
  "learning_rate": 4.322727117869951e-08,
199
+ "loss": 0.6722,
200
+ "step": 130
 
 
 
 
 
 
 
201
  },
202
  {
203
  "epoch": 0.9,
204
+ "grad_norm": 8.827940940856934,
205
  "learning_rate": 2.4471741852423233e-08,
206
+ "loss": 0.6599,
207
+ "step": 135
 
 
 
 
 
 
 
208
  },
209
  {
210
  "epoch": 0.9333333333333333,
211
+ "grad_norm": 7.812473297119141,
212
  "learning_rate": 1.0926199633097154e-08,
213
+ "loss": 0.645,
214
+ "step": 140
 
 
 
 
 
 
 
215
  },
216
  {
217
  "epoch": 0.9666666666666667,
218
+ "grad_norm": 10.083480834960938,
219
  "learning_rate": 2.739052315863355e-09,
220
+ "loss": 0.6633,
221
+ "step": 145
 
 
 
 
 
 
 
222
  },
223
  {
224
  "epoch": 1.0,
225
+ "grad_norm": 9.186869621276855,
226
  "learning_rate": 0.0,
227
+ "loss": 0.6466,
228
+ "step": 150
 
 
 
 
 
 
 
 
 
229
  },
230
  {
231
  "epoch": 1.0,
232
+ "step": 150,
233
  "total_flos": 0.0,
234
+ "train_loss": 0.6696457926432292,
235
+ "train_runtime": 575.0153,
236
+ "train_samples_per_second": 16.687,
237
+ "train_steps_per_second": 0.261
238
  }
239
  ],
240
+ "logging_steps": 5,
241
+ "max_steps": 150,
242
  "num_input_tokens_seen": 0,
243
  "num_train_epochs": 1,
244
  "save_steps": 500,