chansung commited on
Commit
44a7f2e
·
verified ·
1 Parent(s): 588cc80

Model save

Browse files
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 1.6544
24
 
25
  ## Model description
26
 
@@ -57,7 +57,7 @@ The following hyperparameters were used during training:
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
- | 0.8317 | 1.0 | 301 | 1.6544 |
61
 
62
 
63
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.6534
24
 
25
  ## Model description
26
 
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
+ | 0.8265 | 1.0 | 301 | 1.6534 |
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 1.6543629169464111,
4
- "eval_runtime": 1.2677,
5
- "eval_samples": 16,
6
- "eval_samples_per_second": 7.888,
7
- "eval_steps_per_second": 0.789,
8
  "total_flos": 9.17802542639874e+17,
9
- "train_loss": 2.9563070911901734,
10
- "train_runtime": 3725.7135,
11
  "train_samples": 116368,
12
- "train_samples_per_second": 10.33,
13
- "train_steps_per_second": 0.081
14
  }
 
1
  {
2
  "epoch": 1.0,
 
 
 
 
 
3
  "total_flos": 9.17802542639874e+17,
4
+ "train_loss": 2.9511482283918564,
5
+ "train_runtime": 778.9577,
6
  "train_samples": 116368,
7
+ "train_samples_per_second": 49.406,
8
+ "train_steps_per_second": 0.386
9
  }
runs/Nov18_01-59-33_main-lora-gemma7b-coding-0-0/events.out.tfevents.1731913882.main-lora-gemma7b-coding-0-0.458.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:536b06f752fea660457651812e23328001c4b4f172a1057b994a0f2bb1cbafd8
3
- size 18466
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0105e1f113726d8e52c40af25e0adf9d33715470ea11320d7896d239fe48e085
3
+ size 19091
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 9.17802542639874e+17,
4
- "train_loss": 2.9563070911901734,
5
- "train_runtime": 3725.7135,
6
  "train_samples": 116368,
7
- "train_samples_per_second": 10.33,
8
- "train_steps_per_second": 0.081
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 9.17802542639874e+17,
4
+ "train_loss": 2.9511482283918564,
5
+ "train_runtime": 778.9577,
6
  "train_samples": 116368,
7
+ "train_samples_per_second": 49.406,
8
+ "train_steps_per_second": 0.386
9
  }
trainer_state.json CHANGED
@@ -10,447 +10,447 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0033222591362126247,
13
- "grad_norm": 58.848934173583984,
14
  "learning_rate": 6.451612903225806e-06,
15
- "loss": 20.4567,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.016611295681063124,
20
- "grad_norm": 35.33861541748047,
21
  "learning_rate": 3.2258064516129034e-05,
22
- "loss": 20.5366,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.03322259136212625,
27
- "grad_norm": 16.172595977783203,
28
  "learning_rate": 6.451612903225807e-05,
29
- "loss": 17.7569,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.04983388704318937,
34
- "grad_norm": 7.711285591125488,
35
  "learning_rate": 9.677419354838711e-05,
36
- "loss": 15.7111,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.0664451827242525,
41
- "grad_norm": 7.137362957000732,
42
  "learning_rate": 0.00012903225806451613,
43
- "loss": 14.1919,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.08305647840531562,
48
- "grad_norm": 2.8490428924560547,
49
  "learning_rate": 0.00016129032258064516,
50
- "loss": 13.0076,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.09966777408637874,
55
- "grad_norm": 3.174508571624756,
56
  "learning_rate": 0.00019354838709677422,
57
- "loss": 12.4104,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.11627906976744186,
62
- "grad_norm": 6.637372970581055,
63
  "learning_rate": 0.0001998917111338525,
64
- "loss": 11.4493,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.132890365448505,
69
- "grad_norm": 12.817553520202637,
70
  "learning_rate": 0.00019945218953682734,
71
- "loss": 10.3687,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.14950166112956811,
76
- "grad_norm": 17.324180603027344,
77
  "learning_rate": 0.00019867615321125795,
78
- "loss": 8.29,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.16611295681063123,
83
- "grad_norm": 20.25031852722168,
84
  "learning_rate": 0.00019756622801842143,
85
- "loss": 5.0469,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.18272425249169436,
90
- "grad_norm": 10.32929801940918,
91
  "learning_rate": 0.0001961261695938319,
92
- "loss": 2.5342,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.19933554817275748,
97
- "grad_norm": 6.437869071960449,
98
  "learning_rate": 0.00019436085063935835,
99
- "loss": 1.8296,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.2159468438538206,
104
- "grad_norm": 2.7442312240600586,
105
  "learning_rate": 0.00019227624443554425,
106
- "loss": 1.5211,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.23255813953488372,
111
- "grad_norm": 1.103617548942566,
112
  "learning_rate": 0.0001898794046299167,
113
- "loss": 1.342,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.24916943521594684,
118
- "grad_norm": 0.8008713722229004,
119
  "learning_rate": 0.00018717844136967624,
120
- "loss": 1.2224,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.26578073089701,
125
- "grad_norm": 0.9825754761695862,
126
  "learning_rate": 0.00018418249385952575,
127
- "loss": 1.1639,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.2823920265780731,
132
- "grad_norm": 1.6092805862426758,
133
  "learning_rate": 0.00018090169943749476,
134
- "loss": 1.106,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.29900332225913623,
139
- "grad_norm": 0.7608142495155334,
140
  "learning_rate": 0.0001773471592733964,
141
- "loss": 1.0649,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.31561461794019935,
146
- "grad_norm": 0.7974920272827148,
147
  "learning_rate": 0.0001735309008059829,
148
- "loss": 1.0544,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.33222591362126247,
153
- "grad_norm": 0.8733552694320679,
154
  "learning_rate": 0.00016946583704589973,
155
- "loss": 1.0418,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.3488372093023256,
160
- "grad_norm": 0.8620198965072632,
161
  "learning_rate": 0.00016516572288214552,
162
- "loss": 1.0134,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.3654485049833887,
167
- "grad_norm": 0.9430036544799805,
168
  "learning_rate": 0.00016064510853988138,
169
- "loss": 1.0117,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.38205980066445183,
174
- "grad_norm": 0.9424466490745544,
175
  "learning_rate": 0.0001559192903470747,
176
- "loss": 0.9687,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.39867109634551495,
181
- "grad_norm": 0.7452667355537415,
182
  "learning_rate": 0.00015100425897656753,
183
- "loss": 0.9678,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.4152823920265781,
188
- "grad_norm": 0.6089577674865723,
189
  "learning_rate": 0.00014591664533870118,
190
- "loss": 0.9428,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.4318936877076412,
195
- "grad_norm": 0.5161769390106201,
196
  "learning_rate": 0.00014067366430758004,
197
- "loss": 0.9344,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.4485049833887043,
202
- "grad_norm": 1.061056137084961,
203
  "learning_rate": 0.00013529305647138687,
204
- "loss": 0.9316,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.46511627906976744,
209
- "grad_norm": 1.1914814710617065,
210
  "learning_rate": 0.0001297930281038482,
211
- "loss": 0.932,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.48172757475083056,
216
- "grad_norm": 1.3082082271575928,
217
  "learning_rate": 0.00012419218955996676,
218
- "loss": 0.9225,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.4983388704318937,
223
- "grad_norm": 0.9995535612106323,
224
  "learning_rate": 0.00011850949230447145,
225
- "loss": 0.9153,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.5149501661129569,
230
- "grad_norm": 0.5884081125259399,
231
  "learning_rate": 0.00011276416478605949,
232
- "loss": 0.9105,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.53156146179402,
237
- "grad_norm": 0.9028487801551819,
238
  "learning_rate": 0.00010697564737441252,
239
- "loss": 0.9142,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.5481727574750831,
244
- "grad_norm": 0.9643915891647339,
245
  "learning_rate": 0.00010116352658013973,
246
- "loss": 0.8963,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.5647840531561462,
251
- "grad_norm": 0.6744817495346069,
252
  "learning_rate": 9.534746878022534e-05,
253
- "loss": 0.9227,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.5813953488372093,
258
- "grad_norm": 0.5441331267356873,
259
  "learning_rate": 8.954715367323468e-05,
260
- "loss": 0.8981,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.5980066445182725,
265
- "grad_norm": 0.6811143755912781,
266
  "learning_rate": 8.378220768944327e-05,
267
- "loss": 0.8746,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.6146179401993356,
272
- "grad_norm": 0.5517932176589966,
273
  "learning_rate": 7.807213758120966e-05,
274
- "loss": 0.8821,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.6312292358803987,
279
- "grad_norm": 1.0475077629089355,
280
  "learning_rate": 7.243626441830009e-05,
281
- "loss": 0.8766,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.6478405315614618,
286
- "grad_norm": 0.8785480856895447,
287
  "learning_rate": 6.68936582115042e-05,
288
- "loss": 0.8696,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.6644518272425249,
293
- "grad_norm": 0.5514780282974243,
294
  "learning_rate": 6.146307338575519e-05,
295
- "loss": 0.8435,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.6810631229235881,
300
- "grad_norm": 0.7921276688575745,
301
  "learning_rate": 5.616288532109225e-05,
302
- "loss": 0.8613,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.6976744186046512,
307
- "grad_norm": 1.2019799947738647,
308
  "learning_rate": 5.101102817619131e-05,
309
- "loss": 0.8491,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.7142857142857143,
314
- "grad_norm": 0.5968192219734192,
315
  "learning_rate": 4.6024934204848745e-05,
316
- "loss": 0.8399,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.7308970099667774,
321
- "grad_norm": 0.622888445854187,
322
  "learning_rate": 4.12214747707527e-05,
323
- "loss": 0.8589,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.7475083056478405,
328
- "grad_norm": 0.6290799379348755,
329
  "learning_rate": 3.661690326012897e-05,
330
- "loss": 0.8479,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.7641196013289037,
335
- "grad_norm": 0.5619077086448669,
336
  "learning_rate": 3.222680008542678e-05,
337
- "loss": 0.8341,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.7807308970099668,
342
- "grad_norm": 0.4804949164390564,
343
  "learning_rate": 2.8066019966134904e-05,
344
- "loss": 0.8355,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.7973421926910299,
349
- "grad_norm": 0.8997146487236023,
350
  "learning_rate": 2.4148641665113113e-05,
351
- "loss": 0.8305,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.813953488372093,
356
- "grad_norm": 0.62757408618927,
357
  "learning_rate": 2.0487920350515212e-05,
358
- "loss": 0.8352,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.8305647840531561,
363
- "grad_norm": 0.6770641207695007,
364
  "learning_rate": 1.7096242744495837e-05,
365
- "loss": 0.8365,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.8471760797342193,
370
- "grad_norm": 0.8780685663223267,
371
  "learning_rate": 1.3985085210463477e-05,
372
- "loss": 0.8397,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.8637873754152824,
377
- "grad_norm": 0.5401960611343384,
378
  "learning_rate": 1.116497492069961e-05,
379
- "loss": 0.8265,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.8803986710963455,
384
- "grad_norm": 0.5332029461860657,
385
  "learning_rate": 8.645454235739903e-06,
386
- "loss": 0.8441,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.8970099667774086,
391
- "grad_norm": 0.6243578195571899,
392
  "learning_rate": 6.435048416046863e-06,
393
- "loss": 0.8333,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.9136212624584718,
398
- "grad_norm": 0.5947065949440002,
399
  "learning_rate": 4.541236775226809e-06,
400
- "loss": 0.8311,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.9302325581395349,
405
- "grad_norm": 0.5467190742492676,
406
  "learning_rate": 2.970427372400353e-06,
407
- "loss": 0.8217,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.946843853820598,
412
- "grad_norm": 0.5683417320251465,
413
  "learning_rate": 1.7279353293586765e-06,
414
- "loss": 0.8293,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.9634551495016611,
419
- "grad_norm": 0.5739116072654724,
420
  "learning_rate": 8.17964845873831e-07,
421
- "loss": 0.833,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.9800664451827242,
426
- "grad_norm": 0.6611495614051819,
427
  "learning_rate": 2.4359497401758024e-07,
428
- "loss": 0.8317,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.9966777408637874,
433
- "grad_norm": 0.7239180207252502,
434
  "learning_rate": 6.769199623779532e-09,
435
- "loss": 0.8317,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 1.0,
440
- "eval_loss": 1.6543629169464111,
441
- "eval_runtime": 1.2337,
442
- "eval_samples_per_second": 8.106,
443
- "eval_steps_per_second": 0.811,
444
  "step": 301
445
  },
446
  {
447
  "epoch": 1.0,
448
  "step": 301,
449
  "total_flos": 9.17802542639874e+17,
450
- "train_loss": 2.9563070911901734,
451
- "train_runtime": 3725.7135,
452
- "train_samples_per_second": 10.33,
453
- "train_steps_per_second": 0.081
454
  }
455
  ],
456
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0033222591362126247,
13
+ "grad_norm": 51.88744354248047,
14
  "learning_rate": 6.451612903225806e-06,
15
+ "loss": 20.0655,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.016611295681063124,
20
+ "grad_norm": 38.35562515258789,
21
  "learning_rate": 3.2258064516129034e-05,
22
+ "loss": 20.4442,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.03322259136212625,
27
+ "grad_norm": 16.541790008544922,
28
  "learning_rate": 6.451612903225807e-05,
29
+ "loss": 17.9286,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.04983388704318937,
34
+ "grad_norm": 7.802853584289551,
35
  "learning_rate": 9.677419354838711e-05,
36
+ "loss": 15.769,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.0664451827242525,
41
+ "grad_norm": 7.048831462860107,
42
  "learning_rate": 0.00012903225806451613,
43
+ "loss": 14.2071,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.08305647840531562,
48
+ "grad_norm": 2.842278003692627,
49
  "learning_rate": 0.00016129032258064516,
50
+ "loss": 13.0251,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.09966777408637874,
55
+ "grad_norm": 3.1177785396575928,
56
  "learning_rate": 0.00019354838709677422,
57
+ "loss": 12.4164,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.11627906976744186,
62
+ "grad_norm": 7.123711109161377,
63
  "learning_rate": 0.0001998917111338525,
64
+ "loss": 11.4556,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.132890365448505,
69
+ "grad_norm": 13.379329681396484,
70
  "learning_rate": 0.00019945218953682734,
71
+ "loss": 10.3635,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.14950166112956811,
76
+ "grad_norm": 17.68781280517578,
77
  "learning_rate": 0.00019867615321125795,
78
+ "loss": 8.2723,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.16611295681063123,
83
+ "grad_norm": 18.207794189453125,
84
  "learning_rate": 0.00019756622801842143,
85
+ "loss": 4.9814,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.18272425249169436,
90
+ "grad_norm": 23.382278442382812,
91
  "learning_rate": 0.0001961261695938319,
92
+ "loss": 2.4786,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.19933554817275748,
97
+ "grad_norm": 4.926826000213623,
98
  "learning_rate": 0.00019436085063935835,
99
+ "loss": 1.8199,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.2159468438538206,
104
+ "grad_norm": 2.2475671768188477,
105
  "learning_rate": 0.00019227624443554425,
106
+ "loss": 1.519,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.23255813953488372,
111
+ "grad_norm": 1.4462262392044067,
112
  "learning_rate": 0.0001898794046299167,
113
+ "loss": 1.3358,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.24916943521594684,
118
+ "grad_norm": 0.7703434824943542,
119
  "learning_rate": 0.00018717844136967624,
120
+ "loss": 1.2165,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.26578073089701,
125
+ "grad_norm": 0.6548369526863098,
126
  "learning_rate": 0.00018418249385952575,
127
+ "loss": 1.161,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.2823920265780731,
132
+ "grad_norm": 1.4386329650878906,
133
  "learning_rate": 0.00018090169943749476,
134
+ "loss": 1.1037,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.29900332225913623,
139
+ "grad_norm": 0.7257230281829834,
140
  "learning_rate": 0.0001773471592733964,
141
+ "loss": 1.0609,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.31561461794019935,
146
+ "grad_norm": 0.8698992729187012,
147
  "learning_rate": 0.0001735309008059829,
148
+ "loss": 1.048,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.33222591362126247,
153
+ "grad_norm": 0.7444359064102173,
154
  "learning_rate": 0.00016946583704589973,
155
+ "loss": 1.034,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.3488372093023256,
160
+ "grad_norm": 1.0887985229492188,
161
  "learning_rate": 0.00016516572288214552,
162
+ "loss": 1.0021,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.3654485049833887,
167
+ "grad_norm": 1.341417670249939,
168
  "learning_rate": 0.00016064510853988138,
169
+ "loss": 1.0024,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.38205980066445183,
174
+ "grad_norm": 0.7008532285690308,
175
  "learning_rate": 0.0001559192903470747,
176
+ "loss": 0.9584,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.39867109634551495,
181
+ "grad_norm": 0.8793571591377258,
182
  "learning_rate": 0.00015100425897656753,
183
+ "loss": 0.9583,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.4152823920265781,
188
+ "grad_norm": 0.7256600856781006,
189
  "learning_rate": 0.00014591664533870118,
190
+ "loss": 0.9338,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.4318936877076412,
195
+ "grad_norm": 0.5686385631561279,
196
  "learning_rate": 0.00014067366430758004,
197
+ "loss": 0.9274,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.4485049833887043,
202
+ "grad_norm": 1.2567209005355835,
203
  "learning_rate": 0.00013529305647138687,
204
+ "loss": 0.925,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.46511627906976744,
209
+ "grad_norm": 1.181892991065979,
210
  "learning_rate": 0.0001297930281038482,
211
+ "loss": 0.9243,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.48172757475083056,
216
+ "grad_norm": 0.7734186053276062,
217
  "learning_rate": 0.00012419218955996676,
218
+ "loss": 0.9127,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.4983388704318937,
223
+ "grad_norm": 0.9539685845375061,
224
  "learning_rate": 0.00011850949230447145,
225
+ "loss": 0.9106,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.5149501661129569,
230
+ "grad_norm": 0.8648198843002319,
231
  "learning_rate": 0.00011276416478605949,
232
+ "loss": 0.9035,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.53156146179402,
237
+ "grad_norm": 0.7837306261062622,
238
  "learning_rate": 0.00010697564737441252,
239
+ "loss": 0.9096,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.5481727574750831,
244
+ "grad_norm": 0.7229638695716858,
245
  "learning_rate": 0.00010116352658013973,
246
+ "loss": 0.8911,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.5647840531561462,
251
+ "grad_norm": 0.7054806351661682,
252
  "learning_rate": 9.534746878022534e-05,
253
+ "loss": 0.9145,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.5813953488372093,
258
+ "grad_norm": 0.5297299027442932,
259
  "learning_rate": 8.954715367323468e-05,
260
+ "loss": 0.8937,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.5980066445182725,
265
+ "grad_norm": 0.5183060169219971,
266
  "learning_rate": 8.378220768944327e-05,
267
+ "loss": 0.8682,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.6146179401993356,
272
+ "grad_norm": 0.5382773876190186,
273
  "learning_rate": 7.807213758120966e-05,
274
+ "loss": 0.8748,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.6312292358803987,
279
+ "grad_norm": 0.8112709522247314,
280
  "learning_rate": 7.243626441830009e-05,
281
+ "loss": 0.8694,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.6478405315614618,
286
+ "grad_norm": 0.7376631498336792,
287
  "learning_rate": 6.68936582115042e-05,
288
+ "loss": 0.8641,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.6644518272425249,
293
+ "grad_norm": 0.45725372433662415,
294
  "learning_rate": 6.146307338575519e-05,
295
+ "loss": 0.839,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.6810631229235881,
300
+ "grad_norm": 0.6437954306602478,
301
  "learning_rate": 5.616288532109225e-05,
302
+ "loss": 0.8563,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.6976744186046512,
307
+ "grad_norm": 1.122187852859497,
308
  "learning_rate": 5.101102817619131e-05,
309
+ "loss": 0.8448,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.7142857142857143,
314
+ "grad_norm": 0.7291833758354187,
315
  "learning_rate": 4.6024934204848745e-05,
316
+ "loss": 0.8338,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.7308970099667774,
321
+ "grad_norm": 0.6493574380874634,
322
  "learning_rate": 4.12214747707527e-05,
323
+ "loss": 0.8546,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.7475083056478405,
328
+ "grad_norm": 0.7318633794784546,
329
  "learning_rate": 3.661690326012897e-05,
330
+ "loss": 0.8432,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.7641196013289037,
335
+ "grad_norm": 0.5700820088386536,
336
  "learning_rate": 3.222680008542678e-05,
337
+ "loss": 0.828,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.7807308970099668,
342
+ "grad_norm": 0.4655066430568695,
343
  "learning_rate": 2.8066019966134904e-05,
344
+ "loss": 0.8318,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.7973421926910299,
349
+ "grad_norm": 0.8703047633171082,
350
  "learning_rate": 2.4148641665113113e-05,
351
+ "loss": 0.8275,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.813953488372093,
356
+ "grad_norm": 0.6757218241691589,
357
  "learning_rate": 2.0487920350515212e-05,
358
+ "loss": 0.8317,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.8305647840531561,
363
+ "grad_norm": 0.7051078677177429,
364
  "learning_rate": 1.7096242744495837e-05,
365
+ "loss": 0.8323,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.8471760797342193,
370
+ "grad_norm": 0.9345777630805969,
371
  "learning_rate": 1.3985085210463477e-05,
372
+ "loss": 0.8348,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.8637873754152824,
377
+ "grad_norm": 0.5797870755195618,
378
  "learning_rate": 1.116497492069961e-05,
379
+ "loss": 0.8215,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.8803986710963455,
384
+ "grad_norm": 0.5265169739723206,
385
  "learning_rate": 8.645454235739903e-06,
386
+ "loss": 0.8405,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.8970099667774086,
391
+ "grad_norm": 0.6980829834938049,
392
  "learning_rate": 6.435048416046863e-06,
393
+ "loss": 0.8281,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.9136212624584718,
398
+ "grad_norm": 0.5821875333786011,
399
  "learning_rate": 4.541236775226809e-06,
400
+ "loss": 0.8259,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.9302325581395349,
405
+ "grad_norm": 0.6054457426071167,
406
  "learning_rate": 2.970427372400353e-06,
407
+ "loss": 0.8171,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.946843853820598,
412
+ "grad_norm": 0.5492629408836365,
413
  "learning_rate": 1.7279353293586765e-06,
414
+ "loss": 0.8246,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.9634551495016611,
419
+ "grad_norm": 0.606873631477356,
420
  "learning_rate": 8.17964845873831e-07,
421
+ "loss": 0.8269,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.9800664451827242,
426
+ "grad_norm": 0.8362772464752197,
427
  "learning_rate": 2.4359497401758024e-07,
428
+ "loss": 0.8267,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.9966777408637874,
433
+ "grad_norm": 0.8066886067390442,
434
  "learning_rate": 6.769199623779532e-09,
435
+ "loss": 0.8265,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 1.0,
440
+ "eval_loss": 1.6534239053726196,
441
+ "eval_runtime": 0.4546,
442
+ "eval_samples_per_second": 21.998,
443
+ "eval_steps_per_second": 2.2,
444
  "step": 301
445
  },
446
  {
447
  "epoch": 1.0,
448
  "step": 301,
449
  "total_flos": 9.17802542639874e+17,
450
+ "train_loss": 2.9511482283918564,
451
+ "train_runtime": 778.9577,
452
+ "train_samples_per_second": 49.406,
453
+ "train_steps_per_second": 0.386
454
  }
455
  ],
456
  "logging_steps": 5,