chansung commited on
Commit
f79dd13
·
verified ·
1 Parent(s): 9c4c110

Model save

Browse files
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 2.3991
24
 
25
  ## Model description
26
 
@@ -57,7 +57,7 @@ The following hyperparameters were used during training:
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
- | 1.2456 | 0.9981 | 261 | 2.3991 |
61
 
62
 
63
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 2.4052
24
 
25
  ## Model description
26
 
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
+ | 1.2442 | 0.9981 | 261 | 2.4052 |
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 0.9980879541108987,
3
- "eval_loss": 2.3990681171417236,
4
- "eval_runtime": 1.2805,
5
- "eval_samples": 15,
6
- "eval_samples_per_second": 9.371,
7
- "eval_steps_per_second": 0.781,
8
  "total_flos": 7.958354271676662e+17,
9
- "train_loss": 4.106667400776655,
10
- "train_runtime": 3226.2655,
11
  "train_samples": 111440,
12
- "train_samples_per_second": 10.373,
13
- "train_steps_per_second": 0.081
14
  }
 
1
  {
2
  "epoch": 0.9980879541108987,
 
 
 
 
 
3
  "total_flos": 7.958354271676662e+17,
4
+ "train_loss": 4.106453996508514,
5
+ "train_runtime": 670.8506,
6
  "train_samples": 111440,
7
+ "train_samples_per_second": 49.884,
8
+ "train_steps_per_second": 0.389
9
  }
runs/Nov18_06-39-27_main-lora-gemma7b-closedqa-0-0/events.out.tfevents.1731930676.main-lora-gemma7b-closedqa-0-0.458.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8eb2eb9493bbd0739d7bf3029b4c1e1c6324a110a4538501a2ff1598d5705a14
3
- size 16782
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:045439e960d40b91d793c9c72ae0d9ced6bef45fd60651c284cf5f3f668d5445
3
+ size 17407
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9980879541108987,
3
  "total_flos": 7.958354271676662e+17,
4
- "train_loss": 4.106667400776655,
5
- "train_runtime": 3226.2655,
6
  "train_samples": 111440,
7
- "train_samples_per_second": 10.373,
8
- "train_steps_per_second": 0.081
9
  }
 
1
  {
2
  "epoch": 0.9980879541108987,
3
  "total_flos": 7.958354271676662e+17,
4
+ "train_loss": 4.106453996508514,
5
+ "train_runtime": 670.8506,
6
  "train_samples": 111440,
7
+ "train_samples_per_second": 49.884,
8
+ "train_steps_per_second": 0.389
9
  }
trainer_state.json CHANGED
@@ -10,391 +10,391 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0038240917782026767,
13
- "grad_norm": 109.6296157836914,
14
  "learning_rate": 7.4074074074074075e-06,
15
- "loss": 28.2707,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.019120458891013385,
20
- "grad_norm": 73.76856231689453,
21
  "learning_rate": 3.7037037037037037e-05,
22
- "loss": 26.7709,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.03824091778202677,
27
- "grad_norm": 27.987791061401367,
28
  "learning_rate": 7.407407407407407e-05,
29
- "loss": 22.5898,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.05736137667304015,
34
- "grad_norm": 13.016111373901367,
35
  "learning_rate": 0.00011111111111111112,
36
- "loss": 18.9729,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.07648183556405354,
41
- "grad_norm": 11.862752914428711,
42
  "learning_rate": 0.00014814814814814815,
43
- "loss": 16.2855,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.09560229445506692,
48
- "grad_norm": 2.9698574542999268,
49
  "learning_rate": 0.0001851851851851852,
50
- "loss": 15.0127,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.1147227533460803,
55
- "grad_norm": 4.096982955932617,
56
  "learning_rate": 0.00019991889981715698,
57
- "loss": 13.9569,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.1338432122370937,
62
- "grad_norm": 6.29751443862915,
63
  "learning_rate": 0.0001994237638847428,
64
- "loss": 13.1471,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.15296367112810708,
69
- "grad_norm": 13.995941162109375,
70
  "learning_rate": 0.00019848077530122083,
71
- "loss": 11.9192,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.17208413001912046,
76
- "grad_norm": 23.425451278686523,
77
  "learning_rate": 0.0001970941817426052,
78
- "loss": 9.1546,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.19120458891013384,
83
- "grad_norm": 24.60663604736328,
84
  "learning_rate": 0.00019527022909596536,
85
- "loss": 6.0786,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.21032504780114722,
90
- "grad_norm": 9.548501014709473,
91
  "learning_rate": 0.00019301713332493386,
92
- "loss": 3.2243,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.2294455066921606,
97
- "grad_norm": 6.405646324157715,
98
  "learning_rate": 0.00019034504346103823,
99
- "loss": 2.4261,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.248565965583174,
104
- "grad_norm": 2.391706943511963,
105
  "learning_rate": 0.00018726599588756145,
106
- "loss": 2.1088,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.2676864244741874,
111
- "grad_norm": 1.3095039129257202,
112
  "learning_rate": 0.00018379386012185814,
113
- "loss": 1.8208,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.28680688336520077,
118
- "grad_norm": 1.0937951803207397,
119
  "learning_rate": 0.00017994427634035015,
120
- "loss": 1.6814,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.30592734225621415,
125
- "grad_norm": 0.9703987240791321,
126
  "learning_rate": 0.00017573458492761801,
127
- "loss": 1.5943,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.32504780114722753,
132
- "grad_norm": 1.7736315727233887,
133
  "learning_rate": 0.00017118374836693406,
134
- "loss": 1.5357,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.3441682600382409,
139
- "grad_norm": 0.7995043992996216,
140
  "learning_rate": 0.00016631226582407952,
141
- "loss": 1.4938,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.3632887189292543,
146
- "grad_norm": 0.689298152923584,
147
  "learning_rate": 0.00016114208080920123,
148
- "loss": 1.4527,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.3824091778202677,
153
- "grad_norm": 1.3656119108200073,
154
  "learning_rate": 0.00015569648233264394,
155
- "loss": 1.4352,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.40152963671128106,
160
- "grad_norm": 0.8344822525978088,
161
  "learning_rate": 0.00015000000000000001,
162
- "loss": 1.4039,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.42065009560229444,
167
- "grad_norm": 0.9604928493499756,
168
  "learning_rate": 0.00014407829351891857,
169
- "loss": 1.3876,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.4397705544933078,
174
- "grad_norm": 0.645470917224884,
175
  "learning_rate": 0.00013795803711538966,
176
- "loss": 1.3706,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.4588910133843212,
181
- "grad_norm": 0.5904386639595032,
182
  "learning_rate": 0.00013166679938014726,
183
- "loss": 1.3562,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.4780114722753346,
188
- "grad_norm": 0.6904798746109009,
189
  "learning_rate": 0.00012523291908642217,
190
- "loss": 1.3502,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.497131931166348,
195
- "grad_norm": 0.8584344387054443,
196
  "learning_rate": 0.00011868537753842051,
197
- "loss": 1.3508,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.5162523900573613,
202
- "grad_norm": 1.3949240446090698,
203
  "learning_rate": 0.0001120536680255323,
204
- "loss": 1.3284,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.5353728489483748,
209
- "grad_norm": 1.0056356191635132,
210
  "learning_rate": 0.00010536766297031215,
211
- "loss": 1.318,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.5544933078393881,
216
- "grad_norm": 0.9676538109779358,
217
  "learning_rate": 9.865747936866027e-05,
218
- "loss": 1.3218,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.5736137667304015,
223
- "grad_norm": 0.8652806878089905,
224
  "learning_rate": 9.195334312832742e-05,
225
- "loss": 1.3008,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.5927342256214149,
230
- "grad_norm": 1.193677306175232,
231
  "learning_rate": 8.528545291682838e-05,
232
- "loss": 1.2873,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.6118546845124283,
237
- "grad_norm": 0.6716729998588562,
238
  "learning_rate": 7.868384413205842e-05,
239
- "loss": 1.2834,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.6309751434034416,
244
- "grad_norm": 4.712409019470215,
245
  "learning_rate": 7.217825360835473e-05,
246
- "loss": 1.2933,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.6500956022944551,
251
- "grad_norm": 1.2476301193237305,
252
  "learning_rate": 6.579798566743314e-05,
253
- "loss": 1.2727,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.6692160611854685,
258
- "grad_norm": 0.7000300288200378,
259
  "learning_rate": 5.957178011756952e-05,
260
- "loss": 1.2674,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.6883365200764818,
265
- "grad_norm": 0.6005345582962036,
266
  "learning_rate": 5.3527682795623146e-05,
267
- "loss": 1.2683,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.7074569789674953,
272
- "grad_norm": 0.6676229238510132,
273
  "learning_rate": 4.7692919235042255e-05,
274
- "loss": 1.2739,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.7265774378585086,
279
- "grad_norm": 0.5791189670562744,
280
  "learning_rate": 4.209377202891212e-05,
281
- "loss": 1.2686,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.745697896749522,
286
- "grad_norm": 0.6084746718406677,
287
  "learning_rate": 3.675546244046228e-05,
288
- "loss": 1.2485,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.7648183556405354,
293
- "grad_norm": 0.5911340117454529,
294
  "learning_rate": 3.170203679431584e-05,
295
- "loss": 1.2727,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.7839388145315488,
300
- "grad_norm": 0.7844299674034119,
301
  "learning_rate": 2.6956258160229695e-05,
302
- "loss": 1.2634,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.8030592734225621,
307
- "grad_norm": 0.6864188313484192,
308
  "learning_rate": 2.2539503817234553e-05,
309
- "loss": 1.2407,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.8221797323135756,
314
- "grad_norm": 0.7077968716621399,
315
  "learning_rate": 1.8471668960045574e-05,
316
- "loss": 1.2416,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.8413001912045889,
321
- "grad_norm": 0.6579145789146423,
322
  "learning_rate": 1.4771077081496654e-05,
323
- "loss": 1.2472,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.8604206500956023,
328
- "grad_norm": 0.8815127015113831,
329
  "learning_rate": 1.1454397434679021e-05,
330
- "loss": 1.2361,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.8795411089866156,
335
- "grad_norm": 0.8044998049736023,
336
  "learning_rate": 8.536569946574546e-06,
337
- "loss": 1.2467,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.8986615678776291,
342
- "grad_norm": 0.5612942576408386,
343
  "learning_rate": 6.030737921409169e-06,
344
- "loss": 1.2457,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.9177820267686424,
349
- "grad_norm": 0.6176737546920776,
350
  "learning_rate": 3.948188836862776e-06,
351
- "loss": 1.2367,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.9369024856596558,
356
- "grad_norm": 0.5503106117248535,
357
  "learning_rate": 2.2983034998182997e-06,
358
- "loss": 1.2424,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.9560229445506692,
363
- "grad_norm": 0.6087459921836853,
364
  "learning_rate": 1.0885137906768372e-06,
365
- "loss": 1.2377,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.9751434034416826,
370
- "grad_norm": 0.6323258876800537,
371
  "learning_rate": 3.2426918657900704e-07,
372
- "loss": 1.233,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.994263862332696,
377
- "grad_norm": 0.6436792612075806,
378
  "learning_rate": 9.012214327897006e-09,
379
- "loss": 1.2456,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.9980879541108987,
384
- "eval_loss": 2.3990681171417236,
385
- "eval_runtime": 1.2567,
386
- "eval_samples_per_second": 9.549,
387
- "eval_steps_per_second": 0.796,
388
  "step": 261
389
  },
390
  {
391
  "epoch": 0.9980879541108987,
392
  "step": 261,
393
  "total_flos": 7.958354271676662e+17,
394
- "train_loss": 4.106667400776655,
395
- "train_runtime": 3226.2655,
396
- "train_samples_per_second": 10.373,
397
- "train_steps_per_second": 0.081
398
  }
399
  ],
400
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0038240917782026767,
13
+ "grad_norm": 105.78279876708984,
14
  "learning_rate": 7.4074074074074075e-06,
15
+ "loss": 27.5504,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.019120458891013385,
20
+ "grad_norm": 89.91433715820312,
21
  "learning_rate": 3.7037037037037037e-05,
22
+ "loss": 26.708,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.03824091778202677,
27
+ "grad_norm": 27.077810287475586,
28
  "learning_rate": 7.407407407407407e-05,
29
+ "loss": 22.6974,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.05736137667304015,
34
+ "grad_norm": 13.22861385345459,
35
  "learning_rate": 0.00011111111111111112,
36
+ "loss": 19.0567,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.07648183556405354,
41
+ "grad_norm": 11.456145286560059,
42
  "learning_rate": 0.00014814814814814815,
43
+ "loss": 16.3135,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.09560229445506692,
48
+ "grad_norm": 3.0151147842407227,
49
  "learning_rate": 0.0001851851851851852,
50
+ "loss": 15.0328,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.1147227533460803,
55
+ "grad_norm": 4.0936784744262695,
56
  "learning_rate": 0.00019991889981715698,
57
+ "loss": 13.9759,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.1338432122370937,
62
+ "grad_norm": 6.428543567657471,
63
  "learning_rate": 0.0001994237638847428,
64
+ "loss": 13.1664,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.15296367112810708,
69
+ "grad_norm": 13.633052825927734,
70
  "learning_rate": 0.00019848077530122083,
71
+ "loss": 11.9549,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.17208413001912046,
76
+ "grad_norm": 23.461597442626953,
77
  "learning_rate": 0.0001970941817426052,
78
+ "loss": 9.1656,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.19120458891013384,
83
+ "grad_norm": 25.158462524414062,
84
  "learning_rate": 0.00019527022909596536,
85
+ "loss": 6.0581,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.21032504780114722,
90
+ "grad_norm": 9.126226425170898,
91
  "learning_rate": 0.00019301713332493386,
92
+ "loss": 3.1728,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.2294455066921606,
97
+ "grad_norm": 6.13222074508667,
98
  "learning_rate": 0.00019034504346103823,
99
+ "loss": 2.3894,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.248565965583174,
104
+ "grad_norm": 1.9926798343658447,
105
  "learning_rate": 0.00018726599588756145,
106
+ "loss": 2.0757,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.2676864244741874,
111
+ "grad_norm": 1.3959568738937378,
112
  "learning_rate": 0.00018379386012185814,
113
+ "loss": 1.7977,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.28680688336520077,
118
+ "grad_norm": 2.5797815322875977,
119
  "learning_rate": 0.00017994427634035015,
120
+ "loss": 1.6855,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.30592734225621415,
125
+ "grad_norm": 0.8123499155044556,
126
  "learning_rate": 0.00017573458492761801,
127
+ "loss": 1.6023,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.32504780114722753,
132
+ "grad_norm": 1.2287877798080444,
133
  "learning_rate": 0.00017118374836693406,
134
+ "loss": 1.543,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.3441682600382409,
139
+ "grad_norm": 0.7896401882171631,
140
  "learning_rate": 0.00016631226582407952,
141
+ "loss": 1.5,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.3632887189292543,
146
+ "grad_norm": 1.0136632919311523,
147
  "learning_rate": 0.00016114208080920123,
148
+ "loss": 1.4581,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.3824091778202677,
153
+ "grad_norm": 1.0349676609039307,
154
  "learning_rate": 0.00015569648233264394,
155
+ "loss": 1.4393,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.40152963671128106,
160
+ "grad_norm": 0.7523437738418579,
161
  "learning_rate": 0.00015000000000000001,
162
+ "loss": 1.4075,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.42065009560229444,
167
+ "grad_norm": 0.8223061561584473,
168
  "learning_rate": 0.00014407829351891857,
169
+ "loss": 1.3899,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.4397705544933078,
174
+ "grad_norm": 0.7651774883270264,
175
  "learning_rate": 0.00013795803711538966,
176
+ "loss": 1.3739,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.4588910133843212,
181
+ "grad_norm": 0.6884527802467346,
182
  "learning_rate": 0.00013166679938014726,
183
+ "loss": 1.357,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.4780114722753346,
188
+ "grad_norm": 0.7700805068016052,
189
  "learning_rate": 0.00012523291908642217,
190
+ "loss": 1.3525,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.497131931166348,
195
+ "grad_norm": 1.0306934118270874,
196
  "learning_rate": 0.00011868537753842051,
197
+ "loss": 1.3514,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.5162523900573613,
202
+ "grad_norm": 1.1528677940368652,
203
  "learning_rate": 0.0001120536680255323,
204
+ "loss": 1.3288,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.5353728489483748,
209
+ "grad_norm": 0.7638438940048218,
210
  "learning_rate": 0.00010536766297031215,
211
+ "loss": 1.3204,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.5544933078393881,
216
+ "grad_norm": 0.8795316219329834,
217
  "learning_rate": 9.865747936866027e-05,
218
+ "loss": 1.3215,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.5736137667304015,
223
+ "grad_norm": 0.8259925246238708,
224
  "learning_rate": 9.195334312832742e-05,
225
+ "loss": 1.3012,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.5927342256214149,
230
+ "grad_norm": 1.0407475233078003,
231
  "learning_rate": 8.528545291682838e-05,
232
+ "loss": 1.2876,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.6118546845124283,
237
+ "grad_norm": 0.7297894358634949,
238
  "learning_rate": 7.868384413205842e-05,
239
+ "loss": 1.2845,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.6309751434034416,
244
+ "grad_norm": 2.0282633304595947,
245
  "learning_rate": 7.217825360835473e-05,
246
+ "loss": 1.2929,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.6500956022944551,
251
+ "grad_norm": 1.362356185913086,
252
  "learning_rate": 6.579798566743314e-05,
253
+ "loss": 1.2715,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.6692160611854685,
258
+ "grad_norm": 0.5886189937591553,
259
  "learning_rate": 5.957178011756952e-05,
260
+ "loss": 1.2666,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.6883365200764818,
265
+ "grad_norm": 0.683692991733551,
266
  "learning_rate": 5.3527682795623146e-05,
267
+ "loss": 1.2673,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.7074569789674953,
272
+ "grad_norm": 0.7453979253768921,
273
  "learning_rate": 4.7692919235042255e-05,
274
+ "loss": 1.2728,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.7265774378585086,
279
+ "grad_norm": 0.563890814781189,
280
  "learning_rate": 4.209377202891212e-05,
281
+ "loss": 1.2673,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.745697896749522,
286
+ "grad_norm": 0.6703224182128906,
287
  "learning_rate": 3.675546244046228e-05,
288
+ "loss": 1.2457,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.7648183556405354,
293
+ "grad_norm": 0.6253310441970825,
294
  "learning_rate": 3.170203679431584e-05,
295
+ "loss": 1.2703,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.7839388145315488,
300
+ "grad_norm": 0.8390852212905884,
301
  "learning_rate": 2.6956258160229695e-05,
302
+ "loss": 1.2605,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.8030592734225621,
307
+ "grad_norm": 0.6806175708770752,
308
  "learning_rate": 2.2539503817234553e-05,
309
+ "loss": 1.2402,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.8221797323135756,
314
+ "grad_norm": 0.774019718170166,
315
  "learning_rate": 1.8471668960045574e-05,
316
+ "loss": 1.241,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.8413001912045889,
321
+ "grad_norm": 0.6997012495994568,
322
  "learning_rate": 1.4771077081496654e-05,
323
+ "loss": 1.2451,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.8604206500956023,
328
+ "grad_norm": 0.8751155138015747,
329
  "learning_rate": 1.1454397434679021e-05,
330
+ "loss": 1.2339,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.8795411089866156,
335
+ "grad_norm": 0.7787600159645081,
336
  "learning_rate": 8.536569946574546e-06,
337
+ "loss": 1.2464,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.8986615678776291,
342
+ "grad_norm": 0.5735809803009033,
343
  "learning_rate": 6.030737921409169e-06,
344
+ "loss": 1.2439,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.9177820267686424,
349
+ "grad_norm": 0.6756762862205505,
350
  "learning_rate": 3.948188836862776e-06,
351
+ "loss": 1.2356,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.9369024856596558,
356
+ "grad_norm": 0.573499858379364,
357
  "learning_rate": 2.2983034998182997e-06,
358
+ "loss": 1.2411,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.9560229445506692,
363
+ "grad_norm": 0.6631729006767273,
364
  "learning_rate": 1.0885137906768372e-06,
365
+ "loss": 1.2363,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.9751434034416826,
370
+ "grad_norm": 0.5861126780509949,
371
  "learning_rate": 3.2426918657900704e-07,
372
+ "loss": 1.2316,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.994263862332696,
377
+ "grad_norm": 0.6123586297035217,
378
  "learning_rate": 9.012214327897006e-09,
379
+ "loss": 1.2442,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.9980879541108987,
384
+ "eval_loss": 2.405249834060669,
385
+ "eval_runtime": 0.4542,
386
+ "eval_samples_per_second": 26.419,
387
+ "eval_steps_per_second": 2.202,
388
  "step": 261
389
  },
390
  {
391
  "epoch": 0.9980879541108987,
392
  "step": 261,
393
  "total_flos": 7.958354271676662e+17,
394
+ "train_loss": 4.106453996508514,
395
+ "train_runtime": 670.8506,
396
+ "train_samples_per_second": 49.884,
397
+ "train_steps_per_second": 0.389
398
  }
399
  ],
400
  "logging_steps": 5,