Ct1tz commited on
Commit
f7976cb
·
verified ·
1 Parent(s): daad864

Upload 4 files

Browse files
Files changed (4) hide show
  1. optimizer.pt +3 -0
  2. scheduler.pt +3 -0
  3. trainer_state.json +945 -0
  4. training_args.bin +3 -0
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e62dc2d191b49cee9fd815ec78acb9c7a579b2a37b2b1d7f8aa6375ca3a0e23a
3
+ size 997370106
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61062d9a7fb887f32150bfb8d43225ea263194b767e580db272ea50e0e292cb0
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,945 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1000,
3
+ "best_metric": 0.8438986049887841,
4
+ "best_model_checkpoint": "./results/checkpoint-1000",
5
+ "epoch": 0.029189526267256648,
6
+ "eval_steps": 100,
7
+ "global_step": 1100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.00026535932970233315,
14
+ "grad_norm": 4.446621894836426,
15
+ "learning_rate": 3.6e-07,
16
+ "loss": 1.855,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.0005307186594046663,
21
+ "grad_norm": 6.435311317443848,
22
+ "learning_rate": 7.6e-07,
23
+ "loss": 1.8131,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.0007960779891069995,
28
+ "grad_norm": 3.5201945304870605,
29
+ "learning_rate": 1.1600000000000001e-06,
30
+ "loss": 1.7821,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.0010614373188093326,
35
+ "grad_norm": 5.47129487991333,
36
+ "learning_rate": 1.56e-06,
37
+ "loss": 1.8187,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.001326796648511666,
42
+ "grad_norm": 7.340343475341797,
43
+ "learning_rate": 1.9600000000000003e-06,
44
+ "loss": 1.7791,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.001592155978213999,
49
+ "grad_norm": 5.010741233825684,
50
+ "learning_rate": 2.3600000000000003e-06,
51
+ "loss": 1.7705,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.0018575153079163323,
56
+ "grad_norm": 4.706456184387207,
57
+ "learning_rate": 2.7600000000000003e-06,
58
+ "loss": 1.7616,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.002122874637618665,
63
+ "grad_norm": 3.865488290786743,
64
+ "learning_rate": 3.1600000000000002e-06,
65
+ "loss": 1.7265,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.0023882339673209985,
70
+ "grad_norm": 6.390440464019775,
71
+ "learning_rate": 3.5600000000000002e-06,
72
+ "loss": 1.7143,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.002653593297023332,
77
+ "grad_norm": 5.747302532196045,
78
+ "learning_rate": 3.96e-06,
79
+ "loss": 1.6675,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.002653593297023332,
84
+ "eval_accuracy": 0.3978160782284494,
85
+ "eval_f1": 0.22667861021073907,
86
+ "eval_loss": 1.7215721607208252,
87
+ "eval_precision": 0.3277163462999534,
88
+ "eval_recall": 0.3978160782284494,
89
+ "eval_runtime": 1141.6234,
90
+ "eval_samples_per_second": 66.019,
91
+ "eval_steps_per_second": 8.253,
92
+ "step": 100
93
+ },
94
+ {
95
+ "epoch": 0.0029189526267256647,
96
+ "grad_norm": 5.64985466003418,
97
+ "learning_rate": 4.360000000000001e-06,
98
+ "loss": 1.6701,
99
+ "step": 110
100
+ },
101
+ {
102
+ "epoch": 0.003184311956427998,
103
+ "grad_norm": 7.48480224609375,
104
+ "learning_rate": 4.76e-06,
105
+ "loss": 1.6665,
106
+ "step": 120
107
+ },
108
+ {
109
+ "epoch": 0.0034496712861303313,
110
+ "grad_norm": 6.59235954284668,
111
+ "learning_rate": 5.1600000000000006e-06,
112
+ "loss": 1.6049,
113
+ "step": 130
114
+ },
115
+ {
116
+ "epoch": 0.0037150306158326646,
117
+ "grad_norm": 5.440073013305664,
118
+ "learning_rate": 5.560000000000001e-06,
119
+ "loss": 1.6174,
120
+ "step": 140
121
+ },
122
+ {
123
+ "epoch": 0.0039803899455349975,
124
+ "grad_norm": 5.735574245452881,
125
+ "learning_rate": 5.9600000000000005e-06,
126
+ "loss": 1.5654,
127
+ "step": 150
128
+ },
129
+ {
130
+ "epoch": 0.00424574927523733,
131
+ "grad_norm": 5.891671180725098,
132
+ "learning_rate": 6.360000000000001e-06,
133
+ "loss": 1.5445,
134
+ "step": 160
135
+ },
136
+ {
137
+ "epoch": 0.004511108604939664,
138
+ "grad_norm": 7.2471089363098145,
139
+ "learning_rate": 6.760000000000001e-06,
140
+ "loss": 1.5545,
141
+ "step": 170
142
+ },
143
+ {
144
+ "epoch": 0.004776467934641997,
145
+ "grad_norm": 8.074533462524414,
146
+ "learning_rate": 7.16e-06,
147
+ "loss": 1.5121,
148
+ "step": 180
149
+ },
150
+ {
151
+ "epoch": 0.00504182726434433,
152
+ "grad_norm": 6.5745768547058105,
153
+ "learning_rate": 7.5600000000000005e-06,
154
+ "loss": 1.4951,
155
+ "step": 190
156
+ },
157
+ {
158
+ "epoch": 0.005307186594046664,
159
+ "grad_norm": 7.1547112464904785,
160
+ "learning_rate": 7.960000000000002e-06,
161
+ "loss": 1.5225,
162
+ "step": 200
163
+ },
164
+ {
165
+ "epoch": 0.005307186594046664,
166
+ "eval_accuracy": 0.5399965503058286,
167
+ "eval_f1": 0.4954655997286682,
168
+ "eval_loss": 1.4961189031600952,
169
+ "eval_precision": 0.5230960318752841,
170
+ "eval_recall": 0.5399965503058286,
171
+ "eval_runtime": 1146.418,
172
+ "eval_samples_per_second": 65.743,
173
+ "eval_steps_per_second": 8.219,
174
+ "step": 200
175
+ },
176
+ {
177
+ "epoch": 0.0055725459237489965,
178
+ "grad_norm": 10.55716609954834,
179
+ "learning_rate": 8.36e-06,
180
+ "loss": 1.486,
181
+ "step": 210
182
+ },
183
+ {
184
+ "epoch": 0.005837905253451329,
185
+ "grad_norm": 10.896256446838379,
186
+ "learning_rate": 8.76e-06,
187
+ "loss": 1.3551,
188
+ "step": 220
189
+ },
190
+ {
191
+ "epoch": 0.006103264583153663,
192
+ "grad_norm": 10.593475341796875,
193
+ "learning_rate": 9.16e-06,
194
+ "loss": 1.3457,
195
+ "step": 230
196
+ },
197
+ {
198
+ "epoch": 0.006368623912855996,
199
+ "grad_norm": 12.37448787689209,
200
+ "learning_rate": 9.56e-06,
201
+ "loss": 1.4264,
202
+ "step": 240
203
+ },
204
+ {
205
+ "epoch": 0.006633983242558329,
206
+ "grad_norm": 15.805830001831055,
207
+ "learning_rate": 9.960000000000001e-06,
208
+ "loss": 1.1405,
209
+ "step": 250
210
+ },
211
+ {
212
+ "epoch": 0.006899342572260663,
213
+ "grad_norm": 7.878324031829834,
214
+ "learning_rate": 1.036e-05,
215
+ "loss": 1.3623,
216
+ "step": 260
217
+ },
218
+ {
219
+ "epoch": 0.0071647019019629955,
220
+ "grad_norm": 9.763978004455566,
221
+ "learning_rate": 1.0760000000000002e-05,
222
+ "loss": 1.1918,
223
+ "step": 270
224
+ },
225
+ {
226
+ "epoch": 0.007430061231665329,
227
+ "grad_norm": 8.449042320251465,
228
+ "learning_rate": 1.1160000000000002e-05,
229
+ "loss": 1.1437,
230
+ "step": 280
231
+ },
232
+ {
233
+ "epoch": 0.007695420561367662,
234
+ "grad_norm": 10.577520370483398,
235
+ "learning_rate": 1.156e-05,
236
+ "loss": 1.0793,
237
+ "step": 290
238
+ },
239
+ {
240
+ "epoch": 0.007960779891069995,
241
+ "grad_norm": 11.707535743713379,
242
+ "learning_rate": 1.196e-05,
243
+ "loss": 1.1895,
244
+ "step": 300
245
+ },
246
+ {
247
+ "epoch": 0.007960779891069995,
248
+ "eval_accuracy": 0.6815534238214651,
249
+ "eval_f1": 0.6624035468309406,
250
+ "eval_loss": 1.228468894958496,
251
+ "eval_precision": 0.6548079319665443,
252
+ "eval_recall": 0.6815534238214651,
253
+ "eval_runtime": 1147.8213,
254
+ "eval_samples_per_second": 65.663,
255
+ "eval_steps_per_second": 8.209,
256
+ "step": 300
257
+ },
258
+ {
259
+ "epoch": 0.008226139220772328,
260
+ "grad_norm": 7.825327396392822,
261
+ "learning_rate": 1.236e-05,
262
+ "loss": 1.0,
263
+ "step": 310
264
+ },
265
+ {
266
+ "epoch": 0.00849149855047466,
267
+ "grad_norm": 9.7120943069458,
268
+ "learning_rate": 1.2760000000000001e-05,
269
+ "loss": 1.2152,
270
+ "step": 320
271
+ },
272
+ {
273
+ "epoch": 0.008756857880176995,
274
+ "grad_norm": 10.450387954711914,
275
+ "learning_rate": 1.3160000000000001e-05,
276
+ "loss": 1.2838,
277
+ "step": 330
278
+ },
279
+ {
280
+ "epoch": 0.009022217209879328,
281
+ "grad_norm": 6.901065826416016,
282
+ "learning_rate": 1.3560000000000002e-05,
283
+ "loss": 1.1989,
284
+ "step": 340
285
+ },
286
+ {
287
+ "epoch": 0.009287576539581661,
288
+ "grad_norm": 7.151400089263916,
289
+ "learning_rate": 1.396e-05,
290
+ "loss": 1.0571,
291
+ "step": 350
292
+ },
293
+ {
294
+ "epoch": 0.009552935869283994,
295
+ "grad_norm": 7.359131813049316,
296
+ "learning_rate": 1.4360000000000001e-05,
297
+ "loss": 1.2923,
298
+ "step": 360
299
+ },
300
+ {
301
+ "epoch": 0.009818295198986327,
302
+ "grad_norm": 6.762883186340332,
303
+ "learning_rate": 1.4760000000000001e-05,
304
+ "loss": 1.2465,
305
+ "step": 370
306
+ },
307
+ {
308
+ "epoch": 0.01008365452868866,
309
+ "grad_norm": 7.525689125061035,
310
+ "learning_rate": 1.516e-05,
311
+ "loss": 0.889,
312
+ "step": 380
313
+ },
314
+ {
315
+ "epoch": 0.010349013858390994,
316
+ "grad_norm": 12.986421585083008,
317
+ "learning_rate": 1.556e-05,
318
+ "loss": 1.2117,
319
+ "step": 390
320
+ },
321
+ {
322
+ "epoch": 0.010614373188093327,
323
+ "grad_norm": 10.896193504333496,
324
+ "learning_rate": 1.5960000000000003e-05,
325
+ "loss": 1.0,
326
+ "step": 400
327
+ },
328
+ {
329
+ "epoch": 0.010614373188093327,
330
+ "eval_accuracy": 0.7040295081532195,
331
+ "eval_f1": 0.7118047953474329,
332
+ "eval_loss": 1.004681944847107,
333
+ "eval_precision": 0.7430775529469658,
334
+ "eval_recall": 0.7040295081532195,
335
+ "eval_runtime": 1149.1166,
336
+ "eval_samples_per_second": 65.589,
337
+ "eval_steps_per_second": 8.199,
338
+ "step": 400
339
+ },
340
+ {
341
+ "epoch": 0.01087973251779566,
342
+ "grad_norm": 10.948391914367676,
343
+ "learning_rate": 1.636e-05,
344
+ "loss": 1.1142,
345
+ "step": 410
346
+ },
347
+ {
348
+ "epoch": 0.011145091847497993,
349
+ "grad_norm": 9.782318115234375,
350
+ "learning_rate": 1.6760000000000002e-05,
351
+ "loss": 1.0815,
352
+ "step": 420
353
+ },
354
+ {
355
+ "epoch": 0.011410451177200326,
356
+ "grad_norm": 15.20601749420166,
357
+ "learning_rate": 1.7160000000000002e-05,
358
+ "loss": 1.1052,
359
+ "step": 430
360
+ },
361
+ {
362
+ "epoch": 0.011675810506902659,
363
+ "grad_norm": 13.752850532531738,
364
+ "learning_rate": 1.756e-05,
365
+ "loss": 0.9449,
366
+ "step": 440
367
+ },
368
+ {
369
+ "epoch": 0.011941169836604993,
370
+ "grad_norm": 14.253933906555176,
371
+ "learning_rate": 1.796e-05,
372
+ "loss": 0.8107,
373
+ "step": 450
374
+ },
375
+ {
376
+ "epoch": 0.012206529166307326,
377
+ "grad_norm": 10.736539840698242,
378
+ "learning_rate": 1.8360000000000004e-05,
379
+ "loss": 0.8507,
380
+ "step": 460
381
+ },
382
+ {
383
+ "epoch": 0.012471888496009659,
384
+ "grad_norm": 19.90713119506836,
385
+ "learning_rate": 1.876e-05,
386
+ "loss": 0.8218,
387
+ "step": 470
388
+ },
389
+ {
390
+ "epoch": 0.012737247825711992,
391
+ "grad_norm": 5.8942084312438965,
392
+ "learning_rate": 1.916e-05,
393
+ "loss": 0.8322,
394
+ "step": 480
395
+ },
396
+ {
397
+ "epoch": 0.013002607155414325,
398
+ "grad_norm": 5.852909088134766,
399
+ "learning_rate": 1.9560000000000002e-05,
400
+ "loss": 0.72,
401
+ "step": 490
402
+ },
403
+ {
404
+ "epoch": 0.013267966485116658,
405
+ "grad_norm": 8.721864700317383,
406
+ "learning_rate": 1.9960000000000002e-05,
407
+ "loss": 0.8702,
408
+ "step": 500
409
+ },
410
+ {
411
+ "epoch": 0.013267966485116658,
412
+ "eval_accuracy": 0.7919303692499569,
413
+ "eval_f1": 0.7952185444780413,
414
+ "eval_loss": 0.7821776270866394,
415
+ "eval_precision": 0.8109789187273935,
416
+ "eval_recall": 0.7919303692499569,
417
+ "eval_runtime": 1149.5055,
418
+ "eval_samples_per_second": 65.566,
419
+ "eval_steps_per_second": 8.197,
420
+ "step": 500
421
+ },
422
+ {
423
+ "epoch": 0.013533325814818992,
424
+ "grad_norm": 7.045879364013672,
425
+ "learning_rate": 1.9999042145593872e-05,
426
+ "loss": 0.8162,
427
+ "step": 510
428
+ },
429
+ {
430
+ "epoch": 0.013798685144521325,
431
+ "grad_norm": 12.590984344482422,
432
+ "learning_rate": 1.9997977862920393e-05,
433
+ "loss": 0.8281,
434
+ "step": 520
435
+ },
436
+ {
437
+ "epoch": 0.014064044474223658,
438
+ "grad_norm": 4.697333812713623,
439
+ "learning_rate": 1.9996913580246914e-05,
440
+ "loss": 0.8597,
441
+ "step": 530
442
+ },
443
+ {
444
+ "epoch": 0.014329403803925991,
445
+ "grad_norm": 7.0437188148498535,
446
+ "learning_rate": 1.999584929757344e-05,
447
+ "loss": 0.6957,
448
+ "step": 540
449
+ },
450
+ {
451
+ "epoch": 0.014594763133628324,
452
+ "grad_norm": 12.778396606445312,
453
+ "learning_rate": 1.999478501489996e-05,
454
+ "loss": 0.8493,
455
+ "step": 550
456
+ },
457
+ {
458
+ "epoch": 0.014860122463330658,
459
+ "grad_norm": 30.98528480529785,
460
+ "learning_rate": 1.999372073222648e-05,
461
+ "loss": 0.8598,
462
+ "step": 560
463
+ },
464
+ {
465
+ "epoch": 0.015125481793032991,
466
+ "grad_norm": 6.842329502105713,
467
+ "learning_rate": 1.9992656449553e-05,
468
+ "loss": 0.7805,
469
+ "step": 570
470
+ },
471
+ {
472
+ "epoch": 0.015390841122735324,
473
+ "grad_norm": 7.865843772888184,
474
+ "learning_rate": 1.9991592166879526e-05,
475
+ "loss": 0.6858,
476
+ "step": 580
477
+ },
478
+ {
479
+ "epoch": 0.01565620045243766,
480
+ "grad_norm": 7.990331172943115,
481
+ "learning_rate": 1.9990527884206047e-05,
482
+ "loss": 0.8487,
483
+ "step": 590
484
+ },
485
+ {
486
+ "epoch": 0.01592155978213999,
487
+ "grad_norm": 7.9941935539245605,
488
+ "learning_rate": 1.9989463601532568e-05,
489
+ "loss": 0.7483,
490
+ "step": 600
491
+ },
492
+ {
493
+ "epoch": 0.01592155978213999,
494
+ "eval_accuracy": 0.8143533813636906,
495
+ "eval_f1": 0.8091828053145604,
496
+ "eval_loss": 0.7799807190895081,
497
+ "eval_precision": 0.8135668963695762,
498
+ "eval_recall": 0.8143533813636906,
499
+ "eval_runtime": 1149.6535,
500
+ "eval_samples_per_second": 65.558,
501
+ "eval_steps_per_second": 8.196,
502
+ "step": 600
503
+ },
504
+ {
505
+ "epoch": 0.016186919111842325,
506
+ "grad_norm": 7.86228609085083,
507
+ "learning_rate": 1.9988399318859092e-05,
508
+ "loss": 0.8267,
509
+ "step": 610
510
+ },
511
+ {
512
+ "epoch": 0.016452278441544656,
513
+ "grad_norm": 17.805816650390625,
514
+ "learning_rate": 1.9987335036185613e-05,
515
+ "loss": 0.7449,
516
+ "step": 620
517
+ },
518
+ {
519
+ "epoch": 0.01671763777124699,
520
+ "grad_norm": 20.419509887695312,
521
+ "learning_rate": 1.9986270753512134e-05,
522
+ "loss": 0.806,
523
+ "step": 630
524
+ },
525
+ {
526
+ "epoch": 0.01698299710094932,
527
+ "grad_norm": 8.158143043518066,
528
+ "learning_rate": 1.9985206470838655e-05,
529
+ "loss": 0.5455,
530
+ "step": 640
531
+ },
532
+ {
533
+ "epoch": 0.017248356430651656,
534
+ "grad_norm": 19.537343978881836,
535
+ "learning_rate": 1.998414218816518e-05,
536
+ "loss": 0.6334,
537
+ "step": 650
538
+ },
539
+ {
540
+ "epoch": 0.01751371576035399,
541
+ "grad_norm": 13.77968692779541,
542
+ "learning_rate": 1.99830779054917e-05,
543
+ "loss": 0.6848,
544
+ "step": 660
545
+ },
546
+ {
547
+ "epoch": 0.017779075090056322,
548
+ "grad_norm": 13.80045223236084,
549
+ "learning_rate": 1.9982013622818222e-05,
550
+ "loss": 0.8403,
551
+ "step": 670
552
+ },
553
+ {
554
+ "epoch": 0.018044434419758656,
555
+ "grad_norm": 7.564460277557373,
556
+ "learning_rate": 1.9980949340144743e-05,
557
+ "loss": 0.8387,
558
+ "step": 680
559
+ },
560
+ {
561
+ "epoch": 0.018309793749460988,
562
+ "grad_norm": 9.932668685913086,
563
+ "learning_rate": 1.9979885057471267e-05,
564
+ "loss": 0.6252,
565
+ "step": 690
566
+ },
567
+ {
568
+ "epoch": 0.018575153079163322,
569
+ "grad_norm": 3.894618034362793,
570
+ "learning_rate": 1.997882077479779e-05,
571
+ "loss": 0.6915,
572
+ "step": 700
573
+ },
574
+ {
575
+ "epoch": 0.018575153079163322,
576
+ "eval_accuracy": 0.8227255237564516,
577
+ "eval_f1": 0.8164082076886476,
578
+ "eval_loss": 0.7442639470100403,
579
+ "eval_precision": 0.8239897054500639,
580
+ "eval_recall": 0.8227255237564516,
581
+ "eval_runtime": 1149.8124,
582
+ "eval_samples_per_second": 65.549,
583
+ "eval_steps_per_second": 8.194,
584
+ "step": 700
585
+ },
586
+ {
587
+ "epoch": 0.018840512408865657,
588
+ "grad_norm": 8.771450996398926,
589
+ "learning_rate": 1.997775649212431e-05,
590
+ "loss": 0.5164,
591
+ "step": 710
592
+ },
593
+ {
594
+ "epoch": 0.019105871738567988,
595
+ "grad_norm": 13.675606727600098,
596
+ "learning_rate": 1.997669220945083e-05,
597
+ "loss": 0.7917,
598
+ "step": 720
599
+ },
600
+ {
601
+ "epoch": 0.019371231068270323,
602
+ "grad_norm": 7.2742462158203125,
603
+ "learning_rate": 1.9975627926777355e-05,
604
+ "loss": 0.5966,
605
+ "step": 730
606
+ },
607
+ {
608
+ "epoch": 0.019636590397972654,
609
+ "grad_norm": 8.653703689575195,
610
+ "learning_rate": 1.9974563644103876e-05,
611
+ "loss": 0.7009,
612
+ "step": 740
613
+ },
614
+ {
615
+ "epoch": 0.01990194972767499,
616
+ "grad_norm": 4.837522983551025,
617
+ "learning_rate": 1.9973499361430397e-05,
618
+ "loss": 0.6471,
619
+ "step": 750
620
+ },
621
+ {
622
+ "epoch": 0.02016730905737732,
623
+ "grad_norm": 51.784481048583984,
624
+ "learning_rate": 1.997243507875692e-05,
625
+ "loss": 0.5877,
626
+ "step": 760
627
+ },
628
+ {
629
+ "epoch": 0.020432668387079654,
630
+ "grad_norm": 10.597892761230469,
631
+ "learning_rate": 1.9971370796083442e-05,
632
+ "loss": 0.5925,
633
+ "step": 770
634
+ },
635
+ {
636
+ "epoch": 0.02069802771678199,
637
+ "grad_norm": 13.237262725830078,
638
+ "learning_rate": 1.9970306513409963e-05,
639
+ "loss": 0.6298,
640
+ "step": 780
641
+ },
642
+ {
643
+ "epoch": 0.02096338704648432,
644
+ "grad_norm": 9.751429557800293,
645
+ "learning_rate": 1.9969242230736484e-05,
646
+ "loss": 0.5414,
647
+ "step": 790
648
+ },
649
+ {
650
+ "epoch": 0.021228746376186654,
651
+ "grad_norm": 15.811433792114258,
652
+ "learning_rate": 1.996817794806301e-05,
653
+ "loss": 0.7402,
654
+ "step": 800
655
+ },
656
+ {
657
+ "epoch": 0.021228746376186654,
658
+ "eval_accuracy": 0.8336318645596996,
659
+ "eval_f1": 0.8326606774493011,
660
+ "eval_loss": 0.6281165480613708,
661
+ "eval_precision": 0.8348071025847044,
662
+ "eval_recall": 0.8336318645596996,
663
+ "eval_runtime": 1150.0313,
664
+ "eval_samples_per_second": 65.536,
665
+ "eval_steps_per_second": 8.193,
666
+ "step": 800
667
+ },
668
+ {
669
+ "epoch": 0.021494105705888986,
670
+ "grad_norm": 6.421304225921631,
671
+ "learning_rate": 1.9967113665389526e-05,
672
+ "loss": 0.5246,
673
+ "step": 810
674
+ },
675
+ {
676
+ "epoch": 0.02175946503559132,
677
+ "grad_norm": 10.94848918914795,
678
+ "learning_rate": 1.996604938271605e-05,
679
+ "loss": 0.4998,
680
+ "step": 820
681
+ },
682
+ {
683
+ "epoch": 0.022024824365293655,
684
+ "grad_norm": 18.42159080505371,
685
+ "learning_rate": 1.9964985100042572e-05,
686
+ "loss": 0.3855,
687
+ "step": 830
688
+ },
689
+ {
690
+ "epoch": 0.022290183694995986,
691
+ "grad_norm": 6.594385623931885,
692
+ "learning_rate": 1.9963920817369096e-05,
693
+ "loss": 0.6205,
694
+ "step": 840
695
+ },
696
+ {
697
+ "epoch": 0.02255554302469832,
698
+ "grad_norm": 15.90577220916748,
699
+ "learning_rate": 1.9962856534695617e-05,
700
+ "loss": 0.5837,
701
+ "step": 850
702
+ },
703
+ {
704
+ "epoch": 0.022820902354400652,
705
+ "grad_norm": 13.169767379760742,
706
+ "learning_rate": 1.996179225202214e-05,
707
+ "loss": 0.6141,
708
+ "step": 860
709
+ },
710
+ {
711
+ "epoch": 0.023086261684102986,
712
+ "grad_norm": 18.30284881591797,
713
+ "learning_rate": 1.9960727969348663e-05,
714
+ "loss": 0.5903,
715
+ "step": 870
716
+ },
717
+ {
718
+ "epoch": 0.023351621013805317,
719
+ "grad_norm": 18.575279235839844,
720
+ "learning_rate": 1.9959663686675184e-05,
721
+ "loss": 0.6436,
722
+ "step": 880
723
+ },
724
+ {
725
+ "epoch": 0.023616980343507652,
726
+ "grad_norm": 7.2494587898254395,
727
+ "learning_rate": 1.9958599404001705e-05,
728
+ "loss": 0.5912,
729
+ "step": 890
730
+ },
731
+ {
732
+ "epoch": 0.023882339673209987,
733
+ "grad_norm": 27.128528594970703,
734
+ "learning_rate": 1.9957535121328226e-05,
735
+ "loss": 0.6165,
736
+ "step": 900
737
+ },
738
+ {
739
+ "epoch": 0.023882339673209987,
740
+ "eval_accuracy": 0.8347198450291233,
741
+ "eval_f1": 0.8366928520828808,
742
+ "eval_loss": 0.6065123081207275,
743
+ "eval_precision": 0.8420416556337017,
744
+ "eval_recall": 0.8347198450291233,
745
+ "eval_runtime": 1150.012,
746
+ "eval_samples_per_second": 65.538,
747
+ "eval_steps_per_second": 8.193,
748
+ "step": 900
749
+ },
750
+ {
751
+ "epoch": 0.024147699002912318,
752
+ "grad_norm": 11.817070960998535,
753
+ "learning_rate": 1.995647083865475e-05,
754
+ "loss": 0.6235,
755
+ "step": 910
756
+ },
757
+ {
758
+ "epoch": 0.024413058332614652,
759
+ "grad_norm": 13.93792724609375,
760
+ "learning_rate": 1.9955406555981268e-05,
761
+ "loss": 0.5442,
762
+ "step": 920
763
+ },
764
+ {
765
+ "epoch": 0.024678417662316984,
766
+ "grad_norm": 12.457510948181152,
767
+ "learning_rate": 1.9954342273307792e-05,
768
+ "loss": 0.5218,
769
+ "step": 930
770
+ },
771
+ {
772
+ "epoch": 0.024943776992019318,
773
+ "grad_norm": 19.165714263916016,
774
+ "learning_rate": 1.9953277990634313e-05,
775
+ "loss": 0.6517,
776
+ "step": 940
777
+ },
778
+ {
779
+ "epoch": 0.025209136321721653,
780
+ "grad_norm": 16.57741355895996,
781
+ "learning_rate": 1.9952213707960838e-05,
782
+ "loss": 0.6778,
783
+ "step": 950
784
+ },
785
+ {
786
+ "epoch": 0.025474495651423984,
787
+ "grad_norm": 10.39408016204834,
788
+ "learning_rate": 1.995114942528736e-05,
789
+ "loss": 0.7138,
790
+ "step": 960
791
+ },
792
+ {
793
+ "epoch": 0.02573985498112632,
794
+ "grad_norm": 9.648213386535645,
795
+ "learning_rate": 1.995008514261388e-05,
796
+ "loss": 0.63,
797
+ "step": 970
798
+ },
799
+ {
800
+ "epoch": 0.02600521431082865,
801
+ "grad_norm": 18.695945739746094,
802
+ "learning_rate": 1.99490208599404e-05,
803
+ "loss": 0.6557,
804
+ "step": 980
805
+ },
806
+ {
807
+ "epoch": 0.026270573640530984,
808
+ "grad_norm": 16.400314331054688,
809
+ "learning_rate": 1.9947956577266925e-05,
810
+ "loss": 0.7228,
811
+ "step": 990
812
+ },
813
+ {
814
+ "epoch": 0.026535932970233315,
815
+ "grad_norm": 6.129698753356934,
816
+ "learning_rate": 1.9946892294593446e-05,
817
+ "loss": 0.4626,
818
+ "step": 1000
819
+ },
820
+ {
821
+ "epoch": 0.026535932970233315,
822
+ "eval_accuracy": 0.846382464939166,
823
+ "eval_f1": 0.8438986049887841,
824
+ "eval_loss": 0.6415271759033203,
825
+ "eval_precision": 0.8464411216655519,
826
+ "eval_recall": 0.846382464939166,
827
+ "eval_runtime": 1149.903,
828
+ "eval_samples_per_second": 65.544,
829
+ "eval_steps_per_second": 8.194,
830
+ "step": 1000
831
+ },
832
+ {
833
+ "epoch": 0.02680129229993565,
834
+ "grad_norm": 9.027094841003418,
835
+ "learning_rate": 1.9945828011919967e-05,
836
+ "loss": 0.7249,
837
+ "step": 1010
838
+ },
839
+ {
840
+ "epoch": 0.027066651629637985,
841
+ "grad_norm": 14.91533374786377,
842
+ "learning_rate": 1.9944763729246492e-05,
843
+ "loss": 0.6765,
844
+ "step": 1020
845
+ },
846
+ {
847
+ "epoch": 0.027332010959340316,
848
+ "grad_norm": 3.9888482093811035,
849
+ "learning_rate": 1.994369944657301e-05,
850
+ "loss": 0.7675,
851
+ "step": 1030
852
+ },
853
+ {
854
+ "epoch": 0.02759737028904265,
855
+ "grad_norm": 7.831120491027832,
856
+ "learning_rate": 1.9942635163899534e-05,
857
+ "loss": 0.5764,
858
+ "step": 1040
859
+ },
860
+ {
861
+ "epoch": 0.02786272961874498,
862
+ "grad_norm": 7.661794185638428,
863
+ "learning_rate": 1.9941570881226055e-05,
864
+ "loss": 0.6496,
865
+ "step": 1050
866
+ },
867
+ {
868
+ "epoch": 0.028128088948447316,
869
+ "grad_norm": 5.446905136108398,
870
+ "learning_rate": 1.994050659855258e-05,
871
+ "loss": 0.7055,
872
+ "step": 1060
873
+ },
874
+ {
875
+ "epoch": 0.02839344827814965,
876
+ "grad_norm": 11.230253219604492,
877
+ "learning_rate": 1.9939442315879097e-05,
878
+ "loss": 0.6096,
879
+ "step": 1070
880
+ },
881
+ {
882
+ "epoch": 0.028658807607851982,
883
+ "grad_norm": 4.4712982177734375,
884
+ "learning_rate": 1.993837803320562e-05,
885
+ "loss": 0.603,
886
+ "step": 1080
887
+ },
888
+ {
889
+ "epoch": 0.028924166937554317,
890
+ "grad_norm": 6.14396858215332,
891
+ "learning_rate": 1.9937313750532142e-05,
892
+ "loss": 0.7755,
893
+ "step": 1090
894
+ },
895
+ {
896
+ "epoch": 0.029189526267256648,
897
+ "grad_norm": 15.914993286132812,
898
+ "learning_rate": 1.9936249467858667e-05,
899
+ "loss": 0.4619,
900
+ "step": 1100
901
+ },
902
+ {
903
+ "epoch": 0.029189526267256648,
904
+ "eval_accuracy": 0.846621289920259,
905
+ "eval_f1": 0.842526016669277,
906
+ "eval_loss": 0.6318368911743164,
907
+ "eval_precision": 0.8510012460868045,
908
+ "eval_recall": 0.846621289920259,
909
+ "eval_runtime": 1149.9706,
910
+ "eval_samples_per_second": 65.54,
911
+ "eval_steps_per_second": 8.193,
912
+ "step": 1100
913
+ }
914
+ ],
915
+ "logging_steps": 10,
916
+ "max_steps": 188420,
917
+ "num_input_tokens_seen": 0,
918
+ "num_train_epochs": 5,
919
+ "save_steps": 100,
920
+ "stateful_callbacks": {
921
+ "EarlyStoppingCallback": {
922
+ "args": {
923
+ "early_stopping_patience": 3,
924
+ "early_stopping_threshold": 0.01
925
+ },
926
+ "attributes": {
927
+ "early_stopping_patience_counter": 3
928
+ }
929
+ },
930
+ "TrainerControl": {
931
+ "args": {
932
+ "should_epoch_stop": false,
933
+ "should_evaluate": false,
934
+ "should_log": false,
935
+ "should_save": true,
936
+ "should_training_stop": true
937
+ },
938
+ "attributes": {}
939
+ }
940
+ },
941
+ "total_flos": 4630920885043200.0,
942
+ "train_batch_size": 4,
943
+ "trial_name": null,
944
+ "trial_params": null
945
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63651316137ef8a43b7b68fffe153c9c989f961f0e3702d3b3c782028c356ff2
3
+ size 5240