polaris314 commited on
Commit
e693f31
·
verified ·
1 Parent(s): bd4ea92

Add checkpoint at step 3800

Browse files
checkpoints/checkpoint-step-3800/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "pad_token_id": 50256,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "task_specific_params": {
31
+ "text-generation": {
32
+ "do_sample": true,
33
+ "max_length": 50
34
+ }
35
+ },
36
+ "transformers_version": "4.56.1",
37
+ "use_cache": true,
38
+ "vocab_size": 50257
39
+ }
checkpoints/checkpoint-step-3800/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": [
5
+ 50256
6
+ ],
7
+ "pad_token_id": 50256,
8
+ "transformers_version": "4.56.1"
9
+ }
checkpoints/checkpoint-step-3800/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-step-3800/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:422a2721f583adeb4fcd526a221351b2a934adba9a1ef6d7c71252d180754284
3
+ size 497774208
checkpoints/checkpoint-step-3800/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8deac502a961c15fd488b5fded73368d93661e7246b1721fbe8a7ba49e0f5cdf
3
+ size 995644811
checkpoints/checkpoint-step-3800/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a621cb270f2b965f728cbab5d5078e1f0c88507cdf0f7b599a7f43d4fea470b
3
+ size 14645
checkpoints/checkpoint-step-3800/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8190bf4320db3ed8a3c941c3e905b8c663d78f90b5f6779890d8de842b74eee
3
+ size 1383
checkpoints/checkpoint-step-3800/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2de1834892de2e93c62657a6533cbdcde4cd4cdb55324be7ef696f2acab4f51e
3
+ size 1465
checkpoints/checkpoint-step-3800/special_tokens_map.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "unk_token": "<|endoftext|>"
12
+ }
checkpoints/checkpoint-step-3800/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-step-3800/tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "tokenizer_class": "GPT2Tokenizer",
20
+ "unk_token": "<|endoftext|>"
21
+ }
checkpoints/checkpoint-step-3800/trainer_state.json ADDED
@@ -0,0 +1,718 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 3800,
3
+ "best_metric": 0.03279593959450722,
4
+ "best_model_checkpoint": "checkpoints/checkpoint-3800",
5
+ "epoch": 1.034013605442177,
6
+ "eval_steps": 200,
7
+ "global_step": 3800,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.013605442176870748,
14
+ "grad_norm": 4.6472649574279785,
15
+ "learning_rate": 4.9000000000000005e-06,
16
+ "loss": 2.8759,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.027210884353741496,
21
+ "grad_norm": 3.089015245437622,
22
+ "learning_rate": 9.900000000000002e-06,
23
+ "loss": 1.7949,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.04081632653061224,
28
+ "grad_norm": 2.3650102615356445,
29
+ "learning_rate": 1.49e-05,
30
+ "loss": 0.6938,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.05442176870748299,
35
+ "grad_norm": 1.4423813819885254,
36
+ "learning_rate": 1.9900000000000003e-05,
37
+ "loss": 0.2987,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.05442176870748299,
42
+ "eval_loss": 0.18158380687236786,
43
+ "eval_runtime": 10.2028,
44
+ "eval_samples_per_second": 58.808,
45
+ "eval_steps_per_second": 7.351,
46
+ "step": 200
47
+ },
48
+ {
49
+ "epoch": 0.06802721088435375,
50
+ "grad_norm": 0.9667473435401917,
51
+ "learning_rate": 2.4900000000000002e-05,
52
+ "loss": 0.2009,
53
+ "step": 250
54
+ },
55
+ {
56
+ "epoch": 0.08163265306122448,
57
+ "grad_norm": 1.2099648714065552,
58
+ "learning_rate": 2.9900000000000002e-05,
59
+ "loss": 0.1613,
60
+ "step": 300
61
+ },
62
+ {
63
+ "epoch": 0.09523809523809523,
64
+ "grad_norm": 1.036044955253601,
65
+ "learning_rate": 3.49e-05,
66
+ "loss": 0.1397,
67
+ "step": 350
68
+ },
69
+ {
70
+ "epoch": 0.10884353741496598,
71
+ "grad_norm": 0.7627539038658142,
72
+ "learning_rate": 3.99e-05,
73
+ "loss": 0.1236,
74
+ "step": 400
75
+ },
76
+ {
77
+ "epoch": 0.10884353741496598,
78
+ "eval_loss": 0.09973898530006409,
79
+ "eval_runtime": 10.2069,
80
+ "eval_samples_per_second": 58.784,
81
+ "eval_steps_per_second": 7.348,
82
+ "step": 400
83
+ },
84
+ {
85
+ "epoch": 0.12244897959183673,
86
+ "grad_norm": 1.1398504972457886,
87
+ "learning_rate": 4.49e-05,
88
+ "loss": 0.1149,
89
+ "step": 450
90
+ },
91
+ {
92
+ "epoch": 0.1360544217687075,
93
+ "grad_norm": 0.7201138734817505,
94
+ "learning_rate": 4.99e-05,
95
+ "loss": 0.1004,
96
+ "step": 500
97
+ },
98
+ {
99
+ "epoch": 0.14965986394557823,
100
+ "grad_norm": 0.6430326700210571,
101
+ "learning_rate": 4.9642335766423356e-05,
102
+ "loss": 0.0926,
103
+ "step": 550
104
+ },
105
+ {
106
+ "epoch": 0.16326530612244897,
107
+ "grad_norm": 0.8617527484893799,
108
+ "learning_rate": 4.9277372262773724e-05,
109
+ "loss": 0.0876,
110
+ "step": 600
111
+ },
112
+ {
113
+ "epoch": 0.16326530612244897,
114
+ "eval_loss": 0.06922342628240585,
115
+ "eval_runtime": 10.155,
116
+ "eval_samples_per_second": 59.084,
117
+ "eval_steps_per_second": 7.386,
118
+ "step": 600
119
+ },
120
+ {
121
+ "epoch": 0.17687074829931973,
122
+ "grad_norm": 0.7790252566337585,
123
+ "learning_rate": 4.891240875912409e-05,
124
+ "loss": 0.0813,
125
+ "step": 650
126
+ },
127
+ {
128
+ "epoch": 0.19047619047619047,
129
+ "grad_norm": 0.6242516040802002,
130
+ "learning_rate": 4.854744525547445e-05,
131
+ "loss": 0.0778,
132
+ "step": 700
133
+ },
134
+ {
135
+ "epoch": 0.20408163265306123,
136
+ "grad_norm": 0.5622245073318481,
137
+ "learning_rate": 4.818248175182482e-05,
138
+ "loss": 0.0733,
139
+ "step": 750
140
+ },
141
+ {
142
+ "epoch": 0.21768707482993196,
143
+ "grad_norm": 0.6843573451042175,
144
+ "learning_rate": 4.781751824817519e-05,
145
+ "loss": 0.0703,
146
+ "step": 800
147
+ },
148
+ {
149
+ "epoch": 0.21768707482993196,
150
+ "eval_loss": 0.055413372814655304,
151
+ "eval_runtime": 10.1991,
152
+ "eval_samples_per_second": 58.829,
153
+ "eval_steps_per_second": 7.354,
154
+ "step": 800
155
+ },
156
+ {
157
+ "epoch": 0.23129251700680273,
158
+ "grad_norm": 0.611773669719696,
159
+ "learning_rate": 4.745255474452555e-05,
160
+ "loss": 0.0668,
161
+ "step": 850
162
+ },
163
+ {
164
+ "epoch": 0.24489795918367346,
165
+ "grad_norm": 0.622644305229187,
166
+ "learning_rate": 4.708759124087592e-05,
167
+ "loss": 0.0649,
168
+ "step": 900
169
+ },
170
+ {
171
+ "epoch": 0.2585034013605442,
172
+ "grad_norm": 0.4426730275154114,
173
+ "learning_rate": 4.6722627737226286e-05,
174
+ "loss": 0.0637,
175
+ "step": 950
176
+ },
177
+ {
178
+ "epoch": 0.272108843537415,
179
+ "grad_norm": 0.8717305660247803,
180
+ "learning_rate": 4.635766423357664e-05,
181
+ "loss": 0.0601,
182
+ "step": 1000
183
+ },
184
+ {
185
+ "epoch": 0.272108843537415,
186
+ "eval_loss": 0.05313626304268837,
187
+ "eval_runtime": 10.1979,
188
+ "eval_samples_per_second": 58.836,
189
+ "eval_steps_per_second": 7.354,
190
+ "step": 1000
191
+ },
192
+ {
193
+ "epoch": 0.2857142857142857,
194
+ "grad_norm": 0.5126848220825195,
195
+ "learning_rate": 4.599270072992701e-05,
196
+ "loss": 0.0578,
197
+ "step": 1050
198
+ },
199
+ {
200
+ "epoch": 0.29931972789115646,
201
+ "grad_norm": 0.5319710373878479,
202
+ "learning_rate": 4.5627737226277376e-05,
203
+ "loss": 0.0574,
204
+ "step": 1100
205
+ },
206
+ {
207
+ "epoch": 0.3129251700680272,
208
+ "grad_norm": 0.7722771167755127,
209
+ "learning_rate": 4.526277372262774e-05,
210
+ "loss": 0.0564,
211
+ "step": 1150
212
+ },
213
+ {
214
+ "epoch": 0.32653061224489793,
215
+ "grad_norm": 0.4977850317955017,
216
+ "learning_rate": 4.4897810218978105e-05,
217
+ "loss": 0.0553,
218
+ "step": 1200
219
+ },
220
+ {
221
+ "epoch": 0.32653061224489793,
222
+ "eval_loss": 0.0452888123691082,
223
+ "eval_runtime": 10.1684,
224
+ "eval_samples_per_second": 59.006,
225
+ "eval_steps_per_second": 7.376,
226
+ "step": 1200
227
+ },
228
+ {
229
+ "epoch": 0.3401360544217687,
230
+ "grad_norm": 0.5616284608840942,
231
+ "learning_rate": 4.4532846715328466e-05,
232
+ "loss": 0.0529,
233
+ "step": 1250
234
+ },
235
+ {
236
+ "epoch": 0.35374149659863946,
237
+ "grad_norm": 0.41897886991500854,
238
+ "learning_rate": 4.4167883211678834e-05,
239
+ "loss": 0.0527,
240
+ "step": 1300
241
+ },
242
+ {
243
+ "epoch": 0.3673469387755102,
244
+ "grad_norm": 0.526966392993927,
245
+ "learning_rate": 4.38029197080292e-05,
246
+ "loss": 0.0518,
247
+ "step": 1350
248
+ },
249
+ {
250
+ "epoch": 0.38095238095238093,
251
+ "grad_norm": 0.7886810898780823,
252
+ "learning_rate": 4.343795620437956e-05,
253
+ "loss": 0.0514,
254
+ "step": 1400
255
+ },
256
+ {
257
+ "epoch": 0.38095238095238093,
258
+ "eval_loss": 0.042705778032541275,
259
+ "eval_runtime": 10.1958,
260
+ "eval_samples_per_second": 58.848,
261
+ "eval_steps_per_second": 7.356,
262
+ "step": 1400
263
+ },
264
+ {
265
+ "epoch": 0.3945578231292517,
266
+ "grad_norm": 0.38092342019081116,
267
+ "learning_rate": 4.307299270072993e-05,
268
+ "loss": 0.0507,
269
+ "step": 1450
270
+ },
271
+ {
272
+ "epoch": 0.40816326530612246,
273
+ "grad_norm": 0.47930657863616943,
274
+ "learning_rate": 4.27080291970803e-05,
275
+ "loss": 0.0517,
276
+ "step": 1500
277
+ },
278
+ {
279
+ "epoch": 0.4217687074829932,
280
+ "grad_norm": 0.529920220375061,
281
+ "learning_rate": 4.234306569343066e-05,
282
+ "loss": 0.0494,
283
+ "step": 1550
284
+ },
285
+ {
286
+ "epoch": 0.43537414965986393,
287
+ "grad_norm": 0.5892526507377625,
288
+ "learning_rate": 4.197810218978102e-05,
289
+ "loss": 0.0473,
290
+ "step": 1600
291
+ },
292
+ {
293
+ "epoch": 0.43537414965986393,
294
+ "eval_loss": 0.04030081257224083,
295
+ "eval_runtime": 10.1867,
296
+ "eval_samples_per_second": 58.9,
297
+ "eval_steps_per_second": 7.363,
298
+ "step": 1600
299
+ },
300
+ {
301
+ "epoch": 0.4489795918367347,
302
+ "grad_norm": 0.8671649098396301,
303
+ "learning_rate": 4.161313868613139e-05,
304
+ "loss": 0.047,
305
+ "step": 1650
306
+ },
307
+ {
308
+ "epoch": 0.46258503401360546,
309
+ "grad_norm": 0.6588522791862488,
310
+ "learning_rate": 4.124817518248175e-05,
311
+ "loss": 0.0481,
312
+ "step": 1700
313
+ },
314
+ {
315
+ "epoch": 0.47619047619047616,
316
+ "grad_norm": 0.502729594707489,
317
+ "learning_rate": 4.088321167883212e-05,
318
+ "loss": 0.0456,
319
+ "step": 1750
320
+ },
321
+ {
322
+ "epoch": 0.4897959183673469,
323
+ "grad_norm": 0.5965167284011841,
324
+ "learning_rate": 4.0518248175182486e-05,
325
+ "loss": 0.0463,
326
+ "step": 1800
327
+ },
328
+ {
329
+ "epoch": 0.4897959183673469,
330
+ "eval_loss": 0.040413301438093185,
331
+ "eval_runtime": 10.2208,
332
+ "eval_samples_per_second": 58.704,
333
+ "eval_steps_per_second": 7.338,
334
+ "step": 1800
335
+ },
336
+ {
337
+ "epoch": 0.5034013605442177,
338
+ "grad_norm": 0.38793548941612244,
339
+ "learning_rate": 4.015328467153285e-05,
340
+ "loss": 0.0451,
341
+ "step": 1850
342
+ },
343
+ {
344
+ "epoch": 0.5170068027210885,
345
+ "grad_norm": 0.3949367105960846,
346
+ "learning_rate": 3.9788321167883215e-05,
347
+ "loss": 0.0468,
348
+ "step": 1900
349
+ },
350
+ {
351
+ "epoch": 0.5306122448979592,
352
+ "grad_norm": 0.5778154134750366,
353
+ "learning_rate": 3.9423357664233576e-05,
354
+ "loss": 0.0454,
355
+ "step": 1950
356
+ },
357
+ {
358
+ "epoch": 0.54421768707483,
359
+ "grad_norm": 0.3722288906574249,
360
+ "learning_rate": 3.9058394160583944e-05,
361
+ "loss": 0.0461,
362
+ "step": 2000
363
+ },
364
+ {
365
+ "epoch": 0.54421768707483,
366
+ "eval_loss": 0.03860794007778168,
367
+ "eval_runtime": 10.1795,
368
+ "eval_samples_per_second": 58.942,
369
+ "eval_steps_per_second": 7.368,
370
+ "step": 2000
371
+ },
372
+ {
373
+ "epoch": 0.5578231292517006,
374
+ "grad_norm": 0.5253990292549133,
375
+ "learning_rate": 3.869343065693431e-05,
376
+ "loss": 0.0445,
377
+ "step": 2050
378
+ },
379
+ {
380
+ "epoch": 0.5714285714285714,
381
+ "grad_norm": 0.41715025901794434,
382
+ "learning_rate": 3.832846715328467e-05,
383
+ "loss": 0.042,
384
+ "step": 2100
385
+ },
386
+ {
387
+ "epoch": 0.5850340136054422,
388
+ "grad_norm": 0.567323625087738,
389
+ "learning_rate": 3.796350364963504e-05,
390
+ "loss": 0.0423,
391
+ "step": 2150
392
+ },
393
+ {
394
+ "epoch": 0.5986394557823129,
395
+ "grad_norm": 0.6290739178657532,
396
+ "learning_rate": 3.759854014598541e-05,
397
+ "loss": 0.0428,
398
+ "step": 2200
399
+ },
400
+ {
401
+ "epoch": 0.5986394557823129,
402
+ "eval_loss": 0.037571050226688385,
403
+ "eval_runtime": 10.1846,
404
+ "eval_samples_per_second": 58.913,
405
+ "eval_steps_per_second": 7.364,
406
+ "step": 2200
407
+ },
408
+ {
409
+ "epoch": 0.6122448979591837,
410
+ "grad_norm": 0.40591228008270264,
411
+ "learning_rate": 3.723357664233576e-05,
412
+ "loss": 0.043,
413
+ "step": 2250
414
+ },
415
+ {
416
+ "epoch": 0.6258503401360545,
417
+ "grad_norm": 0.45446255803108215,
418
+ "learning_rate": 3.686861313868613e-05,
419
+ "loss": 0.0423,
420
+ "step": 2300
421
+ },
422
+ {
423
+ "epoch": 0.6394557823129252,
424
+ "grad_norm": 0.41105887293815613,
425
+ "learning_rate": 3.65036496350365e-05,
426
+ "loss": 0.0413,
427
+ "step": 2350
428
+ },
429
+ {
430
+ "epoch": 0.6530612244897959,
431
+ "grad_norm": 0.4538460671901703,
432
+ "learning_rate": 3.613868613138686e-05,
433
+ "loss": 0.0428,
434
+ "step": 2400
435
+ },
436
+ {
437
+ "epoch": 0.6530612244897959,
438
+ "eval_loss": 0.036736100912094116,
439
+ "eval_runtime": 10.1735,
440
+ "eval_samples_per_second": 58.977,
441
+ "eval_steps_per_second": 7.372,
442
+ "step": 2400
443
+ },
444
+ {
445
+ "epoch": 0.6666666666666666,
446
+ "grad_norm": 0.42337286472320557,
447
+ "learning_rate": 3.577372262773723e-05,
448
+ "loss": 0.0406,
449
+ "step": 2450
450
+ },
451
+ {
452
+ "epoch": 0.6802721088435374,
453
+ "grad_norm": 0.3924243450164795,
454
+ "learning_rate": 3.5408759124087596e-05,
455
+ "loss": 0.0412,
456
+ "step": 2500
457
+ },
458
+ {
459
+ "epoch": 0.6938775510204082,
460
+ "grad_norm": 0.3243819773197174,
461
+ "learning_rate": 3.504379562043796e-05,
462
+ "loss": 0.0416,
463
+ "step": 2550
464
+ },
465
+ {
466
+ "epoch": 0.7074829931972789,
467
+ "grad_norm": 0.3431473970413208,
468
+ "learning_rate": 3.4678832116788325e-05,
469
+ "loss": 0.0395,
470
+ "step": 2600
471
+ },
472
+ {
473
+ "epoch": 0.7074829931972789,
474
+ "eval_loss": 0.035297442227602005,
475
+ "eval_runtime": 10.181,
476
+ "eval_samples_per_second": 58.933,
477
+ "eval_steps_per_second": 7.367,
478
+ "step": 2600
479
+ },
480
+ {
481
+ "epoch": 0.7210884353741497,
482
+ "grad_norm": 0.4607592821121216,
483
+ "learning_rate": 3.4313868613138686e-05,
484
+ "loss": 0.0392,
485
+ "step": 2650
486
+ },
487
+ {
488
+ "epoch": 0.7346938775510204,
489
+ "grad_norm": 0.5089300870895386,
490
+ "learning_rate": 3.3948905109489054e-05,
491
+ "loss": 0.0409,
492
+ "step": 2700
493
+ },
494
+ {
495
+ "epoch": 0.7482993197278912,
496
+ "grad_norm": 0.42247381806373596,
497
+ "learning_rate": 3.358394160583942e-05,
498
+ "loss": 0.041,
499
+ "step": 2750
500
+ },
501
+ {
502
+ "epoch": 0.7619047619047619,
503
+ "grad_norm": 0.5103944540023804,
504
+ "learning_rate": 3.321897810218978e-05,
505
+ "loss": 0.0396,
506
+ "step": 2800
507
+ },
508
+ {
509
+ "epoch": 0.7619047619047619,
510
+ "eval_loss": 0.03508320823311806,
511
+ "eval_runtime": 10.1991,
512
+ "eval_samples_per_second": 58.829,
513
+ "eval_steps_per_second": 7.354,
514
+ "step": 2800
515
+ },
516
+ {
517
+ "epoch": 0.7755102040816326,
518
+ "grad_norm": 0.4671725928783417,
519
+ "learning_rate": 3.2854014598540144e-05,
520
+ "loss": 0.0401,
521
+ "step": 2850
522
+ },
523
+ {
524
+ "epoch": 0.7891156462585034,
525
+ "grad_norm": 0.39482733607292175,
526
+ "learning_rate": 3.248905109489051e-05,
527
+ "loss": 0.0413,
528
+ "step": 2900
529
+ },
530
+ {
531
+ "epoch": 0.8027210884353742,
532
+ "grad_norm": 0.6017800569534302,
533
+ "learning_rate": 3.212408759124087e-05,
534
+ "loss": 0.039,
535
+ "step": 2950
536
+ },
537
+ {
538
+ "epoch": 0.8163265306122449,
539
+ "grad_norm": 0.3402301073074341,
540
+ "learning_rate": 3.175912408759124e-05,
541
+ "loss": 0.0388,
542
+ "step": 3000
543
+ },
544
+ {
545
+ "epoch": 0.8163265306122449,
546
+ "eval_loss": 0.03466026484966278,
547
+ "eval_runtime": 10.1801,
548
+ "eval_samples_per_second": 58.938,
549
+ "eval_steps_per_second": 7.367,
550
+ "step": 3000
551
+ },
552
+ {
553
+ "epoch": 0.8299319727891157,
554
+ "grad_norm": 0.31006762385368347,
555
+ "learning_rate": 3.139416058394161e-05,
556
+ "loss": 0.0372,
557
+ "step": 3050
558
+ },
559
+ {
560
+ "epoch": 0.8435374149659864,
561
+ "grad_norm": 0.6701622605323792,
562
+ "learning_rate": 3.102919708029197e-05,
563
+ "loss": 0.0393,
564
+ "step": 3100
565
+ },
566
+ {
567
+ "epoch": 0.8571428571428571,
568
+ "grad_norm": 0.398631751537323,
569
+ "learning_rate": 3.066423357664234e-05,
570
+ "loss": 0.0396,
571
+ "step": 3150
572
+ },
573
+ {
574
+ "epoch": 0.8707482993197279,
575
+ "grad_norm": 0.3951578140258789,
576
+ "learning_rate": 3.0299270072992703e-05,
577
+ "loss": 0.0383,
578
+ "step": 3200
579
+ },
580
+ {
581
+ "epoch": 0.8707482993197279,
582
+ "eval_loss": 0.03498370572924614,
583
+ "eval_runtime": 10.1869,
584
+ "eval_samples_per_second": 58.899,
585
+ "eval_steps_per_second": 7.362,
586
+ "step": 3200
587
+ },
588
+ {
589
+ "epoch": 0.8843537414965986,
590
+ "grad_norm": 0.45307376980781555,
591
+ "learning_rate": 2.9934306569343067e-05,
592
+ "loss": 0.039,
593
+ "step": 3250
594
+ },
595
+ {
596
+ "epoch": 0.8979591836734694,
597
+ "grad_norm": 0.46504005789756775,
598
+ "learning_rate": 2.9569343065693432e-05,
599
+ "loss": 0.0372,
600
+ "step": 3300
601
+ },
602
+ {
603
+ "epoch": 0.9115646258503401,
604
+ "grad_norm": 0.3962818682193756,
605
+ "learning_rate": 2.92043795620438e-05,
606
+ "loss": 0.0388,
607
+ "step": 3350
608
+ },
609
+ {
610
+ "epoch": 0.9251700680272109,
611
+ "grad_norm": 0.5040358901023865,
612
+ "learning_rate": 2.8839416058394164e-05,
613
+ "loss": 0.038,
614
+ "step": 3400
615
+ },
616
+ {
617
+ "epoch": 0.9251700680272109,
618
+ "eval_loss": 0.03590795397758484,
619
+ "eval_runtime": 10.1999,
620
+ "eval_samples_per_second": 58.824,
621
+ "eval_steps_per_second": 7.353,
622
+ "step": 3400
623
+ },
624
+ {
625
+ "epoch": 0.9387755102040817,
626
+ "grad_norm": 0.8045864701271057,
627
+ "learning_rate": 2.847445255474453e-05,
628
+ "loss": 0.0364,
629
+ "step": 3450
630
+ },
631
+ {
632
+ "epoch": 0.9523809523809523,
633
+ "grad_norm": 0.9472477436065674,
634
+ "learning_rate": 2.810948905109489e-05,
635
+ "loss": 0.0365,
636
+ "step": 3500
637
+ },
638
+ {
639
+ "epoch": 0.9659863945578231,
640
+ "grad_norm": 0.5935471057891846,
641
+ "learning_rate": 2.7744525547445254e-05,
642
+ "loss": 0.0358,
643
+ "step": 3550
644
+ },
645
+ {
646
+ "epoch": 0.9795918367346939,
647
+ "grad_norm": 0.42011234164237976,
648
+ "learning_rate": 2.737956204379562e-05,
649
+ "loss": 0.0371,
650
+ "step": 3600
651
+ },
652
+ {
653
+ "epoch": 0.9795918367346939,
654
+ "eval_loss": 0.034322191029787064,
655
+ "eval_runtime": 10.2002,
656
+ "eval_samples_per_second": 58.823,
657
+ "eval_steps_per_second": 7.353,
658
+ "step": 3600
659
+ },
660
+ {
661
+ "epoch": 0.9931972789115646,
662
+ "grad_norm": 0.757950484752655,
663
+ "learning_rate": 2.7014598540145987e-05,
664
+ "loss": 0.0367,
665
+ "step": 3650
666
+ },
667
+ {
668
+ "epoch": 1.0068027210884354,
669
+ "grad_norm": 0.37888622283935547,
670
+ "learning_rate": 2.664963503649635e-05,
671
+ "loss": 0.0376,
672
+ "step": 3700
673
+ },
674
+ {
675
+ "epoch": 1.0204081632653061,
676
+ "grad_norm": 0.4423586428165436,
677
+ "learning_rate": 2.6284671532846716e-05,
678
+ "loss": 0.0363,
679
+ "step": 3750
680
+ },
681
+ {
682
+ "epoch": 1.034013605442177,
683
+ "grad_norm": 0.526573896408081,
684
+ "learning_rate": 2.591970802919708e-05,
685
+ "loss": 0.0364,
686
+ "step": 3800
687
+ },
688
+ {
689
+ "epoch": 1.034013605442177,
690
+ "eval_loss": 0.03279593959450722,
691
+ "eval_runtime": 10.1819,
692
+ "eval_samples_per_second": 58.928,
693
+ "eval_steps_per_second": 7.366,
694
+ "step": 3800
695
+ }
696
+ ],
697
+ "logging_steps": 50,
698
+ "max_steps": 7350,
699
+ "num_input_tokens_seen": 0,
700
+ "num_train_epochs": 2,
701
+ "save_steps": 200,
702
+ "stateful_callbacks": {
703
+ "TrainerControl": {
704
+ "args": {
705
+ "should_epoch_stop": false,
706
+ "should_evaluate": false,
707
+ "should_log": false,
708
+ "should_save": true,
709
+ "should_training_stop": false
710
+ },
711
+ "attributes": {}
712
+ }
713
+ },
714
+ "total_flos": 1.58865555456e+16,
715
+ "train_batch_size": 2,
716
+ "trial_name": null,
717
+ "trial_params": null
718
+ }
checkpoints/checkpoint-step-3800/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e65f11e00edb82d09a337df334551a2d5eac2eb0f7f94aaa44d3be5e86cc7a7
3
+ size 5777
checkpoints/checkpoint-step-3800/vocab.json ADDED
The diff for this file is too large to render. See raw diff