polaris314 commited on
Commit
bd4ea92
·
verified ·
1 Parent(s): ffa9c88

Add checkpoint at step 7000

Browse files
checkpoints/checkpoint-step-7000/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "pad_token_id": 50256,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "task_specific_params": {
31
+ "text-generation": {
32
+ "do_sample": true,
33
+ "max_length": 50
34
+ }
35
+ },
36
+ "transformers_version": "4.56.1",
37
+ "use_cache": true,
38
+ "vocab_size": 50257
39
+ }
checkpoints/checkpoint-step-7000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": [
5
+ 50256
6
+ ],
7
+ "pad_token_id": 50256,
8
+ "transformers_version": "4.56.1"
9
+ }
checkpoints/checkpoint-step-7000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-step-7000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:784db0bf710b7d2f5d21db4bd60e429a12dc79d0c719e202a19c550528b87388
3
+ size 497774208
checkpoints/checkpoint-step-7000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e014ded9d6c452586406e663e6e8d5de9894ed6a94b32a9d69f78f025778dc8
3
+ size 995644811
checkpoints/checkpoint-step-7000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16466d0333028c482007aea45193f9e1865b181a1c01becd71015d19745e9d94
3
+ size 14645
checkpoints/checkpoint-step-7000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a68ceae7165f4c88c8c4809f6ef958049408efc4229565cd13a1f7800728538
3
+ size 1383
checkpoints/checkpoint-step-7000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a19aa3c3e9956335a914b5072c3c48bed5d0544441ba0da1a5c689d79593efb
3
+ size 1465
checkpoints/checkpoint-step-7000/special_tokens_map.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "unk_token": "<|endoftext|>"
12
+ }
checkpoints/checkpoint-step-7000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-step-7000/tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "tokenizer_class": "GPT2Tokenizer",
20
+ "unk_token": "<|endoftext|>"
21
+ }
checkpoints/checkpoint-step-7000/trainer_state.json ADDED
@@ -0,0 +1,1294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 6800,
3
+ "best_metric": 0.029951954260468483,
4
+ "best_model_checkpoint": "checkpoints/checkpoint-6800",
5
+ "epoch": 1.9047619047619047,
6
+ "eval_steps": 200,
7
+ "global_step": 7000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.013605442176870748,
14
+ "grad_norm": 4.6472649574279785,
15
+ "learning_rate": 4.9000000000000005e-06,
16
+ "loss": 2.8759,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.027210884353741496,
21
+ "grad_norm": 3.089015245437622,
22
+ "learning_rate": 9.900000000000002e-06,
23
+ "loss": 1.7949,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.04081632653061224,
28
+ "grad_norm": 2.3650102615356445,
29
+ "learning_rate": 1.49e-05,
30
+ "loss": 0.6938,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.05442176870748299,
35
+ "grad_norm": 1.4423813819885254,
36
+ "learning_rate": 1.9900000000000003e-05,
37
+ "loss": 0.2987,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.05442176870748299,
42
+ "eval_loss": 0.18158380687236786,
43
+ "eval_runtime": 10.2028,
44
+ "eval_samples_per_second": 58.808,
45
+ "eval_steps_per_second": 7.351,
46
+ "step": 200
47
+ },
48
+ {
49
+ "epoch": 0.06802721088435375,
50
+ "grad_norm": 0.9667473435401917,
51
+ "learning_rate": 2.4900000000000002e-05,
52
+ "loss": 0.2009,
53
+ "step": 250
54
+ },
55
+ {
56
+ "epoch": 0.08163265306122448,
57
+ "grad_norm": 1.2099648714065552,
58
+ "learning_rate": 2.9900000000000002e-05,
59
+ "loss": 0.1613,
60
+ "step": 300
61
+ },
62
+ {
63
+ "epoch": 0.09523809523809523,
64
+ "grad_norm": 1.036044955253601,
65
+ "learning_rate": 3.49e-05,
66
+ "loss": 0.1397,
67
+ "step": 350
68
+ },
69
+ {
70
+ "epoch": 0.10884353741496598,
71
+ "grad_norm": 0.7627539038658142,
72
+ "learning_rate": 3.99e-05,
73
+ "loss": 0.1236,
74
+ "step": 400
75
+ },
76
+ {
77
+ "epoch": 0.10884353741496598,
78
+ "eval_loss": 0.09973898530006409,
79
+ "eval_runtime": 10.2069,
80
+ "eval_samples_per_second": 58.784,
81
+ "eval_steps_per_second": 7.348,
82
+ "step": 400
83
+ },
84
+ {
85
+ "epoch": 0.12244897959183673,
86
+ "grad_norm": 1.1398504972457886,
87
+ "learning_rate": 4.49e-05,
88
+ "loss": 0.1149,
89
+ "step": 450
90
+ },
91
+ {
92
+ "epoch": 0.1360544217687075,
93
+ "grad_norm": 0.7201138734817505,
94
+ "learning_rate": 4.99e-05,
95
+ "loss": 0.1004,
96
+ "step": 500
97
+ },
98
+ {
99
+ "epoch": 0.14965986394557823,
100
+ "grad_norm": 0.6430326700210571,
101
+ "learning_rate": 4.9642335766423356e-05,
102
+ "loss": 0.0926,
103
+ "step": 550
104
+ },
105
+ {
106
+ "epoch": 0.16326530612244897,
107
+ "grad_norm": 0.8617527484893799,
108
+ "learning_rate": 4.9277372262773724e-05,
109
+ "loss": 0.0876,
110
+ "step": 600
111
+ },
112
+ {
113
+ "epoch": 0.16326530612244897,
114
+ "eval_loss": 0.06922342628240585,
115
+ "eval_runtime": 10.155,
116
+ "eval_samples_per_second": 59.084,
117
+ "eval_steps_per_second": 7.386,
118
+ "step": 600
119
+ },
120
+ {
121
+ "epoch": 0.17687074829931973,
122
+ "grad_norm": 0.7790252566337585,
123
+ "learning_rate": 4.891240875912409e-05,
124
+ "loss": 0.0813,
125
+ "step": 650
126
+ },
127
+ {
128
+ "epoch": 0.19047619047619047,
129
+ "grad_norm": 0.6242516040802002,
130
+ "learning_rate": 4.854744525547445e-05,
131
+ "loss": 0.0778,
132
+ "step": 700
133
+ },
134
+ {
135
+ "epoch": 0.20408163265306123,
136
+ "grad_norm": 0.5622245073318481,
137
+ "learning_rate": 4.818248175182482e-05,
138
+ "loss": 0.0733,
139
+ "step": 750
140
+ },
141
+ {
142
+ "epoch": 0.21768707482993196,
143
+ "grad_norm": 0.6843573451042175,
144
+ "learning_rate": 4.781751824817519e-05,
145
+ "loss": 0.0703,
146
+ "step": 800
147
+ },
148
+ {
149
+ "epoch": 0.21768707482993196,
150
+ "eval_loss": 0.055413372814655304,
151
+ "eval_runtime": 10.1991,
152
+ "eval_samples_per_second": 58.829,
153
+ "eval_steps_per_second": 7.354,
154
+ "step": 800
155
+ },
156
+ {
157
+ "epoch": 0.23129251700680273,
158
+ "grad_norm": 0.611773669719696,
159
+ "learning_rate": 4.745255474452555e-05,
160
+ "loss": 0.0668,
161
+ "step": 850
162
+ },
163
+ {
164
+ "epoch": 0.24489795918367346,
165
+ "grad_norm": 0.622644305229187,
166
+ "learning_rate": 4.708759124087592e-05,
167
+ "loss": 0.0649,
168
+ "step": 900
169
+ },
170
+ {
171
+ "epoch": 0.2585034013605442,
172
+ "grad_norm": 0.4426730275154114,
173
+ "learning_rate": 4.6722627737226286e-05,
174
+ "loss": 0.0637,
175
+ "step": 950
176
+ },
177
+ {
178
+ "epoch": 0.272108843537415,
179
+ "grad_norm": 0.8717305660247803,
180
+ "learning_rate": 4.635766423357664e-05,
181
+ "loss": 0.0601,
182
+ "step": 1000
183
+ },
184
+ {
185
+ "epoch": 0.272108843537415,
186
+ "eval_loss": 0.05313626304268837,
187
+ "eval_runtime": 10.1979,
188
+ "eval_samples_per_second": 58.836,
189
+ "eval_steps_per_second": 7.354,
190
+ "step": 1000
191
+ },
192
+ {
193
+ "epoch": 0.2857142857142857,
194
+ "grad_norm": 0.5126848220825195,
195
+ "learning_rate": 4.599270072992701e-05,
196
+ "loss": 0.0578,
197
+ "step": 1050
198
+ },
199
+ {
200
+ "epoch": 0.29931972789115646,
201
+ "grad_norm": 0.5319710373878479,
202
+ "learning_rate": 4.5627737226277376e-05,
203
+ "loss": 0.0574,
204
+ "step": 1100
205
+ },
206
+ {
207
+ "epoch": 0.3129251700680272,
208
+ "grad_norm": 0.7722771167755127,
209
+ "learning_rate": 4.526277372262774e-05,
210
+ "loss": 0.0564,
211
+ "step": 1150
212
+ },
213
+ {
214
+ "epoch": 0.32653061224489793,
215
+ "grad_norm": 0.4977850317955017,
216
+ "learning_rate": 4.4897810218978105e-05,
217
+ "loss": 0.0553,
218
+ "step": 1200
219
+ },
220
+ {
221
+ "epoch": 0.32653061224489793,
222
+ "eval_loss": 0.0452888123691082,
223
+ "eval_runtime": 10.1684,
224
+ "eval_samples_per_second": 59.006,
225
+ "eval_steps_per_second": 7.376,
226
+ "step": 1200
227
+ },
228
+ {
229
+ "epoch": 0.3401360544217687,
230
+ "grad_norm": 0.5616284608840942,
231
+ "learning_rate": 4.4532846715328466e-05,
232
+ "loss": 0.0529,
233
+ "step": 1250
234
+ },
235
+ {
236
+ "epoch": 0.35374149659863946,
237
+ "grad_norm": 0.41897886991500854,
238
+ "learning_rate": 4.4167883211678834e-05,
239
+ "loss": 0.0527,
240
+ "step": 1300
241
+ },
242
+ {
243
+ "epoch": 0.3673469387755102,
244
+ "grad_norm": 0.526966392993927,
245
+ "learning_rate": 4.38029197080292e-05,
246
+ "loss": 0.0518,
247
+ "step": 1350
248
+ },
249
+ {
250
+ "epoch": 0.38095238095238093,
251
+ "grad_norm": 0.7886810898780823,
252
+ "learning_rate": 4.343795620437956e-05,
253
+ "loss": 0.0514,
254
+ "step": 1400
255
+ },
256
+ {
257
+ "epoch": 0.38095238095238093,
258
+ "eval_loss": 0.042705778032541275,
259
+ "eval_runtime": 10.1958,
260
+ "eval_samples_per_second": 58.848,
261
+ "eval_steps_per_second": 7.356,
262
+ "step": 1400
263
+ },
264
+ {
265
+ "epoch": 0.3945578231292517,
266
+ "grad_norm": 0.38092342019081116,
267
+ "learning_rate": 4.307299270072993e-05,
268
+ "loss": 0.0507,
269
+ "step": 1450
270
+ },
271
+ {
272
+ "epoch": 0.40816326530612246,
273
+ "grad_norm": 0.47930657863616943,
274
+ "learning_rate": 4.27080291970803e-05,
275
+ "loss": 0.0517,
276
+ "step": 1500
277
+ },
278
+ {
279
+ "epoch": 0.4217687074829932,
280
+ "grad_norm": 0.529920220375061,
281
+ "learning_rate": 4.234306569343066e-05,
282
+ "loss": 0.0494,
283
+ "step": 1550
284
+ },
285
+ {
286
+ "epoch": 0.43537414965986393,
287
+ "grad_norm": 0.5892526507377625,
288
+ "learning_rate": 4.197810218978102e-05,
289
+ "loss": 0.0473,
290
+ "step": 1600
291
+ },
292
+ {
293
+ "epoch": 0.43537414965986393,
294
+ "eval_loss": 0.04030081257224083,
295
+ "eval_runtime": 10.1867,
296
+ "eval_samples_per_second": 58.9,
297
+ "eval_steps_per_second": 7.363,
298
+ "step": 1600
299
+ },
300
+ {
301
+ "epoch": 0.4489795918367347,
302
+ "grad_norm": 0.8671649098396301,
303
+ "learning_rate": 4.161313868613139e-05,
304
+ "loss": 0.047,
305
+ "step": 1650
306
+ },
307
+ {
308
+ "epoch": 0.46258503401360546,
309
+ "grad_norm": 0.6588522791862488,
310
+ "learning_rate": 4.124817518248175e-05,
311
+ "loss": 0.0481,
312
+ "step": 1700
313
+ },
314
+ {
315
+ "epoch": 0.47619047619047616,
316
+ "grad_norm": 0.502729594707489,
317
+ "learning_rate": 4.088321167883212e-05,
318
+ "loss": 0.0456,
319
+ "step": 1750
320
+ },
321
+ {
322
+ "epoch": 0.4897959183673469,
323
+ "grad_norm": 0.5965167284011841,
324
+ "learning_rate": 4.0518248175182486e-05,
325
+ "loss": 0.0463,
326
+ "step": 1800
327
+ },
328
+ {
329
+ "epoch": 0.4897959183673469,
330
+ "eval_loss": 0.040413301438093185,
331
+ "eval_runtime": 10.2208,
332
+ "eval_samples_per_second": 58.704,
333
+ "eval_steps_per_second": 7.338,
334
+ "step": 1800
335
+ },
336
+ {
337
+ "epoch": 0.5034013605442177,
338
+ "grad_norm": 0.38793548941612244,
339
+ "learning_rate": 4.015328467153285e-05,
340
+ "loss": 0.0451,
341
+ "step": 1850
342
+ },
343
+ {
344
+ "epoch": 0.5170068027210885,
345
+ "grad_norm": 0.3949367105960846,
346
+ "learning_rate": 3.9788321167883215e-05,
347
+ "loss": 0.0468,
348
+ "step": 1900
349
+ },
350
+ {
351
+ "epoch": 0.5306122448979592,
352
+ "grad_norm": 0.5778154134750366,
353
+ "learning_rate": 3.9423357664233576e-05,
354
+ "loss": 0.0454,
355
+ "step": 1950
356
+ },
357
+ {
358
+ "epoch": 0.54421768707483,
359
+ "grad_norm": 0.3722288906574249,
360
+ "learning_rate": 3.9058394160583944e-05,
361
+ "loss": 0.0461,
362
+ "step": 2000
363
+ },
364
+ {
365
+ "epoch": 0.54421768707483,
366
+ "eval_loss": 0.03860794007778168,
367
+ "eval_runtime": 10.1795,
368
+ "eval_samples_per_second": 58.942,
369
+ "eval_steps_per_second": 7.368,
370
+ "step": 2000
371
+ },
372
+ {
373
+ "epoch": 0.5578231292517006,
374
+ "grad_norm": 0.5253990292549133,
375
+ "learning_rate": 3.869343065693431e-05,
376
+ "loss": 0.0445,
377
+ "step": 2050
378
+ },
379
+ {
380
+ "epoch": 0.5714285714285714,
381
+ "grad_norm": 0.41715025901794434,
382
+ "learning_rate": 3.832846715328467e-05,
383
+ "loss": 0.042,
384
+ "step": 2100
385
+ },
386
+ {
387
+ "epoch": 0.5850340136054422,
388
+ "grad_norm": 0.567323625087738,
389
+ "learning_rate": 3.796350364963504e-05,
390
+ "loss": 0.0423,
391
+ "step": 2150
392
+ },
393
+ {
394
+ "epoch": 0.5986394557823129,
395
+ "grad_norm": 0.6290739178657532,
396
+ "learning_rate": 3.759854014598541e-05,
397
+ "loss": 0.0428,
398
+ "step": 2200
399
+ },
400
+ {
401
+ "epoch": 0.5986394557823129,
402
+ "eval_loss": 0.037571050226688385,
403
+ "eval_runtime": 10.1846,
404
+ "eval_samples_per_second": 58.913,
405
+ "eval_steps_per_second": 7.364,
406
+ "step": 2200
407
+ },
408
+ {
409
+ "epoch": 0.6122448979591837,
410
+ "grad_norm": 0.40591228008270264,
411
+ "learning_rate": 3.723357664233576e-05,
412
+ "loss": 0.043,
413
+ "step": 2250
414
+ },
415
+ {
416
+ "epoch": 0.6258503401360545,
417
+ "grad_norm": 0.45446255803108215,
418
+ "learning_rate": 3.686861313868613e-05,
419
+ "loss": 0.0423,
420
+ "step": 2300
421
+ },
422
+ {
423
+ "epoch": 0.6394557823129252,
424
+ "grad_norm": 0.41105887293815613,
425
+ "learning_rate": 3.65036496350365e-05,
426
+ "loss": 0.0413,
427
+ "step": 2350
428
+ },
429
+ {
430
+ "epoch": 0.6530612244897959,
431
+ "grad_norm": 0.4538460671901703,
432
+ "learning_rate": 3.613868613138686e-05,
433
+ "loss": 0.0428,
434
+ "step": 2400
435
+ },
436
+ {
437
+ "epoch": 0.6530612244897959,
438
+ "eval_loss": 0.036736100912094116,
439
+ "eval_runtime": 10.1735,
440
+ "eval_samples_per_second": 58.977,
441
+ "eval_steps_per_second": 7.372,
442
+ "step": 2400
443
+ },
444
+ {
445
+ "epoch": 0.6666666666666666,
446
+ "grad_norm": 0.42337286472320557,
447
+ "learning_rate": 3.577372262773723e-05,
448
+ "loss": 0.0406,
449
+ "step": 2450
450
+ },
451
+ {
452
+ "epoch": 0.6802721088435374,
453
+ "grad_norm": 0.3924243450164795,
454
+ "learning_rate": 3.5408759124087596e-05,
455
+ "loss": 0.0412,
456
+ "step": 2500
457
+ },
458
+ {
459
+ "epoch": 0.6938775510204082,
460
+ "grad_norm": 0.3243819773197174,
461
+ "learning_rate": 3.504379562043796e-05,
462
+ "loss": 0.0416,
463
+ "step": 2550
464
+ },
465
+ {
466
+ "epoch": 0.7074829931972789,
467
+ "grad_norm": 0.3431473970413208,
468
+ "learning_rate": 3.4678832116788325e-05,
469
+ "loss": 0.0395,
470
+ "step": 2600
471
+ },
472
+ {
473
+ "epoch": 0.7074829931972789,
474
+ "eval_loss": 0.035297442227602005,
475
+ "eval_runtime": 10.181,
476
+ "eval_samples_per_second": 58.933,
477
+ "eval_steps_per_second": 7.367,
478
+ "step": 2600
479
+ },
480
+ {
481
+ "epoch": 0.7210884353741497,
482
+ "grad_norm": 0.4607592821121216,
483
+ "learning_rate": 3.4313868613138686e-05,
484
+ "loss": 0.0392,
485
+ "step": 2650
486
+ },
487
+ {
488
+ "epoch": 0.7346938775510204,
489
+ "grad_norm": 0.5089300870895386,
490
+ "learning_rate": 3.3948905109489054e-05,
491
+ "loss": 0.0409,
492
+ "step": 2700
493
+ },
494
+ {
495
+ "epoch": 0.7482993197278912,
496
+ "grad_norm": 0.42247381806373596,
497
+ "learning_rate": 3.358394160583942e-05,
498
+ "loss": 0.041,
499
+ "step": 2750
500
+ },
501
+ {
502
+ "epoch": 0.7619047619047619,
503
+ "grad_norm": 0.5103944540023804,
504
+ "learning_rate": 3.321897810218978e-05,
505
+ "loss": 0.0396,
506
+ "step": 2800
507
+ },
508
+ {
509
+ "epoch": 0.7619047619047619,
510
+ "eval_loss": 0.03508320823311806,
511
+ "eval_runtime": 10.1991,
512
+ "eval_samples_per_second": 58.829,
513
+ "eval_steps_per_second": 7.354,
514
+ "step": 2800
515
+ },
516
+ {
517
+ "epoch": 0.7755102040816326,
518
+ "grad_norm": 0.4671725928783417,
519
+ "learning_rate": 3.2854014598540144e-05,
520
+ "loss": 0.0401,
521
+ "step": 2850
522
+ },
523
+ {
524
+ "epoch": 0.7891156462585034,
525
+ "grad_norm": 0.39482733607292175,
526
+ "learning_rate": 3.248905109489051e-05,
527
+ "loss": 0.0413,
528
+ "step": 2900
529
+ },
530
+ {
531
+ "epoch": 0.8027210884353742,
532
+ "grad_norm": 0.6017800569534302,
533
+ "learning_rate": 3.212408759124087e-05,
534
+ "loss": 0.039,
535
+ "step": 2950
536
+ },
537
+ {
538
+ "epoch": 0.8163265306122449,
539
+ "grad_norm": 0.3402301073074341,
540
+ "learning_rate": 3.175912408759124e-05,
541
+ "loss": 0.0388,
542
+ "step": 3000
543
+ },
544
+ {
545
+ "epoch": 0.8163265306122449,
546
+ "eval_loss": 0.03466026484966278,
547
+ "eval_runtime": 10.1801,
548
+ "eval_samples_per_second": 58.938,
549
+ "eval_steps_per_second": 7.367,
550
+ "step": 3000
551
+ },
552
+ {
553
+ "epoch": 0.8299319727891157,
554
+ "grad_norm": 0.31006762385368347,
555
+ "learning_rate": 3.139416058394161e-05,
556
+ "loss": 0.0372,
557
+ "step": 3050
558
+ },
559
+ {
560
+ "epoch": 0.8435374149659864,
561
+ "grad_norm": 0.6701622605323792,
562
+ "learning_rate": 3.102919708029197e-05,
563
+ "loss": 0.0393,
564
+ "step": 3100
565
+ },
566
+ {
567
+ "epoch": 0.8571428571428571,
568
+ "grad_norm": 0.398631751537323,
569
+ "learning_rate": 3.066423357664234e-05,
570
+ "loss": 0.0396,
571
+ "step": 3150
572
+ },
573
+ {
574
+ "epoch": 0.8707482993197279,
575
+ "grad_norm": 0.3951578140258789,
576
+ "learning_rate": 3.0299270072992703e-05,
577
+ "loss": 0.0383,
578
+ "step": 3200
579
+ },
580
+ {
581
+ "epoch": 0.8707482993197279,
582
+ "eval_loss": 0.03498370572924614,
583
+ "eval_runtime": 10.1869,
584
+ "eval_samples_per_second": 58.899,
585
+ "eval_steps_per_second": 7.362,
586
+ "step": 3200
587
+ },
588
+ {
589
+ "epoch": 0.8843537414965986,
590
+ "grad_norm": 0.45307376980781555,
591
+ "learning_rate": 2.9934306569343067e-05,
592
+ "loss": 0.039,
593
+ "step": 3250
594
+ },
595
+ {
596
+ "epoch": 0.8979591836734694,
597
+ "grad_norm": 0.46504005789756775,
598
+ "learning_rate": 2.9569343065693432e-05,
599
+ "loss": 0.0372,
600
+ "step": 3300
601
+ },
602
+ {
603
+ "epoch": 0.9115646258503401,
604
+ "grad_norm": 0.3962818682193756,
605
+ "learning_rate": 2.92043795620438e-05,
606
+ "loss": 0.0388,
607
+ "step": 3350
608
+ },
609
+ {
610
+ "epoch": 0.9251700680272109,
611
+ "grad_norm": 0.5040358901023865,
612
+ "learning_rate": 2.8839416058394164e-05,
613
+ "loss": 0.038,
614
+ "step": 3400
615
+ },
616
+ {
617
+ "epoch": 0.9251700680272109,
618
+ "eval_loss": 0.03590795397758484,
619
+ "eval_runtime": 10.1999,
620
+ "eval_samples_per_second": 58.824,
621
+ "eval_steps_per_second": 7.353,
622
+ "step": 3400
623
+ },
624
+ {
625
+ "epoch": 0.9387755102040817,
626
+ "grad_norm": 0.8045864701271057,
627
+ "learning_rate": 2.847445255474453e-05,
628
+ "loss": 0.0364,
629
+ "step": 3450
630
+ },
631
+ {
632
+ "epoch": 0.9523809523809523,
633
+ "grad_norm": 0.9472477436065674,
634
+ "learning_rate": 2.810948905109489e-05,
635
+ "loss": 0.0365,
636
+ "step": 3500
637
+ },
638
+ {
639
+ "epoch": 0.9659863945578231,
640
+ "grad_norm": 0.5935471057891846,
641
+ "learning_rate": 2.7744525547445254e-05,
642
+ "loss": 0.0358,
643
+ "step": 3550
644
+ },
645
+ {
646
+ "epoch": 0.9795918367346939,
647
+ "grad_norm": 0.42011234164237976,
648
+ "learning_rate": 2.737956204379562e-05,
649
+ "loss": 0.0371,
650
+ "step": 3600
651
+ },
652
+ {
653
+ "epoch": 0.9795918367346939,
654
+ "eval_loss": 0.034322191029787064,
655
+ "eval_runtime": 10.2002,
656
+ "eval_samples_per_second": 58.823,
657
+ "eval_steps_per_second": 7.353,
658
+ "step": 3600
659
+ },
660
+ {
661
+ "epoch": 0.9931972789115646,
662
+ "grad_norm": 0.757950484752655,
663
+ "learning_rate": 2.7014598540145987e-05,
664
+ "loss": 0.0367,
665
+ "step": 3650
666
+ },
667
+ {
668
+ "epoch": 1.0068027210884354,
669
+ "grad_norm": 0.37888622283935547,
670
+ "learning_rate": 2.664963503649635e-05,
671
+ "loss": 0.0376,
672
+ "step": 3700
673
+ },
674
+ {
675
+ "epoch": 1.0204081632653061,
676
+ "grad_norm": 0.4423586428165436,
677
+ "learning_rate": 2.6284671532846716e-05,
678
+ "loss": 0.0363,
679
+ "step": 3750
680
+ },
681
+ {
682
+ "epoch": 1.034013605442177,
683
+ "grad_norm": 0.526573896408081,
684
+ "learning_rate": 2.591970802919708e-05,
685
+ "loss": 0.0364,
686
+ "step": 3800
687
+ },
688
+ {
689
+ "epoch": 1.034013605442177,
690
+ "eval_loss": 0.03279593959450722,
691
+ "eval_runtime": 10.1819,
692
+ "eval_samples_per_second": 58.928,
693
+ "eval_steps_per_second": 7.366,
694
+ "step": 3800
695
+ },
696
+ {
697
+ "epoch": 1.0476190476190477,
698
+ "grad_norm": 0.32934117317199707,
699
+ "learning_rate": 2.555474452554745e-05,
700
+ "loss": 0.0359,
701
+ "step": 3850
702
+ },
703
+ {
704
+ "epoch": 1.0612244897959184,
705
+ "grad_norm": 0.33360642194747925,
706
+ "learning_rate": 2.5189781021897813e-05,
707
+ "loss": 0.0376,
708
+ "step": 3900
709
+ },
710
+ {
711
+ "epoch": 1.0748299319727892,
712
+ "grad_norm": 0.6703974604606628,
713
+ "learning_rate": 2.4824817518248174e-05,
714
+ "loss": 0.0369,
715
+ "step": 3950
716
+ },
717
+ {
718
+ "epoch": 1.08843537414966,
719
+ "grad_norm": 0.3145996034145355,
720
+ "learning_rate": 2.4459854014598542e-05,
721
+ "loss": 0.0372,
722
+ "step": 4000
723
+ },
724
+ {
725
+ "epoch": 1.08843537414966,
726
+ "eval_loss": 0.0330994687974453,
727
+ "eval_runtime": 10.1837,
728
+ "eval_samples_per_second": 58.918,
729
+ "eval_steps_per_second": 7.365,
730
+ "step": 4000
731
+ },
732
+ {
733
+ "epoch": 1.1020408163265305,
734
+ "grad_norm": 0.5301318168640137,
735
+ "learning_rate": 2.4094890510948906e-05,
736
+ "loss": 0.0365,
737
+ "step": 4050
738
+ },
739
+ {
740
+ "epoch": 1.1156462585034013,
741
+ "grad_norm": 0.7140094637870789,
742
+ "learning_rate": 2.372992700729927e-05,
743
+ "loss": 0.0358,
744
+ "step": 4100
745
+ },
746
+ {
747
+ "epoch": 1.129251700680272,
748
+ "grad_norm": 0.3584352433681488,
749
+ "learning_rate": 2.3364963503649635e-05,
750
+ "loss": 0.0356,
751
+ "step": 4150
752
+ },
753
+ {
754
+ "epoch": 1.1428571428571428,
755
+ "grad_norm": 0.32996875047683716,
756
+ "learning_rate": 2.3000000000000003e-05,
757
+ "loss": 0.0363,
758
+ "step": 4200
759
+ },
760
+ {
761
+ "epoch": 1.1428571428571428,
762
+ "eval_loss": 0.03242386505007744,
763
+ "eval_runtime": 10.2256,
764
+ "eval_samples_per_second": 58.676,
765
+ "eval_steps_per_second": 7.335,
766
+ "step": 4200
767
+ },
768
+ {
769
+ "epoch": 1.1564625850340136,
770
+ "grad_norm": 0.3712214231491089,
771
+ "learning_rate": 2.2635036496350365e-05,
772
+ "loss": 0.0355,
773
+ "step": 4250
774
+ },
775
+ {
776
+ "epoch": 1.1700680272108843,
777
+ "grad_norm": 0.4751546084880829,
778
+ "learning_rate": 2.227007299270073e-05,
779
+ "loss": 0.0347,
780
+ "step": 4300
781
+ },
782
+ {
783
+ "epoch": 1.183673469387755,
784
+ "grad_norm": 0.49500930309295654,
785
+ "learning_rate": 2.1905109489051097e-05,
786
+ "loss": 0.0362,
787
+ "step": 4350
788
+ },
789
+ {
790
+ "epoch": 1.1972789115646258,
791
+ "grad_norm": 0.4981195032596588,
792
+ "learning_rate": 2.154014598540146e-05,
793
+ "loss": 0.0351,
794
+ "step": 4400
795
+ },
796
+ {
797
+ "epoch": 1.1972789115646258,
798
+ "eval_loss": 0.033444974571466446,
799
+ "eval_runtime": 10.2088,
800
+ "eval_samples_per_second": 58.773,
801
+ "eval_steps_per_second": 7.347,
802
+ "step": 4400
803
+ },
804
+ {
805
+ "epoch": 1.2108843537414966,
806
+ "grad_norm": 0.30703791975975037,
807
+ "learning_rate": 2.1175182481751826e-05,
808
+ "loss": 0.0348,
809
+ "step": 4450
810
+ },
811
+ {
812
+ "epoch": 1.2244897959183674,
813
+ "grad_norm": 0.40786242485046387,
814
+ "learning_rate": 2.081021897810219e-05,
815
+ "loss": 0.0348,
816
+ "step": 4500
817
+ },
818
+ {
819
+ "epoch": 1.2380952380952381,
820
+ "grad_norm": 0.33420053124427795,
821
+ "learning_rate": 2.044525547445256e-05,
822
+ "loss": 0.0349,
823
+ "step": 4550
824
+ },
825
+ {
826
+ "epoch": 1.251700680272109,
827
+ "grad_norm": 0.39799734950065613,
828
+ "learning_rate": 2.008029197080292e-05,
829
+ "loss": 0.0347,
830
+ "step": 4600
831
+ },
832
+ {
833
+ "epoch": 1.251700680272109,
834
+ "eval_loss": 0.03165949881076813,
835
+ "eval_runtime": 10.4306,
836
+ "eval_samples_per_second": 57.523,
837
+ "eval_steps_per_second": 7.19,
838
+ "step": 4600
839
+ },
840
+ {
841
+ "epoch": 1.2653061224489797,
842
+ "grad_norm": 0.5049504637718201,
843
+ "learning_rate": 1.9715328467153284e-05,
844
+ "loss": 0.0358,
845
+ "step": 4650
846
+ },
847
+ {
848
+ "epoch": 1.2789115646258504,
849
+ "grad_norm": 0.41933709383010864,
850
+ "learning_rate": 1.9350364963503652e-05,
851
+ "loss": 0.0338,
852
+ "step": 4700
853
+ },
854
+ {
855
+ "epoch": 1.2925170068027212,
856
+ "grad_norm": 0.6357618570327759,
857
+ "learning_rate": 1.8985401459854017e-05,
858
+ "loss": 0.0351,
859
+ "step": 4750
860
+ },
861
+ {
862
+ "epoch": 1.306122448979592,
863
+ "grad_norm": 0.34428149461746216,
864
+ "learning_rate": 1.862043795620438e-05,
865
+ "loss": 0.0342,
866
+ "step": 4800
867
+ },
868
+ {
869
+ "epoch": 1.306122448979592,
870
+ "eval_loss": 0.03147235885262489,
871
+ "eval_runtime": 10.2354,
872
+ "eval_samples_per_second": 58.62,
873
+ "eval_steps_per_second": 7.327,
874
+ "step": 4800
875
+ },
876
+ {
877
+ "epoch": 1.3197278911564627,
878
+ "grad_norm": 0.3444003760814667,
879
+ "learning_rate": 1.8255474452554746e-05,
880
+ "loss": 0.035,
881
+ "step": 4850
882
+ },
883
+ {
884
+ "epoch": 1.3333333333333333,
885
+ "grad_norm": 0.3341350853443146,
886
+ "learning_rate": 1.789051094890511e-05,
887
+ "loss": 0.0352,
888
+ "step": 4900
889
+ },
890
+ {
891
+ "epoch": 1.346938775510204,
892
+ "grad_norm": 0.3407929241657257,
893
+ "learning_rate": 1.7525547445255475e-05,
894
+ "loss": 0.0343,
895
+ "step": 4950
896
+ },
897
+ {
898
+ "epoch": 1.3605442176870748,
899
+ "grad_norm": 0.5530718564987183,
900
+ "learning_rate": 1.716058394160584e-05,
901
+ "loss": 0.0344,
902
+ "step": 5000
903
+ },
904
+ {
905
+ "epoch": 1.3605442176870748,
906
+ "eval_loss": 0.031369421631097794,
907
+ "eval_runtime": 10.1936,
908
+ "eval_samples_per_second": 58.86,
909
+ "eval_steps_per_second": 7.358,
910
+ "step": 5000
911
+ },
912
+ {
913
+ "epoch": 1.3741496598639455,
914
+ "grad_norm": 0.43771809339523315,
915
+ "learning_rate": 1.6795620437956207e-05,
916
+ "loss": 0.0359,
917
+ "step": 5050
918
+ },
919
+ {
920
+ "epoch": 1.3877551020408163,
921
+ "grad_norm": 0.3095148503780365,
922
+ "learning_rate": 1.643065693430657e-05,
923
+ "loss": 0.0339,
924
+ "step": 5100
925
+ },
926
+ {
927
+ "epoch": 1.401360544217687,
928
+ "grad_norm": 0.35238730907440186,
929
+ "learning_rate": 1.6065693430656936e-05,
930
+ "loss": 0.0344,
931
+ "step": 5150
932
+ },
933
+ {
934
+ "epoch": 1.4149659863945578,
935
+ "grad_norm": 0.5534179210662842,
936
+ "learning_rate": 1.5700729927007297e-05,
937
+ "loss": 0.0337,
938
+ "step": 5200
939
+ },
940
+ {
941
+ "epoch": 1.4149659863945578,
942
+ "eval_loss": 0.030900483950972557,
943
+ "eval_runtime": 10.2164,
944
+ "eval_samples_per_second": 58.729,
945
+ "eval_steps_per_second": 7.341,
946
+ "step": 5200
947
+ },
948
+ {
949
+ "epoch": 1.4285714285714286,
950
+ "grad_norm": 0.3276965916156769,
951
+ "learning_rate": 1.5335766423357665e-05,
952
+ "loss": 0.0347,
953
+ "step": 5250
954
+ },
955
+ {
956
+ "epoch": 1.4421768707482994,
957
+ "grad_norm": 0.421546071767807,
958
+ "learning_rate": 1.497080291970803e-05,
959
+ "loss": 0.0328,
960
+ "step": 5300
961
+ },
962
+ {
963
+ "epoch": 1.4557823129251701,
964
+ "grad_norm": 0.33375367522239685,
965
+ "learning_rate": 1.4605839416058394e-05,
966
+ "loss": 0.0338,
967
+ "step": 5350
968
+ },
969
+ {
970
+ "epoch": 1.469387755102041,
971
+ "grad_norm": 0.28487250208854675,
972
+ "learning_rate": 1.424087591240876e-05,
973
+ "loss": 0.0338,
974
+ "step": 5400
975
+ },
976
+ {
977
+ "epoch": 1.469387755102041,
978
+ "eval_loss": 0.030995788052678108,
979
+ "eval_runtime": 10.198,
980
+ "eval_samples_per_second": 58.835,
981
+ "eval_steps_per_second": 7.354,
982
+ "step": 5400
983
+ },
984
+ {
985
+ "epoch": 1.4829931972789114,
986
+ "grad_norm": 0.4164125919342041,
987
+ "learning_rate": 1.3875912408759125e-05,
988
+ "loss": 0.0347,
989
+ "step": 5450
990
+ },
991
+ {
992
+ "epoch": 1.4965986394557822,
993
+ "grad_norm": 0.5844776630401611,
994
+ "learning_rate": 1.3510948905109488e-05,
995
+ "loss": 0.0342,
996
+ "step": 5500
997
+ },
998
+ {
999
+ "epoch": 1.510204081632653,
1000
+ "grad_norm": 0.4449387490749359,
1001
+ "learning_rate": 1.3145985401459854e-05,
1002
+ "loss": 0.0329,
1003
+ "step": 5550
1004
+ },
1005
+ {
1006
+ "epoch": 1.5238095238095237,
1007
+ "grad_norm": 0.351698100566864,
1008
+ "learning_rate": 1.2781021897810219e-05,
1009
+ "loss": 0.0334,
1010
+ "step": 5600
1011
+ },
1012
+ {
1013
+ "epoch": 1.5238095238095237,
1014
+ "eval_loss": 0.030806375667452812,
1015
+ "eval_runtime": 10.1839,
1016
+ "eval_samples_per_second": 58.917,
1017
+ "eval_steps_per_second": 7.365,
1018
+ "step": 5600
1019
+ },
1020
+ {
1021
+ "epoch": 1.5374149659863945,
1022
+ "grad_norm": 0.32961511611938477,
1023
+ "learning_rate": 1.2416058394160585e-05,
1024
+ "loss": 0.0332,
1025
+ "step": 5650
1026
+ },
1027
+ {
1028
+ "epoch": 1.5510204081632653,
1029
+ "grad_norm": 0.39127564430236816,
1030
+ "learning_rate": 1.205109489051095e-05,
1031
+ "loss": 0.0329,
1032
+ "step": 5700
1033
+ },
1034
+ {
1035
+ "epoch": 1.564625850340136,
1036
+ "grad_norm": 0.47163116931915283,
1037
+ "learning_rate": 1.1686131386861314e-05,
1038
+ "loss": 0.0328,
1039
+ "step": 5750
1040
+ },
1041
+ {
1042
+ "epoch": 1.5782312925170068,
1043
+ "grad_norm": 0.34902673959732056,
1044
+ "learning_rate": 1.132116788321168e-05,
1045
+ "loss": 0.0341,
1046
+ "step": 5800
1047
+ },
1048
+ {
1049
+ "epoch": 1.5782312925170068,
1050
+ "eval_loss": 0.030648473650217056,
1051
+ "eval_runtime": 10.232,
1052
+ "eval_samples_per_second": 58.639,
1053
+ "eval_steps_per_second": 7.33,
1054
+ "step": 5800
1055
+ },
1056
+ {
1057
+ "epoch": 1.5918367346938775,
1058
+ "grad_norm": 0.34722593426704407,
1059
+ "learning_rate": 1.0956204379562045e-05,
1060
+ "loss": 0.0336,
1061
+ "step": 5850
1062
+ },
1063
+ {
1064
+ "epoch": 1.6054421768707483,
1065
+ "grad_norm": 0.3117406964302063,
1066
+ "learning_rate": 1.0591240875912409e-05,
1067
+ "loss": 0.0325,
1068
+ "step": 5900
1069
+ },
1070
+ {
1071
+ "epoch": 1.619047619047619,
1072
+ "grad_norm": 0.38354817032814026,
1073
+ "learning_rate": 1.0226277372262774e-05,
1074
+ "loss": 0.0337,
1075
+ "step": 5950
1076
+ },
1077
+ {
1078
+ "epoch": 1.6326530612244898,
1079
+ "grad_norm": 0.4977235794067383,
1080
+ "learning_rate": 9.86131386861314e-06,
1081
+ "loss": 0.033,
1082
+ "step": 6000
1083
+ },
1084
+ {
1085
+ "epoch": 1.6326530612244898,
1086
+ "eval_loss": 0.03188992664217949,
1087
+ "eval_runtime": 10.1863,
1088
+ "eval_samples_per_second": 58.903,
1089
+ "eval_steps_per_second": 7.363,
1090
+ "step": 6000
1091
+ },
1092
+ {
1093
+ "epoch": 1.6462585034013606,
1094
+ "grad_norm": 0.3820977509021759,
1095
+ "learning_rate": 9.496350364963503e-06,
1096
+ "loss": 0.034,
1097
+ "step": 6050
1098
+ },
1099
+ {
1100
+ "epoch": 1.6598639455782314,
1101
+ "grad_norm": 0.32841384410858154,
1102
+ "learning_rate": 9.131386861313869e-06,
1103
+ "loss": 0.0332,
1104
+ "step": 6100
1105
+ },
1106
+ {
1107
+ "epoch": 1.6734693877551021,
1108
+ "grad_norm": 0.31902793049812317,
1109
+ "learning_rate": 8.766423357664235e-06,
1110
+ "loss": 0.0327,
1111
+ "step": 6150
1112
+ },
1113
+ {
1114
+ "epoch": 1.6870748299319729,
1115
+ "grad_norm": 0.35828185081481934,
1116
+ "learning_rate": 8.401459854014598e-06,
1117
+ "loss": 0.0335,
1118
+ "step": 6200
1119
+ },
1120
+ {
1121
+ "epoch": 1.6870748299319729,
1122
+ "eval_loss": 0.03037342056632042,
1123
+ "eval_runtime": 10.2013,
1124
+ "eval_samples_per_second": 58.816,
1125
+ "eval_steps_per_second": 7.352,
1126
+ "step": 6200
1127
+ },
1128
+ {
1129
+ "epoch": 1.7006802721088436,
1130
+ "grad_norm": 0.3647211790084839,
1131
+ "learning_rate": 8.036496350364964e-06,
1132
+ "loss": 0.0331,
1133
+ "step": 6250
1134
+ },
1135
+ {
1136
+ "epoch": 1.7142857142857144,
1137
+ "grad_norm": 0.34833037853240967,
1138
+ "learning_rate": 7.671532846715329e-06,
1139
+ "loss": 0.0344,
1140
+ "step": 6300
1141
+ },
1142
+ {
1143
+ "epoch": 1.7278911564625852,
1144
+ "grad_norm": 0.35880446434020996,
1145
+ "learning_rate": 7.306569343065693e-06,
1146
+ "loss": 0.0326,
1147
+ "step": 6350
1148
+ },
1149
+ {
1150
+ "epoch": 1.741496598639456,
1151
+ "grad_norm": 0.31057825684547424,
1152
+ "learning_rate": 6.941605839416059e-06,
1153
+ "loss": 0.0322,
1154
+ "step": 6400
1155
+ },
1156
+ {
1157
+ "epoch": 1.741496598639456,
1158
+ "eval_loss": 0.030219364911317825,
1159
+ "eval_runtime": 10.2032,
1160
+ "eval_samples_per_second": 58.805,
1161
+ "eval_steps_per_second": 7.351,
1162
+ "step": 6400
1163
+ },
1164
+ {
1165
+ "epoch": 1.7551020408163265,
1166
+ "grad_norm": 0.3011321723461151,
1167
+ "learning_rate": 6.576642335766424e-06,
1168
+ "loss": 0.0325,
1169
+ "step": 6450
1170
+ },
1171
+ {
1172
+ "epoch": 1.7687074829931972,
1173
+ "grad_norm": 0.26797881722450256,
1174
+ "learning_rate": 6.2116788321167885e-06,
1175
+ "loss": 0.0322,
1176
+ "step": 6500
1177
+ },
1178
+ {
1179
+ "epoch": 1.782312925170068,
1180
+ "grad_norm": 0.4549163579940796,
1181
+ "learning_rate": 5.846715328467153e-06,
1182
+ "loss": 0.0325,
1183
+ "step": 6550
1184
+ },
1185
+ {
1186
+ "epoch": 1.7959183673469388,
1187
+ "grad_norm": 0.3827670216560364,
1188
+ "learning_rate": 5.481751824817518e-06,
1189
+ "loss": 0.0329,
1190
+ "step": 6600
1191
+ },
1192
+ {
1193
+ "epoch": 1.7959183673469388,
1194
+ "eval_loss": 0.030094588175415993,
1195
+ "eval_runtime": 10.1718,
1196
+ "eval_samples_per_second": 58.987,
1197
+ "eval_steps_per_second": 7.373,
1198
+ "step": 6600
1199
+ },
1200
+ {
1201
+ "epoch": 1.8095238095238095,
1202
+ "grad_norm": 0.7711276412010193,
1203
+ "learning_rate": 5.116788321167884e-06,
1204
+ "loss": 0.0327,
1205
+ "step": 6650
1206
+ },
1207
+ {
1208
+ "epoch": 1.8231292517006803,
1209
+ "grad_norm": 0.4340929388999939,
1210
+ "learning_rate": 4.751824817518248e-06,
1211
+ "loss": 0.0326,
1212
+ "step": 6700
1213
+ },
1214
+ {
1215
+ "epoch": 1.836734693877551,
1216
+ "grad_norm": 0.46433642506599426,
1217
+ "learning_rate": 4.386861313868614e-06,
1218
+ "loss": 0.0331,
1219
+ "step": 6750
1220
+ },
1221
+ {
1222
+ "epoch": 1.8503401360544216,
1223
+ "grad_norm": 0.5542903542518616,
1224
+ "learning_rate": 4.021897810218978e-06,
1225
+ "loss": 0.0336,
1226
+ "step": 6800
1227
+ },
1228
+ {
1229
+ "epoch": 1.8503401360544216,
1230
+ "eval_loss": 0.029951954260468483,
1231
+ "eval_runtime": 10.1915,
1232
+ "eval_samples_per_second": 58.872,
1233
+ "eval_steps_per_second": 7.359,
1234
+ "step": 6800
1235
+ },
1236
+ {
1237
+ "epoch": 1.8639455782312924,
1238
+ "grad_norm": 0.3692566156387329,
1239
+ "learning_rate": 3.6569343065693436e-06,
1240
+ "loss": 0.0322,
1241
+ "step": 6850
1242
+ },
1243
+ {
1244
+ "epoch": 1.8775510204081631,
1245
+ "grad_norm": 0.25918880105018616,
1246
+ "learning_rate": 3.291970802919708e-06,
1247
+ "loss": 0.0325,
1248
+ "step": 6900
1249
+ },
1250
+ {
1251
+ "epoch": 1.891156462585034,
1252
+ "grad_norm": 0.32452937960624695,
1253
+ "learning_rate": 2.927007299270073e-06,
1254
+ "loss": 0.0321,
1255
+ "step": 6950
1256
+ },
1257
+ {
1258
+ "epoch": 1.9047619047619047,
1259
+ "grad_norm": 0.3027072250843048,
1260
+ "learning_rate": 2.562043795620438e-06,
1261
+ "loss": 0.0328,
1262
+ "step": 7000
1263
+ },
1264
+ {
1265
+ "epoch": 1.9047619047619047,
1266
+ "eval_loss": 0.029974693432450294,
1267
+ "eval_runtime": 10.1955,
1268
+ "eval_samples_per_second": 58.849,
1269
+ "eval_steps_per_second": 7.356,
1270
+ "step": 7000
1271
+ }
1272
+ ],
1273
+ "logging_steps": 50,
1274
+ "max_steps": 7350,
1275
+ "num_input_tokens_seen": 0,
1276
+ "num_train_epochs": 2,
1277
+ "save_steps": 200,
1278
+ "stateful_callbacks": {
1279
+ "TrainerControl": {
1280
+ "args": {
1281
+ "should_epoch_stop": false,
1282
+ "should_evaluate": false,
1283
+ "should_log": false,
1284
+ "should_save": true,
1285
+ "should_training_stop": false
1286
+ },
1287
+ "attributes": {}
1288
+ }
1289
+ },
1290
+ "total_flos": 2.9264707584e+16,
1291
+ "train_batch_size": 2,
1292
+ "trial_name": null,
1293
+ "trial_params": null
1294
+ }
checkpoints/checkpoint-step-7000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e65f11e00edb82d09a337df334551a2d5eac2eb0f7f94aaa44d3be5e86cc7a7
3
+ size 5777
checkpoints/checkpoint-step-7000/vocab.json ADDED
The diff for this file is too large to render. See raw diff