chansung commited on
Commit
9f6d950
·
verified ·
1 Parent(s): 5e6ece8

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -2
  2. all_results.json +4 -9
  3. train_results.json +4 -4
  4. trainer_state.json +84 -84
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 1.3992
24
 
25
  ## Model description
26
 
@@ -57,7 +57,7 @@ The following hyperparameters were used during training:
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
- | 0.8942 | 1.0 | 185 | 1.3992 |
61
 
62
 
63
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.4024
24
 
25
  ## Model description
26
 
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
+ | 0.893 | 1.0 | 185 | 1.4024 |
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 1.3991833925247192,
4
- "eval_runtime": 1.4894,
5
- "eval_samples": 16,
6
- "eval_samples_per_second": 7.385,
7
- "eval_steps_per_second": 0.671,
8
  "total_flos": 9.060485625492275e+17,
9
- "train_loss": 0.9615903693276483,
10
- "train_runtime": 3138.0854,
11
  "train_samples": 116368,
12
- "train_samples_per_second": 13.203,
13
- "train_steps_per_second": 0.059
14
  }
 
1
  {
2
  "epoch": 1.0,
 
 
 
 
 
3
  "total_flos": 9.060485625492275e+17,
4
+ "train_loss": 0.9603648733448338,
5
+ "train_runtime": 724.3175,
6
  "train_samples": 116368,
7
+ "train_samples_per_second": 57.2,
8
+ "train_steps_per_second": 0.255
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 9.060485625492275e+17,
4
- "train_loss": 0.9615903693276483,
5
- "train_runtime": 3138.0854,
6
  "train_samples": 116368,
7
- "train_samples_per_second": 13.203,
8
- "train_steps_per_second": 0.059
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 9.060485625492275e+17,
4
+ "train_loss": 0.9603648733448338,
5
+ "train_runtime": 724.3175,
6
  "train_samples": 116368,
7
+ "train_samples_per_second": 57.2,
8
+ "train_steps_per_second": 0.255
9
  }
trainer_state.json CHANGED
@@ -10,286 +10,286 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.005405405405405406,
13
- "grad_norm": 2.6925952434539795,
14
  "learning_rate": 1.0526315789473684e-05,
15
- "loss": 1.4517,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.02702702702702703,
20
- "grad_norm": 2.4539754390716553,
21
  "learning_rate": 5.2631578947368424e-05,
22
- "loss": 1.4341,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.05405405405405406,
27
- "grad_norm": 2.1194558143615723,
28
  "learning_rate": 0.00010526315789473685,
29
- "loss": 1.3761,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.08108108108108109,
34
- "grad_norm": 2.0553150177001953,
35
  "learning_rate": 0.00015789473684210527,
36
- "loss": 1.2687,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.10810810810810811,
41
- "grad_norm": 1.8468493223190308,
42
  "learning_rate": 0.00019998209226697376,
43
- "loss": 1.1537,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.13513513513513514,
48
- "grad_norm": 1.0270365476608276,
49
  "learning_rate": 0.0001993559947963185,
50
- "loss": 1.0561,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.16216216216216217,
55
- "grad_norm": 0.9126940369606018,
56
  "learning_rate": 0.00019784091409455728,
57
- "loss": 0.9985,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.1891891891891892,
62
- "grad_norm": 0.6563550233840942,
63
  "learning_rate": 0.0001954504062771555,
64
- "loss": 0.9701,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.21621621621621623,
69
- "grad_norm": 0.6633013486862183,
70
  "learning_rate": 0.00019220586030376134,
71
- "loss": 0.9619,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.24324324324324326,
76
- "grad_norm": 0.7960276007652283,
77
  "learning_rate": 0.00018813630660146488,
78
- "loss": 0.9392,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.2702702702702703,
83
- "grad_norm": 0.6284172534942627,
84
  "learning_rate": 0.00018327815731637612,
85
- "loss": 0.9387,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.2972972972972973,
90
- "grad_norm": 0.6241137981414795,
91
  "learning_rate": 0.00017767488051760857,
92
- "loss": 0.9316,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.32432432432432434,
97
- "grad_norm": 0.7423997521400452,
98
  "learning_rate": 0.0001713766112687139,
99
- "loss": 0.9287,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.35135135135135137,
104
- "grad_norm": 0.6817660331726074,
105
  "learning_rate": 0.0001644397030464877,
106
- "loss": 0.9293,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.3783783783783784,
111
- "grad_norm": 0.6054185628890991,
112
  "learning_rate": 0.00015692622352080662,
113
- "loss": 0.9134,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.40540540540540543,
118
- "grad_norm": 0.6050704121589661,
119
  "learning_rate": 0.00014890339920698334,
120
- "loss": 0.93,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.43243243243243246,
125
- "grad_norm": 0.589340090751648,
126
  "learning_rate": 0.0001404430139595877,
127
- "loss": 0.9163,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.4594594594594595,
132
- "grad_norm": 0.6608156561851501,
133
  "learning_rate": 0.0001316207666896824,
134
- "loss": 0.9211,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.4864864864864865,
139
- "grad_norm": 0.5910987854003906,
140
  "learning_rate": 0.00012251559405226941,
141
- "loss": 0.9081,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.5135135135135135,
146
- "grad_norm": 0.6293689608573914,
147
  "learning_rate": 0.00011320896416417026,
148
- "loss": 0.9123,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.5405405405405406,
153
- "grad_norm": 0.5938843488693237,
154
  "learning_rate": 0.00010378414767176705,
155
- "loss": 0.902,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.5675675675675675,
160
- "grad_norm": 0.6008319854736328,
161
  "learning_rate": 9.432547269069261e-05,
162
- "loss": 0.9061,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.5945945945945946,
167
- "grad_norm": 0.6468297839164734,
168
  "learning_rate": 8.491757028386263e-05,
169
- "loss": 0.9038,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.6216216216216216,
174
- "grad_norm": 0.6693244576454163,
175
  "learning_rate": 7.564461722890081e-05,
176
- "loss": 0.911,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.6486486486486487,
181
- "grad_norm": 0.590085506439209,
182
  "learning_rate": 6.658958285026102e-05,
183
- "loss": 0.9041,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.6756756756756757,
188
- "grad_norm": 0.5497063398361206,
189
  "learning_rate": 5.7833486654981606e-05,
190
- "loss": 0.9066,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.7027027027027027,
195
- "grad_norm": 0.5866259932518005,
196
  "learning_rate": 4.945467341434195e-05,
197
- "loss": 0.904,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.7297297297297297,
202
- "grad_norm": 0.6213558316230774,
203
  "learning_rate": 4.152811217759529e-05,
204
- "loss": 0.9036,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.7567567567567568,
209
- "grad_norm": 0.5839004516601562,
210
  "learning_rate": 3.4124725489820645e-05,
211
- "loss": 0.8903,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.7837837837837838,
216
- "grad_norm": 0.6068409085273743,
217
  "learning_rate": 2.7310754815685624e-05,
218
- "loss": 0.8978,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.8108108108108109,
223
- "grad_norm": 0.6482556462287903,
224
  "learning_rate": 2.1147167846963422e-05,
225
- "loss": 0.8958,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.8378378378378378,
230
- "grad_norm": 0.5723783373832703,
231
  "learning_rate": 1.5689112996891576e-05,
232
- "loss": 0.9028,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.8648648648648649,
237
- "grad_norm": 0.655846357345581,
238
  "learning_rate": 1.0985425962260343e-05,
239
- "loss": 0.8999,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.8918918918918919,
244
- "grad_norm": 0.6444997787475586,
245
  "learning_rate": 7.078192768243486e-06,
246
- "loss": 0.8923,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.918918918918919,
251
- "grad_norm": 0.5838979482650757,
252
  "learning_rate": 4.002373205607723e-06,
253
- "loss": 0.8973,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.9459459459459459,
258
- "grad_norm": 0.6335191130638123,
259
  "learning_rate": 1.7854880295797405e-06,
260
- "loss": 0.8826,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.972972972972973,
265
- "grad_norm": 0.5943057537078857,
266
  "learning_rate": 4.4737271914411236e-07,
267
- "loss": 0.8931,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 1.0,
272
- "grad_norm": 0.600213885307312,
273
  "learning_rate": 0.0,
274
- "loss": 0.8942,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 1.0,
279
- "eval_loss": 1.3991833925247192,
280
- "eval_runtime": 1.4887,
281
- "eval_samples_per_second": 7.389,
282
- "eval_steps_per_second": 0.672,
283
  "step": 185
284
  },
285
  {
286
  "epoch": 1.0,
287
  "step": 185,
288
  "total_flos": 9.060485625492275e+17,
289
- "train_loss": 0.9615903693276483,
290
- "train_runtime": 3138.0854,
291
- "train_samples_per_second": 13.203,
292
- "train_steps_per_second": 0.059
293
  }
294
  ],
295
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.005405405405405406,
13
+ "grad_norm": 2.6678006649017334,
14
  "learning_rate": 1.0526315789473684e-05,
15
+ "loss": 1.4457,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.02702702702702703,
20
+ "grad_norm": 2.4659605026245117,
21
  "learning_rate": 5.2631578947368424e-05,
22
+ "loss": 1.4311,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.05405405405405406,
27
+ "grad_norm": 2.074678659439087,
28
  "learning_rate": 0.00010526315789473685,
29
+ "loss": 1.376,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.08108108108108109,
34
+ "grad_norm": 1.9776719808578491,
35
  "learning_rate": 0.00015789473684210527,
36
+ "loss": 1.2676,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.10810810810810811,
41
+ "grad_norm": 1.8173789978027344,
42
  "learning_rate": 0.00019998209226697376,
43
+ "loss": 1.1517,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.13513513513513514,
48
+ "grad_norm": 1.0727437734603882,
49
  "learning_rate": 0.0001993559947963185,
50
+ "loss": 1.0512,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.16216216216216217,
55
+ "grad_norm": 0.8763325214385986,
56
  "learning_rate": 0.00019784091409455728,
57
+ "loss": 0.9971,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.1891891891891892,
62
+ "grad_norm": 0.6051992774009705,
63
  "learning_rate": 0.0001954504062771555,
64
+ "loss": 0.9685,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.21621621621621623,
69
+ "grad_norm": 0.6363440752029419,
70
  "learning_rate": 0.00019220586030376134,
71
+ "loss": 0.9609,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.24324324324324326,
76
+ "grad_norm": 0.7076846957206726,
77
  "learning_rate": 0.00018813630660146488,
78
+ "loss": 0.938,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.2702702702702703,
83
+ "grad_norm": 0.5835558772087097,
84
  "learning_rate": 0.00018327815731637612,
85
+ "loss": 0.9376,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.2972972972972973,
90
+ "grad_norm": 0.6069886088371277,
91
  "learning_rate": 0.00017767488051760857,
92
+ "loss": 0.9303,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.32432432432432434,
97
+ "grad_norm": 0.7000340819358826,
98
  "learning_rate": 0.0001713766112687139,
99
+ "loss": 0.9276,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.35135135135135137,
104
+ "grad_norm": 0.6510019898414612,
105
  "learning_rate": 0.0001644397030464877,
106
+ "loss": 0.9282,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.3783783783783784,
111
+ "grad_norm": 0.6287819147109985,
112
  "learning_rate": 0.00015692622352080662,
113
+ "loss": 0.9125,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.40540540540540543,
118
+ "grad_norm": 0.6086368560791016,
119
  "learning_rate": 0.00014890339920698334,
120
+ "loss": 0.9291,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.43243243243243246,
125
+ "grad_norm": 0.5720112919807434,
126
  "learning_rate": 0.0001404430139595877,
127
+ "loss": 0.9152,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.4594594594594595,
132
+ "grad_norm": 0.6146399974822998,
133
  "learning_rate": 0.0001316207666896824,
134
+ "loss": 0.9201,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.4864864864864865,
139
+ "grad_norm": 0.5874430537223816,
140
  "learning_rate": 0.00012251559405226941,
141
+ "loss": 0.9071,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.5135135135135135,
146
+ "grad_norm": 0.609653115272522,
147
  "learning_rate": 0.00011320896416417026,
148
+ "loss": 0.9111,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.5405405405405406,
153
+ "grad_norm": 0.5806834101676941,
154
  "learning_rate": 0.00010378414767176705,
155
+ "loss": 0.9008,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.5675675675675675,
160
+ "grad_norm": 0.5760082602500916,
161
  "learning_rate": 9.432547269069261e-05,
162
+ "loss": 0.9053,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.5945945945945946,
167
+ "grad_norm": 0.6656131148338318,
168
  "learning_rate": 8.491757028386263e-05,
169
+ "loss": 0.9029,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.6216216216216216,
174
+ "grad_norm": 0.6559913754463196,
175
  "learning_rate": 7.564461722890081e-05,
176
+ "loss": 0.9103,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.6486486486486487,
181
+ "grad_norm": 0.5813584327697754,
182
  "learning_rate": 6.658958285026102e-05,
183
+ "loss": 0.9033,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.6756756756756757,
188
+ "grad_norm": 0.538943350315094,
189
  "learning_rate": 5.7833486654981606e-05,
190
+ "loss": 0.9059,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.7027027027027027,
195
+ "grad_norm": 0.5927494764328003,
196
  "learning_rate": 4.945467341434195e-05,
197
+ "loss": 0.9031,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.7297297297297297,
202
+ "grad_norm": 0.6069759726524353,
203
  "learning_rate": 4.152811217759529e-05,
204
+ "loss": 0.9027,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.7567567567567568,
209
+ "grad_norm": 0.5718995332717896,
210
  "learning_rate": 3.4124725489820645e-05,
211
+ "loss": 0.8891,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.7837837837837838,
216
+ "grad_norm": 0.6052721738815308,
217
  "learning_rate": 2.7310754815685624e-05,
218
+ "loss": 0.8972,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.8108108108108109,
223
+ "grad_norm": 0.6083750128746033,
224
  "learning_rate": 2.1147167846963422e-05,
225
+ "loss": 0.8948,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.8378378378378378,
230
+ "grad_norm": 0.5737211108207703,
231
  "learning_rate": 1.5689112996891576e-05,
232
+ "loss": 0.9016,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.8648648648648649,
237
+ "grad_norm": 0.6462253332138062,
238
  "learning_rate": 1.0985425962260343e-05,
239
+ "loss": 0.8989,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.8918918918918919,
244
+ "grad_norm": 0.6070159077644348,
245
  "learning_rate": 7.078192768243486e-06,
246
+ "loss": 0.8913,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.918918918918919,
251
+ "grad_norm": 0.5784199833869934,
252
  "learning_rate": 4.002373205607723e-06,
253
+ "loss": 0.8962,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.9459459459459459,
258
+ "grad_norm": 0.6253275871276855,
259
  "learning_rate": 1.7854880295797405e-06,
260
+ "loss": 0.8815,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.972972972972973,
265
+ "grad_norm": 0.5931078195571899,
266
  "learning_rate": 4.4737271914411236e-07,
267
+ "loss": 0.8919,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 1.0,
272
+ "grad_norm": 0.6198851466178894,
273
  "learning_rate": 0.0,
274
+ "loss": 0.893,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 1.0,
279
+ "eval_loss": 1.4024296998977661,
280
+ "eval_runtime": 0.674,
281
+ "eval_samples_per_second": 16.321,
282
+ "eval_steps_per_second": 1.484,
283
  "step": 185
284
  },
285
  {
286
  "epoch": 1.0,
287
  "step": 185,
288
  "total_flos": 9.060485625492275e+17,
289
+ "train_loss": 0.9603648733448338,
290
+ "train_runtime": 724.3175,
291
+ "train_samples_per_second": 57.2,
292
+ "train_steps_per_second": 0.255
293
  }
294
  ],
295
  "logging_steps": 5,