chansung commited on
Commit
a2d25a5
·
verified ·
1 Parent(s): 4d8079b

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -2
  2. all_results.json +4 -9
  3. train_results.json +4 -4
  4. trainer_state.json +66 -66
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 2.0159
24
 
25
  ## Model description
26
 
@@ -57,7 +57,7 @@ The following hyperparameters were used during training:
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
- | 1.3639 | 0.9965 | 142 | 2.0159 |
61
 
62
 
63
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 2.0179
24
 
25
  ## Model description
26
 
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
+ | 1.3624 | 0.9965 | 142 | 2.0179 |
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 0.9964912280701754,
3
- "eval_loss": 2.01587176322937,
4
- "eval_runtime": 1.5311,
5
- "eval_samples": 25,
6
- "eval_samples_per_second": 15.675,
7
- "eval_steps_per_second": 0.653,
8
  "total_flos": 6.954534912540017e+17,
9
- "train_loss": 1.4601248418781119,
10
- "train_runtime": 2408.4577,
11
  "train_samples": 129221,
12
- "train_samples_per_second": 13.235,
13
- "train_steps_per_second": 0.059
14
  }
 
1
  {
2
  "epoch": 0.9964912280701754,
 
 
 
 
 
3
  "total_flos": 6.954534912540017e+17,
4
+ "train_loss": 1.4586160669864063,
5
+ "train_runtime": 560.2403,
6
  "train_samples": 129221,
7
+ "train_samples_per_second": 56.899,
8
+ "train_steps_per_second": 0.253
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9964912280701754,
3
  "total_flos": 6.954534912540017e+17,
4
- "train_loss": 1.4601248418781119,
5
- "train_runtime": 2408.4577,
6
  "train_samples": 129221,
7
- "train_samples_per_second": 13.235,
8
- "train_steps_per_second": 0.059
9
  }
 
1
  {
2
  "epoch": 0.9964912280701754,
3
  "total_flos": 6.954534912540017e+17,
4
+ "train_loss": 1.4586160669864063,
5
+ "train_runtime": 560.2403,
6
  "train_samples": 129221,
7
+ "train_samples_per_second": 56.899,
8
+ "train_steps_per_second": 0.253
9
  }
trainer_state.json CHANGED
@@ -10,223 +10,223 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.007017543859649123,
13
- "grad_norm": 4.245335102081299,
14
  "learning_rate": 1.3333333333333333e-05,
15
- "loss": 2.0944,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.03508771929824561,
20
- "grad_norm": 3.7525720596313477,
21
  "learning_rate": 6.666666666666667e-05,
22
- "loss": 2.0745,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.07017543859649122,
27
- "grad_norm": 2.4327211380004883,
28
  "learning_rate": 0.00013333333333333334,
29
- "loss": 1.9452,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.10526315789473684,
34
- "grad_norm": 2.3125839233398438,
35
  "learning_rate": 0.0002,
36
- "loss": 1.8137,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.14035087719298245,
41
- "grad_norm": 4.225382328033447,
42
  "learning_rate": 0.00019923607874151032,
43
- "loss": 1.6506,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.17543859649122806,
48
- "grad_norm": 1.9176355600357056,
49
  "learning_rate": 0.00019695598647982468,
50
- "loss": 1.512,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.21052631578947367,
55
- "grad_norm": 1.0350816249847412,
56
  "learning_rate": 0.00019319455943394347,
57
- "loss": 1.4861,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.24561403508771928,
62
- "grad_norm": 0.7984347343444824,
63
  "learning_rate": 0.00018800926628551886,
64
- "loss": 1.4497,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.2807017543859649,
69
- "grad_norm": 0.894679844379425,
70
  "learning_rate": 0.00018147933014790244,
71
- "loss": 1.4256,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.3157894736842105,
76
- "grad_norm": 0.7882960438728333,
77
  "learning_rate": 0.0001737045181617364,
78
- "loss": 1.4192,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.3508771929824561,
83
- "grad_norm": 0.8045026063919067,
84
  "learning_rate": 0.00016480361721016054,
85
- "loss": 1.4128,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.38596491228070173,
90
- "grad_norm": 0.7174099087715149,
91
  "learning_rate": 0.00015491261904230727,
92
- "loss": 1.4027,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.42105263157894735,
97
- "grad_norm": 0.9516172409057617,
98
  "learning_rate": 0.0001441826425335387,
99
- "loss": 1.4043,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.45614035087719296,
104
- "grad_norm": 0.871509313583374,
105
  "learning_rate": 0.00013277762482701767,
106
- "loss": 1.3877,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.49122807017543857,
111
- "grad_norm": 0.7828186750411987,
112
  "learning_rate": 0.00012087181663233354,
113
- "loss": 1.384,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.5263157894736842,
118
- "grad_norm": 0.8469617962837219,
119
  "learning_rate": 0.00010864711994907458,
120
- "loss": 1.3868,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.5614035087719298,
125
- "grad_norm": 0.7687762379646301,
126
  "learning_rate": 9.629030889073949e-05,
127
- "loss": 1.3761,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.5964912280701754,
132
- "grad_norm": 0.77200847864151,
133
  "learning_rate": 8.399017607042025e-05,
134
- "loss": 1.3697,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.631578947368421,
139
- "grad_norm": 0.8314034342765808,
140
  "learning_rate": 7.193464814699073e-05,
141
- "loss": 1.3686,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.6666666666666666,
146
- "grad_norm": 0.8049700856208801,
147
  "learning_rate": 6.0307914601711305e-05,
148
- "loss": 1.3621,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.7017543859649122,
153
- "grad_norm": 0.8274004459381104,
154
  "learning_rate": 4.928761361302269e-05,
155
- "loss": 1.3719,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.7368421052631579,
160
- "grad_norm": 0.7905752062797546,
161
  "learning_rate": 3.904211802492922e-05,
162
- "loss": 1.3677,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.7719298245614035,
167
- "grad_norm": 0.7010822892189026,
168
  "learning_rate": 2.9727962875101e-05,
169
- "loss": 1.3647,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.8070175438596491,
174
- "grad_norm": 0.7270281910896301,
175
  "learning_rate": 2.1487453786014512e-05,
176
- "loss": 1.3591,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.8421052631578947,
181
- "grad_norm": 0.770898163318634,
182
  "learning_rate": 1.4446492759148411e-05,
183
- "loss": 1.3728,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.8771929824561403,
188
- "grad_norm": 0.7640476822853088,
189
  "learning_rate": 8.712654590675085e-06,
190
- "loss": 1.3588,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.9122807017543859,
195
- "grad_norm": 0.7663669586181641,
196
  "learning_rate": 4.37354329798726e-06,
197
- "loss": 1.3641,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.9473684210526315,
202
- "grad_norm": 0.7269485592842102,
203
  "learning_rate": 1.4954536682736719e-06,
204
- "loss": 1.3634,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.9824561403508771,
209
- "grad_norm": 0.8241817355155945,
210
  "learning_rate": 1.2235837857387246e-07,
211
- "loss": 1.3639,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.9964912280701754,
216
- "eval_loss": 2.01587176322937,
217
- "eval_runtime": 1.5396,
218
- "eval_samples_per_second": 15.588,
219
- "eval_steps_per_second": 0.65,
220
  "step": 142
221
  },
222
  {
223
  "epoch": 0.9964912280701754,
224
  "step": 142,
225
  "total_flos": 6.954534912540017e+17,
226
- "train_loss": 1.4601248418781119,
227
- "train_runtime": 2408.4577,
228
- "train_samples_per_second": 13.235,
229
- "train_steps_per_second": 0.059
230
  }
231
  ],
232
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.007017543859649123,
13
+ "grad_norm": 4.463099479675293,
14
  "learning_rate": 1.3333333333333333e-05,
15
+ "loss": 2.0875,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.03508771929824561,
20
+ "grad_norm": 3.755190134048462,
21
  "learning_rate": 6.666666666666667e-05,
22
+ "loss": 2.0705,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.07017543859649122,
27
+ "grad_norm": 2.5210607051849365,
28
  "learning_rate": 0.00013333333333333334,
29
+ "loss": 1.9436,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.10526315789473684,
34
+ "grad_norm": 2.3114030361175537,
35
  "learning_rate": 0.0002,
36
+ "loss": 1.8117,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.14035087719298245,
41
+ "grad_norm": 2.0034232139587402,
42
  "learning_rate": 0.00019923607874151032,
43
+ "loss": 1.6473,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.17543859649122806,
48
+ "grad_norm": 1.9196051359176636,
49
  "learning_rate": 0.00019695598647982468,
50
+ "loss": 1.5117,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.21052631578947367,
55
+ "grad_norm": 1.0466196537017822,
56
  "learning_rate": 0.00019319455943394347,
57
+ "loss": 1.4862,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.24561403508771928,
62
+ "grad_norm": 1.0164800882339478,
63
  "learning_rate": 0.00018800926628551886,
64
+ "loss": 1.4486,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.2807017543859649,
69
+ "grad_norm": 0.8062431216239929,
70
  "learning_rate": 0.00018147933014790244,
71
+ "loss": 1.4254,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.3157894736842105,
76
+ "grad_norm": 0.7738965749740601,
77
  "learning_rate": 0.0001737045181617364,
78
+ "loss": 1.4183,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.3508771929824561,
83
+ "grad_norm": 0.8346546292304993,
84
  "learning_rate": 0.00016480361721016054,
85
+ "loss": 1.4114,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.38596491228070173,
90
+ "grad_norm": 0.8058141469955444,
91
  "learning_rate": 0.00015491261904230727,
92
+ "loss": 1.4009,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.42105263157894735,
97
+ "grad_norm": 0.802452564239502,
98
  "learning_rate": 0.0001441826425335387,
99
+ "loss": 1.4026,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.45614035087719296,
104
+ "grad_norm": 0.8057828545570374,
105
  "learning_rate": 0.00013277762482701767,
106
+ "loss": 1.3857,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.49122807017543857,
111
+ "grad_norm": 0.794191837310791,
112
  "learning_rate": 0.00012087181663233354,
113
+ "loss": 1.3823,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.5263157894736842,
118
+ "grad_norm": 0.7871180176734924,
119
  "learning_rate": 0.00010864711994907458,
120
+ "loss": 1.3851,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.5614035087719298,
125
+ "grad_norm": 0.7190406322479248,
126
  "learning_rate": 9.629030889073949e-05,
127
+ "loss": 1.3743,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.5964912280701754,
132
+ "grad_norm": 0.7476623058319092,
133
  "learning_rate": 8.399017607042025e-05,
134
+ "loss": 1.3684,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.631578947368421,
139
+ "grad_norm": 0.8211018443107605,
140
  "learning_rate": 7.193464814699073e-05,
141
+ "loss": 1.367,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.6666666666666666,
146
+ "grad_norm": 0.824474036693573,
147
  "learning_rate": 6.0307914601711305e-05,
148
+ "loss": 1.3609,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.7017543859649122,
153
+ "grad_norm": 0.7797313928604126,
154
  "learning_rate": 4.928761361302269e-05,
155
+ "loss": 1.3703,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.7368421052631579,
160
+ "grad_norm": 0.8079975247383118,
161
  "learning_rate": 3.904211802492922e-05,
162
+ "loss": 1.3666,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.7719298245614035,
167
+ "grad_norm": 0.701257586479187,
168
  "learning_rate": 2.9727962875101e-05,
169
+ "loss": 1.3636,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.8070175438596491,
174
+ "grad_norm": 0.7054150700569153,
175
  "learning_rate": 2.1487453786014512e-05,
176
+ "loss": 1.3577,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.8421052631578947,
181
+ "grad_norm": 0.7800086736679077,
182
  "learning_rate": 1.4446492759148411e-05,
183
+ "loss": 1.3717,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.8771929824561403,
188
+ "grad_norm": 0.7601162791252136,
189
  "learning_rate": 8.712654590675085e-06,
190
+ "loss": 1.3574,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.9122807017543859,
195
+ "grad_norm": 0.7522596716880798,
196
  "learning_rate": 4.37354329798726e-06,
197
+ "loss": 1.3627,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.9473684210526315,
202
+ "grad_norm": 0.7144069075584412,
203
  "learning_rate": 1.4954536682736719e-06,
204
+ "loss": 1.3619,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.9824561403508771,
209
+ "grad_norm": 0.7889108657836914,
210
  "learning_rate": 1.2235837857387246e-07,
211
+ "loss": 1.3624,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.9964912280701754,
216
+ "eval_loss": 2.0179381370544434,
217
+ "eval_runtime": 0.6745,
218
+ "eval_samples_per_second": 35.583,
219
+ "eval_steps_per_second": 1.483,
220
  "step": 142
221
  },
222
  {
223
  "epoch": 0.9964912280701754,
224
  "step": 142,
225
  "total_flos": 6.954534912540017e+17,
226
+ "train_loss": 1.4586160669864063,
227
+ "train_runtime": 560.2403,
228
+ "train_samples_per_second": 56.899,
229
+ "train_steps_per_second": 0.253
230
  }
231
  ],
232
  "logging_steps": 5,