chansung commited on
Commit
c35b05b
·
verified ·
1 Parent(s): 99f1651

Model save

Browse files
README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: gemma
4
+ base_model: google/gemma-7b
5
+ tags:
6
+ - trl
7
+ - sft
8
+ - generated_from_trainer
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: gemma7b-lora-coding-11-v1
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # gemma7b-lora-coding-11-v1
20
+
21
+ This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 1.6544
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 0.0002
43
+ - train_batch_size: 8
44
+ - eval_batch_size: 8
45
+ - seed: 42
46
+ - distributed_type: multi-GPU
47
+ - num_devices: 8
48
+ - gradient_accumulation_steps: 2
49
+ - total_train_batch_size: 128
50
+ - total_eval_batch_size: 64
51
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
+ - lr_scheduler_type: cosine
53
+ - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 1
55
+
56
+ ### Training results
57
+
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:-----:|:----:|:---------------:|
60
+ | 0.8317 | 1.0 | 301 | 1.6544 |
61
+
62
+
63
+ ### Framework versions
64
+
65
+ - PEFT 0.13.2
66
+ - Transformers 4.46.2
67
+ - Pytorch 2.3.1+cu121
68
+ - Datasets 3.1.0
69
+ - Tokenizers 0.20.3
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 9.17802542639874e+17,
4
+ "train_loss": 2.9563070911901734,
5
+ "train_runtime": 3725.7135,
6
+ "train_samples": 116368,
7
+ "train_samples_per_second": 10.33,
8
+ "train_steps_per_second": 0.081
9
+ }
runs/Nov17_12-13-39_main-lora-gemma7b-coding-0-0/events.out.tfevents.1731864313.main-lora-gemma7b-coding-0-0.457.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8984720659161635c52c667925eef5dde658d5bcbd4bb20a8959b071687e9908
3
- size 19441
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b0dadb64bce0afa6ce664241a712a924e6e83943ff082be5ab1429bb2e4118c
3
+ size 20066
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 9.17802542639874e+17,
4
+ "train_loss": 2.9563070911901734,
5
+ "train_runtime": 3725.7135,
6
+ "train_samples": 116368,
7
+ "train_samples_per_second": 10.33,
8
+ "train_steps_per_second": 0.081
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 301,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0033222591362126247,
13
+ "grad_norm": 58.848934173583984,
14
+ "learning_rate": 6.451612903225806e-06,
15
+ "loss": 20.4567,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.016611295681063124,
20
+ "grad_norm": 35.33861541748047,
21
+ "learning_rate": 3.2258064516129034e-05,
22
+ "loss": 20.5366,
23
+ "step": 5
24
+ },
25
+ {
26
+ "epoch": 0.03322259136212625,
27
+ "grad_norm": 16.172595977783203,
28
+ "learning_rate": 6.451612903225807e-05,
29
+ "loss": 17.7569,
30
+ "step": 10
31
+ },
32
+ {
33
+ "epoch": 0.04983388704318937,
34
+ "grad_norm": 7.711285591125488,
35
+ "learning_rate": 9.677419354838711e-05,
36
+ "loss": 15.7111,
37
+ "step": 15
38
+ },
39
+ {
40
+ "epoch": 0.0664451827242525,
41
+ "grad_norm": 7.137362957000732,
42
+ "learning_rate": 0.00012903225806451613,
43
+ "loss": 14.1919,
44
+ "step": 20
45
+ },
46
+ {
47
+ "epoch": 0.08305647840531562,
48
+ "grad_norm": 2.8490428924560547,
49
+ "learning_rate": 0.00016129032258064516,
50
+ "loss": 13.0076,
51
+ "step": 25
52
+ },
53
+ {
54
+ "epoch": 0.09966777408637874,
55
+ "grad_norm": 3.174508571624756,
56
+ "learning_rate": 0.00019354838709677422,
57
+ "loss": 12.4104,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 0.11627906976744186,
62
+ "grad_norm": 6.637372970581055,
63
+ "learning_rate": 0.0001998917111338525,
64
+ "loss": 11.4493,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 0.132890365448505,
69
+ "grad_norm": 12.817553520202637,
70
+ "learning_rate": 0.00019945218953682734,
71
+ "loss": 10.3687,
72
+ "step": 40
73
+ },
74
+ {
75
+ "epoch": 0.14950166112956811,
76
+ "grad_norm": 17.324180603027344,
77
+ "learning_rate": 0.00019867615321125795,
78
+ "loss": 8.29,
79
+ "step": 45
80
+ },
81
+ {
82
+ "epoch": 0.16611295681063123,
83
+ "grad_norm": 20.25031852722168,
84
+ "learning_rate": 0.00019756622801842143,
85
+ "loss": 5.0469,
86
+ "step": 50
87
+ },
88
+ {
89
+ "epoch": 0.18272425249169436,
90
+ "grad_norm": 10.32929801940918,
91
+ "learning_rate": 0.0001961261695938319,
92
+ "loss": 2.5342,
93
+ "step": 55
94
+ },
95
+ {
96
+ "epoch": 0.19933554817275748,
97
+ "grad_norm": 6.437869071960449,
98
+ "learning_rate": 0.00019436085063935835,
99
+ "loss": 1.8296,
100
+ "step": 60
101
+ },
102
+ {
103
+ "epoch": 0.2159468438538206,
104
+ "grad_norm": 2.7442312240600586,
105
+ "learning_rate": 0.00019227624443554425,
106
+ "loss": 1.5211,
107
+ "step": 65
108
+ },
109
+ {
110
+ "epoch": 0.23255813953488372,
111
+ "grad_norm": 1.103617548942566,
112
+ "learning_rate": 0.0001898794046299167,
113
+ "loss": 1.342,
114
+ "step": 70
115
+ },
116
+ {
117
+ "epoch": 0.24916943521594684,
118
+ "grad_norm": 0.8008713722229004,
119
+ "learning_rate": 0.00018717844136967624,
120
+ "loss": 1.2224,
121
+ "step": 75
122
+ },
123
+ {
124
+ "epoch": 0.26578073089701,
125
+ "grad_norm": 0.9825754761695862,
126
+ "learning_rate": 0.00018418249385952575,
127
+ "loss": 1.1639,
128
+ "step": 80
129
+ },
130
+ {
131
+ "epoch": 0.2823920265780731,
132
+ "grad_norm": 1.6092805862426758,
133
+ "learning_rate": 0.00018090169943749476,
134
+ "loss": 1.106,
135
+ "step": 85
136
+ },
137
+ {
138
+ "epoch": 0.29900332225913623,
139
+ "grad_norm": 0.7608142495155334,
140
+ "learning_rate": 0.0001773471592733964,
141
+ "loss": 1.0649,
142
+ "step": 90
143
+ },
144
+ {
145
+ "epoch": 0.31561461794019935,
146
+ "grad_norm": 0.7974920272827148,
147
+ "learning_rate": 0.0001735309008059829,
148
+ "loss": 1.0544,
149
+ "step": 95
150
+ },
151
+ {
152
+ "epoch": 0.33222591362126247,
153
+ "grad_norm": 0.8733552694320679,
154
+ "learning_rate": 0.00016946583704589973,
155
+ "loss": 1.0418,
156
+ "step": 100
157
+ },
158
+ {
159
+ "epoch": 0.3488372093023256,
160
+ "grad_norm": 0.8620198965072632,
161
+ "learning_rate": 0.00016516572288214552,
162
+ "loss": 1.0134,
163
+ "step": 105
164
+ },
165
+ {
166
+ "epoch": 0.3654485049833887,
167
+ "grad_norm": 0.9430036544799805,
168
+ "learning_rate": 0.00016064510853988138,
169
+ "loss": 1.0117,
170
+ "step": 110
171
+ },
172
+ {
173
+ "epoch": 0.38205980066445183,
174
+ "grad_norm": 0.9424466490745544,
175
+ "learning_rate": 0.0001559192903470747,
176
+ "loss": 0.9687,
177
+ "step": 115
178
+ },
179
+ {
180
+ "epoch": 0.39867109634551495,
181
+ "grad_norm": 0.7452667355537415,
182
+ "learning_rate": 0.00015100425897656753,
183
+ "loss": 0.9678,
184
+ "step": 120
185
+ },
186
+ {
187
+ "epoch": 0.4152823920265781,
188
+ "grad_norm": 0.6089577674865723,
189
+ "learning_rate": 0.00014591664533870118,
190
+ "loss": 0.9428,
191
+ "step": 125
192
+ },
193
+ {
194
+ "epoch": 0.4318936877076412,
195
+ "grad_norm": 0.5161769390106201,
196
+ "learning_rate": 0.00014067366430758004,
197
+ "loss": 0.9344,
198
+ "step": 130
199
+ },
200
+ {
201
+ "epoch": 0.4485049833887043,
202
+ "grad_norm": 1.061056137084961,
203
+ "learning_rate": 0.00013529305647138687,
204
+ "loss": 0.9316,
205
+ "step": 135
206
+ },
207
+ {
208
+ "epoch": 0.46511627906976744,
209
+ "grad_norm": 1.1914814710617065,
210
+ "learning_rate": 0.0001297930281038482,
211
+ "loss": 0.932,
212
+ "step": 140
213
+ },
214
+ {
215
+ "epoch": 0.48172757475083056,
216
+ "grad_norm": 1.3082082271575928,
217
+ "learning_rate": 0.00012419218955996676,
218
+ "loss": 0.9225,
219
+ "step": 145
220
+ },
221
+ {
222
+ "epoch": 0.4983388704318937,
223
+ "grad_norm": 0.9995535612106323,
224
+ "learning_rate": 0.00011850949230447145,
225
+ "loss": 0.9153,
226
+ "step": 150
227
+ },
228
+ {
229
+ "epoch": 0.5149501661129569,
230
+ "grad_norm": 0.5884081125259399,
231
+ "learning_rate": 0.00011276416478605949,
232
+ "loss": 0.9105,
233
+ "step": 155
234
+ },
235
+ {
236
+ "epoch": 0.53156146179402,
237
+ "grad_norm": 0.9028487801551819,
238
+ "learning_rate": 0.00010697564737441252,
239
+ "loss": 0.9142,
240
+ "step": 160
241
+ },
242
+ {
243
+ "epoch": 0.5481727574750831,
244
+ "grad_norm": 0.9643915891647339,
245
+ "learning_rate": 0.00010116352658013973,
246
+ "loss": 0.8963,
247
+ "step": 165
248
+ },
249
+ {
250
+ "epoch": 0.5647840531561462,
251
+ "grad_norm": 0.6744817495346069,
252
+ "learning_rate": 9.534746878022534e-05,
253
+ "loss": 0.9227,
254
+ "step": 170
255
+ },
256
+ {
257
+ "epoch": 0.5813953488372093,
258
+ "grad_norm": 0.5441331267356873,
259
+ "learning_rate": 8.954715367323468e-05,
260
+ "loss": 0.8981,
261
+ "step": 175
262
+ },
263
+ {
264
+ "epoch": 0.5980066445182725,
265
+ "grad_norm": 0.6811143755912781,
266
+ "learning_rate": 8.378220768944327e-05,
267
+ "loss": 0.8746,
268
+ "step": 180
269
+ },
270
+ {
271
+ "epoch": 0.6146179401993356,
272
+ "grad_norm": 0.5517932176589966,
273
+ "learning_rate": 7.807213758120966e-05,
274
+ "loss": 0.8821,
275
+ "step": 185
276
+ },
277
+ {
278
+ "epoch": 0.6312292358803987,
279
+ "grad_norm": 1.0475077629089355,
280
+ "learning_rate": 7.243626441830009e-05,
281
+ "loss": 0.8766,
282
+ "step": 190
283
+ },
284
+ {
285
+ "epoch": 0.6478405315614618,
286
+ "grad_norm": 0.8785480856895447,
287
+ "learning_rate": 6.68936582115042e-05,
288
+ "loss": 0.8696,
289
+ "step": 195
290
+ },
291
+ {
292
+ "epoch": 0.6644518272425249,
293
+ "grad_norm": 0.5514780282974243,
294
+ "learning_rate": 6.146307338575519e-05,
295
+ "loss": 0.8435,
296
+ "step": 200
297
+ },
298
+ {
299
+ "epoch": 0.6810631229235881,
300
+ "grad_norm": 0.7921276688575745,
301
+ "learning_rate": 5.616288532109225e-05,
302
+ "loss": 0.8613,
303
+ "step": 205
304
+ },
305
+ {
306
+ "epoch": 0.6976744186046512,
307
+ "grad_norm": 1.2019799947738647,
308
+ "learning_rate": 5.101102817619131e-05,
309
+ "loss": 0.8491,
310
+ "step": 210
311
+ },
312
+ {
313
+ "epoch": 0.7142857142857143,
314
+ "grad_norm": 0.5968192219734192,
315
+ "learning_rate": 4.6024934204848745e-05,
316
+ "loss": 0.8399,
317
+ "step": 215
318
+ },
319
+ {
320
+ "epoch": 0.7308970099667774,
321
+ "grad_norm": 0.622888445854187,
322
+ "learning_rate": 4.12214747707527e-05,
323
+ "loss": 0.8589,
324
+ "step": 220
325
+ },
326
+ {
327
+ "epoch": 0.7475083056478405,
328
+ "grad_norm": 0.6290799379348755,
329
+ "learning_rate": 3.661690326012897e-05,
330
+ "loss": 0.8479,
331
+ "step": 225
332
+ },
333
+ {
334
+ "epoch": 0.7641196013289037,
335
+ "grad_norm": 0.5619077086448669,
336
+ "learning_rate": 3.222680008542678e-05,
337
+ "loss": 0.8341,
338
+ "step": 230
339
+ },
340
+ {
341
+ "epoch": 0.7807308970099668,
342
+ "grad_norm": 0.4804949164390564,
343
+ "learning_rate": 2.8066019966134904e-05,
344
+ "loss": 0.8355,
345
+ "step": 235
346
+ },
347
+ {
348
+ "epoch": 0.7973421926910299,
349
+ "grad_norm": 0.8997146487236023,
350
+ "learning_rate": 2.4148641665113113e-05,
351
+ "loss": 0.8305,
352
+ "step": 240
353
+ },
354
+ {
355
+ "epoch": 0.813953488372093,
356
+ "grad_norm": 0.62757408618927,
357
+ "learning_rate": 2.0487920350515212e-05,
358
+ "loss": 0.8352,
359
+ "step": 245
360
+ },
361
+ {
362
+ "epoch": 0.8305647840531561,
363
+ "grad_norm": 0.6770641207695007,
364
+ "learning_rate": 1.7096242744495837e-05,
365
+ "loss": 0.8365,
366
+ "step": 250
367
+ },
368
+ {
369
+ "epoch": 0.8471760797342193,
370
+ "grad_norm": 0.8780685663223267,
371
+ "learning_rate": 1.3985085210463477e-05,
372
+ "loss": 0.8397,
373
+ "step": 255
374
+ },
375
+ {
376
+ "epoch": 0.8637873754152824,
377
+ "grad_norm": 0.5401960611343384,
378
+ "learning_rate": 1.116497492069961e-05,
379
+ "loss": 0.8265,
380
+ "step": 260
381
+ },
382
+ {
383
+ "epoch": 0.8803986710963455,
384
+ "grad_norm": 0.5332029461860657,
385
+ "learning_rate": 8.645454235739903e-06,
386
+ "loss": 0.8441,
387
+ "step": 265
388
+ },
389
+ {
390
+ "epoch": 0.8970099667774086,
391
+ "grad_norm": 0.6243578195571899,
392
+ "learning_rate": 6.435048416046863e-06,
393
+ "loss": 0.8333,
394
+ "step": 270
395
+ },
396
+ {
397
+ "epoch": 0.9136212624584718,
398
+ "grad_norm": 0.5947065949440002,
399
+ "learning_rate": 4.541236775226809e-06,
400
+ "loss": 0.8311,
401
+ "step": 275
402
+ },
403
+ {
404
+ "epoch": 0.9302325581395349,
405
+ "grad_norm": 0.5467190742492676,
406
+ "learning_rate": 2.970427372400353e-06,
407
+ "loss": 0.8217,
408
+ "step": 280
409
+ },
410
+ {
411
+ "epoch": 0.946843853820598,
412
+ "grad_norm": 0.5683417320251465,
413
+ "learning_rate": 1.7279353293586765e-06,
414
+ "loss": 0.8293,
415
+ "step": 285
416
+ },
417
+ {
418
+ "epoch": 0.9634551495016611,
419
+ "grad_norm": 0.5739116072654724,
420
+ "learning_rate": 8.17964845873831e-07,
421
+ "loss": 0.833,
422
+ "step": 290
423
+ },
424
+ {
425
+ "epoch": 0.9800664451827242,
426
+ "grad_norm": 0.6611495614051819,
427
+ "learning_rate": 2.4359497401758024e-07,
428
+ "loss": 0.8317,
429
+ "step": 295
430
+ },
431
+ {
432
+ "epoch": 0.9966777408637874,
433
+ "grad_norm": 0.7239180207252502,
434
+ "learning_rate": 6.769199623779532e-09,
435
+ "loss": 0.8317,
436
+ "step": 300
437
+ },
438
+ {
439
+ "epoch": 1.0,
440
+ "eval_loss": 1.6543629169464111,
441
+ "eval_runtime": 1.2337,
442
+ "eval_samples_per_second": 8.106,
443
+ "eval_steps_per_second": 0.811,
444
+ "step": 301
445
+ },
446
+ {
447
+ "epoch": 1.0,
448
+ "step": 301,
449
+ "total_flos": 9.17802542639874e+17,
450
+ "train_loss": 2.9563070911901734,
451
+ "train_runtime": 3725.7135,
452
+ "train_samples_per_second": 10.33,
453
+ "train_steps_per_second": 0.081
454
+ }
455
+ ],
456
+ "logging_steps": 5,
457
+ "max_steps": 301,
458
+ "num_input_tokens_seen": 0,
459
+ "num_train_epochs": 1,
460
+ "save_steps": 100,
461
+ "stateful_callbacks": {
462
+ "TrainerControl": {
463
+ "args": {
464
+ "should_epoch_stop": false,
465
+ "should_evaluate": false,
466
+ "should_log": false,
467
+ "should_save": true,
468
+ "should_training_stop": true
469
+ },
470
+ "attributes": {}
471
+ }
472
+ },
473
+ "total_flos": 9.17802542639874e+17,
474
+ "train_batch_size": 8,
475
+ "trial_name": null,
476
+ "trial_params": null
477
+ }