chansung commited on
Commit
ce7b374
·
verified ·
1 Parent(s): 598c86d

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -2
  2. all_results.json +4 -9
  3. train_results.json +4 -4
  4. trainer_state.json +34 -34
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 2.3643
24
 
25
  ## Model description
26
 
@@ -57,7 +57,7 @@ The following hyperparameters were used during training:
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
- | 2.2079 | 1.0 | 61 | 2.3643 |
61
 
62
 
63
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 2.3604
24
 
25
  ## Model description
26
 
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
+ | 2.2029 | 1.0 | 61 | 2.3604 |
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 2.36434006690979,
4
- "eval_runtime": 1.1364,
5
- "eval_samples": 16,
6
- "eval_samples_per_second": 3.52,
7
- "eval_steps_per_second": 0.88,
8
  "total_flos": 3.60192752912171e+17,
9
- "train_loss": 2.4251912656377574,
10
- "train_runtime": 1033.8275,
11
  "train_samples": 92634,
12
- "train_samples_per_second": 14.996,
13
- "train_steps_per_second": 0.059
14
  }
 
1
  {
2
  "epoch": 1.0,
 
 
 
 
 
3
  "total_flos": 3.60192752912171e+17,
4
+ "train_loss": 2.436295607050911,
5
+ "train_runtime": 291.8933,
6
  "train_samples": 92634,
7
+ "train_samples_per_second": 53.112,
8
+ "train_steps_per_second": 0.209
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 3.60192752912171e+17,
4
- "train_loss": 2.4251912656377574,
5
- "train_runtime": 1033.8275,
6
  "train_samples": 92634,
7
- "train_samples_per_second": 14.996,
8
- "train_steps_per_second": 0.059
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 3.60192752912171e+17,
4
+ "train_loss": 2.436295607050911,
5
+ "train_runtime": 291.8933,
6
  "train_samples": 92634,
7
+ "train_samples_per_second": 53.112,
8
+ "train_steps_per_second": 0.209
9
  }
trainer_state.json CHANGED
@@ -10,111 +10,111 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.01639344262295082,
13
- "grad_norm": 3.5152792930603027,
14
  "learning_rate": 2.857142857142857e-05,
15
- "loss": 3.1061,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.08196721311475409,
20
- "grad_norm": 2.2367053031921387,
21
  "learning_rate": 0.00014285714285714287,
22
- "loss": 3.0938,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.16393442622950818,
27
- "grad_norm": 0.8790752291679382,
28
  "learning_rate": 0.00019848077530122083,
29
- "loss": 2.925,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.2459016393442623,
34
- "grad_norm": 0.8242706656455994,
35
  "learning_rate": 0.00018936326403234125,
36
- "loss": 2.6576,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.32786885245901637,
41
- "grad_norm": 0.652793824672699,
42
  "learning_rate": 0.00017273736415730488,
43
- "loss": 2.4658,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.4098360655737705,
48
- "grad_norm": 0.5188227891921997,
49
  "learning_rate": 0.00015000000000000001,
50
- "loss": 2.3721,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.4918032786885246,
55
- "grad_norm": 0.4388259947299957,
56
  "learning_rate": 0.00012306158707424403,
57
- "loss": 2.3044,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.5737704918032787,
62
- "grad_norm": 0.3132685124874115,
63
  "learning_rate": 9.418551710895243e-05,
64
- "loss": 2.2488,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.6557377049180327,
69
- "grad_norm": 0.3067067861557007,
70
  "learning_rate": 6.579798566743314e-05,
71
- "loss": 2.2299,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.7377049180327869,
76
- "grad_norm": 0.29608020186424255,
77
  "learning_rate": 4.028414082972141e-05,
78
- "loss": 2.2148,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.819672131147541,
83
- "grad_norm": 0.2628838121891022,
84
  "learning_rate": 1.9787680724495617e-05,
85
- "loss": 2.2038,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.9016393442622951,
90
- "grad_norm": 0.2850213348865509,
91
  "learning_rate": 6.030737921409169e-06,
92
- "loss": 2.212,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.9836065573770492,
97
- "grad_norm": 0.27618488669395447,
98
  "learning_rate": 1.6918417287318245e-07,
99
- "loss": 2.2079,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 1.0,
104
- "eval_loss": 2.36434006690979,
105
- "eval_runtime": 1.1415,
106
- "eval_samples_per_second": 3.504,
107
- "eval_steps_per_second": 0.876,
108
  "step": 61
109
  },
110
  {
111
  "epoch": 1.0,
112
  "step": 61,
113
  "total_flos": 3.60192752912171e+17,
114
- "train_loss": 2.4251912656377574,
115
- "train_runtime": 1033.8275,
116
- "train_samples_per_second": 14.996,
117
- "train_steps_per_second": 0.059
118
  }
119
  ],
120
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.01639344262295082,
13
+ "grad_norm": 9.396034240722656,
14
  "learning_rate": 2.857142857142857e-05,
15
+ "loss": 3.2576,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.08196721311475409,
20
+ "grad_norm": 7.002080917358398,
21
  "learning_rate": 0.00014285714285714287,
22
+ "loss": 3.2192,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.16393442622950818,
27
+ "grad_norm": 0.9778996706008911,
28
  "learning_rate": 0.00019848077530122083,
29
+ "loss": 2.948,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.2459016393442623,
34
+ "grad_norm": 0.8340317606925964,
35
  "learning_rate": 0.00018936326403234125,
36
+ "loss": 2.6706,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.32786885245901637,
41
+ "grad_norm": 0.6723042130470276,
42
  "learning_rate": 0.00017273736415730488,
43
+ "loss": 2.4683,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.4098360655737705,
48
+ "grad_norm": 0.500697135925293,
49
  "learning_rate": 0.00015000000000000001,
50
+ "loss": 2.3682,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.4918032786885246,
55
+ "grad_norm": 0.43113112449645996,
56
  "learning_rate": 0.00012306158707424403,
57
+ "loss": 2.3001,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.5737704918032787,
62
+ "grad_norm": 0.32319948077201843,
63
  "learning_rate": 9.418551710895243e-05,
64
+ "loss": 2.246,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.6557377049180327,
69
+ "grad_norm": 0.34333205223083496,
70
  "learning_rate": 6.579798566743314e-05,
71
+ "loss": 2.2259,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.7377049180327869,
76
+ "grad_norm": 0.31353285908699036,
77
  "learning_rate": 4.028414082972141e-05,
78
+ "loss": 2.2111,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.819672131147541,
83
+ "grad_norm": 0.2684493064880371,
84
  "learning_rate": 1.9787680724495617e-05,
85
+ "loss": 2.1992,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.9016393442622951,
90
+ "grad_norm": 0.2821851372718811,
91
  "learning_rate": 6.030737921409169e-06,
92
+ "loss": 2.2074,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.9836065573770492,
97
+ "grad_norm": 0.2827270030975342,
98
  "learning_rate": 1.6918417287318245e-07,
99
+ "loss": 2.2029,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 1.0,
104
+ "eval_loss": 2.3603930473327637,
105
+ "eval_runtime": 0.7894,
106
+ "eval_samples_per_second": 5.067,
107
+ "eval_steps_per_second": 1.267,
108
  "step": 61
109
  },
110
  {
111
  "epoch": 1.0,
112
  "step": 61,
113
  "total_flos": 3.60192752912171e+17,
114
+ "train_loss": 2.436295607050911,
115
+ "train_runtime": 291.8933,
116
+ "train_samples_per_second": 53.112,
117
+ "train_steps_per_second": 0.209
118
  }
119
  ],
120
  "logging_steps": 5,