chansung commited on
Commit
708dabc
·
verified ·
1 Parent(s): 0945ccc

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -2
  2. all_results.json +4 -9
  3. train_results.json +4 -4
  4. trainer_state.json +40 -40
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 1.9204
24
 
25
  ## Model description
26
 
@@ -57,7 +57,7 @@ The following hyperparameters were used during training:
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
- | 1.7207 | 0.9936 | 78 | 1.9204 |
61
 
62
 
63
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.9245
24
 
25
  ## Model description
26
 
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
+ | 1.7247 | 0.9936 | 78 | 1.9245 |
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 0.9936305732484076,
3
- "eval_loss": 1.9203951358795166,
4
- "eval_runtime": 1.4435,
5
- "eval_samples": 16,
6
- "eval_samples_per_second": 2.771,
7
- "eval_steps_per_second": 0.693,
8
  "total_flos": 3.820096642099446e+17,
9
- "train_loss": 1.8799311350553463,
10
- "train_runtime": 1320.5477,
11
  "train_samples": 92634,
12
- "train_samples_per_second": 13.248,
13
- "train_steps_per_second": 0.059
14
  }
 
1
  {
2
  "epoch": 0.9936305732484076,
 
 
 
 
 
3
  "total_flos": 3.820096642099446e+17,
4
+ "train_loss": 1.883078443698394,
5
+ "train_runtime": 302.4096,
6
  "train_samples": 92634,
7
+ "train_samples_per_second": 57.852,
8
+ "train_steps_per_second": 0.258
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9936305732484076,
3
  "total_flos": 3.820096642099446e+17,
4
- "train_loss": 1.8799311350553463,
5
- "train_runtime": 1320.5477,
6
  "train_samples": 92634,
7
- "train_samples_per_second": 13.248,
8
- "train_steps_per_second": 0.059
9
  }
 
1
  {
2
  "epoch": 0.9936305732484076,
3
  "total_flos": 3.820096642099446e+17,
4
+ "train_loss": 1.883078443698394,
5
+ "train_runtime": 302.4096,
6
  "train_samples": 92634,
7
+ "train_samples_per_second": 57.852,
8
+ "train_steps_per_second": 0.258
9
  }
trainer_state.json CHANGED
@@ -10,132 +10,132 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.012738853503184714,
13
- "grad_norm": 4.837225437164307,
14
  "learning_rate": 2.5e-05,
15
- "loss": 2.5854,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.06369426751592357,
20
- "grad_norm": 3.6128976345062256,
21
  "learning_rate": 0.000125,
22
- "loss": 2.5321,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.12738853503184713,
27
- "grad_norm": 2.7008352279663086,
28
  "learning_rate": 0.00019959742939952392,
29
- "loss": 2.3462,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.1910828025477707,
34
- "grad_norm": 2.4735682010650635,
35
  "learning_rate": 0.00019510565162951537,
36
- "loss": 2.1254,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.25477707006369427,
41
- "grad_norm": 1.8116168975830078,
42
  "learning_rate": 0.00018584487936018661,
43
- "loss": 1.9483,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.3184713375796178,
48
- "grad_norm": 1.1086759567260742,
49
  "learning_rate": 0.00017227948638273916,
50
- "loss": 1.8338,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.3821656050955414,
55
- "grad_norm": 1.007157802581787,
56
  "learning_rate": 0.00015508969814521025,
57
- "loss": 1.8085,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.445859872611465,
62
- "grad_norm": 0.9624175429344177,
63
  "learning_rate": 0.0001351374824081343,
64
- "loss": 1.7777,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.5095541401273885,
69
- "grad_norm": 1.133021593093872,
70
  "learning_rate": 0.00011342332658176555,
71
- "loss": 1.7611,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.5732484076433121,
76
- "grad_norm": 0.9674370884895325,
77
  "learning_rate": 9.103606910965666e-05,
78
- "loss": 1.7547,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.6369426751592356,
83
- "grad_norm": 0.9040862321853638,
84
  "learning_rate": 6.909830056250527e-05,
85
- "loss": 1.7319,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.7006369426751592,
90
- "grad_norm": 0.8982458710670471,
91
  "learning_rate": 4.87100722594094e-05,
92
- "loss": 1.7384,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.7643312101910829,
97
- "grad_norm": 0.8207002878189087,
98
  "learning_rate": 3.089373510131354e-05,
99
- "loss": 1.7354,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.8280254777070064,
104
- "grad_norm": 0.8674971461296082,
105
  "learning_rate": 1.6542674627869737e-05,
106
- "loss": 1.7322,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.89171974522293,
111
- "grad_norm": 0.8455283641815186,
112
  "learning_rate": 6.37651293602628e-06,
113
- "loss": 1.7258,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.9554140127388535,
118
- "grad_norm": 0.7830201387405396,
119
  "learning_rate": 9.0502382320653e-07,
120
- "loss": 1.7207,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.9936305732484076,
125
- "eval_loss": 1.9203951358795166,
126
- "eval_runtime": 1.4459,
127
- "eval_samples_per_second": 2.766,
128
- "eval_steps_per_second": 0.692,
129
  "step": 78
130
  },
131
  {
132
  "epoch": 0.9936305732484076,
133
  "step": 78,
134
  "total_flos": 3.820096642099446e+17,
135
- "train_loss": 1.8799311350553463,
136
- "train_runtime": 1320.5477,
137
- "train_samples_per_second": 13.248,
138
- "train_steps_per_second": 0.059
139
  }
140
  ],
141
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.012738853503184714,
13
+ "grad_norm": 5.064015865325928,
14
  "learning_rate": 2.5e-05,
15
+ "loss": 2.5779,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.06369426751592357,
20
+ "grad_norm": 3.464812755584717,
21
  "learning_rate": 0.000125,
22
+ "loss": 2.5286,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.12738853503184713,
27
+ "grad_norm": 2.723762035369873,
28
  "learning_rate": 0.00019959742939952392,
29
+ "loss": 2.3449,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.1910828025477707,
34
+ "grad_norm": 2.2147715091705322,
35
  "learning_rate": 0.00019510565162951537,
36
+ "loss": 2.1213,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.25477707006369427,
41
+ "grad_norm": 2.67816162109375,
42
  "learning_rate": 0.00018584487936018661,
43
+ "loss": 1.9556,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.3184713375796178,
48
+ "grad_norm": 1.2481664419174194,
49
  "learning_rate": 0.00017227948638273916,
50
+ "loss": 1.8417,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.3821656050955414,
55
+ "grad_norm": 1.3122402429580688,
56
  "learning_rate": 0.00015508969814521025,
57
+ "loss": 1.8135,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.445859872611465,
62
+ "grad_norm": 1.0445932149887085,
63
  "learning_rate": 0.0001351374824081343,
64
+ "loss": 1.7823,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.5095541401273885,
69
+ "grad_norm": 1.0760376453399658,
70
  "learning_rate": 0.00011342332658176555,
71
+ "loss": 1.7652,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.5732484076433121,
76
+ "grad_norm": 0.9530027508735657,
77
  "learning_rate": 9.103606910965666e-05,
78
+ "loss": 1.7581,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.6369426751592356,
83
+ "grad_norm": 1.1503989696502686,
84
  "learning_rate": 6.909830056250527e-05,
85
+ "loss": 1.7357,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.7006369426751592,
90
+ "grad_norm": 0.9222537279129028,
91
  "learning_rate": 4.87100722594094e-05,
92
+ "loss": 1.7425,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.7643312101910829,
97
+ "grad_norm": 0.8503910303115845,
98
  "learning_rate": 3.089373510131354e-05,
99
+ "loss": 1.7397,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.8280254777070064,
104
+ "grad_norm": 0.8713019490242004,
105
  "learning_rate": 1.6542674627869737e-05,
106
+ "loss": 1.7361,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.89171974522293,
111
+ "grad_norm": 0.8647977709770203,
112
  "learning_rate": 6.37651293602628e-06,
113
+ "loss": 1.7294,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.9554140127388535,
118
+ "grad_norm": 0.8375623226165771,
119
  "learning_rate": 9.0502382320653e-07,
120
+ "loss": 1.7247,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.9936305732484076,
125
+ "eval_loss": 1.9245359897613525,
126
+ "eval_runtime": 0.6582,
127
+ "eval_samples_per_second": 6.078,
128
+ "eval_steps_per_second": 1.519,
129
  "step": 78
130
  },
131
  {
132
  "epoch": 0.9936305732484076,
133
  "step": 78,
134
  "total_flos": 3.820096642099446e+17,
135
+ "train_loss": 1.883078443698394,
136
+ "train_runtime": 302.4096,
137
+ "train_samples_per_second": 57.852,
138
+ "train_steps_per_second": 0.258
139
  }
140
  ],
141
  "logging_steps": 5,