Jinawei commited on
Commit
28939a1
·
1 Parent(s): e94e8c2

Upload 8 files

Browse files
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home.local/jianwei/workspace/archive/SparseOptimizer/output/Layer_7_12_Hid_160_768_Head_10_12_IMRatio_3.5",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "embedding_size": 160,
9
+ "finetuning_task": "rte",
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 160,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 560,
16
+ "layer_norm_eps": 1e-12,
17
+ "max_position_embeddings": 512,
18
+ "model_type": "bert",
19
+ "num_attention_heads": 10,
20
+ "num_hidden_layers": 7,
21
+ "output_intermediate": true,
22
+ "output_past": true,
23
+ "pad_token_id": 0,
24
+ "position_embedding_type": "absolute",
25
+ "problem_type": "single_label_classification",
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.17.0",
28
+ "type_vocab_size": 2,
29
+ "use_cache": true,
30
+ "vocab_size": 30522
31
+ }
log_bs32_lr3e-05_20221118_060236_793692.txt ADDED
@@ -0,0 +1,639 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ------------> log file ==runs2/rte/1/log_bs32_lr3e-05_20221118_060236_793692.txt
2
+ Namespace(aug_train=False, data_dir='/home.local/jianwei/datasets/nlp/glue_data/RTE', do_eval=False, early_stop=True, early_stop_metric='accuracy', eval_step=120, gradient_accumulation_steps=1, learning_rate=3e-05, local_rank=0, lr_scheduler_type=<SchedulerType.CONSTANT_WITH_WARMUP: 'constant_with_warmup'>, max_length=128, max_train_steps=None, model_name_or_path='/home.local/jianwei/workspace/archive/SparseOptimizer/output/Layer_7_12_Hid_160_768_Head_10_12_IMRatio_3.5', num_train_epochs=30, num_warmup_steps=0, output_dir='runs2/rte/1', pad_to_max_length=False, per_device_eval_batch_size=32, per_device_train_batch_size=32, print_step=5, save_last=False, seed=None, task_name='rte', train_file=None, use_slow_tokenizer=False, validation_file=None, weight_decay=0.0)
3
+ Distributed environment: NO
4
+ Num processes: 1
5
+ Process index: 0
6
+ Local process index: 0
7
+ Device: cuda
8
+ Mixed precision type: fp16
9
+
10
+ Sample 595 of the training set: (tensor([ 101, 11929, 1010, 5553, 1012, 2570, 1006, 8418, 25311, 13860,
11
+ 3388, 1007, 1011, 1011, 2019, 18410, 2140, 6187, 24887, 2080,
12
+ 11183, 1010, 1037, 2280, 3539, 2704, 1010, 2180, 5978, 1005,
13
+ 1055, 4883, 2602, 2006, 4465, 1012, 102, 2047, 5077, 3539,
14
+ 2704, 2003, 2700, 1012, 102, 0, 0, 0, 0, 0,
15
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
17
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
18
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
19
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
20
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
21
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
22
+ 0, 0, 0, 0, 0, 0, 0, 0]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
24
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
25
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27
+ 0, 0, 0, 0, 0, 0, 0, 0]), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
29
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32
+ 0, 0, 0, 0, 0, 0, 0, 0]), tensor(1)).
33
+ Sample 2375 of the training set: (tensor([ 101, 1996, 5611, 2390, 2749, 3344, 2041, 1010, 2006, 5095,
34
+ 1010, 1037, 6923, 2510, 3169, 2046, 1996, 2225, 2924, 2237,
35
+ 1997, 15419, 2378, 1998, 2049, 13141, 3409, 1010, 2334, 9302,
36
+ 4216, 2056, 1012, 102, 1996, 5611, 2390, 3344, 2041, 1037,
37
+ 6923, 3169, 1999, 15419, 2378, 1012, 102, 0, 0, 0,
38
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
39
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
40
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
41
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
42
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
43
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
44
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
45
+ 0, 0, 0, 0, 0, 0, 0, 0]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
47
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
49
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
50
+ 0, 0, 0, 0, 0, 0, 0, 0]), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
51
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
52
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55
+ 0, 0, 0, 0, 0, 0, 0, 0]), tensor(0)).
56
+ Sample 149 of the training set: (tensor([ 101, 2048, 9767, 8461, 2379, 2019, 5499, 2082, 1999, 4501,
57
+ 2730, 2809, 2111, 1998, 5229, 4413, 2500, 7483, 1999, 1996,
58
+ 6745, 8293, 1997, 4808, 13940, 1996, 2670, 3417, 1997, 15381,
59
+ 1012, 102, 2809, 2111, 8461, 2048, 9767, 2379, 2019, 5499,
60
+ 2082, 1999, 4501, 1012, 102, 0, 0, 0, 0, 0,
61
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
63
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
64
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
65
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
66
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68
+ 0, 0, 0, 0, 0, 0, 0, 0]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
69
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
70
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
72
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73
+ 0, 0, 0, 0, 0, 0, 0, 0]), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
75
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78
+ 0, 0, 0, 0, 0, 0, 0, 0]), tensor(1)).
79
+ ***** Running training *****
80
+ Num examples = 2490
81
+ Num Epochs = 30
82
+ Instantaneous batch size per device = 32
83
+ Total train batch size (w. parallel, distributed & accumulation) = 32
84
+ Gradient Accumulation steps = 1
85
+ Total optimization steps = 2340
86
+ 000005/002340, loss: 0.694824, avg_loss: 0.691177
87
+ 000010/002340, loss: 0.707565, avg_loss: 0.693715
88
+ 000015/002340, loss: 0.699615, avg_loss: 0.693022
89
+ 000020/002340, loss: 0.699615, avg_loss: 0.693939
90
+ 000025/002340, loss: 0.699310, avg_loss: 0.694436
91
+ 000030/002340, loss: 0.698532, avg_loss: 0.694941
92
+ 000035/002340, loss: 0.686935, avg_loss: 0.694372
93
+ 000040/002340, loss: 0.696411, avg_loss: 0.694273
94
+ 000045/002340, loss: 0.692871, avg_loss: 0.693708
95
+ 000050/002340, loss: 0.687256, avg_loss: 0.693756
96
+ 000055/002340, loss: 0.701004, avg_loss: 0.693827
97
+ 000060/002340, loss: 0.691040, avg_loss: 0.693579
98
+ 000065/002340, loss: 0.689056, avg_loss: 0.693324
99
+ 000070/002340, loss: 0.696518, avg_loss: 0.693440
100
+ 000075/002340, loss: 0.696930, avg_loss: 0.693460
101
+ 000080/002340, loss: 0.693802, avg_loss: 0.693340
102
+ 000085/002340, loss: 0.688171, avg_loss: 0.693318
103
+ 000090/002340, loss: 0.698029, avg_loss: 0.693154
104
+ 000095/002340, loss: 0.689453, avg_loss: 0.692949
105
+ 000100/002340, loss: 0.690857, avg_loss: 0.692921
106
+ 000105/002340, loss: 0.689819, avg_loss: 0.692827
107
+ 000110/002340, loss: 0.682220, avg_loss: 0.692768
108
+ 000115/002340, loss: 0.700806, avg_loss: 0.692803
109
+ 000120/002340, loss: 0.701385, avg_loss: 0.692652
110
+ ***** Running dev evaluation *****
111
+ Num examples = 277
112
+ Instantaneous batch size per device = 32
113
+ epoch 1, step 120/2340: {'accuracy': 0.5523465703971119}
114
+ 000125/002340, loss: 0.693527, avg_loss: 0.692706
115
+ 000130/002340, loss: 0.689957, avg_loss: 0.692658
116
+ 000135/002340, loss: 0.685425, avg_loss: 0.692536
117
+ 000140/002340, loss: 0.690201, avg_loss: 0.692434
118
+ 000145/002340, loss: 0.686600, avg_loss: 0.692396
119
+ 000150/002340, loss: 0.678986, avg_loss: 0.692177
120
+ 000155/002340, loss: 0.679138, avg_loss: 0.691975
121
+ 000160/002340, loss: 0.694275, avg_loss: 0.691769
122
+ 000165/002340, loss: 0.692368, avg_loss: 0.691443
123
+ 000170/002340, loss: 0.680664, avg_loss: 0.691252
124
+ 000175/002340, loss: 0.666016, avg_loss: 0.690698
125
+ 000180/002340, loss: 0.671844, avg_loss: 0.690296
126
+ 000185/002340, loss: 0.651184, avg_loss: 0.689748
127
+ 000190/002340, loss: 0.659752, avg_loss: 0.688919
128
+ 000195/002340, loss: 0.662926, avg_loss: 0.688697
129
+ 000200/002340, loss: 0.643776, avg_loss: 0.688136
130
+ 000205/002340, loss: 0.693794, avg_loss: 0.687406
131
+ 000210/002340, loss: 0.716675, avg_loss: 0.686937
132
+ 000215/002340, loss: 0.665474, avg_loss: 0.686136
133
+ 000220/002340, loss: 0.625298, avg_loss: 0.685308
134
+ 000225/002340, loss: 0.656639, avg_loss: 0.685019
135
+ 000230/002340, loss: 0.673508, avg_loss: 0.684550
136
+ 000235/002340, loss: 0.575394, avg_loss: 0.682954
137
+ 000240/002340, loss: 0.615173, avg_loss: 0.681390
138
+ ***** Running dev evaluation *****
139
+ Num examples = 277
140
+ Instantaneous batch size per device = 32
141
+ epoch 3, step 240/2340: {'accuracy': 0.5884476534296029}
142
+ 000245/002340, loss: 0.566116, avg_loss: 0.679216
143
+ 000250/002340, loss: 0.662231, avg_loss: 0.677990
144
+ 000255/002340, loss: 0.742844, avg_loss: 0.677457
145
+ 000260/002340, loss: 0.744896, avg_loss: 0.677289
146
+ 000265/002340, loss: 0.524788, avg_loss: 0.675974
147
+ 000270/002340, loss: 0.573128, avg_loss: 0.674871
148
+ 000275/002340, loss: 0.698616, avg_loss: 0.674028
149
+ 000280/002340, loss: 0.661125, avg_loss: 0.672997
150
+ 000285/002340, loss: 0.577705, avg_loss: 0.671527
151
+ 000290/002340, loss: 0.529144, avg_loss: 0.669498
152
+ 000295/002340, loss: 0.548820, avg_loss: 0.668429
153
+ 000300/002340, loss: 0.533775, avg_loss: 0.667589
154
+ 000305/002340, loss: 0.724682, avg_loss: 0.666549
155
+ 000310/002340, loss: 0.618702, avg_loss: 0.667052
156
+ 000315/002340, loss: 0.600662, avg_loss: 0.666212
157
+ 000320/002340, loss: 0.560127, avg_loss: 0.665015
158
+ 000325/002340, loss: 0.667423, avg_loss: 0.663344
159
+ 000330/002340, loss: 0.520096, avg_loss: 0.661692
160
+ 000335/002340, loss: 0.589901, avg_loss: 0.659812
161
+ 000340/002340, loss: 0.718616, avg_loss: 0.658405
162
+ 000345/002340, loss: 0.523731, avg_loss: 0.657693
163
+ 000350/002340, loss: 0.597912, avg_loss: 0.656364
164
+ 000355/002340, loss: 0.510841, avg_loss: 0.654704
165
+ 000360/002340, loss: 0.598392, avg_loss: 0.652629
166
+ ***** Running dev evaluation *****
167
+ Num examples = 277
168
+ Instantaneous batch size per device = 32
169
+ epoch 4, step 360/2340: {'accuracy': 0.6137184115523465}
170
+ 000365/002340, loss: 0.509396, avg_loss: 0.650652
171
+ 000370/002340, loss: 0.625957, avg_loss: 0.649372
172
+ 000375/002340, loss: 0.632420, avg_loss: 0.648425
173
+ 000380/002340, loss: 0.562641, avg_loss: 0.647222
174
+ 000385/002340, loss: 0.649609, avg_loss: 0.645501
175
+ 000390/002340, loss: 0.361694, avg_loss: 0.643182
176
+ 000395/002340, loss: 0.425430, avg_loss: 0.642246
177
+ 000400/002340, loss: 0.577938, avg_loss: 0.640067
178
+ 000405/002340, loss: 0.554668, avg_loss: 0.638333
179
+ 000410/002340, loss: 0.505466, avg_loss: 0.636457
180
+ 000415/002340, loss: 0.531124, avg_loss: 0.634969
181
+ 000420/002340, loss: 0.425911, avg_loss: 0.633147
182
+ 000425/002340, loss: 0.532368, avg_loss: 0.632082
183
+ 000430/002340, loss: 0.569756, avg_loss: 0.630961
184
+ 000435/002340, loss: 0.451645, avg_loss: 0.629107
185
+ 000440/002340, loss: 0.459530, avg_loss: 0.627486
186
+ 000445/002340, loss: 0.380501, avg_loss: 0.625123
187
+ 000450/002340, loss: 0.565880, avg_loss: 0.624122
188
+ 000455/002340, loss: 0.422201, avg_loss: 0.621911
189
+ 000460/002340, loss: 0.671333, avg_loss: 0.620993
190
+ 000465/002340, loss: 0.427799, avg_loss: 0.618575
191
+ 000470/002340, loss: 0.301590, avg_loss: 0.616753
192
+ 000475/002340, loss: 0.517204, avg_loss: 0.614735
193
+ 000480/002340, loss: 0.473822, avg_loss: 0.612666
194
+ ***** Running dev evaluation *****
195
+ Num examples = 277
196
+ Instantaneous batch size per device = 32
197
+ epoch 6, step 480/2340: {'accuracy': 0.6209386281588448}
198
+ 000485/002340, loss: 0.235840, avg_loss: 0.610187
199
+ 000490/002340, loss: 0.535803, avg_loss: 0.608769
200
+ 000495/002340, loss: 0.447842, avg_loss: 0.606833
201
+ 000500/002340, loss: 0.359915, avg_loss: 0.604468
202
+ 000505/002340, loss: 0.473944, avg_loss: 0.601928
203
+ 000510/002340, loss: 0.487707, avg_loss: 0.600405
204
+ 000515/002340, loss: 0.280029, avg_loss: 0.599008
205
+ 000520/002340, loss: 0.509848, avg_loss: 0.597484
206
+ 000525/002340, loss: 0.646320, avg_loss: 0.596454
207
+ 000530/002340, loss: 0.350674, avg_loss: 0.594710
208
+ 000535/002340, loss: 0.480106, avg_loss: 0.593436
209
+ 000540/002340, loss: 0.560251, avg_loss: 0.593214
210
+ 000545/002340, loss: 0.387239, avg_loss: 0.591432
211
+ 000550/002340, loss: 0.277430, avg_loss: 0.589320
212
+ 000555/002340, loss: 0.280695, avg_loss: 0.587417
213
+ 000560/002340, loss: 0.330351, avg_loss: 0.585310
214
+ 000565/002340, loss: 0.391579, avg_loss: 0.583662
215
+ 000570/002340, loss: 0.280355, avg_loss: 0.582107
216
+ 000575/002340, loss: 0.359081, avg_loss: 0.580171
217
+ 000580/002340, loss: 0.367201, avg_loss: 0.578450
218
+ 000585/002340, loss: 0.430851, avg_loss: 0.577231
219
+ 000590/002340, loss: 0.331879, avg_loss: 0.575557
220
+ 000595/002340, loss: 0.333700, avg_loss: 0.573829
221
+ 000600/002340, loss: 0.309275, avg_loss: 0.571686
222
+ ***** Running dev evaluation *****
223
+ Num examples = 277
224
+ Instantaneous batch size per device = 32
225
+ epoch 7, step 600/2340: {'accuracy': 0.6425992779783394}
226
+ 000605/002340, loss: 0.461454, avg_loss: 0.570168
227
+ 000610/002340, loss: 0.434152, avg_loss: 0.568408
228
+ 000615/002340, loss: 0.565701, avg_loss: 0.567013
229
+ 000620/002340, loss: 0.281487, avg_loss: 0.564378
230
+ 000625/002340, loss: 0.183996, avg_loss: 0.562576
231
+ 000630/002340, loss: 0.308249, avg_loss: 0.560548
232
+ 000635/002340, loss: 0.492087, avg_loss: 0.558905
233
+ 000640/002340, loss: 0.276144, avg_loss: 0.556907
234
+ 000645/002340, loss: 0.379016, avg_loss: 0.555011
235
+ 000650/002340, loss: 0.257240, avg_loss: 0.553119
236
+ 000655/002340, loss: 0.260510, avg_loss: 0.550735
237
+ 000660/002340, loss: 0.482807, avg_loss: 0.549067
238
+ 000665/002340, loss: 0.313425, avg_loss: 0.547653
239
+ 000670/002340, loss: 0.244961, avg_loss: 0.545744
240
+ 000675/002340, loss: 0.386663, avg_loss: 0.544380
241
+ 000680/002340, loss: 0.137331, avg_loss: 0.541812
242
+ 000685/002340, loss: 0.301256, avg_loss: 0.539778
243
+ 000690/002340, loss: 0.284186, avg_loss: 0.537928
244
+ 000695/002340, loss: 0.521972, avg_loss: 0.536261
245
+ 000700/002340, loss: 0.718600, avg_loss: 0.535717
246
+ 000705/002340, loss: 0.237306, avg_loss: 0.534266
247
+ 000710/002340, loss: 0.164028, avg_loss: 0.532027
248
+ 000715/002340, loss: 0.235560, avg_loss: 0.530920
249
+ 000720/002340, loss: 0.224425, avg_loss: 0.529428
250
+ ***** Running dev evaluation *****
251
+ Num examples = 277
252
+ Instantaneous batch size per device = 32
253
+ epoch 9, step 720/2340: {'accuracy': 0.6462093862815884}
254
+ 000725/002340, loss: 0.250054, avg_loss: 0.527996
255
+ 000730/002340, loss: 0.213790, avg_loss: 0.526521
256
+ 000735/002340, loss: 0.339844, avg_loss: 0.525346
257
+ 000740/002340, loss: 0.192316, avg_loss: 0.523399
258
+ 000745/002340, loss: 0.322181, avg_loss: 0.521820
259
+ 000750/002340, loss: 0.114270, avg_loss: 0.519722
260
+ 000755/002340, loss: 0.242498, avg_loss: 0.517846
261
+ 000760/002340, loss: 0.234197, avg_loss: 0.515497
262
+ 000765/002340, loss: 0.332447, avg_loss: 0.513969
263
+ 000770/002340, loss: 0.163693, avg_loss: 0.512496
264
+ 000775/002340, loss: 0.260910, avg_loss: 0.511088
265
+ 000780/002340, loss: 0.236919, avg_loss: 0.509495
266
+ 000785/002340, loss: 0.151022, avg_loss: 0.507580
267
+ 000790/002340, loss: 0.489914, avg_loss: 0.506298
268
+ 000795/002340, loss: 0.175525, avg_loss: 0.504419
269
+ 000800/002340, loss: 0.274471, avg_loss: 0.502310
270
+ 000805/002340, loss: 0.308759, avg_loss: 0.500468
271
+ 000810/002340, loss: 0.227170, avg_loss: 0.498888
272
+ 000815/002340, loss: 0.112951, avg_loss: 0.496910
273
+ 000820/002340, loss: 0.168542, avg_loss: 0.495333
274
+ 000825/002340, loss: 0.163078, avg_loss: 0.493526
275
+ 000830/002340, loss: 0.208418, avg_loss: 0.492144
276
+ 000835/002340, loss: 0.204179, avg_loss: 0.490463
277
+ 000840/002340, loss: 0.262290, avg_loss: 0.488488
278
+ ***** Running dev evaluation *****
279
+ Num examples = 277
280
+ Instantaneous batch size per device = 32
281
+ epoch 10, step 840/2340: {'accuracy': 0.6245487364620939}
282
+ 000845/002340, loss: 0.166388, avg_loss: 0.486870
283
+ 000850/002340, loss: 0.221429, avg_loss: 0.485510
284
+ 000855/002340, loss: 0.376082, avg_loss: 0.484030
285
+ 000860/002340, loss: 0.083231, avg_loss: 0.482307
286
+ 000865/002340, loss: 0.161541, avg_loss: 0.480355
287
+ 000870/002340, loss: 0.180701, avg_loss: 0.478405
288
+ 000875/002340, loss: 0.175531, avg_loss: 0.476498
289
+ 000880/002340, loss: 0.148172, avg_loss: 0.475174
290
+ 000885/002340, loss: 0.110148, avg_loss: 0.473676
291
+ 000890/002340, loss: 0.177225, avg_loss: 0.472175
292
+ 000895/002340, loss: 0.051785, avg_loss: 0.470479
293
+ 000900/002340, loss: 0.239419, avg_loss: 0.469122
294
+ 000905/002340, loss: 0.294643, avg_loss: 0.467460
295
+ 000910/002340, loss: 0.372546, avg_loss: 0.466119
296
+ 000915/002340, loss: 0.160401, avg_loss: 0.464562
297
+ 000920/002340, loss: 0.389829, avg_loss: 0.463444
298
+ 000925/002340, loss: 0.461596, avg_loss: 0.462050
299
+ 000930/002340, loss: 0.169349, avg_loss: 0.460443
300
+ 000935/002340, loss: 0.274192, avg_loss: 0.459206
301
+ 000940/002340, loss: 0.245536, avg_loss: 0.457409
302
+ 000945/002340, loss: 0.124900, avg_loss: 0.455669
303
+ 000950/002340, loss: 0.258810, avg_loss: 0.453951
304
+ 000955/002340, loss: 0.328007, avg_loss: 0.452289
305
+ 000960/002340, loss: 0.243825, avg_loss: 0.450600
306
+ ***** Running dev evaluation *****
307
+ Num examples = 277
308
+ Instantaneous batch size per device = 32
309
+ epoch 12, step 960/2340: {'accuracy': 0.6389891696750902}
310
+ 000965/002340, loss: 0.201036, avg_loss: 0.449321
311
+ 000970/002340, loss: 0.091728, avg_loss: 0.447797
312
+ 000975/002340, loss: 0.182425, avg_loss: 0.446324
313
+ 000980/002340, loss: 0.159452, avg_loss: 0.444909
314
+ 000985/002340, loss: 0.142912, avg_loss: 0.443522
315
+ 000990/002340, loss: 0.304327, avg_loss: 0.442004
316
+ 000995/002340, loss: 0.117483, avg_loss: 0.440452
317
+ 001000/002340, loss: 0.156437, avg_loss: 0.438837
318
+ 001005/002340, loss: 0.032182, avg_loss: 0.437682
319
+ 001010/002340, loss: 0.063084, avg_loss: 0.436744
320
+ 001015/002340, loss: 0.258552, avg_loss: 0.435504
321
+ 001020/002340, loss: 0.091414, avg_loss: 0.434340
322
+ 001025/002340, loss: 0.100409, avg_loss: 0.432843
323
+ 001030/002340, loss: 0.064708, avg_loss: 0.431516
324
+ 001035/002340, loss: 0.459350, avg_loss: 0.430340
325
+ 001040/002340, loss: 0.195770, avg_loss: 0.428896
326
+ 001045/002340, loss: 0.101108, avg_loss: 0.427430
327
+ 001050/002340, loss: 0.162723, avg_loss: 0.425868
328
+ 001055/002340, loss: 0.170199, avg_loss: 0.424800
329
+ 001060/002340, loss: 0.066082, avg_loss: 0.423415
330
+ 001065/002340, loss: 0.139599, avg_loss: 0.422219
331
+ 001070/002340, loss: 0.089475, avg_loss: 0.420665
332
+ 001075/002340, loss: 0.115157, avg_loss: 0.419250
333
+ 001080/002340, loss: 0.085939, avg_loss: 0.417821
334
+ ***** Running dev evaluation *****
335
+ Num examples = 277
336
+ Instantaneous batch size per device = 32
337
+ epoch 13, step 1080/2340: {'accuracy': 0.6173285198555957}
338
+ 001085/002340, loss: 0.138964, avg_loss: 0.416740
339
+ 001090/002340, loss: 0.385725, avg_loss: 0.415552
340
+ 001095/002340, loss: 0.173466, avg_loss: 0.414612
341
+ 001100/002340, loss: 0.101382, avg_loss: 0.413397
342
+ 001105/002340, loss: 0.098917, avg_loss: 0.412091
343
+ 001110/002340, loss: 0.088198, avg_loss: 0.410518
344
+ 001115/002340, loss: 0.039977, avg_loss: 0.409207
345
+ 001120/002340, loss: 0.126413, avg_loss: 0.407805
346
+ 001125/002340, loss: 0.154641, avg_loss: 0.406540
347
+ 001130/002340, loss: 0.221717, avg_loss: 0.405238
348
+ 001135/002340, loss: 0.155590, avg_loss: 0.403870
349
+ 001140/002340, loss: 0.072533, avg_loss: 0.402521
350
+ 001145/002340, loss: 0.148947, avg_loss: 0.401401
351
+ 001150/002340, loss: 0.202878, avg_loss: 0.400165
352
+ 001155/002340, loss: 0.054971, avg_loss: 0.399305
353
+ 001160/002340, loss: 0.058926, avg_loss: 0.398088
354
+ 001165/002340, loss: 0.187665, avg_loss: 0.396901
355
+ 001170/002340, loss: 0.091442, avg_loss: 0.395624
356
+ 001175/002340, loss: 0.339817, avg_loss: 0.394529
357
+ 001180/002340, loss: 0.029183, avg_loss: 0.393430
358
+ 001185/002340, loss: 0.052091, avg_loss: 0.392348
359
+ 001190/002340, loss: 0.175309, avg_loss: 0.391464
360
+ 001195/002340, loss: 0.269615, avg_loss: 0.390438
361
+ 001200/002340, loss: 0.042982, avg_loss: 0.389416
362
+ ***** Running dev evaluation *****
363
+ Num examples = 277
364
+ Instantaneous batch size per device = 32
365
+ epoch 15, step 1200/2340: {'accuracy': 0.6353790613718412}
366
+ 001205/002340, loss: 0.029362, avg_loss: 0.388045
367
+ 001210/002340, loss: 0.106356, avg_loss: 0.386842
368
+ 001215/002340, loss: 0.055282, avg_loss: 0.385720
369
+ 001220/002340, loss: 0.025587, avg_loss: 0.384474
370
+ 001225/002340, loss: 0.017830, avg_loss: 0.383314
371
+ 001230/002340, loss: 0.156192, avg_loss: 0.382166
372
+ 001235/002340, loss: 0.017268, avg_loss: 0.381167
373
+ 001240/002340, loss: 0.015908, avg_loss: 0.379919
374
+ 001245/002340, loss: 0.024442, avg_loss: 0.378661
375
+ 001250/002340, loss: 0.016508, avg_loss: 0.377585
376
+ 001255/002340, loss: 0.021355, avg_loss: 0.376479
377
+ 001260/002340, loss: 0.024076, avg_loss: 0.375165
378
+ 001265/002340, loss: 0.202033, avg_loss: 0.374116
379
+ 001270/002340, loss: 0.027793, avg_loss: 0.372882
380
+ 001275/002340, loss: 0.027369, avg_loss: 0.372247
381
+ 001280/002340, loss: 0.021813, avg_loss: 0.371052
382
+ 001285/002340, loss: 0.021163, avg_loss: 0.370046
383
+ 001290/002340, loss: 0.046603, avg_loss: 0.369336
384
+ 001295/002340, loss: 0.076338, avg_loss: 0.368328
385
+ 001300/002340, loss: 0.183380, avg_loss: 0.367225
386
+ 001305/002340, loss: 0.169317, avg_loss: 0.366140
387
+ 001310/002340, loss: 0.020987, avg_loss: 0.365018
388
+ 001315/002340, loss: 0.169484, avg_loss: 0.364127
389
+ 001320/002340, loss: 0.044023, avg_loss: 0.363106
390
+ ***** Running dev evaluation *****
391
+ Num examples = 277
392
+ Instantaneous batch size per device = 32
393
+ epoch 16, step 1320/2340: {'accuracy': 0.6462093862815884}
394
+ 001325/002340, loss: 0.146640, avg_loss: 0.361943
395
+ 001330/002340, loss: 0.053370, avg_loss: 0.360778
396
+ 001335/002340, loss: 0.024849, avg_loss: 0.359785
397
+ 001340/002340, loss: 0.040356, avg_loss: 0.358545
398
+ 001345/002340, loss: 0.216520, avg_loss: 0.357564
399
+ 001350/002340, loss: 0.020188, avg_loss: 0.356442
400
+ 001355/002340, loss: 0.050854, avg_loss: 0.355434
401
+ 001360/002340, loss: 0.013922, avg_loss: 0.354336
402
+ 001365/002340, loss: 0.034302, avg_loss: 0.353537
403
+ 001370/002340, loss: 0.083984, avg_loss: 0.352530
404
+ 001375/002340, loss: 0.044313, avg_loss: 0.351671
405
+ 001380/002340, loss: 0.197178, avg_loss: 0.350656
406
+ 001385/002340, loss: 0.087372, avg_loss: 0.349721
407
+ 001390/002340, loss: 0.122292, avg_loss: 0.348657
408
+ 001395/002340, loss: 0.161705, avg_loss: 0.347780
409
+ 001400/002340, loss: 0.014310, avg_loss: 0.346943
410
+ 001405/002340, loss: 0.096345, avg_loss: 0.345930
411
+ 001410/002340, loss: 0.142292, avg_loss: 0.345120
412
+ 001415/002340, loss: 0.016984, avg_loss: 0.344193
413
+ 001420/002340, loss: 0.014843, avg_loss: 0.343171
414
+ 001425/002340, loss: 0.054250, avg_loss: 0.342329
415
+ 001430/002340, loss: 0.049341, avg_loss: 0.341417
416
+ 001435/002340, loss: 0.033567, avg_loss: 0.340340
417
+ 001440/002340, loss: 0.108241, avg_loss: 0.339508
418
+ ***** Running dev evaluation *****
419
+ Num examples = 277
420
+ Instantaneous batch size per device = 32
421
+ epoch 18, step 1440/2340: {'accuracy': 0.6137184115523465}
422
+ 001445/002340, loss: 0.148780, avg_loss: 0.338643
423
+ 001450/002340, loss: 0.121979, avg_loss: 0.337871
424
+ 001455/002340, loss: 0.015762, avg_loss: 0.337010
425
+ 001460/002340, loss: 0.197943, avg_loss: 0.336178
426
+ 001465/002340, loss: 0.019593, avg_loss: 0.335371
427
+ 001470/002340, loss: 0.129545, avg_loss: 0.334404
428
+ 001475/002340, loss: 0.015238, avg_loss: 0.333483
429
+ 001480/002340, loss: 0.016869, avg_loss: 0.332625
430
+ 001485/002340, loss: 0.011418, avg_loss: 0.331565
431
+ 001490/002340, loss: 0.338315, avg_loss: 0.330893
432
+ 001495/002340, loss: 0.288740, avg_loss: 0.330484
433
+ 001500/002340, loss: 0.148870, avg_loss: 0.329575
434
+ 001505/002340, loss: 0.013757, avg_loss: 0.328768
435
+ 001510/002340, loss: 0.016786, avg_loss: 0.327894
436
+ 001515/002340, loss: 0.013239, avg_loss: 0.326989
437
+ 001520/002340, loss: 0.024581, avg_loss: 0.326006
438
+ 001525/002340, loss: 0.017539, avg_loss: 0.325226
439
+ 001530/002340, loss: 0.067678, avg_loss: 0.324287
440
+ 001535/002340, loss: 0.024253, avg_loss: 0.323389
441
+ 001540/002340, loss: 0.077925, avg_loss: 0.322495
442
+ 001545/002340, loss: 0.024680, avg_loss: 0.321567
443
+ 001550/002340, loss: 0.012920, avg_loss: 0.320824
444
+ 001555/002340, loss: 0.023837, avg_loss: 0.320000
445
+ 001560/002340, loss: 0.221982, avg_loss: 0.319304
446
+ ***** Running dev evaluation *****
447
+ Num examples = 277
448
+ Instantaneous batch size per device = 32
449
+ epoch 19, step 1560/2340: {'accuracy': 0.6137184115523465}
450
+ 001565/002340, loss: 0.013699, avg_loss: 0.318449
451
+ 001570/002340, loss: 0.011844, avg_loss: 0.317610
452
+ 001575/002340, loss: 0.012580, avg_loss: 0.316855
453
+ 001580/002340, loss: 0.037540, avg_loss: 0.316005
454
+ 001585/002340, loss: 0.019229, avg_loss: 0.315232
455
+ 001590/002340, loss: 0.048232, avg_loss: 0.314477
456
+ 001595/002340, loss: 0.141452, avg_loss: 0.313963
457
+ 001600/002340, loss: 0.015298, avg_loss: 0.313133
458
+ 001605/002340, loss: 0.013662, avg_loss: 0.312229
459
+ 001610/002340, loss: 0.160849, avg_loss: 0.311404
460
+ 001615/002340, loss: 0.012301, avg_loss: 0.310524
461
+ 001620/002340, loss: 0.063877, avg_loss: 0.309759
462
+ 001625/002340, loss: 0.032892, avg_loss: 0.309026
463
+ 001630/002340, loss: 0.177563, avg_loss: 0.308279
464
+ 001635/002340, loss: 0.157313, avg_loss: 0.307644
465
+ 001640/002340, loss: 0.130090, avg_loss: 0.306819
466
+ 001645/002340, loss: 0.021889, avg_loss: 0.306081
467
+ 001650/002340, loss: 0.152882, avg_loss: 0.305300
468
+ 001655/002340, loss: 0.009122, avg_loss: 0.304627
469
+ 001660/002340, loss: 0.015140, avg_loss: 0.303849
470
+ 001665/002340, loss: 0.164985, avg_loss: 0.303089
471
+ 001670/002340, loss: 0.008990, avg_loss: 0.302396
472
+ 001675/002340, loss: 0.010757, avg_loss: 0.301671
473
+ 001680/002340, loss: 0.009137, avg_loss: 0.300904
474
+ ***** Running dev evaluation *****
475
+ Num examples = 277
476
+ Instantaneous batch size per device = 32
477
+ epoch 21, step 1680/2340: {'accuracy': 0.6173285198555957}
478
+ 001685/002340, loss: 0.053387, avg_loss: 0.300194
479
+ 001690/002340, loss: 0.022511, avg_loss: 0.299502
480
+ 001695/002340, loss: 0.105420, avg_loss: 0.298722
481
+ 001700/002340, loss: 0.013549, avg_loss: 0.297988
482
+ 001705/002340, loss: 0.073981, avg_loss: 0.297318
483
+ 001710/002340, loss: 0.014491, avg_loss: 0.296600
484
+ 001715/002340, loss: 0.154422, avg_loss: 0.295955
485
+ 001720/002340, loss: 0.163267, avg_loss: 0.295310
486
+ 001725/002340, loss: 0.136114, avg_loss: 0.294759
487
+ 001730/002340, loss: 0.015310, avg_loss: 0.294064
488
+ 001735/002340, loss: 0.087005, avg_loss: 0.293422
489
+ 001740/002340, loss: 0.020296, avg_loss: 0.292756
490
+ 001745/002340, loss: 0.018787, avg_loss: 0.292135
491
+ 001750/002340, loss: 0.034191, avg_loss: 0.291526
492
+ 001755/002340, loss: 0.045470, avg_loss: 0.290987
493
+ 001760/002340, loss: 0.014372, avg_loss: 0.290662
494
+ 001765/002340, loss: 0.015767, avg_loss: 0.289942
495
+ 001770/002340, loss: 0.039629, avg_loss: 0.289302
496
+ 001775/002340, loss: 0.016410, avg_loss: 0.288527
497
+ 001780/002340, loss: 0.038289, avg_loss: 0.287933
498
+ 001785/002340, loss: 0.017720, avg_loss: 0.287493
499
+ 001790/002340, loss: 0.033570, avg_loss: 0.286735
500
+ 001795/002340, loss: 0.012522, avg_loss: 0.286079
501
+ 001800/002340, loss: 0.053891, avg_loss: 0.285344
502
+ ***** Running dev evaluation *****
503
+ Num examples = 277
504
+ Instantaneous batch size per device = 32
505
+ epoch 23, step 1800/2340: {'accuracy': 0.6245487364620939}
506
+ 001805/002340, loss: 0.126177, avg_loss: 0.284716
507
+ 001810/002340, loss: 0.011923, avg_loss: 0.284070
508
+ 001815/002340, loss: 0.142181, avg_loss: 0.283613
509
+ 001820/002340, loss: 0.010828, avg_loss: 0.282998
510
+ 001825/002340, loss: 0.025087, avg_loss: 0.282492
511
+ 001830/002340, loss: 0.273915, avg_loss: 0.281916
512
+ 001835/002340, loss: 0.016827, avg_loss: 0.281382
513
+ 001840/002340, loss: 0.010785, avg_loss: 0.280767
514
+ 001845/002340, loss: 0.015339, avg_loss: 0.280337
515
+ 001850/002340, loss: 0.020906, avg_loss: 0.279696
516
+ 001855/002340, loss: 0.165239, avg_loss: 0.279069
517
+ 001860/002340, loss: 0.053642, avg_loss: 0.278450
518
+ 001865/002340, loss: 0.133574, avg_loss: 0.277862
519
+ 001870/002340, loss: 0.097644, avg_loss: 0.277226
520
+ 001875/002340, loss: 0.059441, avg_loss: 0.276570
521
+ 001880/002340, loss: 0.016699, avg_loss: 0.275948
522
+ 001885/002340, loss: 0.146401, avg_loss: 0.275488
523
+ 001890/002340, loss: 0.011636, avg_loss: 0.274799
524
+ 001895/002340, loss: 0.018686, avg_loss: 0.274214
525
+ 001900/002340, loss: 0.026965, avg_loss: 0.273611
526
+ 001905/002340, loss: 0.013933, avg_loss: 0.272935
527
+ 001910/002340, loss: 0.125580, avg_loss: 0.272318
528
+ 001915/002340, loss: 0.129783, avg_loss: 0.271802
529
+ 001920/002340, loss: 0.116678, avg_loss: 0.271278
530
+ ***** Running dev evaluation *****
531
+ Num examples = 277
532
+ Instantaneous batch size per device = 32
533
+ epoch 24, step 1920/2340: {'accuracy': 0.6173285198555957}
534
+ 001925/002340, loss: 0.254784, avg_loss: 0.270806
535
+ 001930/002340, loss: 0.157526, avg_loss: 0.270238
536
+ 001935/002340, loss: 0.031608, avg_loss: 0.269644
537
+ 001940/002340, loss: 0.009236, avg_loss: 0.269169
538
+ 001945/002340, loss: 0.009980, avg_loss: 0.268799
539
+ 001950/002340, loss: 0.033835, avg_loss: 0.268168
540
+ 001955/002340, loss: 0.051771, avg_loss: 0.267547
541
+ 001960/002340, loss: 0.142184, avg_loss: 0.267055
542
+ 001965/002340, loss: 0.046325, avg_loss: 0.266676
543
+ 001970/002340, loss: 0.041966, avg_loss: 0.266192
544
+ 001975/002340, loss: 0.020202, avg_loss: 0.265597
545
+ 001980/002340, loss: 0.125195, avg_loss: 0.265071
546
+ 001985/002340, loss: 0.019307, avg_loss: 0.264558
547
+ 001990/002340, loss: 0.011511, avg_loss: 0.263954
548
+ 001995/002340, loss: 0.092994, avg_loss: 0.263384
549
+ 002000/002340, loss: 0.098703, avg_loss: 0.262809
550
+ 002005/002340, loss: 0.017836, avg_loss: 0.262371
551
+ 002010/002340, loss: 0.047947, avg_loss: 0.261831
552
+ 002015/002340, loss: 0.157151, avg_loss: 0.261291
553
+ 002020/002340, loss: 0.063095, avg_loss: 0.260695
554
+ 002025/002340, loss: 0.239691, avg_loss: 0.260198
555
+ 002030/002340, loss: 0.008953, avg_loss: 0.259652
556
+ 002035/002340, loss: 0.008303, avg_loss: 0.259056
557
+ 002040/002340, loss: 0.133496, avg_loss: 0.258505
558
+ ***** Running dev evaluation *****
559
+ Num examples = 277
560
+ Instantaneous batch size per device = 32
561
+ epoch 26, step 2040/2340: {'accuracy': 0.6173285198555957}
562
+ 002045/002340, loss: 0.070495, avg_loss: 0.258069
563
+ 002050/002340, loss: 0.082666, avg_loss: 0.257558
564
+ 002055/002340, loss: 0.036117, avg_loss: 0.257011
565
+ 002060/002340, loss: 0.018446, avg_loss: 0.256447
566
+ 002065/002340, loss: 0.019938, avg_loss: 0.255982
567
+ 002070/002340, loss: 0.010070, avg_loss: 0.255545
568
+ 002075/002340, loss: 0.010592, avg_loss: 0.254990
569
+ 002080/002340, loss: 0.047749, avg_loss: 0.254418
570
+ 002085/002340, loss: 0.157273, avg_loss: 0.253991
571
+ 002090/002340, loss: 0.012268, avg_loss: 0.253488
572
+ 002095/002340, loss: 0.010397, avg_loss: 0.252964
573
+ 002100/002340, loss: 0.152166, avg_loss: 0.252516
574
+ 002105/002340, loss: 0.149034, avg_loss: 0.252077
575
+ 002110/002340, loss: 0.022406, avg_loss: 0.251554
576
+ 002115/002340, loss: 0.050635, avg_loss: 0.251001
577
+ 002120/002340, loss: 0.101384, avg_loss: 0.250624
578
+ 002125/002340, loss: 0.019535, avg_loss: 0.250064
579
+ 002130/002340, loss: 0.017638, avg_loss: 0.249509
580
+ 002135/002340, loss: 0.007454, avg_loss: 0.249097
581
+ 002140/002340, loss: 0.170886, avg_loss: 0.248638
582
+ 002145/002340, loss: 0.008658, avg_loss: 0.248148
583
+ 002150/002340, loss: 0.018784, avg_loss: 0.247731
584
+ 002155/002340, loss: 0.006945, avg_loss: 0.247294
585
+ 002160/002340, loss: 0.149141, avg_loss: 0.246973
586
+ ***** Running dev evaluation *****
587
+ Num examples = 277
588
+ Instantaneous batch size per device = 32
589
+ epoch 27, step 2160/2340: {'accuracy': 0.6173285198555957}
590
+ 002165/002340, loss: 0.070260, avg_loss: 0.246627
591
+ 002170/002340, loss: 0.018735, avg_loss: 0.246110
592
+ 002175/002340, loss: 0.011750, avg_loss: 0.245641
593
+ 002180/002340, loss: 0.024557, avg_loss: 0.245194
594
+ 002185/002340, loss: 0.022439, avg_loss: 0.244675
595
+ 002190/002340, loss: 0.009183, avg_loss: 0.244218
596
+ 002195/002340, loss: 0.147473, avg_loss: 0.243797
597
+ 002200/002340, loss: 0.008439, avg_loss: 0.243311
598
+ 002205/002340, loss: 0.009392, avg_loss: 0.242842
599
+ 002210/002340, loss: 0.007260, avg_loss: 0.242363
600
+ 002215/002340, loss: 0.006505, avg_loss: 0.241869
601
+ 002220/002340, loss: 0.036663, avg_loss: 0.241415
602
+ 002225/002340, loss: 0.010591, avg_loss: 0.240936
603
+ 002230/002340, loss: 0.008057, avg_loss: 0.240418
604
+ 002235/002340, loss: 0.005135, avg_loss: 0.240005
605
+ 002240/002340, loss: 0.009763, avg_loss: 0.239661
606
+ 002245/002340, loss: 0.009173, avg_loss: 0.239206
607
+ 002250/002340, loss: 0.015700, avg_loss: 0.238819
608
+ 002255/002340, loss: 0.021340, avg_loss: 0.238346
609
+ 002260/002340, loss: 0.060185, avg_loss: 0.237882
610
+ 002265/002340, loss: 0.038913, avg_loss: 0.237484
611
+ 002270/002340, loss: 0.016376, avg_loss: 0.237112
612
+ 002275/002340, loss: 0.010828, avg_loss: 0.236714
613
+ 002280/002340, loss: 0.129731, avg_loss: 0.236370
614
+ ***** Running dev evaluation *****
615
+ Num examples = 277
616
+ Instantaneous batch size per device = 32
617
+ epoch 29, step 2280/2340: {'accuracy': 0.6064981949458483}
618
+ 002285/002340, loss: 0.044581, avg_loss: 0.235897
619
+ 002290/002340, loss: 0.008923, avg_loss: 0.235524
620
+ 002295/002340, loss: 0.011697, avg_loss: 0.235179
621
+ 002300/002340, loss: 0.020234, avg_loss: 0.234708
622
+ 002305/002340, loss: 0.024606, avg_loss: 0.234225
623
+ 002310/002340, loss: 0.007431, avg_loss: 0.233798
624
+ 002315/002340, loss: 0.006717, avg_loss: 0.233382
625
+ 002320/002340, loss: 0.017990, avg_loss: 0.232940
626
+ 002325/002340, loss: 0.145197, avg_loss: 0.232597
627
+ 002330/002340, loss: 0.013951, avg_loss: 0.232139
628
+ 002335/002340, loss: 0.014238, avg_loss: 0.231719
629
+ 002340/002340, loss: 0.019154, avg_loss: 0.231268
630
+ ***** Running train evaluation *****
631
+ Num examples = 2490
632
+ Instantaneous batch size per device = 32
633
+ Train Dataset Result: {'accuracy': 0.9955823293172691}
634
+ ***** Running dev evaluation *****
635
+ Num examples = 277
636
+ Instantaneous batch size per device = 32
637
+ Dev Dataset Result: {'accuracy': 0.6101083032490975}
638
+ DEV Best Result: accuracy, 0.6462093862815884
639
+ Training time 0:02:36
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58c4433dc0148c6dcbb383b9e233378c256de46436f4b7c33785bfe5dc3da8f7
3
+ size 34299149
result.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {'accuracy': 0.5523465703971119}
2
+ {'accuracy': 0.5884476534296029}
3
+ {'accuracy': 0.6137184115523465}
4
+ {'accuracy': 0.6209386281588448}
5
+ {'accuracy': 0.6425992779783394}
6
+ {'accuracy': 0.6462093862815884}
7
+ {'accuracy': 0.6245487364620939}
8
+ {'accuracy': 0.6389891696750902}
9
+ {'accuracy': 0.6173285198555957}
10
+ {'accuracy': 0.6353790613718412}
11
+ {'accuracy': 0.6462093862815884}
12
+ {'accuracy': 0.6137184115523465}
13
+ {'accuracy': 0.6137184115523465}
14
+ {'accuracy': 0.6173285198555957}
15
+ {'accuracy': 0.6245487364620939}
16
+ {'accuracy': 0.6173285198555957}
17
+ {'accuracy': 0.6173285198555957}
18
+ {'accuracy': 0.6173285198555957}
19
+ {'accuracy': 0.6064981949458483}
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "do_basic_tokenize": true, "model_max_length": 512, "name_or_path": "/home.local/jianwei/workspace/archive/SparseOptimizer/output/Layer_7_12_Hid_160_768_Head_10_12_IMRatio_3.5", "never_split": null, "special_tokens_map_file": "/home.local/jianwei/.cache/huggingface/transformers/b680d52711d2451bbd6c6b1700365d6d731977c1357ae86bd7227f61145d3be2.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d", "tokenizer_class": "BertTokenizer"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff