add tokenizer
Browse files- config.json +3 -10
- merges.txt +162 -0
- special_tokens_map.json +1 -0
- tokenizer.json +1 -0
- tokenizer_config.json +1 -0
- vocab.json +1 -0
config.json
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "data/saved_models/our_90epochs_saved_model",
|
| 3 |
"architectures": [
|
| 4 |
-
"
|
| 5 |
],
|
| 6 |
"attention_probs_dropout_prob": 0.1,
|
| 7 |
"bos_token_id": 0,
|
|
@@ -10,19 +9,13 @@
|
|
| 10 |
"hidden_act": "gelu",
|
| 11 |
"hidden_dropout_prob": 0.1,
|
| 12 |
"hidden_size": 768,
|
| 13 |
-
"id2label": {
|
| 14 |
-
"0": "LABEL_0"
|
| 15 |
-
},
|
| 16 |
"initializer_range": 0.02,
|
| 17 |
"intermediate_size": 3072,
|
| 18 |
-
"label2id": {
|
| 19 |
-
"LABEL_0": 0
|
| 20 |
-
},
|
| 21 |
"layer_norm_eps": 1e-12,
|
| 22 |
"max_position_embeddings": 514,
|
| 23 |
"model_type": "roberta",
|
| 24 |
-
"num_attention_heads":
|
| 25 |
-
"num_hidden_layers":
|
| 26 |
"pad_token_id": 1,
|
| 27 |
"position_embedding_type": "absolute",
|
| 28 |
"torch_dtype": "float32",
|
|
|
|
| 1 |
{
|
|
|
|
| 2 |
"architectures": [
|
| 3 |
+
"RobertaForMaskedLM"
|
| 4 |
],
|
| 5 |
"attention_probs_dropout_prob": 0.1,
|
| 6 |
"bos_token_id": 0,
|
|
|
|
| 9 |
"hidden_act": "gelu",
|
| 10 |
"hidden_dropout_prob": 0.1,
|
| 11 |
"hidden_size": 768,
|
|
|
|
|
|
|
|
|
|
| 12 |
"initializer_range": 0.02,
|
| 13 |
"intermediate_size": 3072,
|
|
|
|
|
|
|
|
|
|
| 14 |
"layer_norm_eps": 1e-12,
|
| 15 |
"max_position_embeddings": 514,
|
| 16 |
"model_type": "roberta",
|
| 17 |
+
"num_attention_heads": 12,
|
| 18 |
+
"num_hidden_layers": 8,
|
| 19 |
"pad_token_id": 1,
|
| 20 |
"position_embedding_type": "absolute",
|
| 21 |
"torch_dtype": "float32",
|
merges.txt
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version: 0.2 - Trained by `huggingface/tokenizers`
|
| 2 |
+
B r
|
| 3 |
+
a n
|
| 4 |
+
c h
|
| 5 |
+
Br an
|
| 6 |
+
Bran ch
|
| 7 |
+
Branch 1
|
| 8 |
+
= C
|
| 9 |
+
R i
|
| 10 |
+
n g
|
| 11 |
+
Ri ng
|
| 12 |
+
Ring 1
|
| 13 |
+
= Branch1
|
| 14 |
+
Branch 2
|
| 15 |
+
= O
|
| 16 |
+
Ring 2
|
| 17 |
+
H 1
|
| 18 |
+
C @
|
| 19 |
+
= N
|
| 20 |
+
# Branch1
|
| 21 |
+
C@ @
|
| 22 |
+
= Branch2
|
| 23 |
+
C@ H1
|
| 24 |
+
C@@ H1
|
| 25 |
+
# Branch2
|
| 26 |
+
C l
|
| 27 |
+
# C
|
| 28 |
+
/ C
|
| 29 |
+
N H1
|
| 30 |
+
+ 1
|
| 31 |
+
- 1
|
| 32 |
+
= Ring1
|
| 33 |
+
O -1
|
| 34 |
+
N +1
|
| 35 |
+
\ C
|
| 36 |
+
/ N
|
| 37 |
+
# N
|
| 38 |
+
= Ring2
|
| 39 |
+
= S
|
| 40 |
+
=N +1
|
| 41 |
+
N a
|
| 42 |
+
Na +1
|
| 43 |
+
\ N
|
| 44 |
+
S +1
|
| 45 |
+
/ O
|
| 46 |
+
\ S
|
| 47 |
+
\ O
|
| 48 |
+
Br -1
|
| 49 |
+
I -1
|
| 50 |
+
Cl -1
|
| 51 |
+
/ C@H1
|
| 52 |
+
Branch 3
|
| 53 |
+
/ C@@H1
|
| 54 |
+
= P
|
| 55 |
+
/ S
|
| 56 |
+
=N -1
|
| 57 |
+
S i
|
| 58 |
+
K +1
|
| 59 |
+
N -1
|
| 60 |
+
S e
|
| 61 |
+
L i
|
| 62 |
+
Li +1
|
| 63 |
+
+ 3
|
| 64 |
+
Cl +3
|
| 65 |
+
\ C@H1
|
| 66 |
+
Ring 3
|
| 67 |
+
\ C@@H1
|
| 68 |
+
/ N+1
|
| 69 |
+
/ P
|
| 70 |
+
\ F
|
| 71 |
+
P @
|
| 72 |
+
2 H
|
| 73 |
+
P H1
|
| 74 |
+
/ Br
|
| 75 |
+
N @
|
| 76 |
+
P +1
|
| 77 |
+
/ Cl
|
| 78 |
+
\ NH1
|
| 79 |
+
\ Br
|
| 80 |
+
@ +1
|
| 81 |
+
/ I
|
| 82 |
+
/ C@
|
| 83 |
+
T e
|
| 84 |
+
\ N+1
|
| 85 |
+
P@ @
|
| 86 |
+
1 2
|
| 87 |
+
5 I
|
| 88 |
+
\ O-1
|
| 89 |
+
12 5I
|
| 90 |
+
/ F
|
| 91 |
+
# N+1
|
| 92 |
+
\ Cl
|
| 93 |
+
N@ +1
|
| 94 |
+
\ I
|
| 95 |
+
- /
|
| 96 |
+
/ C@@
|
| 97 |
+
N@ @
|
| 98 |
+
N@ @+1
|
| 99 |
+
-/ Ring2
|
| 100 |
+
- \
|
| 101 |
+
1 4
|
| 102 |
+
B -1
|
| 103 |
+
C -1
|
| 104 |
+
S @+1
|
| 105 |
+
14 C
|
| 106 |
+
H 2
|
| 107 |
+
H 4
|
| 108 |
+
I +1
|
| 109 |
+
S -1
|
| 110 |
+
\ P
|
| 111 |
+
=S +1
|
| 112 |
+
=P @
|
| 113 |
+
Si H4
|
| 114 |
+
+ 2
|
| 115 |
+
3 H
|
| 116 |
+
@ @+1
|
| 117 |
+
A g
|
| 118 |
+
C +1
|
| 119 |
+
S @@+1
|
| 120 |
+
Cl +1
|
| 121 |
+
=S e
|
| 122 |
+
-\ Ring1
|
| 123 |
+
H 0
|
| 124 |
+
O H0
|
| 125 |
+
1 1
|
| 126 |
+
= Branch3
|
| 127 |
+
= Te
|
| 128 |
+
M g
|
| 129 |
+
O +1
|
| 130 |
+
Z n
|
| 131 |
+
\ C@
|
| 132 |
+
\ S+1
|
| 133 |
+
H1 -1
|
| 134 |
+
Se H1
|
| 135 |
+
P@ +1
|
| 136 |
+
-\ Ring2
|
| 137 |
+
11 C
|
| 138 |
+
=Te +1
|
| 139 |
+
Zn +2
|
| 140 |
+
/ NH1
|
| 141 |
+
1 8
|
| 142 |
+
A s
|
| 143 |
+
B H2
|
| 144 |
+
B H1-1
|
| 145 |
+
C a
|
| 146 |
+
H 3
|
| 147 |
+
O H1-1
|
| 148 |
+
S H2
|
| 149 |
+
=O +1
|
| 150 |
+
Se +1
|
| 151 |
+
Te H2
|
| 152 |
+
125I H1
|
| 153 |
+
-/ Ring1
|
| 154 |
+
14C H2
|
| 155 |
+
Ag +1
|
| 156 |
+
=Se +1
|
| 157 |
+
Mg H2
|
| 158 |
+
Mg +2
|
| 159 |
+
11C H3
|
| 160 |
+
18 F
|
| 161 |
+
BH2 -1
|
| 162 |
+
Ca +2
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
|
tokenizer.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"<unk>","single_word":false,"lstrip":false,"rstrip":false,"normalized":true},{"id":1,"special":true,"content":"<s>","single_word":false,"lstrip":false,"rstrip":false,"normalized":true},{"id":2,"special":true,"content":"</s>","single_word":false,"lstrip":false,"rstrip":false,"normalized":true},{"id":3,"special":true,"content":"<pad>","single_word":false,"lstrip":false,"rstrip":false,"normalized":true},{"id":4,"special":true,"content":"<mask>","single_word":false,"lstrip":true,"rstrip":false,"normalized":true}],"normalizer":null,"pre_tokenizer":{"type":"ByteLevel","add_prefix_space":false,"trim_offsets":true},"post_processor":{"type":"RobertaProcessing","sep":["</s>",2],"cls":["<s>",1],"trim_offsets":true,"add_prefix_space":false},"decoder":{"type":"ByteLevel","add_prefix_space":true,"trim_offsets":true},"model":{"type":"BPE","dropout":null,"unk_token":null,"continuing_subword_prefix":"","end_of_word_suffix":"","fuse_unk":false,"vocab":{"<unk>":0,"<s>":1,"</s>":2,"<pad>":3,"<mask>":4,"\n":5,"#":6,"+":7,"-":8,".":9,"/":10,"0":11,"1":12,"2":13,"3":14,"4":15,"5":16,"8":17,"=":18,"@":19,"A":20,"B":21,"C":22,"F":23,"H":24,"I":25,"K":26,"L":27,"M":28,"N":29,"O":30,"P":31,"R":32,"S":33,"T":34,"Z":35,"\\":36,"a":37,"c":38,"e":39,"g":40,"h":41,"i":42,"l":43,"n":44,"r":45,"s":46,"Br":47,"an":48,"ch":49,"Bran":50,"Branch":51,"Branch1":52,"=C":53,"Ri":54,"ng":55,"Ring":56,"Ring1":57,"=Branch1":58,"Branch2":59,"=O":60,"Ring2":61,"H1":62,"C@":63,"=N":64,"#Branch1":65,"C@@":66,"=Branch2":67,"C@H1":68,"C@@H1":69,"#Branch2":70,"Cl":71,"#C":72,"/C":73,"NH1":74,"+1":75,"-1":76,"=Ring1":77,"O-1":78,"N+1":79,"\\C":80,"/N":81,"#N":82,"=Ring2":83,"=S":84,"=N+1":85,"Na":86,"Na+1":87,"\\N":88,"S+1":89,"/O":90,"\\S":91,"\\O":92,"Br-1":93,"I-1":94,"Cl-1":95,"/C@H1":96,"Branch3":97,"/C@@H1":98,"=P":99,"/S":100,"=N-1":101,"Si":102,"K+1":103,"N-1":104,"Se":105,"Li":106,"Li+1":107,"+3":108,"Cl+3":109,"\\C@H1":110,"Ring3":111,"\\C@@H1":112,"/N+1":113,"/P":114,"\\F":115,"P@":116,"2H":117,"PH1":118,"/Br":119,"N@":120,"P+1":121,"/Cl":122,"\\NH1":123,"\\Br":124,"@+1":125,"/I":126,"/C@":127,"Te":128,"\\N+1":129,"P@@":130,"12":131,"5I":132,"\\O-1":133,"125I":134,"/F":135,"#N+1":136,"\\Cl":137,"N@+1":138,"\\I":139,"-/":140,"/C@@":141,"N@@":142,"N@@+1":143,"-/Ring2":144,"-\\":145,"14":146,"B-1":147,"C-1":148,"S@+1":149,"14C":150,"H2":151,"H4":152,"I+1":153,"S-1":154,"\\P":155,"=S+1":156,"=P@":157,"SiH4":158,"+2":159,"3H":160,"@@+1":161,"Ag":162,"C+1":163,"S@@+1":164,"Cl+1":165,"=Se":166,"-\\Ring1":167,"H0":168,"OH0":169,"11":170,"=Branch3":171,"=Te":172,"Mg":173,"O+1":174,"Zn":175,"\\C@":176,"\\S+1":177,"H1-1":178,"SeH1":179,"P@+1":180,"-\\Ring2":181,"11C":182,"=Te+1":183,"Zn+2":184,"/NH1":185,"18":186,"As":187,"BH2":188,"BH1-1":189,"Ca":190,"H3":191,"OH1-1":192,"SH2":193,"=O+1":194,"Se+1":195,"TeH2":196,"125IH1":197,"-/Ring1":198,"14CH2":199,"Ag+1":200,"=Se+1":201,"MgH2":202,"Mg+2":203,"11CH3":204,"18F":205,"BH2-1":206,"Ca+2":207},"merges":["B r","a n","c h","Br an","Bran ch","Branch 1","= C","R i","n g","Ri ng","Ring 1","= Branch1","Branch 2","= O","Ring 2","H 1","C @","= N","# Branch1","C@ @","= Branch2","C@ H1","C@@ H1","# Branch2","C l","# C","/ C","N H1","+ 1","- 1","= Ring1","O -1","N +1","\\ C","/ N","# N","= Ring2","= S","=N +1","N a","Na +1","\\ N","S +1","/ O","\\ S","\\ O","Br -1","I -1","Cl -1","/ C@H1","Branch 3","/ C@@H1","= P","/ S","=N -1","S i","K +1","N -1","S e","L i","Li +1","+ 3","Cl +3","\\ C@H1","Ring 3","\\ C@@H1","/ N+1","/ P","\\ F","P @","2 H","P H1","/ Br","N @","P +1","/ Cl","\\ NH1","\\ Br","@ +1","/ I","/ C@","T e","\\ N+1","P@ @","1 2","5 I","\\ O-1","12 5I","/ F","# N+1","\\ Cl","N@ +1","\\ I","- /","/ C@@","N@ @","N@ @+1","-/ Ring2","- \\","1 4","B -1","C -1","S @+1","14 C","H 2","H 4","I +1","S -1","\\ P","=S +1","=P @","Si H4","+ 2","3 H","@ @+1","A g","C +1","S @@+1","Cl +1","=S e","-\\ Ring1","H 0","O H0","1 1","= Branch3","= Te","M g","O +1","Z n","\\ C@","\\ S+1","H1 -1","Se H1","P@ +1","-\\ Ring2","11 C","=Te +1","Zn +2","/ NH1","1 8","A s","B H2","B H1-1","C a","H 3","O H1-1","S H2","=O +1","Se +1","Te H2","125I H1","-/ Ring1","14C H2","Ag +1","=Se +1","Mg H2","Mg +2","11C H3","18 F","BH2 -1","Ca +2"]}}
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "special_tokens_map_file": null, "name_or_path": "./data/bpe/", "tokenizer_class": "RobertaTokenizer"}
|
vocab.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"<unk>":0,"<s>":1,"</s>":2,"<pad>":3,"<mask>":4,"\n":5,"#":6,"+":7,"-":8,".":9,"/":10,"0":11,"1":12,"2":13,"3":14,"4":15,"5":16,"8":17,"=":18,"@":19,"A":20,"B":21,"C":22,"F":23,"H":24,"I":25,"K":26,"L":27,"M":28,"N":29,"O":30,"P":31,"R":32,"S":33,"T":34,"Z":35,"\\":36,"a":37,"c":38,"e":39,"g":40,"h":41,"i":42,"l":43,"n":44,"r":45,"s":46,"Br":47,"an":48,"ch":49,"Bran":50,"Branch":51,"Branch1":52,"=C":53,"Ri":54,"ng":55,"Ring":56,"Ring1":57,"=Branch1":58,"Branch2":59,"=O":60,"Ring2":61,"H1":62,"C@":63,"=N":64,"#Branch1":65,"C@@":66,"=Branch2":67,"C@H1":68,"C@@H1":69,"#Branch2":70,"Cl":71,"#C":72,"/C":73,"NH1":74,"+1":75,"-1":76,"=Ring1":77,"O-1":78,"N+1":79,"\\C":80,"/N":81,"#N":82,"=Ring2":83,"=S":84,"=N+1":85,"Na":86,"Na+1":87,"\\N":88,"S+1":89,"/O":90,"\\S":91,"\\O":92,"Br-1":93,"I-1":94,"Cl-1":95,"/C@H1":96,"Branch3":97,"/C@@H1":98,"=P":99,"/S":100,"=N-1":101,"Si":102,"K+1":103,"N-1":104,"Se":105,"Li":106,"Li+1":107,"+3":108,"Cl+3":109,"\\C@H1":110,"Ring3":111,"\\C@@H1":112,"/N+1":113,"/P":114,"\\F":115,"P@":116,"2H":117,"PH1":118,"/Br":119,"N@":120,"P+1":121,"/Cl":122,"\\NH1":123,"\\Br":124,"@+1":125,"/I":126,"/C@":127,"Te":128,"\\N+1":129,"P@@":130,"12":131,"5I":132,"\\O-1":133,"125I":134,"/F":135,"#N+1":136,"\\Cl":137,"N@+1":138,"\\I":139,"-/":140,"/C@@":141,"N@@":142,"N@@+1":143,"-/Ring2":144,"-\\":145,"14":146,"B-1":147,"C-1":148,"S@+1":149,"14C":150,"H2":151,"H4":152,"I+1":153,"S-1":154,"\\P":155,"=S+1":156,"=P@":157,"SiH4":158,"+2":159,"3H":160,"@@+1":161,"Ag":162,"C+1":163,"S@@+1":164,"Cl+1":165,"=Se":166,"-\\Ring1":167,"H0":168,"OH0":169,"11":170,"=Branch3":171,"=Te":172,"Mg":173,"O+1":174,"Zn":175,"\\C@":176,"\\S+1":177,"H1-1":178,"SeH1":179,"P@+1":180,"-\\Ring2":181,"11C":182,"=Te+1":183,"Zn+2":184,"/NH1":185,"18":186,"As":187,"BH2":188,"BH1-1":189,"Ca":190,"H3":191,"OH1-1":192,"SH2":193,"=O+1":194,"Se+1":195,"TeH2":196,"125IH1":197,"-/Ring1":198,"14CH2":199,"Ag+1":200,"=Se+1":201,"MgH2":202,"Mg+2":203,"11CH3":204,"18F":205,"BH2-1":206,"Ca+2":207}
|