Upload tokenizer files (vocab, config, README)
Browse files- tokenizer.json +8 -72
tokenizer.json
CHANGED
@@ -1,43 +1,5 @@
|
|
1 |
{
|
2 |
"version": "1.0",
|
3 |
-
"added_tokens": [
|
4 |
-
{
|
5 |
-
"id": 0,
|
6 |
-
"content": "<pad>",
|
7 |
-
"single_word": false,
|
8 |
-
"lstrip": false,
|
9 |
-
"rstrip": false,
|
10 |
-
"normalized": false,
|
11 |
-
"special": true
|
12 |
-
},
|
13 |
-
{
|
14 |
-
"id": 1,
|
15 |
-
"content": "<unk>",
|
16 |
-
"single_word": false,
|
17 |
-
"lstrip": false,
|
18 |
-
"rstrip": false,
|
19 |
-
"normalized": false,
|
20 |
-
"special": true
|
21 |
-
},
|
22 |
-
{
|
23 |
-
"id": 2,
|
24 |
-
"content": "<bos>",
|
25 |
-
"single_word": false,
|
26 |
-
"lstrip": false,
|
27 |
-
"rstrip": false,
|
28 |
-
"normalized": false,
|
29 |
-
"special": true
|
30 |
-
},
|
31 |
-
{
|
32 |
-
"id": 3,
|
33 |
-
"content": "<eos>",
|
34 |
-
"single_word": false,
|
35 |
-
"lstrip": false,
|
36 |
-
"rstrip": false,
|
37 |
-
"normalized": false,
|
38 |
-
"special": true
|
39 |
-
}
|
40 |
-
],
|
41 |
"added_tokens_decoder": {
|
42 |
"0": {
|
43 |
"content": "<pad>",
|
@@ -72,26 +34,6 @@
|
|
72 |
"special": true
|
73 |
}
|
74 |
},
|
75 |
-
"normalizer": {
|
76 |
-
"type": "Sequence",
|
77 |
-
"normalizers": [
|
78 |
-
{
|
79 |
-
"type": "NFC"
|
80 |
-
}
|
81 |
-
]
|
82 |
-
},
|
83 |
-
"pre_tokenizer": {
|
84 |
-
"type": "Sequence",
|
85 |
-
"pretokenizers": [
|
86 |
-
{
|
87 |
-
"type": "Whitespace"
|
88 |
-
},
|
89 |
-
{
|
90 |
-
"type": "Punctuation",
|
91 |
-
"behavior": "Isolated"
|
92 |
-
}
|
93 |
-
]
|
94 |
-
},
|
95 |
"decoder": {
|
96 |
"type": "WordPiece",
|
97 |
"unk_token": "<unk>"
|
@@ -102,12 +44,6 @@
|
|
102 |
2,
|
103 |
3
|
104 |
],
|
105 |
-
"special_tokens": {
|
106 |
-
"pad_token": 0,
|
107 |
-
"unk_token": 1,
|
108 |
-
"bos_token": 2,
|
109 |
-
"eos_token": 3
|
110 |
-
},
|
111 |
"model": {
|
112 |
"type": "WordLevel",
|
113 |
"vocab": {
|
@@ -1119,7 +1055,7 @@
|
|
1119 |
"single": [
|
1120 |
{
|
1121 |
"SpecialToken": {
|
1122 |
-
"id":
|
1123 |
"type_id": 0
|
1124 |
}
|
1125 |
},
|
@@ -1131,7 +1067,7 @@
|
|
1131 |
},
|
1132 |
{
|
1133 |
"SpecialToken": {
|
1134 |
-
"id":
|
1135 |
"type_id": 0
|
1136 |
}
|
1137 |
}
|
@@ -1139,7 +1075,7 @@
|
|
1139 |
"pair": [
|
1140 |
{
|
1141 |
"SpecialToken": {
|
1142 |
-
"id":
|
1143 |
"type_id": 0
|
1144 |
}
|
1145 |
},
|
@@ -1157,14 +1093,14 @@
|
|
1157 |
},
|
1158 |
{
|
1159 |
"SpecialToken": {
|
1160 |
-
"id":
|
1161 |
"type_id": 0
|
1162 |
}
|
1163 |
}
|
1164 |
],
|
1165 |
"special_tokens": {
|
1166 |
"<pad>": {
|
1167 |
-
"id":
|
1168 |
"ids": [
|
1169 |
0
|
1170 |
],
|
@@ -1173,7 +1109,7 @@
|
|
1173 |
]
|
1174 |
},
|
1175 |
"<unk>": {
|
1176 |
-
"id":
|
1177 |
"ids": [
|
1178 |
1
|
1179 |
],
|
@@ -1182,7 +1118,7 @@
|
|
1182 |
]
|
1183 |
},
|
1184 |
"<bos>": {
|
1185 |
-
"id":
|
1186 |
"ids": [
|
1187 |
2
|
1188 |
],
|
@@ -1191,7 +1127,7 @@
|
|
1191 |
]
|
1192 |
},
|
1193 |
"<eos>": {
|
1194 |
-
"id":
|
1195 |
"ids": [
|
1196 |
3
|
1197 |
],
|
|
|
1 |
{
|
2 |
"version": "1.0",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
"added_tokens_decoder": {
|
4 |
"0": {
|
5 |
"content": "<pad>",
|
|
|
34 |
"special": true
|
35 |
}
|
36 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
"decoder": {
|
38 |
"type": "WordPiece",
|
39 |
"unk_token": "<unk>"
|
|
|
44 |
2,
|
45 |
3
|
46 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
"model": {
|
48 |
"type": "WordLevel",
|
49 |
"vocab": {
|
|
|
1055 |
"single": [
|
1056 |
{
|
1057 |
"SpecialToken": {
|
1058 |
+
"id": 2,
|
1059 |
"type_id": 0
|
1060 |
}
|
1061 |
},
|
|
|
1067 |
},
|
1068 |
{
|
1069 |
"SpecialToken": {
|
1070 |
+
"id": 3,
|
1071 |
"type_id": 0
|
1072 |
}
|
1073 |
}
|
|
|
1075 |
"pair": [
|
1076 |
{
|
1077 |
"SpecialToken": {
|
1078 |
+
"id": 2,
|
1079 |
"type_id": 0
|
1080 |
}
|
1081 |
},
|
|
|
1093 |
},
|
1094 |
{
|
1095 |
"SpecialToken": {
|
1096 |
+
"id": 3,
|
1097 |
"type_id": 0
|
1098 |
}
|
1099 |
}
|
1100 |
],
|
1101 |
"special_tokens": {
|
1102 |
"<pad>": {
|
1103 |
+
"id": 0,
|
1104 |
"ids": [
|
1105 |
0
|
1106 |
],
|
|
|
1109 |
]
|
1110 |
},
|
1111 |
"<unk>": {
|
1112 |
+
"id": 1,
|
1113 |
"ids": [
|
1114 |
1
|
1115 |
],
|
|
|
1118 |
]
|
1119 |
},
|
1120 |
"<bos>": {
|
1121 |
+
"id": 2,
|
1122 |
"ids": [
|
1123 |
2
|
1124 |
],
|
|
|
1127 |
]
|
1128 |
},
|
1129 |
"<eos>": {
|
1130 |
+
"id": 3,
|
1131 |
"ids": [
|
1132 |
3
|
1133 |
],
|