goabonga commited on
Commit
42ab2c0
·
verified ·
1 Parent(s): 620d70a

Upload tokenizer files (vocab, config, README)

Browse files
Files changed (1) hide show
  1. tokenizer.json +8 -72
tokenizer.json CHANGED
@@ -1,43 +1,5 @@
1
  {
2
  "version": "1.0",
3
- "added_tokens": [
4
- {
5
- "id": 0,
6
- "content": "<pad>",
7
- "single_word": false,
8
- "lstrip": false,
9
- "rstrip": false,
10
- "normalized": false,
11
- "special": true
12
- },
13
- {
14
- "id": 1,
15
- "content": "<unk>",
16
- "single_word": false,
17
- "lstrip": false,
18
- "rstrip": false,
19
- "normalized": false,
20
- "special": true
21
- },
22
- {
23
- "id": 2,
24
- "content": "<bos>",
25
- "single_word": false,
26
- "lstrip": false,
27
- "rstrip": false,
28
- "normalized": false,
29
- "special": true
30
- },
31
- {
32
- "id": 3,
33
- "content": "<eos>",
34
- "single_word": false,
35
- "lstrip": false,
36
- "rstrip": false,
37
- "normalized": false,
38
- "special": true
39
- }
40
- ],
41
  "added_tokens_decoder": {
42
  "0": {
43
  "content": "<pad>",
@@ -72,26 +34,6 @@
72
  "special": true
73
  }
74
  },
75
- "normalizer": {
76
- "type": "Sequence",
77
- "normalizers": [
78
- {
79
- "type": "NFC"
80
- }
81
- ]
82
- },
83
- "pre_tokenizer": {
84
- "type": "Sequence",
85
- "pretokenizers": [
86
- {
87
- "type": "Whitespace"
88
- },
89
- {
90
- "type": "Punctuation",
91
- "behavior": "Isolated"
92
- }
93
- ]
94
- },
95
  "decoder": {
96
  "type": "WordPiece",
97
  "unk_token": "<unk>"
@@ -102,12 +44,6 @@
102
  2,
103
  3
104
  ],
105
- "special_tokens": {
106
- "pad_token": 0,
107
- "unk_token": 1,
108
- "bos_token": 2,
109
- "eos_token": 3
110
- },
111
  "model": {
112
  "type": "WordLevel",
113
  "vocab": {
@@ -1119,7 +1055,7 @@
1119
  "single": [
1120
  {
1121
  "SpecialToken": {
1122
- "id": "<bos>",
1123
  "type_id": 0
1124
  }
1125
  },
@@ -1131,7 +1067,7 @@
1131
  },
1132
  {
1133
  "SpecialToken": {
1134
- "id": "<eos>",
1135
  "type_id": 0
1136
  }
1137
  }
@@ -1139,7 +1075,7 @@
1139
  "pair": [
1140
  {
1141
  "SpecialToken": {
1142
- "id": "<bos>",
1143
  "type_id": 0
1144
  }
1145
  },
@@ -1157,14 +1093,14 @@
1157
  },
1158
  {
1159
  "SpecialToken": {
1160
- "id": "<eos>",
1161
  "type_id": 0
1162
  }
1163
  }
1164
  ],
1165
  "special_tokens": {
1166
  "<pad>": {
1167
- "id": "<pad>",
1168
  "ids": [
1169
  0
1170
  ],
@@ -1173,7 +1109,7 @@
1173
  ]
1174
  },
1175
  "<unk>": {
1176
- "id": "<unk>",
1177
  "ids": [
1178
  1
1179
  ],
@@ -1182,7 +1118,7 @@
1182
  ]
1183
  },
1184
  "<bos>": {
1185
- "id": "<bos>",
1186
  "ids": [
1187
  2
1188
  ],
@@ -1191,7 +1127,7 @@
1191
  ]
1192
  },
1193
  "<eos>": {
1194
- "id": "<eos>",
1195
  "ids": [
1196
  3
1197
  ],
 
1
  {
2
  "version": "1.0",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "added_tokens_decoder": {
4
  "0": {
5
  "content": "<pad>",
 
34
  "special": true
35
  }
36
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  "decoder": {
38
  "type": "WordPiece",
39
  "unk_token": "<unk>"
 
44
  2,
45
  3
46
  ],
 
 
 
 
 
 
47
  "model": {
48
  "type": "WordLevel",
49
  "vocab": {
 
1055
  "single": [
1056
  {
1057
  "SpecialToken": {
1058
+ "id": 2,
1059
  "type_id": 0
1060
  }
1061
  },
 
1067
  },
1068
  {
1069
  "SpecialToken": {
1070
+ "id": 3,
1071
  "type_id": 0
1072
  }
1073
  }
 
1075
  "pair": [
1076
  {
1077
  "SpecialToken": {
1078
+ "id": 2,
1079
  "type_id": 0
1080
  }
1081
  },
 
1093
  },
1094
  {
1095
  "SpecialToken": {
1096
+ "id": 3,
1097
  "type_id": 0
1098
  }
1099
  }
1100
  ],
1101
  "special_tokens": {
1102
  "<pad>": {
1103
+ "id": 0,
1104
  "ids": [
1105
  0
1106
  ],
 
1109
  ]
1110
  },
1111
  "<unk>": {
1112
+ "id": 1,
1113
  "ids": [
1114
  1
1115
  ],
 
1118
  ]
1119
  },
1120
  "<bos>": {
1121
+ "id": 2,
1122
  "ids": [
1123
  2
1124
  ],
 
1127
  ]
1128
  },
1129
  "<eos>": {
1130
+ "id": 3,
1131
  "ids": [
1132
  3
1133
  ],