Upload tokenizer
#6
by
asma-j-alali
- opened
- tokenizer.json +4 -4
- tokenizer_config.json +3 -1
tokenizer.json
CHANGED
@@ -950,8 +950,8 @@
|
|
950 |
"pre_tokenizer": {
|
951 |
"type": "Metaspace",
|
952 |
"replacement": "▁",
|
953 |
-
"
|
954 |
-
"
|
955 |
},
|
956 |
"post_processor": {
|
957 |
"type": "TemplateProcessing",
|
@@ -1010,8 +1010,8 @@
|
|
1010 |
"decoder": {
|
1011 |
"type": "Metaspace",
|
1012 |
"replacement": "▁",
|
1013 |
-
"
|
1014 |
-
"
|
1015 |
},
|
1016 |
"model": {
|
1017 |
"type": "Unigram",
|
|
|
950 |
"pre_tokenizer": {
|
951 |
"type": "Metaspace",
|
952 |
"replacement": "▁",
|
953 |
+
"prepend_scheme": "always",
|
954 |
+
"split": true
|
955 |
},
|
956 |
"post_processor": {
|
957 |
"type": "TemplateProcessing",
|
|
|
1010 |
"decoder": {
|
1011 |
"type": "Metaspace",
|
1012 |
"replacement": "▁",
|
1013 |
+
"prepend_scheme": "always",
|
1014 |
+
"split": true
|
1015 |
},
|
1016 |
"model": {
|
1017 |
"type": "Unigram",
|
tokenizer_config.json
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
{
|
|
|
2 |
"added_tokens_decoder": {
|
3 |
"0": {
|
4 |
"content": "<pad>",
|
@@ -927,9 +928,10 @@
|
|
927 |
"<extra_id_98>",
|
928 |
"<extra_id_99>"
|
929 |
],
|
930 |
-
"clean_up_tokenization_spaces":
|
931 |
"eos_token": "</s>",
|
932 |
"extra_ids": 100,
|
|
|
933 |
"model_max_length": 512,
|
934 |
"pad_token": "<pad>",
|
935 |
"sp_model_kwargs": {},
|
|
|
1 |
{
|
2 |
+
"add_prefix_space": null,
|
3 |
"added_tokens_decoder": {
|
4 |
"0": {
|
5 |
"content": "<pad>",
|
|
|
928 |
"<extra_id_98>",
|
929 |
"<extra_id_99>"
|
930 |
],
|
931 |
+
"clean_up_tokenization_spaces": false,
|
932 |
"eos_token": "</s>",
|
933 |
"extra_ids": 100,
|
934 |
+
"extra_special_tokens": {},
|
935 |
"model_max_length": 512,
|
936 |
"pad_token": "<pad>",
|
937 |
"sp_model_kwargs": {},
|