pcuenq HF Staff commited on
Commit
9d9f5b8
·
verified ·
1 Parent(s): 6d7d090
Files changed (1) hide show
  1. README.md +77 -1
README.md CHANGED
@@ -3,5 +3,81 @@ library_name: transformers
3
  tags: []
4
  ---
5
 
6
- # Fast Tokenizer for [mlx-community/Hunyuan-7B-Instruct-3bit](https://huggingface.co/mlx-community/Hunyuan-7B-Instruct-3bit/blob/main/tokenizer_config.json)
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  tags: []
4
  ---
5
 
6
+ # pcuenq/Hunyuan-7B-Instruct-tokenizer
7
 
8
+ This is a transformers fast tokenizer for [mlx-community/Hunyuan-7B-Instruct-3bit](https://huggingface.co/mlx-community/Hunyuan-7B-Instruct-3bit/blob/main/tokenizer_config.json)
9
+
10
+ ## Conversion
11
+
12
+ ```py
13
+ from huggingface_hub import snapshot_download
14
+ from tokenization_hy import *
15
+ from tokenizers import normalizers
16
+ from transformers import PreTrainedTokenizerFast
17
+ from transformers.convert_slow_tokenizer import TikTokenConverter
18
+
19
+ snapshot_download(
20
+ "mlx-community/Hunyuan-7B-Instruct-3bit",
21
+ local_dir=".",
22
+ allow_patterns=["hy.tiktoken", "tokenization_hy.py", "tokenizer_config.json", "special_tokens_map.json"]
23
+ )
24
+
25
+ original = HYTokenizer.from_pretrained(".")
26
+
27
+ converter = TikTokenConverter(
28
+ vocab_file="hy.tiktoken",
29
+ pattern=PAT_STR,
30
+ additional_special_tokens=[t[1] for t in SPECIAL_TOKENS],
31
+ )
32
+ converted = converter.converted()
33
+ converted.normalizer = normalizers.NFC()
34
+
35
+ t_fast = PreTrainedTokenizerFast(
36
+ tokenizer_object=converted,
37
+ model_input_names=original.model_input_names,
38
+ model_max_length=256*1024,
39
+ clean_up_tokenization_spaces=False,
40
+ )
41
+ t_fast.chat_template = original.chat_template
42
+ t_fast.push_to_hub("Hunyuan-7B-Instruct-tokenizer")
43
+ ```
44
+
45
+ ## Verification
46
+
47
+ ```py
48
+ from datasets import load_dataset
49
+ from tqdm import tqdm
50
+ from tokenization_hy import HYTokenizer
51
+ from transformers import AutoTokenizer
52
+
53
+ original = HYTokenizer.from_pretrained("mlx-community/Hunyuan-7B-Instruct-3bit")
54
+ t_fast = AutoTokenizer.from_pretrained("pcuenq/Hunyuan-7B-Instruct-tokenizer")
55
+
56
+ # Testing on XNLI
57
+
58
+ xnli = load_dataset("xnli", "all_languages", split="validation")
59
+
60
+ def verify(lang, text):
61
+ encoded_original = original.encode(text)
62
+ encoded_fast = t_fast.encode(text)
63
+ assert encoded_fast == encoded_original, f"Fast encode error: {lang} - {text}"
64
+ decoded = original.decode(encoded_original)
65
+ decoded_fast = t_fast.decode(encoded_fast, skip_special_tokens=True)
66
+ assert decoded_fast == decoded, f"Fast decode error: {lang} - {text}"
67
+
68
+ for p in tqdm(xnli["premise"]):
69
+ for lang, text in p.items():
70
+ verify(lang, text)
71
+
72
+
73
+ # Testing on codeparrot subset
74
+
75
+ ds = load_dataset("codeparrot/github-code", streaming=True, trust_remote_code=True, split="train")
76
+
77
+ iterator = iter(ds)
78
+ for _ in tqdm(range(1000)):
79
+ item = next(iterator)
80
+ code = item["code"]
81
+ lang = item["language"]
82
+ verify(lang, code)
83
+ ```