Commit
·
cffbce0
1
Parent(s):
d38f5d1
Update README.md
Browse files
README.md
CHANGED
|
@@ -31,7 +31,7 @@ After undergoing 4-bit quantization, the CodeFuse-CodeLlama-34B-4bits model can
|
|
| 31 |
|
| 32 |
🔥🔥🔥 2023-09-26 We are pleased to announce the release of the 4-bit quantized version of CodeFuse-CodeLlama-34B. Despite the quantization process, the model still achieves a remarkable 73.8% accuracy (greedy decoding) on the HumanEval pass@1 metric.
|
| 33 |
|
| 34 |
-
🔥🔥🔥 2023-09-11 CodeFuse-CodeLlama34B has
|
| 35 |
|
| 36 |
<br>
|
| 37 |
|
|
@@ -124,24 +124,22 @@ pip install -r requirements.txt
|
|
| 124 |
import os
|
| 125 |
import torch
|
| 126 |
import time
|
| 127 |
-
from
|
| 128 |
-
from auto_gptq import AutoGPTQForCausalLM
|
| 129 |
|
| 130 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 131 |
|
| 132 |
-
def load_model_tokenizer(
|
| 133 |
"""
|
| 134 |
-
Load model and tokenizer based on the given model name or local path of downloaded model.
|
| 135 |
"""
|
| 136 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 137 |
trust_remote_code=True,
|
| 138 |
use_fast=False,
|
| 139 |
lagecy=False)
|
| 140 |
tokenizer.padding_side = "left"
|
| 141 |
-
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<unk>")
|
| 142 |
-
tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("</s>")
|
| 143 |
|
| 144 |
-
model = AutoGPTQForCausalLM.from_quantized(
|
| 145 |
inject_fused_attention=False,
|
| 146 |
inject_fused_mlp=False,
|
| 147 |
use_cuda_fp16=True,
|
|
@@ -153,7 +151,7 @@ def load_model_tokenizer(model_path):
|
|
| 153 |
|
| 154 |
def inference(model, tokenizer, prompt):
|
| 155 |
"""
|
| 156 |
-
Uset the given model and tokenizer to generate an answer for the
|
| 157 |
"""
|
| 158 |
st = time.time()
|
| 159 |
prompt = prompt if prompt.endswith('\n') else f'{prompt}\n'
|
|
@@ -181,11 +179,10 @@ def inference(model, tokenizer, prompt):
|
|
| 181 |
|
| 182 |
|
| 183 |
if __name__ == "__main__":
|
| 184 |
-
|
| 185 |
-
|
| 186 |
prompt = 'Please write a QuickSort program in Python'
|
| 187 |
|
| 188 |
-
model, tokenizer = load_model_tokenizer(
|
| 189 |
inference(model, tokenizer, prompt)
|
| 190 |
```
|
| 191 |
|
|
@@ -319,29 +316,27 @@ pip install -r requirements.txt
|
|
| 319 |
import os
|
| 320 |
import torch
|
| 321 |
import time
|
| 322 |
-
from
|
| 323 |
-
from auto_gptq import AutoGPTQForCausalLM
|
| 324 |
|
| 325 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 326 |
|
| 327 |
-
def load_model_tokenizer(
|
| 328 |
"""
|
| 329 |
Load model and tokenizer based on the given model name or local path of downloaded model.
|
| 330 |
"""
|
| 331 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 332 |
trust_remote_code=True,
|
| 333 |
use_fast=False,
|
| 334 |
lagecy=False)
|
| 335 |
tokenizer.padding_side = "left"
|
| 336 |
-
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<unk>")
|
| 337 |
-
tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("</s>")
|
| 338 |
|
| 339 |
-
model = AutoGPTQForCausalLM.from_quantized(
|
| 340 |
inject_fused_attention=False,
|
| 341 |
inject_fused_mlp=False,
|
| 342 |
use_cuda_fp16=True,
|
| 343 |
disable_exllama=False,
|
| 344 |
-
device_map='auto' #
|
| 345 |
)
|
| 346 |
return model, tokenizer
|
| 347 |
|
|
@@ -366,7 +361,7 @@ def inference(model, tokenizer, prompt):
|
|
| 366 |
do_sample=True,
|
| 367 |
max_new_tokens=512,
|
| 368 |
eos_token_id=tokenizer.eos_token_id,
|
| 369 |
-
pad_token_id=tokenizer.pad_token_id
|
| 370 |
)
|
| 371 |
print(f'generated tokens num is {len(generated_ids[0][input_ids.size(1):])}')
|
| 372 |
outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
|
@@ -376,11 +371,10 @@ def inference(model, tokenizer, prompt):
|
|
| 376 |
|
| 377 |
|
| 378 |
if __name__ == "__main__":
|
| 379 |
-
|
| 380 |
-
|
| 381 |
prompt = '请用Python实现一个快速排序算法'
|
| 382 |
|
| 383 |
-
model, tokenizer = load_model_tokenizer(
|
| 384 |
inference(model, tokenizer, prompt)
|
| 385 |
```
|
| 386 |
|
|
|
|
| 31 |
|
| 32 |
🔥🔥🔥 2023-09-26 We are pleased to announce the release of the 4-bit quantized version of CodeFuse-CodeLlama-34B. Despite the quantization process, the model still achieves a remarkable 73.8% accuracy (greedy decoding) on the HumanEval pass@1 metric.
|
| 33 |
|
| 34 |
+
🔥🔥🔥 2023-09-11 CodeFuse-CodeLlama34B has achieved 74.4% of pass@1 (greedy decoding) on HumanEval, which is SOTA results for openspurced LLMs at present.
|
| 35 |
|
| 36 |
<br>
|
| 37 |
|
|
|
|
| 124 |
import os
|
| 125 |
import torch
|
| 126 |
import time
|
| 127 |
+
from transformers import AutoTokenizer
|
| 128 |
+
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
|
| 129 |
|
| 130 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 131 |
|
| 132 |
+
def load_model_tokenizer(model_name_or_local_path):
|
| 133 |
"""
|
| 134 |
+
Load model and tokenizer based on the given model name or local path of the downloaded model.
|
| 135 |
"""
|
| 136 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name_or_local_path,
|
| 137 |
trust_remote_code=True,
|
| 138 |
use_fast=False,
|
| 139 |
lagecy=False)
|
| 140 |
tokenizer.padding_side = "left"
|
|
|
|
|
|
|
| 141 |
|
| 142 |
+
model = AutoGPTQForCausalLM.from_quantized(model_name_or_local_path,
|
| 143 |
inject_fused_attention=False,
|
| 144 |
inject_fused_mlp=False,
|
| 145 |
use_cuda_fp16=True,
|
|
|
|
| 151 |
|
| 152 |
def inference(model, tokenizer, prompt):
|
| 153 |
"""
|
| 154 |
+
Uset the given model and tokenizer to generate an answer for the specified prompt.
|
| 155 |
"""
|
| 156 |
st = time.time()
|
| 157 |
prompt = prompt if prompt.endswith('\n') else f'{prompt}\n'
|
|
|
|
| 179 |
|
| 180 |
|
| 181 |
if __name__ == "__main__":
|
| 182 |
+
model_name_or_local_path = '<Mole name (i.e. codefuse-ai/CodeFuse-CodeLlama-34B-4bits) or local path of the downloaded model>'
|
|
|
|
| 183 |
prompt = 'Please write a QuickSort program in Python'
|
| 184 |
|
| 185 |
+
model, tokenizer = load_model_tokenizer(model_name_or_local_path)
|
| 186 |
inference(model, tokenizer, prompt)
|
| 187 |
```
|
| 188 |
|
|
|
|
| 316 |
import os
|
| 317 |
import torch
|
| 318 |
import time
|
| 319 |
+
from transformers import AutoTokenizer
|
| 320 |
+
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
|
| 321 |
|
| 322 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 323 |
|
| 324 |
+
def load_model_tokenizer(model_name_or_local_path):
|
| 325 |
"""
|
| 326 |
Load model and tokenizer based on the given model name or local path of downloaded model.
|
| 327 |
"""
|
| 328 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name_or_local_path,
|
| 329 |
trust_remote_code=True,
|
| 330 |
use_fast=False,
|
| 331 |
lagecy=False)
|
| 332 |
tokenizer.padding_side = "left"
|
|
|
|
|
|
|
| 333 |
|
| 334 |
+
model = AutoGPTQForCausalLM.from_quantized(model_name_or_local_path,
|
| 335 |
inject_fused_attention=False,
|
| 336 |
inject_fused_mlp=False,
|
| 337 |
use_cuda_fp16=True,
|
| 338 |
disable_exllama=False,
|
| 339 |
+
device_map='auto' # Support multi-gpus
|
| 340 |
)
|
| 341 |
return model, tokenizer
|
| 342 |
|
|
|
|
| 361 |
do_sample=True,
|
| 362 |
max_new_tokens=512,
|
| 363 |
eos_token_id=tokenizer.eos_token_id,
|
| 364 |
+
pad_token_id=tokenizer.pad_token_id
|
| 365 |
)
|
| 366 |
print(f'generated tokens num is {len(generated_ids[0][input_ids.size(1):])}')
|
| 367 |
outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
|
|
|
| 371 |
|
| 372 |
|
| 373 |
if __name__ == "__main__":
|
| 374 |
+
model_name_or_local_path = '<模型名字 (即codefuse-ai/CodeFuse-CodeLlama-34B-4bits)或者提前下载到本地的模型路径>'
|
|
|
|
| 375 |
prompt = '请用Python实现一个快速排序算法'
|
| 376 |
|
| 377 |
+
model, tokenizer = load_model_tokenizer(model_name_or_local_path)
|
| 378 |
inference(model, tokenizer, prompt)
|
| 379 |
```
|
| 380 |
|