codefuse-ai
/

CodeFuse-CodeLlama-34B-4bits

@@ -31,7 +31,7 @@ After undergoing 4-bit quantization, the CodeFuse-CodeLlama-34B-4bits model can
 🔥🔥🔥 2023-09-26 We are pleased to announce the release of the 4-bit quantized version of CodeFuse-CodeLlama-34B. Despite the quantization process, the model still achieves a remarkable 73.8% accuracy (greedy decoding) on the HumanEval pass@1 metric.
-🔥🔥🔥 2023-09-11 CodeFuse-CodeLlama34B has achived 74.4% of pass@1 (greedy decoding) on HumanEval, which is SOTA results for openspurced LLMs at present.
 <br>
@@ -124,24 +124,22 @@ pip install -r requirements.txt
 import os
 import torch
 import time
-from modelscope import AutoTokenizer, snapshot_download
-from auto_gptq import AutoGPTQForCausalLM
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-def load_model_tokenizer(model_path):
     """
-    Load model and tokenizer based on the given model name or local path of downloaded model.
     """
-    tokenizer = AutoTokenizer.from_pretrained(model_path,
                                               trust_remote_code=True,
                                               use_fast=False,
                                               lagecy=False)
     tokenizer.padding_side = "left"
-    tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<unk>")
-    tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("</s>")
-    model = AutoGPTQForCausalLM.from_quantized(model_path,
                                                 inject_fused_attention=False,
                                                 inject_fused_mlp=False,
                                                 use_cuda_fp16=True,
@@ -153,7 +151,7 @@ def load_model_tokenizer(model_path):
 def inference(model, tokenizer, prompt):
     """
-    Uset the given model and tokenizer to generate an answer for the speicifed prompt.
     """
     st = time.time()
     prompt = prompt if prompt.endswith('\n') else f'{prompt}\n'
@@ -181,11 +179,10 @@ def inference(model, tokenizer, prompt):
 if __name__ == "__main__":
-    model_dir = snapshot_download('codefuse-ai/CodeFuse-CodeLlama-34B-4bits', revision='v1.0.0')
     prompt = 'Please write a QuickSort program in Python'
-    model, tokenizer = load_model_tokenizer(model_dir)
     inference(model, tokenizer, prompt)
 ```
@@ -319,29 +316,27 @@ pip install -r requirements.txt
 import os
 import torch
 import time
-from modelscope import AutoTokenizer, snapshot_download
-from auto_gptq import AutoGPTQForCausalLM
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-def load_model_tokenizer(model_path):
     """
     Load model and tokenizer based on the given model name or local path of downloaded model.
     """
-    tokenizer = AutoTokenizer.from_pretrained(model_path,
                                               trust_remote_code=True,
                                               use_fast=False,
                                               lagecy=False)
     tokenizer.padding_side = "left"
-    tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<unk>")
-    tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("</s>")
-    model = AutoGPTQForCausalLM.from_quantized(model_path,
                                                 inject_fused_attention=False,
                                                 inject_fused_mlp=False,
                                                 use_cuda_fp16=True,
                                                 disable_exllama=False,
-                                                device_map='auto'   # 支持多卡
                                               )
     return model, tokenizer
@@ -366,7 +361,7 @@ def inference(model, tokenizer, prompt):
             do_sample=True,
             max_new_tokens=512,
             eos_token_id=tokenizer.eos_token_id,
-            pad_token_id=tokenizer.pad_token_id
         )
     print(f'generated tokens num is {len(generated_ids[0][input_ids.size(1):])}')
     outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
@@ -376,11 +371,10 @@ def inference(model, tokenizer, prompt):
 if __name__ == "__main__":
-    model_dir = snapshot_download('codefuse-ai/CodeFuse-CodeLlama-34B-4bits', revision='v1.0.0')
     prompt = '请用Python实现一个快速排序算法'
-    model, tokenizer = load_model_tokenizer(model_dir)
     inference(model, tokenizer, prompt)
 ```

 🔥🔥🔥 2023-09-26 We are pleased to announce the release of the 4-bit quantized version of CodeFuse-CodeLlama-34B. Despite the quantization process, the model still achieves a remarkable 73.8% accuracy (greedy decoding) on the HumanEval pass@1 metric.
+🔥🔥🔥 2023-09-11 CodeFuse-CodeLlama34B has achieved 74.4% of pass@1 (greedy decoding) on HumanEval, which is SOTA results for openspurced LLMs at present.
 <br>
 import os
 import torch
 import time
+from transformers import AutoTokenizer
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
+def load_model_tokenizer(model_name_or_local_path):
     """
+    Load model and tokenizer based on the given model name or local path of the downloaded model.
     """
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_local_path,
                                               trust_remote_code=True,
                                               use_fast=False,
                                               lagecy=False)
     tokenizer.padding_side = "left"
+    model = AutoGPTQForCausalLM.from_quantized(model_name_or_local_path,
                                                 inject_fused_attention=False,
                                                 inject_fused_mlp=False,
                                                 use_cuda_fp16=True,
 def inference(model, tokenizer, prompt):
     """
+    Uset the given model and tokenizer to generate an answer for the specified prompt.
     """
     st = time.time()
     prompt = prompt if prompt.endswith('\n') else f'{prompt}\n'
 if __name__ == "__main__":
+    model_name_or_local_path = '<Mole name (i.e. codefuse-ai/CodeFuse-CodeLlama-34B-4bits) or local path of the downloaded model>'
     prompt = 'Please write a QuickSort program in Python'
+    model, tokenizer = load_model_tokenizer(model_name_or_local_path)
     inference(model, tokenizer, prompt)
 ```
 import os
 import torch
 import time
+from transformers import AutoTokenizer
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
+def load_model_tokenizer(model_name_or_local_path):
     """
     Load model and tokenizer based on the given model name or local path of downloaded model.
     """
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_local_path,
                                               trust_remote_code=True,
                                               use_fast=False,
                                               lagecy=False)
     tokenizer.padding_side = "left"
+    model = AutoGPTQForCausalLM.from_quantized(model_name_or_local_path,
                                                 inject_fused_attention=False,
                                                 inject_fused_mlp=False,
                                                 use_cuda_fp16=True,
                                                 disable_exllama=False,
+                                                device_map='auto'   # Support multi-gpus
                                               )
     return model, tokenizer
             do_sample=True,
             max_new_tokens=512,
             eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.pad_token_id
         )
     print(f'generated tokens num is {len(generated_ids[0][input_ids.size(1):])}')
     outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 if __name__ == "__main__":
+    model_name_or_local_path = '<模型名字 (即codefuse-ai/CodeFuse-CodeLlama-34B-4bits)或者提前下载到本地的模型路径>'
     prompt = '请用Python实现一个快速排序算法'
+    model, tokenizer = load_model_tokenizer(model_name_or_local_path)
     inference(model, tokenizer, prompt)
 ```