alexmarques commited on
Commit
d6fe5b6
·
verified ·
1 Parent(s): 8a3538d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +36 -23
README.md CHANGED
@@ -37,7 +37,7 @@ This model was obtained by quantizing the weights of [Qwen3-0.6B](https://huggin
37
  This optimization reduces the number of bits per parameter from 16 to 4, reducing the disk size and GPU memory requirements by approximately 75%.
38
 
39
  Only the weights of the linear operators within transformers blocks are quantized.
40
- Weights are quantized using a symmetric per-group scheme, with group size 128.
41
  The [GPTQ](https://arxiv.org/abs/2210.17323) algorithm is applied for quantization, as implemented in the [llm-compressor](https://github.com/vllm-project/llm-compressor) library.
42
 
43
 
@@ -80,35 +80,48 @@ vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://do
80
  This model was created with [llm-compressor](https://github.com/vllm-project/llm-compressor) by running the code snippet below.
81
 
82
 
83
- ```python
84
- from llmcompressor.modifiers.quantization import GPTQModifier
85
- from llmcompressor.transformers import oneshot
86
- from transformers import AutoModelForCausalLM, AutoTokenizer
87
 
88
- # Load model
89
- model_stub = "Qwen/Qwen3-0.6B"
90
- model_name = model_stub.split("/")[-1]
91
 
92
- num_samples = 1024
93
- max_seq_len = 8192
94
 
95
- model = AutoModelForCausalLM.from_pretrained(model_stub)
96
 
97
- tokenizer = AutoTokenizer.from_pretrained(model_stub)
98
 
99
- def preprocess_fn(example):
100
  return {"text": tokenizer.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)}
101
 
102
- ds = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
103
- ds = ds.map(preprocess_fn)
104
-
105
- # Configure the quantization algorithm and scheme
106
- recipe = GPTQModifier(
107
- ignore=["lm_head"],
108
- sequential_targets=["Qwen3DecoderLayer"],
109
- targets="Linear",
110
- scheme="W4A16",
111
- dampening_frac=0.1,
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  )
113
 
114
  # Apply quantization
 
37
  This optimization reduces the number of bits per parameter from 16 to 4, reducing the disk size and GPU memory requirements by approximately 75%.
38
 
39
  Only the weights of the linear operators within transformers blocks are quantized.
40
+ Weights are quantized using a asymmetric per-group scheme, with group size 128.
41
  The [GPTQ](https://arxiv.org/abs/2210.17323) algorithm is applied for quantization, as implemented in the [llm-compressor](https://github.com/vllm-project/llm-compressor) library.
42
 
43
 
 
80
  This model was created with [llm-compressor](https://github.com/vllm-project/llm-compressor) by running the code snippet below.
81
 
82
 
83
+ ```python
84
+ from llmcompressor.modifiers.quantization import GPTQModifier
85
+ from llmcompressor.transformers import oneshot
86
+ from transformers import AutoModelForCausalLM, AutoTokenizer
87
 
88
+ # Load model
89
+ model_stub = "Qwen/Qwen3-0.6B"
90
+ model_name = model_stub.split("/")[-1]
91
 
92
+ num_samples = 1024
93
+ max_seq_len = 8192
94
 
95
+ model = AutoModelForCausalLM.from_pretrained(model_stub)
96
 
97
+ tokenizer = AutoTokenizer.from_pretrained(model_stub)
98
 
99
+ def preprocess_fn(example):
100
  return {"text": tokenizer.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)}
101
 
102
+ ds = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
103
+ ds = ds.map(preprocess_fn)
104
+
105
+ # Configure the quantization algorithm and scheme
106
+ recipe = GPTQModifier(
107
+ ignore=["lm_head"],
108
+ sequential_targets=["Qwen3DecoderLayer"],
109
+ targets="Linear",
110
+ dampening_frac=0.01,
111
+ config_groups={
112
+ "group0": {
113
+ "targets": ["Linear"]
114
+ "weights": {
115
+ "num_bits": 4,
116
+ "type": "int",
117
+ "strategy": "group",
118
+ "group_size": 64,
119
+ "symmetric": False,
120
+ "actorder": "weight",
121
+ "observer": "mse",
122
+ }
123
+ }
124
+ }
125
  )
126
 
127
  # Apply quantization