Update README.md
Browse files
README.md
CHANGED
@@ -47,7 +47,9 @@ Only the weights of the linear operators within transformers blocks are quantize
|
|
47 |
### Use with vLLM
|
48 |
|
49 |
This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
|
50 |
-
|
|
|
|
|
51 |
```python
|
52 |
from vllm import LLM, SamplingParams
|
53 |
from transformers import AutoTokenizer
|
@@ -73,6 +75,7 @@ outputs = llm.generate(prompts, sampling_params)
|
|
73 |
generated_text = outputs[0].outputs[0].text
|
74 |
print(generated_text)
|
75 |
```
|
|
|
76 |
|
77 |
vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://docs.vllm.ai/en/latest/) for more details.
|
78 |
|
@@ -80,101 +83,182 @@ vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://do
|
|
80 |
|
81 |
This model was created by applying [LLM Compressor with calibration samples from neuralmagic/calibration dataset](https://github.com/vllm-project/llm-compressor/blob/main/examples/multimodal_vision/llama4_example.py), as presented in the code snipet below.
|
82 |
|
|
|
|
|
|
|
83 |
```python
|
|
|
|
|
84 |
import torch
|
85 |
from datasets import load_dataset
|
86 |
-
from transformers import Llama4ForConditionalGeneration, Llama4Processor
|
87 |
|
88 |
from llmcompressor import oneshot
|
89 |
-
from llmcompressor.modeling import prepare_for_calibration
|
90 |
from llmcompressor.modifiers.quantization import GPTQModifier
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
-
|
93 |
-
model_id = "meta-llama/Llama-4-Scout-17B-16E
|
94 |
-
|
|
|
|
|
|
|
95 |
processor = Llama4Processor.from_pretrained(model_id)
|
96 |
-
# We update `Llama4TextMoe` modules with custom `SequentialLlama4TextMoe`.
|
97 |
-
# This change allows compatibility with vllm.
|
98 |
-
# To apply your own custom module for experimentation, consider updating
|
99 |
-
# `SequentialLlama4TextMoe` under llmcompressor/modeling/llama4.py
|
100 |
-
model = prepare_for_calibration(model)
|
101 |
|
|
|
|
|
|
|
102 |
DATASET_ID = "neuralmagic/calibration"
|
103 |
NUM_CALIBRATION_SAMPLES = 512
|
104 |
MAX_SEQUENCE_LENGTH = 8192
|
105 |
|
106 |
ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
|
107 |
|
108 |
-
|
109 |
def preprocess_function(example):
|
110 |
messgages = []
|
111 |
for message in example["messages"]:
|
112 |
messgages.append(
|
113 |
{
|
114 |
-
"role": message["role"],
|
115 |
-
"content": [{"type": "text", "text": message["content"]}]
|
116 |
}
|
117 |
)
|
118 |
-
|
119 |
return processor.apply_chat_template(
|
120 |
-
messgages,
|
121 |
-
return_tensors="pt",
|
122 |
-
padding=False,
|
123 |
-
truncation=True,
|
124 |
max_length=MAX_SEQUENCE_LENGTH,
|
125 |
tokenize=True,
|
126 |
add_special_tokens=False,
|
127 |
return_dict=True,
|
128 |
add_generation_prompt=False,
|
129 |
-
)
|
130 |
-
|
131 |
-
|
132 |
-
ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)
|
133 |
|
|
|
|
|
|
|
|
|
|
|
134 |
|
|
|
135 |
def data_collator(batch):
|
136 |
assert len(batch) == 1
|
137 |
return {
|
138 |
-
key: torch.tensor(value)
|
139 |
-
if key != "pixel_values"
|
140 |
-
else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
|
141 |
for key, value in batch[0].items()
|
142 |
}
|
143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
-
|
146 |
-
recipe = GPTQModifier(
|
147 |
-
targets="Linear",
|
148 |
-
scheme="W4A16",
|
149 |
-
ignore=[
|
150 |
-
"re:.*lm_head",
|
151 |
-
"re:.*self_attn",
|
152 |
-
"re:.*router",
|
153 |
-
"re:vision_model.*",
|
154 |
-
"re:multi_modal_projector.*",
|
155 |
-
"Llama4TextAttention",
|
156 |
-
],
|
157 |
-
)
|
158 |
|
159 |
-
#
|
160 |
-
# due to the large size of Llama4, we specify sequential targets such that
|
161 |
-
# only one MLP is loaded into GPU memory at a time
|
162 |
oneshot(
|
163 |
model=model,
|
|
|
164 |
dataset=ds,
|
165 |
recipe=recipe,
|
166 |
max_seq_length=MAX_SEQUENCE_LENGTH,
|
167 |
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
|
|
168 |
data_collator=data_collator,
|
169 |
-
|
170 |
)
|
171 |
|
172 |
# Save to disk compressed.
|
173 |
-
SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
|
174 |
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
175 |
processor.save_pretrained(SAVE_DIR)
|
176 |
|
177 |
```
|
|
|
178 |
|
179 |
## Evaluation
|
180 |
|
@@ -328,6 +412,9 @@ This model was evaluated on the well-known OpenLLM v1, OpenLLM v2, HumanEval, an
|
|
328 |
|
329 |
The results were obtained using the following commands:
|
330 |
|
|
|
|
|
|
|
331 |
#### MMLU_LLAMA
|
332 |
```
|
333 |
lm_eval \
|
@@ -434,3 +521,4 @@ lm_eval \
|
|
434 |
--tasks humaneval_64_instruct \
|
435 |
--batch_size auto
|
436 |
```
|
|
|
|
47 |
### Use with vLLM
|
48 |
|
49 |
This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
|
50 |
+
<details>
|
51 |
+
<summary>Model Usage Code</summary>
|
52 |
+
|
53 |
```python
|
54 |
from vllm import LLM, SamplingParams
|
55 |
from transformers import AutoTokenizer
|
|
|
75 |
generated_text = outputs[0].outputs[0].text
|
76 |
print(generated_text)
|
77 |
```
|
78 |
+
</details>
|
79 |
|
80 |
vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://docs.vllm.ai/en/latest/) for more details.
|
81 |
|
|
|
83 |
|
84 |
This model was created by applying [LLM Compressor with calibration samples from neuralmagic/calibration dataset](https://github.com/vllm-project/llm-compressor/blob/main/examples/multimodal_vision/llama4_example.py), as presented in the code snipet below.
|
85 |
|
86 |
+
<details>
|
87 |
+
<summary>Model Creation Code</summary>
|
88 |
+
|
89 |
```python
|
90 |
+
from transformers import Llama4ForConditionalGeneration, Llama4Processor
|
91 |
+
from transformers.quantizers.quantizers_utils import get_module_from_name
|
92 |
import torch
|
93 |
from datasets import load_dataset
|
|
|
94 |
|
95 |
from llmcompressor import oneshot
|
|
|
96 |
from llmcompressor.modifiers.quantization import GPTQModifier
|
97 |
+
from llmcompressor.utils.dev import skip_weights_initialize
|
98 |
+
from transformers.models.llama4.modeling_llama4 import Llama4TextMLP
|
99 |
+
from llmcompressor.modifiers.quantization import QuantizationModifier
|
100 |
+
import gc
|
101 |
+
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
102 |
+
|
103 |
+
def convert_model_for_quantization(model):
|
104 |
+
to_delete = []
|
105 |
+
for name, module in model.named_modules():
|
106 |
+
module_class_name = module.__class__.__name__
|
107 |
+
if module_class_name == "Llama4TextMoe":
|
108 |
+
parent_module, module_name = get_module_from_name(model, name)
|
109 |
+
parent_module._modules[module_name] = SequentialLlama4TextMoe(
|
110 |
+
model.config.get_text_config(),
|
111 |
+
module,
|
112 |
+
)
|
113 |
+
to_delete.append(module)
|
114 |
+
print(f"Patched {name} with SequentialLlama4TextMoe", flush=True)
|
115 |
+
|
116 |
+
for module in to_delete:
|
117 |
+
del module
|
118 |
+
gc.collect()
|
119 |
+
torch.cuda.empty_cache()
|
120 |
+
|
121 |
+
|
122 |
+
class SequentialLlama4TextMoe(torch.nn.Module):
|
123 |
+
def __init__(self, config, original_moe):
|
124 |
+
super().__init__()
|
125 |
+
self.top_k = config.num_experts_per_tok
|
126 |
+
self.hidden_dim = config.hidden_size
|
127 |
+
self.num_experts = config.num_local_experts
|
128 |
+
self.experts = SequentialLlama4TextExperts(config, original_moe.experts)
|
129 |
+
self.router = original_moe.router
|
130 |
+
self.shared_expert = original_moe.shared_expert
|
131 |
+
|
132 |
+
def forward(self, hidden_states):
|
133 |
+
hidden_states = hidden_states.reshape(-1, self.hidden_dim)
|
134 |
+
router_logits = self.router(hidden_states)
|
135 |
+
|
136 |
+
router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=1)
|
137 |
+
|
138 |
+
router_scores = (
|
139 |
+
torch.full_like(router_logits, float("-inf")).scatter_(1, router_indices, router_top_value).transpose(0, 1)
|
140 |
+
)
|
141 |
+
router_scores = torch.sigmoid(router_scores.float()).to(hidden_states.dtype)
|
142 |
+
|
143 |
+
out = self.shared_expert(hidden_states)
|
144 |
+
for i in range(self.num_experts):
|
145 |
+
out += self.experts[i](hidden_states) * router_scores[i].reshape(-1, 1)
|
146 |
+
|
147 |
+
return out, router_scores
|
148 |
+
|
149 |
+
|
150 |
+
class SequentialLlama4TextExperts(torch.nn.ModuleList):
|
151 |
+
def __init__(self, config, original_experts):
|
152 |
+
self.num_experts = original_experts.gate_up_proj.shape[0]
|
153 |
+
with skip_weights_initialize():
|
154 |
+
super().__init__([Llama4TextMLP(config) for _ in range(self.num_experts)])
|
155 |
+
|
156 |
+
intermediate_size = original_experts.down_proj.shape[1]
|
157 |
+
|
158 |
+
for i in range(self.num_experts):
|
159 |
+
gate_up = original_experts.gate_up_proj[i]
|
160 |
+
down = original_experts.down_proj[i]
|
161 |
+
|
162 |
+
gate_proj = gate_up[:, :intermediate_size]
|
163 |
+
up_proj = gate_up[:, intermediate_size:]
|
164 |
+
|
165 |
+
self[i].gate_proj.weight.data = gate_proj.t().clone().contiguous()
|
166 |
+
self[i].up_proj.weight.data = up_proj.t().clone().contiguous()
|
167 |
+
self[i].down_proj.weight.data = down.t().clone().contiguous()
|
168 |
+
|
169 |
+
original_experts.gate_up_proj = None
|
170 |
+
original_experts.down_proj = None
|
171 |
+
gc.collect()
|
172 |
+
torch.cuda.empty_cache()
|
173 |
|
174 |
+
|
175 |
+
model_id = "meta-llama/Llama-4-Scout-17B-16E"
|
176 |
+
|
177 |
+
model = Llama4ForConditionalGeneration.from_pretrained(
|
178 |
+
model_id, torch_dtype=torch.bfloat16 # load on cpu
|
179 |
+
)
|
180 |
processor = Llama4Processor.from_pretrained(model_id)
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
+
convert_model_for_quantization(model)
|
183 |
+
|
184 |
+
# Oneshot arguments
|
185 |
DATASET_ID = "neuralmagic/calibration"
|
186 |
NUM_CALIBRATION_SAMPLES = 512
|
187 |
MAX_SEQUENCE_LENGTH = 8192
|
188 |
|
189 |
ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
|
190 |
|
|
|
191 |
def preprocess_function(example):
|
192 |
messgages = []
|
193 |
for message in example["messages"]:
|
194 |
messgages.append(
|
195 |
{
|
196 |
+
"role": message["role"],
|
197 |
+
"content": [{"type": "text", "text": message["content"]}]
|
198 |
}
|
199 |
)
|
200 |
+
|
201 |
return processor.apply_chat_template(
|
202 |
+
messgages,
|
203 |
+
return_tensors="pt",
|
204 |
+
padding=False,
|
205 |
+
truncation=True,
|
206 |
max_length=MAX_SEQUENCE_LENGTH,
|
207 |
tokenize=True,
|
208 |
add_special_tokens=False,
|
209 |
return_dict=True,
|
210 |
add_generation_prompt=False,
|
211 |
+
).to("cuda:0")
|
|
|
|
|
|
|
212 |
|
213 |
+
ds = ds.map(
|
214 |
+
preprocess_function,
|
215 |
+
batched=False,
|
216 |
+
remove_columns=ds.column_names
|
217 |
+
)
|
218 |
|
219 |
+
# Define a oneshot data collator for multimodal inputs.
|
220 |
def data_collator(batch):
|
221 |
assert len(batch) == 1
|
222 |
return {
|
223 |
+
key: torch.tensor(value) if key != "pixel_values" else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
|
|
|
|
|
224 |
for key, value in batch[0].items()
|
225 |
}
|
226 |
|
227 |
+
# Recipe
|
228 |
+
recipe = QuantizationModifier(targets="Linear", scheme="NVFP4",
|
229 |
+
ignore=[
|
230 |
+
're:.*lm_head',
|
231 |
+
're:.*self_attn',
|
232 |
+
're:.*router',
|
233 |
+
're:.*vision_model',
|
234 |
+
're:.*multi_modal_projector',
|
235 |
+
're:.*multi_modal_projector',
|
236 |
+
"Llama4TextAttention",
|
237 |
+
],
|
238 |
+
sequential_targets=["Llama4TextMLP"],
|
239 |
+
)
|
240 |
|
241 |
+
SAVE_DIR = f"{model_id.split('/')[1]}-{recipe.scheme}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
|
243 |
+
# Perform oneshot
|
|
|
|
|
244 |
oneshot(
|
245 |
model=model,
|
246 |
+
tokenizer=model_id,
|
247 |
dataset=ds,
|
248 |
recipe=recipe,
|
249 |
max_seq_length=MAX_SEQUENCE_LENGTH,
|
250 |
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
251 |
+
trust_remote_code_model=True,
|
252 |
data_collator=data_collator,
|
253 |
+
output_dir=SAVE_DIR
|
254 |
)
|
255 |
|
256 |
# Save to disk compressed.
|
|
|
257 |
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
258 |
processor.save_pretrained(SAVE_DIR)
|
259 |
|
260 |
```
|
261 |
+
</details>
|
262 |
|
263 |
## Evaluation
|
264 |
|
|
|
412 |
|
413 |
The results were obtained using the following commands:
|
414 |
|
415 |
+
<details>
|
416 |
+
<summary>Model Evaluation Commands</summary>
|
417 |
+
|
418 |
#### MMLU_LLAMA
|
419 |
```
|
420 |
lm_eval \
|
|
|
521 |
--tasks humaneval_64_instruct \
|
522 |
--batch_size auto
|
523 |
```
|
524 |
+
</details>
|