nm-research commited on
Commit
d43a0ce
·
verified ·
1 Parent(s): f9fd488

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +132 -44
README.md CHANGED
@@ -47,7 +47,9 @@ Only the weights of the linear operators within transformers blocks are quantize
47
  ### Use with vLLM
48
 
49
  This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
50
-
 
 
51
  ```python
52
  from vllm import LLM, SamplingParams
53
  from transformers import AutoTokenizer
@@ -73,6 +75,7 @@ outputs = llm.generate(prompts, sampling_params)
73
  generated_text = outputs[0].outputs[0].text
74
  print(generated_text)
75
  ```
 
76
 
77
  vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://docs.vllm.ai/en/latest/) for more details.
78
 
@@ -80,101 +83,182 @@ vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://do
80
 
81
  This model was created by applying [LLM Compressor with calibration samples from neuralmagic/calibration dataset](https://github.com/vllm-project/llm-compressor/blob/main/examples/multimodal_vision/llama4_example.py), as presented in the code snipet below.
82
 
 
 
 
83
  ```python
 
 
84
  import torch
85
  from datasets import load_dataset
86
- from transformers import Llama4ForConditionalGeneration, Llama4Processor
87
 
88
  from llmcompressor import oneshot
89
- from llmcompressor.modeling import prepare_for_calibration
90
  from llmcompressor.modifiers.quantization import GPTQModifier
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- # Select model and load it.
93
- model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
94
- model = Llama4ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
 
 
 
95
  processor = Llama4Processor.from_pretrained(model_id)
96
- # We update `Llama4TextMoe` modules with custom `SequentialLlama4TextMoe`.
97
- # This change allows compatibility with vllm.
98
- # To apply your own custom module for experimentation, consider updating
99
- # `SequentialLlama4TextMoe` under llmcompressor/modeling/llama4.py
100
- model = prepare_for_calibration(model)
101
 
 
 
 
102
  DATASET_ID = "neuralmagic/calibration"
103
  NUM_CALIBRATION_SAMPLES = 512
104
  MAX_SEQUENCE_LENGTH = 8192
105
 
106
  ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
107
 
108
-
109
  def preprocess_function(example):
110
  messgages = []
111
  for message in example["messages"]:
112
  messgages.append(
113
  {
114
- "role": message["role"],
115
- "content": [{"type": "text", "text": message["content"]}],
116
  }
117
  )
118
-
119
  return processor.apply_chat_template(
120
- messgages,
121
- return_tensors="pt",
122
- padding=False,
123
- truncation=True,
124
  max_length=MAX_SEQUENCE_LENGTH,
125
  tokenize=True,
126
  add_special_tokens=False,
127
  return_dict=True,
128
  add_generation_prompt=False,
129
- )
130
-
131
-
132
- ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)
133
 
 
 
 
 
 
134
 
 
135
  def data_collator(batch):
136
  assert len(batch) == 1
137
  return {
138
- key: torch.tensor(value)
139
- if key != "pixel_values"
140
- else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
141
  for key, value in batch[0].items()
142
  }
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
- # Configure the quantization algorithm to run.
146
- recipe = GPTQModifier(
147
- targets="Linear",
148
- scheme="W4A16",
149
- ignore=[
150
- "re:.*lm_head",
151
- "re:.*self_attn",
152
- "re:.*router",
153
- "re:vision_model.*",
154
- "re:multi_modal_projector.*",
155
- "Llama4TextAttention",
156
- ],
157
- )
158
 
159
- # Apply algorithms.
160
- # due to the large size of Llama4, we specify sequential targets such that
161
- # only one MLP is loaded into GPU memory at a time
162
  oneshot(
163
  model=model,
 
164
  dataset=ds,
165
  recipe=recipe,
166
  max_seq_length=MAX_SEQUENCE_LENGTH,
167
  num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 
168
  data_collator=data_collator,
169
- sequential_targets=["Llama4TextMLP"],
170
  )
171
 
172
  # Save to disk compressed.
173
- SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
174
  model.save_pretrained(SAVE_DIR, save_compressed=True)
175
  processor.save_pretrained(SAVE_DIR)
176
 
177
  ```
 
178
 
179
  ## Evaluation
180
 
@@ -328,6 +412,9 @@ This model was evaluated on the well-known OpenLLM v1, OpenLLM v2, HumanEval, an
328
 
329
  The results were obtained using the following commands:
330
 
 
 
 
331
  #### MMLU_LLAMA
332
  ```
333
  lm_eval \
@@ -434,3 +521,4 @@ lm_eval \
434
  --tasks humaneval_64_instruct \
435
  --batch_size auto
436
  ```
 
 
47
  ### Use with vLLM
48
 
49
  This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
50
+ <details>
51
+ <summary>Model Usage Code</summary>
52
+
53
  ```python
54
  from vllm import LLM, SamplingParams
55
  from transformers import AutoTokenizer
 
75
  generated_text = outputs[0].outputs[0].text
76
  print(generated_text)
77
  ```
78
+ </details>
79
 
80
  vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://docs.vllm.ai/en/latest/) for more details.
81
 
 
83
 
84
  This model was created by applying [LLM Compressor with calibration samples from neuralmagic/calibration dataset](https://github.com/vllm-project/llm-compressor/blob/main/examples/multimodal_vision/llama4_example.py), as presented in the code snipet below.
85
 
86
+ <details>
87
+ <summary>Model Creation Code</summary>
88
+
89
  ```python
90
+ from transformers import Llama4ForConditionalGeneration, Llama4Processor
91
+ from transformers.quantizers.quantizers_utils import get_module_from_name
92
  import torch
93
  from datasets import load_dataset
 
94
 
95
  from llmcompressor import oneshot
 
96
  from llmcompressor.modifiers.quantization import GPTQModifier
97
+ from llmcompressor.utils.dev import skip_weights_initialize
98
+ from transformers.models.llama4.modeling_llama4 import Llama4TextMLP
99
+ from llmcompressor.modifiers.quantization import QuantizationModifier
100
+ import gc
101
+ from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
102
+
103
+ def convert_model_for_quantization(model):
104
+ to_delete = []
105
+ for name, module in model.named_modules():
106
+ module_class_name = module.__class__.__name__
107
+ if module_class_name == "Llama4TextMoe":
108
+ parent_module, module_name = get_module_from_name(model, name)
109
+ parent_module._modules[module_name] = SequentialLlama4TextMoe(
110
+ model.config.get_text_config(),
111
+ module,
112
+ )
113
+ to_delete.append(module)
114
+ print(f"Patched {name} with SequentialLlama4TextMoe", flush=True)
115
+
116
+ for module in to_delete:
117
+ del module
118
+ gc.collect()
119
+ torch.cuda.empty_cache()
120
+
121
+
122
+ class SequentialLlama4TextMoe(torch.nn.Module):
123
+ def __init__(self, config, original_moe):
124
+ super().__init__()
125
+ self.top_k = config.num_experts_per_tok
126
+ self.hidden_dim = config.hidden_size
127
+ self.num_experts = config.num_local_experts
128
+ self.experts = SequentialLlama4TextExperts(config, original_moe.experts)
129
+ self.router = original_moe.router
130
+ self.shared_expert = original_moe.shared_expert
131
+
132
+ def forward(self, hidden_states):
133
+ hidden_states = hidden_states.reshape(-1, self.hidden_dim)
134
+ router_logits = self.router(hidden_states)
135
+
136
+ router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=1)
137
+
138
+ router_scores = (
139
+ torch.full_like(router_logits, float("-inf")).scatter_(1, router_indices, router_top_value).transpose(0, 1)
140
+ )
141
+ router_scores = torch.sigmoid(router_scores.float()).to(hidden_states.dtype)
142
+
143
+ out = self.shared_expert(hidden_states)
144
+ for i in range(self.num_experts):
145
+ out += self.experts[i](hidden_states) * router_scores[i].reshape(-1, 1)
146
+
147
+ return out, router_scores
148
+
149
+
150
+ class SequentialLlama4TextExperts(torch.nn.ModuleList):
151
+ def __init__(self, config, original_experts):
152
+ self.num_experts = original_experts.gate_up_proj.shape[0]
153
+ with skip_weights_initialize():
154
+ super().__init__([Llama4TextMLP(config) for _ in range(self.num_experts)])
155
+
156
+ intermediate_size = original_experts.down_proj.shape[1]
157
+
158
+ for i in range(self.num_experts):
159
+ gate_up = original_experts.gate_up_proj[i]
160
+ down = original_experts.down_proj[i]
161
+
162
+ gate_proj = gate_up[:, :intermediate_size]
163
+ up_proj = gate_up[:, intermediate_size:]
164
+
165
+ self[i].gate_proj.weight.data = gate_proj.t().clone().contiguous()
166
+ self[i].up_proj.weight.data = up_proj.t().clone().contiguous()
167
+ self[i].down_proj.weight.data = down.t().clone().contiguous()
168
+
169
+ original_experts.gate_up_proj = None
170
+ original_experts.down_proj = None
171
+ gc.collect()
172
+ torch.cuda.empty_cache()
173
 
174
+
175
+ model_id = "meta-llama/Llama-4-Scout-17B-16E"
176
+
177
+ model = Llama4ForConditionalGeneration.from_pretrained(
178
+ model_id, torch_dtype=torch.bfloat16 # load on cpu
179
+ )
180
  processor = Llama4Processor.from_pretrained(model_id)
 
 
 
 
 
181
 
182
+ convert_model_for_quantization(model)
183
+
184
+ # Oneshot arguments
185
  DATASET_ID = "neuralmagic/calibration"
186
  NUM_CALIBRATION_SAMPLES = 512
187
  MAX_SEQUENCE_LENGTH = 8192
188
 
189
  ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
190
 
 
191
  def preprocess_function(example):
192
  messgages = []
193
  for message in example["messages"]:
194
  messgages.append(
195
  {
196
+ "role": message["role"],
197
+ "content": [{"type": "text", "text": message["content"]}]
198
  }
199
  )
200
+
201
  return processor.apply_chat_template(
202
+ messgages,
203
+ return_tensors="pt",
204
+ padding=False,
205
+ truncation=True,
206
  max_length=MAX_SEQUENCE_LENGTH,
207
  tokenize=True,
208
  add_special_tokens=False,
209
  return_dict=True,
210
  add_generation_prompt=False,
211
+ ).to("cuda:0")
 
 
 
212
 
213
+ ds = ds.map(
214
+ preprocess_function,
215
+ batched=False,
216
+ remove_columns=ds.column_names
217
+ )
218
 
219
+ # Define a oneshot data collator for multimodal inputs.
220
  def data_collator(batch):
221
  assert len(batch) == 1
222
  return {
223
+ key: torch.tensor(value) if key != "pixel_values" else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
 
 
224
  for key, value in batch[0].items()
225
  }
226
 
227
+ # Recipe
228
+ recipe = QuantizationModifier(targets="Linear", scheme="NVFP4",
229
+ ignore=[
230
+ 're:.*lm_head',
231
+ 're:.*self_attn',
232
+ 're:.*router',
233
+ 're:.*vision_model',
234
+ 're:.*multi_modal_projector',
235
+ 're:.*multi_modal_projector',
236
+ "Llama4TextAttention",
237
+ ],
238
+ sequential_targets=["Llama4TextMLP"],
239
+ )
240
 
241
+ SAVE_DIR = f"{model_id.split('/')[1]}-{recipe.scheme}"
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
+ # Perform oneshot
 
 
244
  oneshot(
245
  model=model,
246
+ tokenizer=model_id,
247
  dataset=ds,
248
  recipe=recipe,
249
  max_seq_length=MAX_SEQUENCE_LENGTH,
250
  num_calibration_samples=NUM_CALIBRATION_SAMPLES,
251
+ trust_remote_code_model=True,
252
  data_collator=data_collator,
253
+ output_dir=SAVE_DIR
254
  )
255
 
256
  # Save to disk compressed.
 
257
  model.save_pretrained(SAVE_DIR, save_compressed=True)
258
  processor.save_pretrained(SAVE_DIR)
259
 
260
  ```
261
+ </details>
262
 
263
  ## Evaluation
264
 
 
412
 
413
  The results were obtained using the following commands:
414
 
415
+ <details>
416
+ <summary>Model Evaluation Commands</summary>
417
+
418
  #### MMLU_LLAMA
419
  ```
420
  lm_eval \
 
521
  --tasks humaneval_64_instruct \
522
  --batch_size auto
523
  ```
524
+ </details>