|
--- |
|
license: apache-2.0 |
|
--- |
|
|
|
Generated from: |
|
|
|
```python |
|
import torch |
|
from llmcompressor import oneshot |
|
from llmcompressor.modifiers.quantization import QuantizationModifier |
|
from transformers import ( |
|
AutoModelForCausalLM, # type:ignore[reportPrivateImportUsage] |
|
AutoTokenizer, # type:ignore[reportPrivateImportUsage] |
|
) |
|
|
|
MODEL_ID = "tiiuae/falcon-7b-instruct" |
|
|
|
# Copied from <https://github.com/vllm-project/llm-compressor/blob/9d8a46418f517dd6399e2e9c179805247a7be584/examples/quantization_w8a8_fp8/README.md> |
|
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16) |
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) |
|
|
|
# Configure the simple PTQ quantization |
|
recipe = QuantizationModifier( |
|
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] |
|
) |
|
|
|
# Apply the quantization algorithm. |
|
oneshot(model=model, recipe=recipe) # type:ignore[arg-type] |
|
|
|
# Save the model. |
|
SAVE_DIR = f"data/models/{MODEL_ID.split('/')[-1]}-FP8-Dynamic" |
|
model.save_pretrained(SAVE_DIR) |
|
tokenizer.save_pretrained(SAVE_DIR) |
|
``` |