Generated from:
import torch
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from transformers import (
AutoModelForCausalLM, # type:ignore[reportPrivateImportUsage]
AutoTokenizer, # type:ignore[reportPrivateImportUsage]
)
MODEL_ID = "tiiuae/falcon-7b-instruct"
# Copied from <https://github.com/vllm-project/llm-compressor/blob/9d8a46418f517dd6399e2e9c179805247a7be584/examples/quantization_w8a8_fp8/README.md>
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Configure the simple PTQ quantization
recipe = QuantizationModifier(
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
)
# Apply the quantization algorithm.
oneshot(model=model, recipe=recipe) # type:ignore[arg-type]
# Save the model.
SAVE_DIR = f"data/models/{MODEL_ID.split('/')[-1]}-FP8-Dynamic"
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
- Downloads last month
- 19
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
🙋
Ask for provider support