|
--- |
|
license: mit |
|
language: |
|
- en |
|
- zh |
|
base_model: |
|
- Qwen/Qwen2.5-VL-7B-Instruct |
|
pipeline_tag: image-text-to-text |
|
library_name: transformers |
|
tags: |
|
- text-generation-inference |
|
--- |
|
|
|
# Qwen2.5-VL-7B-Instruct-gptqmodel-int8 |
|
|
|
It is a GPTQ-INT8 quantized [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) with [GPTQModel](https://github.com/ModelCloud/GPTQModel) toolkit. |
|
|
|
## How to quantize |
|
|
|
### Install |
|
|
|
```bash |
|
# Python 3.10.x or above |
|
pip3 install -v "gptqmodel>=2.2.0" --no-build-isolation |
|
|
|
``` |
|
|
|
### Quantize |
|
|
|
```bash |
|
python3 gptqmodel_quantize.py /path/to/Qwen2.5-VL-7B-Instruct/ /path/to/Qwen2.5-VL-7B-Instruct-gptqmodel-int8 8 |
|
|
|
``` |
|
|
|
```python |
|
# gptqmodel_quantize.py |
|
|
|
import fire |
|
from datasets import load_dataset |
|
|
|
from gptqmodel import GPTQModel, QuantizeConfig |
|
from gptqmodel.models.definitions.base_qwen2_vl import BaseQwen2VLGPTQ |
|
|
|
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" |
|
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" |
|
os.environ["PYTHONUTF8"]="1" |
|
|
|
def format_qwen2_vl_dataset(image, assistant): |
|
return [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "image", "image": image}, |
|
{"type": "text", "text": "generate a caption for this image"}, |
|
], |
|
}, |
|
{"role": "assistant", "content": assistant}, |
|
] |
|
|
|
|
|
def prepare_dataset(format_func, n_sample: int = 20) -> list[list[dict]]: |
|
from datasets import load_dataset |
|
|
|
dataset = load_dataset( |
|
"laion/220k-GPT4Vision-captions-from-LIVIS", split=f"train[:{n_sample}]" |
|
) |
|
return [ |
|
format_func(sample["url"], sample["caption"]) |
|
for sample in dataset |
|
] |
|
|
|
|
|
def get_calib_dataset(model): |
|
if isinstance(model, BaseQwen2VLGPTQ): |
|
return prepare_dataset(format_qwen2_vl_dataset, n_sample=256) |
|
raise NotImplementedError(f"Unsupported MODEL: {model.__class__}") |
|
|
|
|
|
def quantize(model_path: str, |
|
output_path: str, |
|
bit: int): |
|
quant_config = QuantizeConfig(bits=bit, group_size=128) |
|
|
|
model = GPTQModel.load(model_path, quant_config) |
|
calibration_dataset = get_calib_dataset(model) |
|
|
|
# increase `batch_size` to match gpu/vram specs to speed up quantization |
|
model.quantize(calibration_dataset, batch_size=8) |
|
|
|
model.save(output_path) |
|
|
|
# test post-quant inference |
|
model = GPTQModel.load(output_path) |
|
result = model.generate("Uncovering deep insights begins with")[0] # tokens |
|
print(model.tokenizer.decode(result)) # string output |
|
|
|
|
|
if __name__ == "__main__": |
|
fire.Fire(quantize) |
|
|
|
``` |
|
|