from transformers import AutoProcessor, LlavaForConditionalGeneration from llmcompressor.modifiers.quantization import QuantizationModifier from llmcompressor import oneshot from llmcompressor.utils import dispatch_for_generation MODEL_ID = "llama-joycaption-beta-one-hf-llava" # Load model. model_class = LlavaForConditionalGeneration model = model_class.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") processor = AutoProcessor.from_pretrained(MODEL_ID) # Configure the quantization algorithm and scheme. # In this case, we: # * quantize the weights to fp8 with per channel via ptq # * quantize the activations to fp8 with dynamic per token recipe = QuantizationModifier( targets="Linear", scheme="FP8_DYNAMIC", ignore=["re:.*lm_head", "re:.*multi_modal_projector.*", "re:.*vision_tower.*"], ) # Apply quantization and save to disk in compressed-tensors format. SAVE_DIR = MODEL_ID + "-FP8-Dynamic" oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR, save_compressed=True) processor.save_pretrained(SAVE_DIR)