--- license: mit base_model: - GSAI-ML/LLaDA-8B-Instruct pipeline_tag: text-generation --- Baby's first adventure with the diffusion language model. Had to quantize this so it would fit on a 3080TI - all I've got! Used modal to do so. If you want to replicate what I did, try this: First, install modal and log into the CLI. uv add modal uv run modal login Then, add an environment and a volume to the project. uv run modal volume create quantized-model-output Then, run the quantization script: uv run modal run scripts/quantize_llada.py ``` # scripts/quantize_llada.py import modal image = ( modal.Image.from_registry("nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04", add_python="3.13") .apt_install("git", "curl") .pip_install( "torch>=2.7.0", "torchvision", "torchaudio", index_url="https://download.pytorch.org/whl/cu128", ) .pip_install( "numpy", "accelerate", "optimum", "loguru", "transformers", ) .pip_install("triton") .pip_install("gptqmodel") ) # These need to be created before running the app: # uv run modal volume create quantized-model-output output_volume = modal.Volume.from_name("quantized-model-output") volume_config = {"/quantized-model-output": output_volume} app = modal.App("quantize-llada", image=image, volumes=volume_config) TRAIN_GPU_COUNT = 1 TRAIN_GPU = f"B200:{TRAIN_GPU_COUNT}" TRAIN_CPU_COUNT = 4 MINUTES = 40 @app.function(gpu=TRAIN_GPU, cpu=TRAIN_CPU_COUNT, timeout=MINUTES * 60) def quantize_model() -> None: import types import torch from loguru import logger from optimum.gptq import GPTQQuantizer from transformers import AutoModel, AutoTokenizer output_volume.reload() # Check if CUDA is available, show device count logger.info(f"CUDA available: {torch.cuda.is_available()}") logger.info(f"Device count: {torch.cuda.device_count() if torch.cuda.is_available() else 0}") logger.info(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}") logger.info(f"CUDA version: {torch.version.cuda}") logger.info(f"PyTorch version: {torch.__version__}") # Check if GPU is available if not torch.cuda.is_available(): raise RuntimeError("CUDA is not available") logger.info("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained("GSAI-ML/LLaDA-8B-Instruct", trust_remote_code=True) # We need to do some shenanigans. # First, load the model on the CPU so we can patch the forward pass. logger.info("Loading model on CPU...") model = AutoModel.from_pretrained( "GSAI-ML/LLaDA-8B-Instruct", trust_remote_code=True, device_map="cpu", torch_dtype=torch.float16, low_cpu_mem_usage=True, ) logger.info("Patching model forward pass...") def patched_forward(self, x, *args, attention_bias=None, layer_past=None, use_cache=False, **kwargs): """ Patched forward that handles both positional and keyword arguments for attention_bias """ # If attention_bias was passed as positional argument, use it if len(args) > 0 and attention_bias is None: attention_bias = args[0] args = args[1:] if len(args) > 0 and layer_past is None: layer_past = args[0] args = args[1:] if len(args) > 0: use_cache = args[0] # Call the original forward with cleaned arguments return self._original_forward(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache) # Apply patch to all LLaDALlamaBlock instances for name, module in model.named_modules(): if module.__class__.__name__ == "LLaDALlamaBlock": # Store original forward module._original_forward = module.forward # Replace with patched version module.forward = types.MethodType(patched_forward, module) logger.info(f"Patched {name}") # Move model to GPU logger.info("Moving model to GPU...") model = model.to("cuda") logger.info("Setting up GPTQ quantizer...") quantizer = GPTQQuantizer( bits=4, group_size=128, desc_act=False, sym=True, true_sequential=True, dataset="c4", tokenizer=tokenizer, block_name_to_quantize="model.transformer.blocks", ) logger.info("Quantizing model...") quantizer.quantize_model(model, tokenizer) logger.info("Quantization done, saving model...") output_path = "/quantized-model-output/llada-8b-instruct-4bit-gptq" logger.info(f"Saving model to {output_path}") quantizer.save(model, output_path) tokenizer.save_pretrained(output_path) logger.success("Quantization done, model saved!") @app.local_entrypoint() def main(): quantize_model.remote() ``` I think I also had to manually change something in the config.json -- needed to be "block_name_to_quantize": "model.transformer.blocks" instead of whatever it was, maybe "model_block_name_to_quantize" as the key or something.