FunAGI
/

LLaDA-8B-Base-gptqmodel-4bit

@@ -20,7 +20,7 @@ tags:
 ---
-This model has been quantized using [GPTQModel](https://github.com/ModelCloud/GPTQModel).
 - **bits**: 4
 - **dynamic**: null
@@ -38,8 +38,349 @@ This model has been quantized using [GPTQModel](https://github.com/ModelCloud/GP
   - **damp_percent**: 0.1
   - **damp_auto_increment**: 0.0015
 ## Example:
 ```python
-TODO
 ```

 ---
+This model has been 4-bit quantized Llada-8B-Base model with [GPTQModel](https://github.com/ModelCloud/GPTQModel).
 - **bits**: 4
 - **dynamic**: null
   - **damp_percent**: 0.1
   - **damp_auto_increment**: 0.0015
+## Benchmark
+### Performance of Quantized Models
+| Dataset         | GPTQ-4bit | FP16 |
+|----------------|-------------|------|
+| mmlu          | ✓           | ✓    |
+| cmmlu         | ✓           | ✓    |
+| arc_challenge | ✓           | ✓    |
 ## Example:
 ```python
+'''
+This file is inspired by the code from https://github.com/ML-GSAI/SMDM
+'''
+import accelerate
+import torch
+import re
+from pathlib import Path
+import random
+import numpy as np
+import torch.nn.functional as F
+from datasets import Dataset
+from lm_eval.__main__ import cli_evaluate
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from lm_eval.models.huggingface import HFLM
+from lm_eval.api.registry import register_model
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModel
+from gptqmodel import  GPTQModel
+@register_model("llada_dist")
+class LLaDAEvalHarness(LM):
+    def __init__(
+        self,
+        model_path='',
+        mask_id=126336,
+        max_length=4096,
+        block_length =  4096,
+        steps = 128,
+        batch_size=32,
+        mc_num=128,
+        is_check_greedy=True,
+        cfg=0.,
+        device="cuda",
+        gptqmodel=True
+    ):
+        """
+        Args:
+            model_path: LLaDA-8B-Base model path.
+            mask_id: The token id of [MASK] is 126336.
+            max_length: the max sequence length.
+            batch_size: mini batch size.
+            mc_num: Monte Carlo estimation iterations
+            is_check_greedy: For certain metrics like LAMBADA, the evaluation requires the model to verify whether the answer
+                             is generated through greedy sampling conditioned on the prompt (note that this differs from conditional
+                             generation). We implement this verification through the suffix_greedy_prediction() function, which
+                             returns a True/False judgment used for accuracy calculation.
+                             When is_check_greedy is set to True, the lm-evaluation-harness library automatically invokes this function.
+                             However, since none of the metrics in the LLaDA paper (https://arxiv.org/abs/2502.09992) require this functionality,
+                             we recommend setting is_check_greedy to False. This configuration causes suffix_greedy_prediction() to return False
+                             by default, significantly accelerating the evaluation process.
+            cfg_scale: Unsupervised classifier-free guidance scale.
+        """
+        super().__init__()
+        accelerator = accelerate.Accelerator()
+        if accelerator.num_processes > 1:
+            self.accelerator = accelerator
+        else:
+            self.accelerator = None
+        model_kwargs = {}
+        if self.accelerator is not None:
+            model_kwargs.update({'device_map': {'': f'{self.accelerator.device}'}})
+        #self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16, gptqmodel=gptqmodel, **model_kwargs)
+        self.model =  GPTQModel.load(model_path, device='cuda' , trust_remote_code=True    )
+        self.model.eval()
+        self.device = torch.device(device)
+        if self.accelerator is not None:
+            self.model = self.accelerator.prepare(self.model)
+            self.device = torch.device(f'{self.accelerator.device}')
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+        self.mask_id = mask_id
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.mc_num = mc_num
+        self.batch_size = int(batch_size)
+        assert mc_num % self.batch_size == 0
+        self.sampling_eps = 0.
+        self.max_length = max_length
+        self.block_length = block_length
+        self.steps = steps
+        self.is_check_greedy = is_check_greedy
+        self.cfg = cfg
+        print(f'model: {model_path}')
+        print(f'Is check greedy: {is_check_greedy}')
+        print(f'cfg: {cfg}')
+    @property
+    def rank(self):
+        return self._rank
+    @property
+    def world_size(self):
+        return self._world_size
+    def _forward_process(self, batch, prompt_index):
+        b, l = batch.shape
+        target_len = (l - prompt_index.sum()).item()
+        k = torch.randint(1, target_len + 1, (), device=batch.device)
+        x = torch.round(torch.linspace(float(k), k + (b - 1) * (target_len / b), steps=b, device=batch.device)).long()
+        x = ((x - 1) % target_len) + 1
+        assert x.min() >= 1 and x.max() <= target_len
+        indices = torch.arange(target_len, device=batch.device).repeat(b, 1)
+        is_mask = indices < x.unsqueeze(1)
+        for i in range(b):
+            is_mask[i] = is_mask[i][torch.randperm(target_len)]
+        is_mask = torch.cat((torch.zeros(b, prompt_index.sum(), dtype=torch.bool, device=batch.device), is_mask), dim=1)
+        noisy_batch = torch.where(is_mask, self.mask_id, batch)
+        return noisy_batch, (x / target_len).unsqueeze(1).repeat(1, l)
+    @torch.no_grad()
+    def get_logits(self, batch, prompt_index):
+        if self.cfg > 0.:
+            assert len(prompt_index) == batch.shape[1]
+            prompt_index = prompt_index.unsqueeze(0).repeat(batch.shape[0], 1)
+            un_batch = batch.clone()
+            un_batch[prompt_index] = self.mask_id
+            batch = torch.cat([batch, un_batch])
+        logits = self.model(batch).logits
+        if self.cfg > 0.:
+            logits, un_logits = torch.chunk(logits, 2, dim=0)
+            logits = un_logits + (self.cfg + 1) * (logits - un_logits)
+        return logits[:, :batch.shape[1]]
+    @torch.no_grad()
+    def get_loglikelihood(self, prefix, target):
+        seq = torch.concatenate([prefix, target])[None, :]
+        seq = seq.repeat((self.batch_size, 1)).to(self.device)
+        prompt_index = torch.arange(seq.shape[1], device=self.device) < len(prefix)
+        loss_acc = []
+        for _ in range(self.mc_num // self.batch_size):
+            perturbed_seq, p_mask = self._forward_process(seq, prompt_index)
+            mask_indices = perturbed_seq == self.mask_id
+            logits = self.get_logits(perturbed_seq, prompt_index)
+            loss = F.cross_entropy(logits[mask_indices], seq[mask_indices], reduction='none') / p_mask[mask_indices]
+            loss = loss.sum() / self.batch_size
+            loss_acc.append(loss.item())
+        return - sum(loss_acc) / len(loss_acc)
+    @torch.no_grad()
+    def suffix_greedy_prediction(self, prefix, target):
+        if not self.is_check_greedy:
+            return False
+        seq = torch.full((1, len(prefix) + len(target)), self.mask_id, device=self.device)
+        prompt_index = torch.arange(seq.shape[1], device=self.device) < len(prefix)
+        prefix, target = prefix.to(self.device), target.to(self.device)
+        seq[0, :len(prefix)] = prefix
+        for i in range(len(target)):
+            mask_index = (seq == self.mask_id)
+            logits = self.get_logits(seq, prompt_index)[mask_index]
+            x0 = torch.argmax(logits, dim=-1)
+            p = torch.softmax(logits.to(torch.float32), dim=-1)
+            confidence = torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)).squeeze(dim=-1)
+            _, index = torch.sort(confidence, descending=True)
+            x0[index[1:]] = self.mask_id
+            seq[mask_index] = x0.clone()
+        correct = target == seq[0, len(prefix):]
+        correct = torch.all(correct)
+        return correct
+    def _encode_pair(self, context, continuation):
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        whole_enc = self.tokenizer(context + continuation)["input_ids"]
+        context_enc = self.tokenizer(context)["input_ids"]
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+        return context_enc, continuation_enc
+    def loglikelihood(self, requests):
+        def _tokenize(e):
+            prefix, target = self._encode_pair(e["prefix"], e["target"])
+            return {
+                "prefix_text": e["prefix"],
+                "target_text": e["target"],
+                "prefix": prefix,
+                "target": target,
+            }
+        ds = []
+        ds = [{"prefix": req.args[0], "target": req.args[1]} for req in requests]
+        ds = Dataset.from_list(ds)
+        ds = ds.map(_tokenize)
+        ds = ds.with_format("torch")
+        prompt_len = [len(x["prefix"]) + len(x["target"]) for x in ds]
+        assert max(prompt_len) <= 4096
+        out = []
+        with torch.no_grad():
+            for elem in tqdm(ds, desc="Computing likelihood..."):
+                prefix = elem["prefix"]
+                target = elem["target"]
+                ll = self.get_loglikelihood(prefix, target)
+                is_target_greedy_dec = self.suffix_greedy_prediction(prefix, target)
+                out.append((ll, 1.0 if is_target_greedy_dec else 0.0))
+                print('=' * 20)
+                print('prefix: ', elem['prefix_text'])
+                print('target: ', elem['target_text'])
+                print(ll, is_target_greedy_dec)
+                print('=' * 20, end='\n\n')
+        torch.cuda.empty_cache()
+        return out
+    def loglikelihood_rolling(self, requests):
+        raise NotImplementedError
+    def generate_until(self, context, max_length, stop, **generation_kwargs):
+        raise NotImplementedError
+    @torch.no_grad()
+    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+        '''
+        Args:
+            model: Mask predictor.
+            prompt: A tensor of shape (1, l).
+            steps: Sampling steps, less than or equal to gen_length.
+            gen_length: Generated answer length.
+            block_length: Block length, less than or equal to gen_length. If less than gen_length, it means using semi_autoregressive remasking.
+            temperature: Categorical distribution sampling temperature.
+            cfg_scale: Unsupervised classifier-free guidance scale.
+            remasking: Remasking strategy. 'low_confidence' or 'random'.
+            mask_id: The toke id of [MASK] is 126336.
+        '''
+        # using the hyperparams in orginal paper
+        prompt = context
+        #
+        gen_length =  self.max_length
+        block_length = self.block_length
+        steps =   self.max_length
+        temperature=0.
+        cfg_scale=0.
+        remasking='low_confidence'
+        mask_id=126336
+        x = torch.full((1, prompt.shape[1] + gen_length), mask_id, dtype=torch.long).to(self.model.device)
+        x[:, :prompt.shape[1]] = prompt.clone()
+        prompt_index = (x != mask_id)
+        assert gen_length % block_length == 0
+        num_blocks = gen_length // block_length
+        assert steps % num_blocks == 0
+        steps = steps // num_blocks
+        for num_block in range(num_blocks):
+            block_mask_index = (x[:, prompt.shape[1] + num_block * block_length: prompt.shape[1] + (num_block + 1) * block_length:] == mask_id)
+            num_transfer_tokens = get_num_transfer_tokens(block_mask_index, steps)
+            for i in range(steps):
+                mask_index = (x == mask_id)
+                if cfg_scale > 0.:
+                    un_x = x.clone()
+                    un_x[prompt_index] = mask_id
+                    x_ = torch.cat([x, un_x], dim=0)
+                    logits = self.model(x_).logits
+                    logits, un_logits = torch.chunk(logits, 2, dim=0)
+                    logits = un_logits + (cfg_scale + 1) * (logits - un_logits)
+                else:
+                    logits = self.model(x).logits
+                logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
+                x0 = torch.argmax(logits_with_noise, dim=-1) # b, l
+                if remasking == 'low_confidence':
+                    p = F.softmax(logits.to(torch.float64), dim=-1)
+                    x0_p = torch.squeeze(
+                        torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1) # b, l
+                elif remasking == 'random':
+                    x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
+                else:
+                    raise NotImplementedError(remasking)
+                x0_p[:, prompt.shape[1] + (num_block + 1) * block_length:] = -np.inf
+                x0 = torch.where(mask_index, x0, x)
+                confidence = torch.where(mask_index, x0_p, -np.inf)
+                transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
+                for j in range(confidence.shape[0]):
+                    _, select_index = torch.topk(confidence[j], k=num_transfer_tokens[j, i])
+                    transfer_index[j, select_index] = True
+                x[transfer_index] = x0[transfer_index]
+        return x
+if __name__ == "__main__":
+    set_seed(1234)
+    cli_evaluate()
+```
+```bash
+accelerate launch eval_llada_gptq.py --tasks arc_challenge --num_fewshot 0 --model llada_dist --batch_size 8 --model_args model_path=FunAGI/LLaDA-8B-Base-gptqmodel-4bit,cfg=0.5,is_check_greedy=False,mc_num=128
 ```