Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

.ipynb_checkpoints/README-checkpoint.md +116 -0
README.md +116 -3
custom_generate/.ipynb_checkpoints/generate-checkpoint.py +245 -0
custom_generate/LICENSE +5 -0
custom_generate/generate.py +245 -0

.ipynb_checkpoints/README-checkpoint.md ADDED Viewed

	@@ -0,0 +1,116 @@

+---
+library_name: transformers
+tags:
+  - custom_generate
+---
+# LagKV Cache
+#### Introduction
+![LagKV Cache diagram from the original paper](https://arxiv.org/html/2504.04704v1/x1.png)
+LagKV is an efficient and robust KV compression algorithm. It uses lag tokens information to compress the previous ones which significantly boost the compression performance with little computation overhead.
+[Original Github](https://github.com/AI-Lab-China-Merchants-Bank/LagKV)
+Details are in the following work:
+[LagKV: Lag-Relative Information of the KV Cache Tells Which Tokens Are Important](https://arxiv.org/abs/2504.04704)
+#### How to Use
+LagKV implements the Cache interface from transformers. It's easy to be integrated into the model calling function.
+```python
+from lag_kv import LagKV
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_path = "Qwen2.5-7B-Instruct"
+device = "cuda:0"
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", attn_implementation="sdpa").to(device)
+prompt = "long text"
+input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
+past_key_values = LagKV(lag_size=64)
+print(model.generate(input_ids, past_key_values=past_key_values))
+# check KV cache size
+print(past_key_values[0][0].size())
+```
+To compress the KV cache during the prefill stage instead of it's precisely calculated, you have to use the following inference function(for batch_size=1 only.):
+```python
+def inference_by_prefill_compress(model, tokenizer, inputs, max_new_tokens=256, decode=False, past_key_values=None, device="cuda"):
+    if isinstance(inputs, str):
+        input_ids = tokenizer([inputs], return_tensors="pt")["input_ids"].to(device)
+    else:
+        input_ids = inputs
+    if past_key_values is None:
+        past_key_values = LagKV(ratio=0.2,
+                             lag_size=128,
+                            layer_idx_skip_first=[],
+                             use_then_compress=True)
+    with torch.no_grad():
+        sink_size = past_key_values.sink_size
+        lag_size = past_key_values.lag_size
+        trigger_len = sink_size + 2*lag_size
+        input_length = input_ids.shape[1]
+        # print(input_length > trigger_len)
+        if input_length > trigger_len:
+            start_idx = 0
+            end_idx = trigger_len
+            position_ids = torch.arange(input_length + max_new_tokens).unsqueeze(0).to(device)
+            def batch_input():
+                sel_input_ids = input_ids[:, start_idx:end_idx]
+                q_len = end_idx - start_idx
+                k_len = past_key_values.get_seq_length() + q_len
+                batch_size = input_ids.shape[0]
+                head_num = model.config.num_attention_heads
+                attn_mask = torch.ones((k_len, q_len),
+                							device=input_ids.device, dtype=torch.bool)
+                attn_mask = torch.triu(attn_mask, diagonal=1).T
+                attn_mask = torch.flip(attn_mask, (0, 1))
+                attn_mask = attn_mask.unsqueeze(0).unsqueeze(0)
+                attn_mask = attn_mask.expand(batch_size, -1, -1, -1).expand(-1, head_num, -1, -1)
+                attention_mask = torch.zeros((batch_size, head_num, q_len, k_len), device=input_ids.device, dtype=torch.bfloat16)
+                attention_mask.masked_fill_(attn_mask, -torch.inf)
+                return {"input_ids": sel_input_ids, "attention_mask": attention_mask}
+            while start_idx < input_length:
+                tmp_pos = position_ids[:, start_idx:end_idx]
+                outputs = model(**batch_input(),
+                               past_key_values=past_key_values,
+                              position_ids=tmp_pos,
+                              cache_position=tmp_pos[0]
+                              )
+                start_idx = end_idx
+                end_idx += lag_size
+                end_idx = min(end_idx, input_length)
+            new_token_id = outputs.logits[:, -1].argmax(dim=-1).unsqueeze(-1)
+            # print(new_token_id)
+            new_token_count = 1
+            generated_ids = [new_token_id]
+            while new_token_id[0][0] != tokenizer.eos_token_id and new_token_count < max_new_tokens+1:
+                tmp_pos = position_ids[:, (input_length+new_token_count-1):(input_length+new_token_count)]
+                outputs = model(new_token_id,
+                               past_key_values=past_key_values,
+                              position_ids=tmp_pos,
+                              cache_position=tmp_pos[0]
+                              )
+                new_token_id = outputs.logits[:, -1].argmax(dim=-1).unsqueeze(-1)
+                new_token_count += 1
+                generated_ids.append(new_token_id)
+            generated_ids = torch.cat(generated_ids, dim=-1)
+        else:
+            generated_ids = model.generate(inputs, do_sample=False, max_new_tokens=max_new_tokens, past_key_values=past_key_values)
+            generated_ids = generated_ids[:, input_length:]
+    if decode:
+        output = tokenizer.batch_decode(generated_ids)
+    else:
+        output = generated_ids
+    return output, past_key_values
+```

README.md CHANGED Viewed

@@ -1,3 +1,116 @@
----
-license: mit
----

+---
+library_name: transformers
+tags:
+  - custom_generate
+---
+# LagKV Cache
+#### Introduction
+![LagKV Cache diagram from the original paper](https://arxiv.org/html/2504.04704v1/x1.png)
+LagKV is an efficient and robust KV compression algorithm. It uses lag tokens information to compress the previous ones which significantly boost the compression performance with little computation overhead.
+[Original Github](https://github.com/AI-Lab-China-Merchants-Bank/LagKV)
+Details are in the following work:
+[LagKV: Lag-Relative Information of the KV Cache Tells Which Tokens Are Important](https://arxiv.org/abs/2504.04704)
+#### How to Use
+LagKV implements the Cache interface from transformers. It's easy to be integrated into the model calling function.
+```python
+from lag_kv import LagKV
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_path = "Qwen2.5-7B-Instruct"
+device = "cuda:0"
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", attn_implementation="sdpa").to(device)
+prompt = "long text"
+input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
+past_key_values = LagKV(lag_size=64)
+print(model.generate(input_ids, past_key_values=past_key_values))
+# check KV cache size
+print(past_key_values[0][0].size())
+```
+To compress the KV cache during the prefill stage instead of it's precisely calculated, you have to use the following inference function(for batch_size=1 only.):
+```python
+def inference_by_prefill_compress(model, tokenizer, inputs, max_new_tokens=256, decode=False, past_key_values=None, device="cuda"):
+    if isinstance(inputs, str):
+        input_ids = tokenizer([inputs], return_tensors="pt")["input_ids"].to(device)
+    else:
+        input_ids = inputs
+    if past_key_values is None:
+        past_key_values = LagKV(ratio=0.2,
+                             lag_size=128,
+                            layer_idx_skip_first=[],
+                             use_then_compress=True)
+    with torch.no_grad():
+        sink_size = past_key_values.sink_size
+        lag_size = past_key_values.lag_size
+        trigger_len = sink_size + 2*lag_size
+        input_length = input_ids.shape[1]
+        # print(input_length > trigger_len)
+        if input_length > trigger_len:
+            start_idx = 0
+            end_idx = trigger_len
+            position_ids = torch.arange(input_length + max_new_tokens).unsqueeze(0).to(device)
+            def batch_input():
+                sel_input_ids = input_ids[:, start_idx:end_idx]
+                q_len = end_idx - start_idx
+                k_len = past_key_values.get_seq_length() + q_len
+                batch_size = input_ids.shape[0]
+                head_num = model.config.num_attention_heads
+                attn_mask = torch.ones((k_len, q_len),
+                							device=input_ids.device, dtype=torch.bool)
+                attn_mask = torch.triu(attn_mask, diagonal=1).T
+                attn_mask = torch.flip(attn_mask, (0, 1))
+                attn_mask = attn_mask.unsqueeze(0).unsqueeze(0)
+                attn_mask = attn_mask.expand(batch_size, -1, -1, -1).expand(-1, head_num, -1, -1)
+                attention_mask = torch.zeros((batch_size, head_num, q_len, k_len), device=input_ids.device, dtype=torch.bfloat16)
+                attention_mask.masked_fill_(attn_mask, -torch.inf)
+                return {"input_ids": sel_input_ids, "attention_mask": attention_mask}
+            while start_idx < input_length:
+                tmp_pos = position_ids[:, start_idx:end_idx]
+                outputs = model(**batch_input(),
+                               past_key_values=past_key_values,
+                              position_ids=tmp_pos,
+                              cache_position=tmp_pos[0]
+                              )
+                start_idx = end_idx
+                end_idx += lag_size
+                end_idx = min(end_idx, input_length)
+            new_token_id = outputs.logits[:, -1].argmax(dim=-1).unsqueeze(-1)
+            # print(new_token_id)
+            new_token_count = 1
+            generated_ids = [new_token_id]
+            while new_token_id[0][0] != tokenizer.eos_token_id and new_token_count < max_new_tokens+1:
+                tmp_pos = position_ids[:, (input_length+new_token_count-1):(input_length+new_token_count)]
+                outputs = model(new_token_id,
+                               past_key_values=past_key_values,
+                              position_ids=tmp_pos,
+                              cache_position=tmp_pos[0]
+                              )
+                new_token_id = outputs.logits[:, -1].argmax(dim=-1).unsqueeze(-1)
+                new_token_count += 1
+                generated_ids.append(new_token_id)
+            generated_ids = torch.cat(generated_ids, dim=-1)
+        else:
+            generated_ids = model.generate(inputs, do_sample=False, max_new_tokens=max_new_tokens, past_key_values=past_key_values)
+            generated_ids = generated_ids[:, input_length:]
+    if decode:
+        output = tokenizer.batch_decode(generated_ids)
+    else:
+        output = generated_ids
+    return output, past_key_values
+```

custom_generate/.ipynb_checkpoints/generate-checkpoint.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# Copyright 2025 China Merchants Bank. All rights reserved.
+#
+# Licensed under the MIT License (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://mit-license.org
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from transformers.cache_utils import DynamicCache
+from typing import Any, Dict, List, Optional, Tuple
+class LagKVCache(DynamicCache):
+    """
+    A KV compression algorithm that as described in the [LagKV paper](https://arxiv.org/abs/2504.04704).
+    The algorithm equips Sink Attention and SlidingWindow like SinkCache but with additional selective tokens in the middle.
+    It allows the model to generate with fewer memory resource and faster decoding speed.
+    The model will hold the main part of information retrieval capbility during the compression, compared to a completed loss
+    of the SinkCache.
+    It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
+    `[batch_size, num_heads, seq_len, head_dim]`.
+    For the chunked prefilling, see https://github.com/AI-Lab-China-Merchants-Bank/LagKV.
+    Parameters:
+        _distributed_cache_data:
+            Inherited from DynamicCache.
+        ratio (`float`):
+            The retrain ratio of tokens in the middle chunks.
+        sink_size (`int`):
+            The number of sink tokens.
+        lag_size (`int`):
+            The size of the partition. The subsequent partion will serve as a reference for the prior one.
+        score_v_ratio (`float`):
+            The ratio multiplied to the score of Value states.
+        skip_layer_idx (`Optional[List[int]]`):
+            A list of layer indices will skip the compression.
+    Example:
+        ```python
+        >>> from transformers import AutoTokenizer, AutoModelForCausalLM, LagKVCache
+        >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+        >>> inputs = tokenizer(text="My name is Qwen2", return_tensors="pt")
+        >>> # Prepare a cache class and pass it to model's forward
+        >>> past_key_values = LagKVCache(ratio=0.25, lag_size=128)
+        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
+        >>> outputs.past_key_values # access cache filled with key/values from generation
+        LagKVCache()
+        ```
+    """
+    def __init__(
+        self,
+        _distributed_cache_data=None,
+        ratio: float = 0.25,
+        sink_size: int = 16,
+        lag_size: int = 1024,
+        score_v_ratio: float = 1.0,
+        skip_layer_idx: Optional[List[int]] = None,
+    ):
+        super().__init__(_distributed_cache_data)
+        self.ratio = ratio
+        self.sink_size: int = sink_size
+        self.lag_size: int = lag_size
+        self.score_v_ratio: float = score_v_ratio
+        self.skip_layer_idx: List[int] = skip_layer_idx if skip_layer_idx is not None else []
+        self._compressed_len: List[int] = []
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs=None,
+    ):
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            self._seen_tokens += key_states.shape[-2]
+        # Update the cache
+        if key_states is not None:
+            if len(self.key_cache) <= layer_idx:
+                # There may be skipped layers, fill them with empty lists
+                for _ in range(len(self.key_cache), layer_idx):
+                    self.key_cache.append([])
+                    self.value_cache.append([])
+                    self._compressed_len.append(self.sink_size)
+                self.key_cache.append(key_states)
+                self.value_cache.append(value_states)
+                self._compressed_len.append(self.sink_size)
+            elif (
+                len(self.key_cache[layer_idx]) == 0
+            ):  # fills previously skipped layers; checking for tensor causes errors
+                self.key_cache[layer_idx] = key_states
+                self.value_cache[layer_idx] = value_states
+            else:
+                self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
+                self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
+            if layer_idx not in self.skip_layer_idx:
+                return self._compress_kv_by_lag(layer_idx)
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+    def _get_states_score(self, base_len, in_size, end_idx, value):
+        """Partition the states then calculate the state scores"""
+        # [batch_size, num_heads, seq_len, head_dim]
+        target_v = value[:, :, base_len:end_idx]
+        # [batch_size, num_heads, partition_num, lag_size, head_dim]
+        target_v = target_v.view(in_size[0], in_size[1], -1, self.lag_size, in_size[-1])
+        ref = target_v[:, :, 1:, :, :]
+        v = target_v[:, :, :-1, :, :]
+        min_r = ref.min(dim=-2).values.unsqueeze(-2).expand(-1, -1, -1, self.lag_size, -1)
+        max_r = ref.max(dim=-2).values.unsqueeze(-2).expand(-1, -1, -1, self.lag_size, -1)
+        score = ((v - min_r) / (max_r - min_r)).std(dim=-1).softmax(dim=-1)
+        return score
+    def _modify_kv(self, value, base_len, end_idx, selected_idx, tail_len):
+        # idx is offset by base_len
+        selected_value = torch.gather(value[:, :, base_len:end_idx], -2, selected_idx)
+        value = torch.cat((value[:, :, :base_len], selected_value, value[:, :, -tail_len:]), dim=-2)
+        return value
+    def _compress_algo(self, layer_idx, base_len):
+        """
+        Calculate the scores of KV tokens in each head and partition. See the paper.
+        The computation overhead of top-k is significantly reduced by partitioning.
+        """
+        in_size = self.key_cache[layer_idx].size()
+        end_idx = base_len + ((in_size[-2] - base_len) // self.lag_size) * self.lag_size
+        # [batch_size, num_heads, partition_num - 1, lag_size, head_dim]
+        key_score = self._get_states_score(base_len, in_size, end_idx, self.key_cache[layer_idx])
+        value_score = self._get_states_score(base_len, in_size, end_idx, self.value_cache[layer_idx])
+        score = key_score + value_score * self.score_v_ratio
+        # you may need to sort the index for some cases
+        selected_idx = torch.topk(score, int(self.ratio * self.lag_size), dim=-1).indices
+        for i in range(1, selected_idx.size()[2], 1):
+            selected_idx[:, :, i] += i * self.lag_size
+        selected_idx = selected_idx.reshape(in_size[0], in_size[1], -1).unsqueeze(-1).expand(-1, -1, -1, in_size[-1])
+        new_base_len = base_len + selected_idx.size()[-2]
+        # alwarys keep the last window
+        tail_len = self.lag_size + in_size[-2] - end_idx
+        self.key_cache[layer_idx] = self._modify_kv(
+            self.key_cache[layer_idx], base_len, end_idx, selected_idx, tail_len
+        )
+        self.value_cache[layer_idx] = self._modify_kv(
+            self.value_cache[layer_idx], base_len, end_idx, selected_idx, tail_len
+        )
+        self._compressed_len[layer_idx] = new_base_len
+    def _compress_kv_by_lag(self, layer_idx):
+        """the KV cache will be used then compressed"""
+        kv_size = self.key_cache[layer_idx].size()
+        base_len = self._compressed_len[layer_idx]
+        keys_to_return, values_to_return = self.key_cache[layer_idx], self.value_cache[layer_idx]
+        if kv_size[-2] >= base_len + 2 * self.lag_size:
+            self._compress_algo(layer_idx, base_len)
+        return keys_to_return, values_to_return
+def generate(model, lag_ratio=0.5, lag_sink_size=16, lag_size=128, **kwargs):
+    """Custom generate function for LagKVCache.
+    (template from https://huggingface.co/transformers-community/sink_cache)
+    Args:
+        model (`PreTrainedModel`):
+            The model to generate from.
+        lag_ratio (`float`):
+            The retrain ratio of tokens in the middle chunks.
+        lag_sink_size (`int`):
+            The number of sink tokens.
+        lag_size (`int`):
+            The size of the partition. See the original paper for more information.
+    """
+    # 1. General sanity checks
+    # 1.a. A few arguments are not allowed, especially arguments that control caches.
+    generation_config = kwargs.get("generation_config")
+    default_global_generation_config = GenerationConfig()
+    default_model_generation_config = model.generation_config
+    for arg in UNSUPPORTED_GENERATION_ARGS:
+        has_custom_gen_config_arg = (
+            generation_config is not None
+            # = and not (match global default or match model-specific default)
+            and not (
+                getattr(default_model_generation_config, arg) == getattr(generation_config, arg)
+                or getattr(default_global_generation_config, arg) == getattr(generation_config, arg)
+            )
+        )
+        kwargs_has_arg = arg in kwargs and kwargs[arg] is not None
+        if kwargs_has_arg or has_custom_gen_config_arg:
+            raise ValueError(
+                f"`{arg}` is set, but it's not supported in this custom generate function. List of "
+                f"unsupported arguments: {UNSUPPORTED_GENERATION_ARGS}"
+            )
+    # 1.b. The model must be decoder-only
+    if model.config.is_encoder_decoder:
+        raise ValueError("This custom generate function only works with decoder-only models")
+    # 1.c. compatibility with transformers 4.52: we must pop `custom_generate` from kwargs, otherwise it will result
+    # in an infinite loop when we call `model.generate`. This is solved in transformers 4.53.
+    kwargs.pop("custom_generate", None)
+    # 2. Generate with LagKVCache
+    # 2.a. prepare the cache, if it was not passed.
+    past_key_values = kwargs.pop("past_key_values", None)
+    if past_key_values is None:
+        past_key_values = LagKVCache(ratio=lag_ratio, sink_size=lag_sink_size, lag_size=lag_size)
+    elif not isinstance(past_key_values, LagKVCache):
+        raise ValueError(f"`past_key_values` must be a `LagKVCache` instance, got a {type(past_key_values)} instance")
+    # 2.b. generate with the cache
+    generation_outputs = model.generate(**kwargs, past_key_values=past_key_values, use_cache=True)
+    return generation_outputs

custom_generate/LICENSE ADDED Viewed

	@@ -0,0 +1,5 @@

+The MIT License (MIT)
+Copyright © 2025 China Merchants Bank
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

custom_generate/generate.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# Copyright 2025 China Merchants Bank. All rights reserved.
+#
+# Licensed under the MIT License (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://mit-license.org
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from transformers.cache_utils import DynamicCache
+from typing import Any, Dict, List, Optional, Tuple
+class LagKVCache(DynamicCache):
+    """
+    A KV compression algorithm that as described in the [LagKV paper](https://arxiv.org/abs/2504.04704).
+    The algorithm equips Sink Attention and SlidingWindow like SinkCache but with additional selective tokens in the middle.
+    It allows the model to generate with fewer memory resource and faster decoding speed.
+    The model will hold the main part of information retrieval capbility during the compression, compared to a completed loss
+    of the SinkCache.
+    It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
+    `[batch_size, num_heads, seq_len, head_dim]`.
+    For the chunked prefilling, see https://github.com/AI-Lab-China-Merchants-Bank/LagKV.
+    Parameters:
+        _distributed_cache_data:
+            Inherited from DynamicCache.
+        ratio (`float`):
+            The retrain ratio of tokens in the middle chunks.
+        sink_size (`int`):
+            The number of sink tokens.
+        lag_size (`int`):
+            The size of the partition. The subsequent partion will serve as a reference for the prior one.
+        score_v_ratio (`float`):
+            The ratio multiplied to the score of Value states.
+        skip_layer_idx (`Optional[List[int]]`):
+            A list of layer indices will skip the compression.
+    Example:
+        ```python
+        >>> from transformers import AutoTokenizer, AutoModelForCausalLM, LagKVCache
+        >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+        >>> inputs = tokenizer(text="My name is Qwen2", return_tensors="pt")
+        >>> # Prepare a cache class and pass it to model's forward
+        >>> past_key_values = LagKVCache(ratio=0.25, lag_size=128)
+        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
+        >>> outputs.past_key_values # access cache filled with key/values from generation
+        LagKVCache()
+        ```
+    """
+    def __init__(
+        self,
+        _distributed_cache_data=None,
+        ratio: float = 0.25,
+        sink_size: int = 16,
+        lag_size: int = 1024,
+        score_v_ratio: float = 1.0,
+        skip_layer_idx: Optional[List[int]] = None,
+    ):
+        super().__init__(_distributed_cache_data)
+        self.ratio = ratio
+        self.sink_size: int = sink_size
+        self.lag_size: int = lag_size
+        self.score_v_ratio: float = score_v_ratio
+        self.skip_layer_idx: List[int] = skip_layer_idx if skip_layer_idx is not None else []
+        self._compressed_len: List[int] = []
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs=None,
+    ):
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            self._seen_tokens += key_states.shape[-2]
+        # Update the cache
+        if key_states is not None:
+            if len(self.key_cache) <= layer_idx:
+                # There may be skipped layers, fill them with empty lists
+                for _ in range(len(self.key_cache), layer_idx):
+                    self.key_cache.append([])
+                    self.value_cache.append([])
+                    self._compressed_len.append(self.sink_size)
+                self.key_cache.append(key_states)
+                self.value_cache.append(value_states)
+                self._compressed_len.append(self.sink_size)
+            elif (
+                len(self.key_cache[layer_idx]) == 0
+            ):  # fills previously skipped layers; checking for tensor causes errors
+                self.key_cache[layer_idx] = key_states
+                self.value_cache[layer_idx] = value_states
+            else:
+                self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
+                self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
+            if layer_idx not in self.skip_layer_idx:
+                return self._compress_kv_by_lag(layer_idx)
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+    def _get_states_score(self, base_len, in_size, end_idx, value):
+        """Partition the states then calculate the state scores"""
+        # [batch_size, num_heads, seq_len, head_dim]
+        target_v = value[:, :, base_len:end_idx]
+        # [batch_size, num_heads, partition_num, lag_size, head_dim]
+        target_v = target_v.view(in_size[0], in_size[1], -1, self.lag_size, in_size[-1])
+        ref = target_v[:, :, 1:, :, :]
+        v = target_v[:, :, :-1, :, :]
+        min_r = ref.min(dim=-2).values.unsqueeze(-2).expand(-1, -1, -1, self.lag_size, -1)
+        max_r = ref.max(dim=-2).values.unsqueeze(-2).expand(-1, -1, -1, self.lag_size, -1)
+        score = ((v - min_r) / (max_r - min_r)).std(dim=-1).softmax(dim=-1)
+        return score
+    def _modify_kv(self, value, base_len, end_idx, selected_idx, tail_len):
+        # idx is offset by base_len
+        selected_value = torch.gather(value[:, :, base_len:end_idx], -2, selected_idx)
+        value = torch.cat((value[:, :, :base_len], selected_value, value[:, :, -tail_len:]), dim=-2)
+        return value
+    def _compress_algo(self, layer_idx, base_len):
+        """
+        Calculate the scores of KV tokens in each head and partition. See the paper.
+        The computation overhead of top-k is significantly reduced by partitioning.
+        """
+        in_size = self.key_cache[layer_idx].size()
+        end_idx = base_len + ((in_size[-2] - base_len) // self.lag_size) * self.lag_size
+        # [batch_size, num_heads, partition_num - 1, lag_size, head_dim]
+        key_score = self._get_states_score(base_len, in_size, end_idx, self.key_cache[layer_idx])
+        value_score = self._get_states_score(base_len, in_size, end_idx, self.value_cache[layer_idx])
+        score = key_score + value_score * self.score_v_ratio
+        # you may need to sort the index for some cases
+        selected_idx = torch.topk(score, int(self.ratio * self.lag_size), dim=-1).indices
+        for i in range(1, selected_idx.size()[2], 1):
+            selected_idx[:, :, i] += i * self.lag_size
+        selected_idx = selected_idx.reshape(in_size[0], in_size[1], -1).unsqueeze(-1).expand(-1, -1, -1, in_size[-1])
+        new_base_len = base_len + selected_idx.size()[-2]
+        # alwarys keep the last window
+        tail_len = self.lag_size + in_size[-2] - end_idx
+        self.key_cache[layer_idx] = self._modify_kv(
+            self.key_cache[layer_idx], base_len, end_idx, selected_idx, tail_len
+        )
+        self.value_cache[layer_idx] = self._modify_kv(
+            self.value_cache[layer_idx], base_len, end_idx, selected_idx, tail_len
+        )
+        self._compressed_len[layer_idx] = new_base_len
+    def _compress_kv_by_lag(self, layer_idx):
+        """the KV cache will be used then compressed"""
+        kv_size = self.key_cache[layer_idx].size()
+        base_len = self._compressed_len[layer_idx]
+        keys_to_return, values_to_return = self.key_cache[layer_idx], self.value_cache[layer_idx]
+        if kv_size[-2] >= base_len + 2 * self.lag_size:
+            self._compress_algo(layer_idx, base_len)
+        return keys_to_return, values_to_return
+def generate(model, lag_ratio=0.5, lag_sink_size=16, lag_size=128, **kwargs):
+    """Custom generate function for LagKVCache.
+    (template from https://huggingface.co/transformers-community/sink_cache)
+    Args:
+        model (`PreTrainedModel`):
+            The model to generate from.
+        lag_ratio (`float`):
+            The retrain ratio of tokens in the middle chunks.
+        lag_sink_size (`int`):
+            The number of sink tokens.
+        lag_size (`int`):
+            The size of the partition. See the original paper for more information.
+    """
+    # 1. General sanity checks
+    # 1.a. A few arguments are not allowed, especially arguments that control caches.
+    generation_config = kwargs.get("generation_config")
+    default_global_generation_config = GenerationConfig()
+    default_model_generation_config = model.generation_config
+    for arg in UNSUPPORTED_GENERATION_ARGS:
+        has_custom_gen_config_arg = (
+            generation_config is not None
+            # = and not (match global default or match model-specific default)
+            and not (
+                getattr(default_model_generation_config, arg) == getattr(generation_config, arg)
+                or getattr(default_global_generation_config, arg) == getattr(generation_config, arg)
+            )
+        )
+        kwargs_has_arg = arg in kwargs and kwargs[arg] is not None
+        if kwargs_has_arg or has_custom_gen_config_arg:
+            raise ValueError(
+                f"`{arg}` is set, but it's not supported in this custom generate function. List of "
+                f"unsupported arguments: {UNSUPPORTED_GENERATION_ARGS}"
+            )
+    # 1.b. The model must be decoder-only
+    if model.config.is_encoder_decoder:
+        raise ValueError("This custom generate function only works with decoder-only models")
+    # 1.c. compatibility with transformers 4.52: we must pop `custom_generate` from kwargs, otherwise it will result
+    # in an infinite loop when we call `model.generate`. This is solved in transformers 4.53.
+    kwargs.pop("custom_generate", None)
+    # 2. Generate with LagKVCache
+    # 2.a. prepare the cache, if it was not passed.
+    past_key_values = kwargs.pop("past_key_values", None)
+    if past_key_values is None:
+        past_key_values = LagKVCache(ratio=lag_ratio, sink_size=lag_sink_size, lag_size=lag_size)
+    elif not isinstance(past_key_values, LagKVCache):
+        raise ValueError(f"`past_key_values` must be a `LagKVCache` instance, got a {type(past_key_values)} instance")
+    # 2.b. generate with the cache
+    generation_outputs = model.generate(**kwargs, past_key_values=past_key_values, use_cache=True)
+    return generation_outputs