Build (aarch64)

Browse files

Files changed (12) hide show

build/torch26-cxx11-cu126-aarch64-linux/flash_attn/__init__.py +364 -0
build/torch26-cxx11-cu126-aarch64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so +3 -0
build/torch26-cxx11-cu126-aarch64-linux/flash_attn/_ops.py +9 -0
build/torch26-cxx98-cu126-aarch64-linux/flash_attn/__init__.py +364 -0
build/torch26-cxx98-cu126-aarch64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so +3 -0
build/torch26-cxx98-cu126-aarch64-linux/flash_attn/_ops.py +9 -0
build/torch27-cxx11-cu126-aarch64-linux/flash_attn/__init__.py +364 -0
build/torch27-cxx11-cu126-aarch64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so +3 -0
build/torch27-cxx11-cu126-aarch64-linux/flash_attn/_ops.py +9 -0
build/torch27-cxx11-cu128-aarch64-linux/flash_attn/__init__.py +364 -0
build/torch27-cxx11-cu128-aarch64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so +3 -0
build/torch27-cxx11-cu128-aarch64-linux/flash_attn/_ops.py +9 -0

build/torch26-cxx11-cu126-aarch64-linux/flash_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,364 @@

+from typing import Optional, List
+import torch
+from ._ops import ops
+def mha_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Optional output tensor, same shape as q
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    return ops.mha_fwd(
+        q,
+        k,
+        v,
+        out,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def mha_varlen_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with variable sequence lengths.
+    Args:
+        q: Query tensor of shape [total_q, num_heads, head_size]
+        k: Key tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        v: Value tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        out: Optional output tensor of shape [total_q, num_heads, head_size]
+        seqused_k: Optional tensor specifying how many keys to use per batch element [batch_size]
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    return ops.mha_varlen_fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_k,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def mha_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    return ops.mha_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def mha_varlen_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention with variable sequence lengths.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    return ops.mha_varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def mha_fwd_kvcache(
+    q: torch.Tensor,
+    kcache: torch.Tensor,
+    vcache: torch.Tensor,
+    k: Optional[torch.Tensor] = None,
+    v: Optional[torch.Tensor] = None,
+    seqlens_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    is_rotary_interleaved: bool = False,
+    num_splits: int = 1,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with KV cache.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        kcache: Key cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        vcache: Value cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        k: Optional new keys tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        v: Optional new values tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        seqlens_k: Optional sequence lengths for keys of shape [batch_size]
+        rotary_cos: Optional rotary cosine tensor of shape [seqlen_ro, rotary_dim/2]
+        rotary_sin: Optional rotary sine tensor of shape [seqlen_ro, rotary_dim/2]
+        cache_batch_idx: Optional indices to index into the KV cache
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        out: Optional output tensor, same shape as q
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        is_rotary_interleaved: Whether rotary embeddings are interleaved
+        num_splits: Number of splits for computation
+    Returns:
+        List of tensors: [output, softmax_lse]
+    """
+    return ops.mha_fwd_kvcache(
+        q,
+        kcache,
+        vcache,
+        k,
+        v,
+        seqlens_k,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        out,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        is_rotary_interleaved,
+        num_splits,
+    )

build/torch26-cxx11-cu126-aarch64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80fb5d7d2d79174b113a48fcf87f1ee99e58cb10e37525ce6d7fbe88b92b2b4e
+size 646378472

build/torch26-cxx11-cu126-aarch64-linux/flash_attn/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch26-cxx98-cu126-aarch64-linux/flash_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,364 @@

+from typing import Optional, List
+import torch
+from ._ops import ops
+def mha_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Optional output tensor, same shape as q
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    return ops.mha_fwd(
+        q,
+        k,
+        v,
+        out,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def mha_varlen_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with variable sequence lengths.
+    Args:
+        q: Query tensor of shape [total_q, num_heads, head_size]
+        k: Key tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        v: Value tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        out: Optional output tensor of shape [total_q, num_heads, head_size]
+        seqused_k: Optional tensor specifying how many keys to use per batch element [batch_size]
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    return ops.mha_varlen_fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_k,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def mha_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    return ops.mha_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def mha_varlen_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention with variable sequence lengths.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    return ops.mha_varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def mha_fwd_kvcache(
+    q: torch.Tensor,
+    kcache: torch.Tensor,
+    vcache: torch.Tensor,
+    k: Optional[torch.Tensor] = None,
+    v: Optional[torch.Tensor] = None,
+    seqlens_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    is_rotary_interleaved: bool = False,
+    num_splits: int = 1,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with KV cache.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        kcache: Key cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        vcache: Value cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        k: Optional new keys tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        v: Optional new values tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        seqlens_k: Optional sequence lengths for keys of shape [batch_size]
+        rotary_cos: Optional rotary cosine tensor of shape [seqlen_ro, rotary_dim/2]
+        rotary_sin: Optional rotary sine tensor of shape [seqlen_ro, rotary_dim/2]
+        cache_batch_idx: Optional indices to index into the KV cache
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        out: Optional output tensor, same shape as q
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        is_rotary_interleaved: Whether rotary embeddings are interleaved
+        num_splits: Number of splits for computation
+    Returns:
+        List of tensors: [output, softmax_lse]
+    """
+    return ops.mha_fwd_kvcache(
+        q,
+        kcache,
+        vcache,
+        k,
+        v,
+        seqlens_k,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        out,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        is_rotary_interleaved,
+        num_splits,
+    )

build/torch26-cxx98-cu126-aarch64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2d761fb906a27113da522801b91d3e2c6db042253caaf9bceb6b893cf76964a
+size 646373888

build/torch26-cxx98-cu126-aarch64-linux/flash_attn/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch27-cxx11-cu126-aarch64-linux/flash_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,364 @@

+from typing import Optional, List
+import torch
+from ._ops import ops
+def mha_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Optional output tensor, same shape as q
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    return ops.mha_fwd(
+        q,
+        k,
+        v,
+        out,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def mha_varlen_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with variable sequence lengths.
+    Args:
+        q: Query tensor of shape [total_q, num_heads, head_size]
+        k: Key tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        v: Value tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        out: Optional output tensor of shape [total_q, num_heads, head_size]
+        seqused_k: Optional tensor specifying how many keys to use per batch element [batch_size]
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    return ops.mha_varlen_fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_k,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def mha_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    return ops.mha_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def mha_varlen_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention with variable sequence lengths.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    return ops.mha_varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def mha_fwd_kvcache(
+    q: torch.Tensor,
+    kcache: torch.Tensor,
+    vcache: torch.Tensor,
+    k: Optional[torch.Tensor] = None,
+    v: Optional[torch.Tensor] = None,
+    seqlens_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    is_rotary_interleaved: bool = False,
+    num_splits: int = 1,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with KV cache.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        kcache: Key cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        vcache: Value cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        k: Optional new keys tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        v: Optional new values tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        seqlens_k: Optional sequence lengths for keys of shape [batch_size]
+        rotary_cos: Optional rotary cosine tensor of shape [seqlen_ro, rotary_dim/2]
+        rotary_sin: Optional rotary sine tensor of shape [seqlen_ro, rotary_dim/2]
+        cache_batch_idx: Optional indices to index into the KV cache
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        out: Optional output tensor, same shape as q
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        is_rotary_interleaved: Whether rotary embeddings are interleaved
+        num_splits: Number of splits for computation
+    Returns:
+        List of tensors: [output, softmax_lse]
+    """
+    return ops.mha_fwd_kvcache(
+        q,
+        kcache,
+        vcache,
+        k,
+        v,
+        seqlens_k,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        out,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        is_rotary_interleaved,
+        num_splits,
+    )

build/torch27-cxx11-cu126-aarch64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4bdf651fa75409d3d8e04e85dd7e2ade1f263114e5c58fae0f1e2dde76f3554c
+size 646378696

build/torch27-cxx11-cu126-aarch64-linux/flash_attn/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flash_attn_dd2f0f9::{op_name}"

build/torch27-cxx11-cu128-aarch64-linux/flash_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,364 @@

+from typing import Optional, List
+import torch
+from ._ops import ops
+def mha_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Optional output tensor, same shape as q
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    return ops.mha_fwd(
+        q,
+        k,
+        v,
+        out,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def mha_varlen_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with variable sequence lengths.
+    Args:
+        q: Query tensor of shape [total_q, num_heads, head_size]
+        k: Key tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        v: Value tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        out: Optional output tensor of shape [total_q, num_heads, head_size]
+        seqused_k: Optional tensor specifying how many keys to use per batch element [batch_size]
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    return ops.mha_varlen_fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_k,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def mha_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    return ops.mha_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def mha_varlen_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: float = 1.0,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention with variable sequence lengths.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    return ops.mha_varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def mha_fwd_kvcache(
+    q: torch.Tensor,
+    kcache: torch.Tensor,
+    vcache: torch.Tensor,
+    k: Optional[torch.Tensor] = None,
+    v: Optional[torch.Tensor] = None,
+    seqlens_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    softmax_scale: float = 1.0,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    is_rotary_interleaved: bool = False,
+    num_splits: int = 1,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with KV cache.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        kcache: Key cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        vcache: Value cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        k: Optional new keys tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        v: Optional new values tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        seqlens_k: Optional sequence lengths for keys of shape [batch_size]
+        rotary_cos: Optional rotary cosine tensor of shape [seqlen_ro, rotary_dim/2]
+        rotary_sin: Optional rotary sine tensor of shape [seqlen_ro, rotary_dim/2]
+        cache_batch_idx: Optional indices to index into the KV cache
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        out: Optional output tensor, same shape as q
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        is_rotary_interleaved: Whether rotary embeddings are interleaved
+        num_splits: Number of splits for computation
+    Returns:
+        List of tensors: [output, softmax_lse]
+    """
+    return ops.mha_fwd_kvcache(
+        q,
+        kcache,
+        vcache,
+        k,
+        v,
+        seqlens_k,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        out,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        is_rotary_interleaved,
+        num_splits,
+    )

build/torch27-cxx11-cu128-aarch64-linux/flash_attn/_flash_attn_dd2f0f9.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a23b3783a4b8aa7f7f03d964cc476baa3521e8d2bad7fd0f7376afbed2640dac
+size 1503161696

build/torch27-cxx11-cu128-aarch64-linux/flash_attn/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _flash_attn_dd2f0f9
+ops = torch.ops._flash_attn_dd2f0f9
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flash_attn_dd2f0f9::{op_name}"