kernels-community
/

flash-attn2

Kernels

Model card Files Files and versions

xet

Community

drbh commited on Jul 9

Commit

56449c1

1 Parent(s): 09eec95

feat: bump api on readme

Browse files

Files changed (2) hide show

README.md +8 -14
scripts/readme_example.py +3 -8

README.md CHANGED Viewed

@@ -30,8 +30,6 @@ torch.manual_seed(42)
 flash_attn = get_kernel("kernels-community/flash-attn")
 device = torch.device("cuda")
-print("Flash Attention functions:", [i for i in dir(flash_attn) if i.startswith("mha")])
 # Create test tensors
 B, S, H, D = 2, 5, 4, 8  # batch, seq_len, heads, head_dim
 q = k = v = torch.randn(B, S, H, D, device=device, dtype=torch.float16)
@@ -46,12 +44,11 @@ def reference_attention(query, key, value, causal=False):
 # 1. Standard attention
 print("\n1. Standard attention:")
 out_ref = reference_attention(q, k, v)
-out_flash = flash_attn.mha_fwd(
     q=q,
     k=k,
     v=v,
     is_causal=False,
-    softmax_scale=1.0 / (D ** 0.5),  # scale factor
 )[0]
 print(f"Reference output: {out_ref.shape}")
 print(f"Flash output: {out_flash.shape}")
@@ -61,12 +58,11 @@ print(f"Outputs close: {torch.allclose(out_flash, out_ref, atol=1e-2, rtol=1e-3)
 print("\n2. Causal attention:")
 out_ref_causal = reference_attention(q, k, v, causal=True)
-out_causal = flash_attn.mha_fwd(
     q=q,
     k=k,
     v=v,
     is_causal=True,
-    softmax_scale=1.0 / (D ** 0.5),  # scale factor
 )[0]
 print(f"Reference causal output: {out_ref_causal.shape}")
 print(f"Flash causal output: {out_causal.shape}")
@@ -74,7 +70,7 @@ print(f"Outputs close: {torch.allclose(out_causal, out_ref_causal, atol=1e-2, rt
 def var_reference_attention(q, k, v, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, causal=False):
     batch_size = cu_seqlens_q.shape[0] - 1
-    # Return output in packed format
     total_tokens_q = q.shape[0]
     out = torch.zeros((total_tokens_q, q.shape[1], q.shape[2]), device=q.device, dtype=q.dtype)
@@ -111,7 +107,7 @@ cu_k = torch.tensor([0, 4, 9, 12], device=device, dtype=torch.int32)
 out_var_ref = var_reference_attention(q_var, k_var, v_var, cu_q, cu_k, max_seqlen_q=4, max_seqlen_k=5, causal=False)
 # Custom function to handle variable
-out_var = flash_attn.mha_varlen_fwd(
     q=q_var,
     k=k_var,
     v=v_var,
@@ -119,7 +115,6 @@ out_var = flash_attn.mha_varlen_fwd(
     cu_seqlens_k=cu_k,
     max_seqlen_q=4,
     max_seqlen_k=5,
-    softmax_scale=1.0 / (D ** 0.5),  # scale factor
 )[0]
 print(f"Variable length output: {out_var.shape}")
 print(f"Reference variable length output: {out_var_ref.shape}")
@@ -133,21 +128,20 @@ uv run scripts/readme_example.py
 ```
 ```txt
-Reading inline script metadata from `flash-attn/scripts/readme_example.py`
-Fetching 4 files: 100%|█████████████████████████████████████████| 4/4 [00:00<00:00, 33354.31it/s]
-Flash Attention functions: ['mha_bwd', 'mha_fwd', 'mha_fwd_kvcache', 'mha_varlen_bwd', 'mha_varlen_fwd']
 1. Standard attention:
 Reference output: torch.Size([2, 5, 4, 8])
 Flash output: torch.Size([2, 5, 4, 8])
 Outputs close: True
-1. Causal attention:
 Reference causal output: torch.Size([2, 5, 4, 8])
 Flash causal output: torch.Size([2, 5, 4, 8])
 Outputs close: True
-1. Variable length sequences:
 Variable length output: torch.Size([10, 4, 8])
 Reference variable length output: torch.Size([10, 4, 8])
 Outputs close: True

 flash_attn = get_kernel("kernels-community/flash-attn")
 device = torch.device("cuda")
 # Create test tensors
 B, S, H, D = 2, 5, 4, 8  # batch, seq_len, heads, head_dim
 q = k = v = torch.randn(B, S, H, D, device=device, dtype=torch.float16)
 # 1. Standard attention
 print("\n1. Standard attention:")
 out_ref = reference_attention(q, k, v)
+out_flash = flash_attn.fwd(
     q=q,
     k=k,
     v=v,
     is_causal=False,
 )[0]
 print(f"Reference output: {out_ref.shape}")
 print(f"Flash output: {out_flash.shape}")
 print("\n2. Causal attention:")
 out_ref_causal = reference_attention(q, k, v, causal=True)
+out_causal = flash_attn.fwd(
     q=q,
     k=k,
     v=v,
     is_causal=True,
 )[0]
 print(f"Reference causal output: {out_ref_causal.shape}")
 print(f"Flash causal output: {out_causal.shape}")
 def var_reference_attention(q, k, v, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, causal=False):
     batch_size = cu_seqlens_q.shape[0] - 1
+    # Return output in packed format (same as flash attention)
     total_tokens_q = q.shape[0]
     out = torch.zeros((total_tokens_q, q.shape[1], q.shape[2]), device=q.device, dtype=q.dtype)
 out_var_ref = var_reference_attention(q_var, k_var, v_var, cu_q, cu_k, max_seqlen_q=4, max_seqlen_k=5, causal=False)
 # Custom function to handle variable
+out_var = flash_attn.varlen_fwd(
     q=q_var,
     k=k_var,
     v=v_var,
     cu_seqlens_k=cu_k,
     max_seqlen_q=4,
     max_seqlen_k=5,
 )[0]
 print(f"Variable length output: {out_var.shape}")
 print(f"Reference variable length output: {out_var_ref.shape}")
 ```
 ```txt
+Reading inline script metadata from `scripts/readme_example.py`
+Fetching 20 files: 100%|██████████████████████████████████████████████████| 20/20 [00:00<00:00, 16371.21it/s]
 1. Standard attention:
 Reference output: torch.Size([2, 5, 4, 8])
 Flash output: torch.Size([2, 5, 4, 8])
 Outputs close: True
+2. Causal attention:
 Reference causal output: torch.Size([2, 5, 4, 8])
 Flash causal output: torch.Size([2, 5, 4, 8])
 Outputs close: True
+3. Variable length sequences:
 Variable length output: torch.Size([10, 4, 8])
 Reference variable length output: torch.Size([10, 4, 8])
 Outputs close: True

scripts/readme_example.py CHANGED Viewed

@@ -13,8 +13,6 @@ torch.manual_seed(42)
 flash_attn = get_kernel("kernels-community/flash-attn")
 device = torch.device("cuda")
-print("Flash Attention functions:", [i for i in dir(flash_attn) if i.startswith("mha")])
 # Create test tensors
 B, S, H, D = 2, 5, 4, 8  # batch, seq_len, heads, head_dim
 q = k = v = torch.randn(B, S, H, D, device=device, dtype=torch.float16)
@@ -29,12 +27,11 @@ def reference_attention(query, key, value, causal=False):
 # 1. Standard attention
 print("\n1. Standard attention:")
 out_ref = reference_attention(q, k, v)
-out_flash = flash_attn.mha_fwd(
     q=q,
     k=k,
     v=v,
     is_causal=False,
-    softmax_scale=1.0 / (D ** 0.5),  # scale factor
 )[0]
 print(f"Reference output: {out_ref.shape}")
 print(f"Flash output: {out_flash.shape}")
@@ -44,12 +41,11 @@ print(f"Outputs close: {torch.allclose(out_flash, out_ref, atol=1e-2, rtol=1e-3)
 print("\n2. Causal attention:")
 out_ref_causal = reference_attention(q, k, v, causal=True)
-out_causal = flash_attn.mha_fwd(
     q=q,
     k=k,
     v=v,
     is_causal=True,
-    softmax_scale=1.0 / (D ** 0.5),  # scale factor
 )[0]
 print(f"Reference causal output: {out_ref_causal.shape}")
 print(f"Flash causal output: {out_causal.shape}")
@@ -94,7 +90,7 @@ cu_k = torch.tensor([0, 4, 9, 12], device=device, dtype=torch.int32)
 out_var_ref = var_reference_attention(q_var, k_var, v_var, cu_q, cu_k, max_seqlen_q=4, max_seqlen_k=5, causal=False)
 # Custom function to handle variable
-out_var = flash_attn.mha_varlen_fwd(
     q=q_var,
     k=k_var,
     v=v_var,
@@ -102,7 +98,6 @@ out_var = flash_attn.mha_varlen_fwd(
     cu_seqlens_k=cu_k,
     max_seqlen_q=4,
     max_seqlen_k=5,
-    softmax_scale=1.0 / (D ** 0.5),  # scale factor
 )[0]
 print(f"Variable length output: {out_var.shape}")
 print(f"Reference variable length output: {out_var_ref.shape}")

 flash_attn = get_kernel("kernels-community/flash-attn")
 device = torch.device("cuda")
 # Create test tensors
 B, S, H, D = 2, 5, 4, 8  # batch, seq_len, heads, head_dim
 q = k = v = torch.randn(B, S, H, D, device=device, dtype=torch.float16)
 # 1. Standard attention
 print("\n1. Standard attention:")
 out_ref = reference_attention(q, k, v)
+out_flash = flash_attn.fwd(
     q=q,
     k=k,
     v=v,
     is_causal=False,
 )[0]
 print(f"Reference output: {out_ref.shape}")
 print(f"Flash output: {out_flash.shape}")
 print("\n2. Causal attention:")
 out_ref_causal = reference_attention(q, k, v, causal=True)
+out_causal = flash_attn.fwd(
     q=q,
     k=k,
     v=v,
     is_causal=True,
 )[0]
 print(f"Reference causal output: {out_ref_causal.shape}")
 print(f"Flash causal output: {out_causal.shape}")
 out_var_ref = var_reference_attention(q_var, k_var, v_var, cu_q, cu_k, max_seqlen_q=4, max_seqlen_k=5, causal=False)
 # Custom function to handle variable
+out_var = flash_attn.varlen_fwd(
     q=q_var,
     k=k_var,
     v=v_var,
     cu_seqlens_k=cu_k,
     max_seqlen_q=4,
     max_seqlen_k=5,
 )[0]
 print(f"Variable length output: {out_var.shape}")
 print(f"Reference variable length output: {out_var_ref.shape}")