fix: simplify and align signatures further

Files changed (5) hide show

.gitignore +5 -1
flash_attn/flash_api.cpp +17 -33
tests/test_flash_attn.py +56 -31
torch-ext/torch_binding.cpp +4 -1
torch-ext/torch_binding.h +9 -9

.gitignore CHANGED Viewed

@@ -5,4 +5,8 @@ cmake
 result
 CMakeLists.txt
 setup.py
-pyproject.toml

 result
 CMakeLists.txt
 setup.py
+pyproject.toml
+.venv
+torch-ext/registration.h
+torch-ext/flash_attn/*.so
+torch-ext/flash_attn/_ops.py

flash_attn/flash_api.cpp CHANGED Viewed

@@ -1475,40 +1475,24 @@ mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_he
 }
 } // namespace FLASH_NAMESPACE
-// NOTE: wrap the namespaced functions so all types are doubles and longs
 std::vector<torch::Tensor>
-mha_fwd(const torch::Tensor &q,                            // batch_size x seqlen_q x num_heads x round_multiple(head_size, 8)
-        const torch::Tensor &k,                            // batch_size x seqlen_k x num_heads_k x round_multiple(head_size, 8)
-        const torch::Tensor &v,                            // batch_size x seqlen_k x num_heads_k x round_multiple(head_size, 8)
-        const c10::optional<torch::Tensor> &out_,          // batch_size x seqlen_q x num_heads x round_multiple(head_size, 8)
-        const c10::optional<torch::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
-        const double p_dropout,
-        const double softmax_scale,
-        bool is_causal,
-        const int64_t window_size_left,
-        const int64_t window_size_right,
-        const double softcap,
-        const bool return_softmax,
-        const c10::optional<at::Generator> gen_) {
-    auto gen = gen_.value_or(at::cuda::detail::getDefaultCUDAGenerator());
-    // Prepare the optional arguments as non-const references.
-    std::optional<at::Tensor> out = out_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(out_.value())) : std::nullopt;
-    std::optional<at::Tensor> alibi_slopes = alibi_slopes_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(alibi_slopes_.value())) : std::nullopt;
-    if (!out.has_value()){
-        out = torch::empty_like(q);
-    }
-    // Convert double to float and int64_t to int.
-    float p_dropout_float = static_cast<float>(p_dropout);
-    float softmax_scale_float = static_cast<float>(softmax_scale);
-    float softcap_float = static_cast<float>(softcap);
-    int window_size_left_int = static_cast<int>(window_size_left);
-    int window_size_right_int = static_cast<int>(window_size_right);
-    return FLASH_NAMESPACE::mha_fwd(const_cast<at::Tensor &>(q), k, v, out, alibi_slopes, p_dropout_float, softmax_scale_float, is_causal, window_size_left_int, window_size_right_int, softcap_float, return_softmax, gen);
 }
 std::vector<torch::Tensor>

 }
 } // namespace FLASH_NAMESPACE
+// Prefer the most minimal wrapper possible to avoid unnecessary copies or conversions.
 std::vector<torch::Tensor>
+mha_fwd(torch::Tensor &q, const torch::Tensor &k, const torch::Tensor &v,
+        c10::optional<torch::Tensor> out_,
+        c10::optional<torch::Tensor> alibi_slopes_,
+        const double p_dropout, const double softmax_scale, bool is_causal,
+        const int64_t window_size_left, const int64_t window_size_right,
+        const double softcap, const bool return_softmax,
+        c10::optional<at::Generator> gen_) {
+  printf("Confirm this path is taken\n");
+  auto result = FLASH_NAMESPACE::mha_fwd(
+      q, k, v, out_, alibi_slopes_, static_cast<float>(p_dropout),
+      static_cast<float>(softmax_scale), is_causal,
+      static_cast<int>(window_size_left), static_cast<int>(window_size_right),
+      static_cast<float>(softcap), return_softmax, gen_);
+  return result;
 }
 std::vector<torch::Tensor>

tests/test_flash_attn.py CHANGED Viewed

@@ -1,38 +1,63 @@
 import torch
 import flash_attn
-# TODO: improve and add more tests
 def test_flash_attn():
-    q = torch.randn(2, 5, 4, 8)
-    k = torch.randn(2, 5, 4, 8)
-    v = torch.randn(2, 5, 4, 8)
-    out = torch.empty(2, 5, 4, 8)
-    alibi_slopes = torch.empty(4)
-    p_dropout = 0.1
-    softmax_scale = 1.0
-    is_causal = False
-    window_size_left = 0
-    window_size_right = 0
-    softcap = 0.0
-    return_softmax = False
-    gen = None
-    out = flash_attn.mha_fwd(
-        q,
-        k,
-        v,
-        out,
-        alibi_slopes,
-        p_dropout,
-        softmax_scale,
-        is_causal,
-        window_size_left,
-        window_size_right,
-        softcap,
-        return_softmax,
-        gen,
     )
-    assert out.shape == (2, 5, 4, 8)

 import torch
 import flash_attn
+# make reproducible
+torch.manual_seed(0)
+def _attention_torch(query, key, value, *, backend):
+    query, key, value = (x.transpose(1, 2).contiguous() for x in (query, key, value))
+    with torch.nn.attention.sdpa_kernel(backend):
+        out = torch.nn.functional.scaled_dot_product_attention(query, key, value)
+    out = out.transpose(1, 2).contiguous()
+    return out
 def test_flash_attn():
+    # ===== Testing shape: (1, 4224, 24, 128) =====
+    batch_size = 1
+    seq_len = 4224
+    num_attention_heads = 24
+    attention_head_dim = 128
+    shape = (batch_size, seq_len, num_attention_heads, attention_head_dim)
+    query = torch.randn(shape, device="cuda", dtype=torch.float16)
+    key = torch.randn(shape, device="cuda", dtype=torch.float16)
+    value = torch.randn(shape, device="cuda", dtype=torch.float16)
+    golden_truth = _attention_torch(query, key, value, backend=torch.nn.attention.SDPBackend.MATH)
+    print("Golden truth shape:", golden_truth.shape)
+    # print query sum
+    print("Query sum:", query.sum().item())
+    # now use the flash attention
+    out, softmax_lse, p, rng_state = flash_attn.mha_fwd(
+        query,
+        key,
+        value,
+        torch.empty(shape, device="cuda", dtype=torch.half),
+        torch.empty(num_attention_heads, device="cuda", dtype=torch.float32),
+        0.0,
+        1.0,
+        False,
+        0,
+        0,
+        0.0,
+        False,
+        None,
     )
+    print("Flash attention output shape:", out.shape)
+    # print query sum
+    print(query.sum().item())
+    # compare
+    diff = (out- golden_truth).abs().max()
+    print("Max absolute difference:", diff.item())
+    assert out.shape == (1, 4224, 24, 128)
+    assert diff < 1e-2

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -14,7 +14,10 @@
 // }
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
-  ops.def("mha_fwd(Tensor! q, Tensor! k, Tensor! v, Tensor? out_, Tensor? alibi_slopes_, float p_dropout, float softmax_scale, bool is_causal, int window_size_left, int window_size_right, float softcap, bool return_softmax, Generator? gen_) -> Tensor[]");
   ops.impl("mha_fwd", torch::kCUDA, &mha_fwd);
   ops.def("mha_varlen_fwd(Tensor! q, Tensor! k, Tensor! v, Tensor? out_, Tensor cu_seqlens_q, Tensor cu_seqlens_k, Tensor? seqused_k_, Tensor? leftpad_k_, Tensor? block_table_, Tensor? alibi_slopes_, int max_seqlen_q, int max_seqlen_k, float p_dropout, float softmax_scale, bool zero_tensors, bool is_causal, int window_size_left, int window_size_right, float softcap, bool return_softmax, Generator? gen_) -> Tensor[]");

 // }
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def("mha_fwd(Tensor! q, Tensor! k, Tensor! v, Tensor? out_, Tensor? "
+          "alibi_slopes_, float p_dropout, float softmax_scale, bool "
+          "is_causal, int window_size_left, int window_size_right, float "
+          "softcap, bool return_softmax, Generator? gen_) -> Tensor[]");
   ops.impl("mha_fwd", torch::kCUDA, &mha_fwd);
   ops.def("mha_varlen_fwd(Tensor! q, Tensor! k, Tensor! v, Tensor? out_, Tensor cu_seqlens_q, Tensor cu_seqlens_k, Tensor? seqused_k_, Tensor? leftpad_k_, Tensor? block_table_, Tensor? alibi_slopes_, int max_seqlen_q, int max_seqlen_k, float p_dropout, float softmax_scale, bool zero_tensors, bool is_causal, int window_size_left, int window_size_right, float softcap, bool return_softmax, Generator? gen_) -> Tensor[]");

torch-ext/torch_binding.h CHANGED Viewed

@@ -3,19 +3,19 @@
 #include <torch/torch.h>
 std::vector<torch::Tensor>
-mha_fwd(const torch::Tensor &q,                               // batch_size x seqlen_q x num_heads x round_multiple(head_size, 8)
-        const torch::Tensor &k,                               // batch_size x seqlen_k x num_heads_k x round_multiple(head_size, 8)
-        const torch::Tensor &v,                               // batch_size x seqlen_k x num_heads_k x round_multiple(head_size, 8)
-        const c10::optional<torch::Tensor> &out_,          // batch_size x seqlen_q x num_heads x round_multiple(head_size, 8)
-        const c10::optional<torch::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
         const double p_dropout,
         const double softmax_scale,
-        bool is_causal,
-        const int64_t window_size_left,
         const int64_t window_size_right,
         const double softcap,
-        const bool return_softmax,
-        const c10::optional<at::Generator> gen_);
 std::vector<torch::Tensor>
 mha_varlen_fwd(

 #include <torch/torch.h>
 std::vector<torch::Tensor>
+mha_fwd(torch::Tensor &q,
+        const torch::Tensor &k,
+        const torch::Tensor &v,
+        c10::optional<torch::Tensor> out_,
+        c10::optional<torch::Tensor> alibi_slopes_,
         const double p_dropout,
         const double softmax_scale,
+        bool is_causal
+        const int64_t window_size_left
         const int64_t window_size_right,
         const double softcap,
+        const bool return_softmax
+        c10::optional<at::Generator> gen_);
 std::vector<torch::Tensor>
 mha_varlen_fwd(

fix: simplify and align signatures further

🎉 Free Image Generator Now Available!