Chao1Han
diff --git a/‎aten/src/ATen/native/cudnn/MHA.cpp‎
Lines changed: 706 additions & 341 deletions b/‎aten/src/ATen/native/cudnn/MHA.cpp‎
Lines changed: 706 additions & 341 deletions
diff --git a/‎aten/src/ATen/native/cudnn/MHA.h‎
Lines changed: 27 additions & 0 deletions b/‎aten/src/ATen/native/cudnn/MHA.h‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/native_functions.yaml‎
Lines changed: 6 additions & 0 deletions b/‎aten/src/ATen/native/native_functions.yaml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp‎
Lines changed: 57 additions & 0 deletions b/‎aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/transformers/cuda/attention.cu‎
Lines changed: 0 additions & 10 deletions b/‎aten/src/ATen/native/transformers/cuda/attention.cu‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎aten/src/ATen/native/transformers/cuda/attention_backward.cu‎
Lines changed: 145 additions & 47 deletions b/‎aten/src/ATen/native/transformers/cuda/attention_backward.cu‎
Lines changed: 145 additions & 47 deletions
@@ -70,4 +70,31 @@ void run_cudnn_SDP_bprop(
     const Tensor& dropoutseed,
     const Tensor& dropoutoffset);
 
+void run_cudnn_SDP_bprop_nestedtensor(
+    int64_t b,
+    int64_t h_q,
+    int64_t h_k,
+    int64_t h_v,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d_qk,
+    int64_t d_v,
+    float scaling_factor,
+    bool is_causal,
+    float dropout_probability,
+    const Tensor& cum_seqlen_q,
+    const Tensor& cum_seqlen_kv,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
+    const Tensor& o,
+    const Tensor& dO,
+    const Tensor& softmaxstats,
+    Tensor& dQ,
+    Tensor& dK,
+    Tensor& dV,
+    const Tensor& dropoutseed,
+    const Tensor& dropoutoffset);
+
 } // namespace at::native
@@ -14958,6 +14958,7 @@
 - func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor attn_bias, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, *, float? scale=None) -> (Tensor, Tensor, Tensor)
   dispatch:
     CUDA: _scaled_dot_product_cudnn_attention_backward_cuda
+    NestedTensorCUDA: _scaled_dot_product_cudnn_attention_nestedtensor_backward_cuda
   tags: nondeterministic_seeded
 
 - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor rng_state, Tensor unused, Tensor debug_attn_mask)
@@ -14990,6 +14991,11 @@
     CUDA: _cudnn_attention_forward
   tags: nondeterministic_seeded
 
+- func: _cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor attn_bias, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, *, float? scale=None) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: _cudnn_attention_backward
+  tags: nondeterministic_seeded
+
 - func: _triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor
   variants: function
   dispatch:
 
@@ -349,6 +349,63 @@ _scaled_dot_product_cudnn_attention_nestedtensor_cuda(
   return std::make_tuple(std::move(attention), std::move(log_sumexp), cumulative_sequence_length_q, cumulative_sequence_length_kv, max_seqlen_batch_q, max_seqlen_batch_kv, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
 }
 
+std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_nestedtensor_backward_cuda(
+    const Tensor& grad_out,
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const Tensor& out,
+    const Tensor& logsumexp,
+    const Tensor& philox_seed,
+    const Tensor& philox_offset,
+    const Tensor& attn_bias,
+    const Tensor& cum_seq_q,
+    const Tensor& cum_seq_k,
+    const int64_t max_q,
+    const int64_t max_k,
+    double dropout_p,
+    bool is_causal,
+    std::optional<double> scale) {
+  if (!grad_out.defined()) {
+    return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
+  }
+  auto [
+      grad_out_buffer_reshaped,
+      query_buffer_reshaped,
+      key_buffer_reshaped,
+      value_buffer_reshaped,
+      output_buffer_reshaped] =
+      preprocessing::sdpa_nested_preprocessing_backward(
+          grad_out,
+          query,
+          key,
+          value,
+          out,
+          cum_seq_q,
+          cum_seq_k,
+          max_q,
+          max_k);
+
+  auto [dq, dk, dv] = at::_cudnn_attention_backward(grad_out_buffer_reshaped,
+                                                    query_buffer_reshaped,
+                                                    key_buffer_reshaped,
+                                                    value_buffer_reshaped,
+                                                    output_buffer_reshaped,
+                                                    logsumexp,
+                                                    philox_seed,
+                                                    philox_offset,
+                                                    attn_bias,
+                                                    cum_seq_q,
+                                                    cum_seq_k,
+                                                    max_q,
+                                                    max_k,
+                                                    dropout_p,
+                                                    is_causal,
+                                                    scale);
+  return std::make_tuple(dq, dk, dv);
+}
+
+
 std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_flash_attention_backward_nested(
     const at::Tensor& grad_out_,
     const at::Tensor& query,
 
@@ -848,16 +848,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
     // TODO(eqy): support debug_attn_mask
     return std::make_tuple(std::move(attention), std::move(log_sumexp), Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_kv, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
   } else {
-    //auto [
-    //    query_buffer_reshaped,
-    //    key_buffer_reshaped,
-    //    value_buffer_reshaped,
-    //    cumulative_sequence_length_q,
-    //    cumulative_sequence_length_kv,
-    //    max_seqlen_batch_q,
-    //    max_seqlen_batch_kv,
-    //    output_shape] = preprocessing::sdpa_nested_preprocessing(query, key, value);
-    // C10_LOG_API_USAGE_ONCE("torch.sdpa.flash_attention_cudnn");
     // TODO(eqy): debug mask support
     // BHSD ...
     const int64_t batch_size = cumulative_sequence_length_q.value().size(0) - 1;
 
@@ -24,6 +24,8 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_cudnn_attention_backward.h>
+#include <ATen/ops/_cudnn_attention_backward_native.h>
 #include <ATen/ops/_flash_attention_backward.h>
 #include <ATen/ops/_flash_attention_backward_native.h>
 #include <ATen/ops/_efficient_attention_backward.h>
@@ -170,7 +172,7 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
   return std::make_tuple(Tensor(), Tensor(), Tensor());
 }
 
-std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_cuda(
+std::tuple<Tensor, Tensor, Tensor> _cudnn_attention_backward(
     const Tensor& grad_out,
     const Tensor& query,
     const Tensor& key,
@@ -197,57 +199,117 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_
       }
     }
 
-    const int64_t batch_size = query.size(0);
-    const int64_t num_heads = query.size(1);
-    const int64_t head_dim_qk = query.size(3);
-    const int64_t head_dim_v = value.size(3);
+    const bool is_nested = cum_seq_q.defined();
     const int64_t max_seqlen_batch_q = query.size(2);
     const int64_t max_seqlen_batch_k = key.size(2);
 
-    // This is needed because SaveVariable automatically converts
-    // std::optional to undefined tensor
-    std::optional<Tensor> attn_bias_;
-    if (attn_bias.defined()) {
-      attn_bias_ = attn_bias;
-    }
-    if (attn_bias_.has_value()) {
-      const auto bias_dim = attn_bias_.value().dim();
-      if (bias_dim == 2) {
-        attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
-      } else if (bias_dim == 3) {
-        attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
-      } else {
-        TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
-        attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
+    if (!is_nested) {
+      const int64_t batch_size = query.size(0);
+      const int64_t num_heads = query.size(1);
+      const int64_t head_dim_qk = query.size(3);
+      const int64_t head_dim_v = value.size(3);
+
+      // This is needed because SaveVariable automatically converts
+      // std::optional to undefined tensor
+      std::optional<Tensor> attn_bias_;
+      if (attn_bias.defined()) {
+        attn_bias_ = attn_bias;
+      }
+      if (attn_bias_.has_value()) {
+        const auto bias_dim = attn_bias_.value().dim();
+        if (bias_dim == 2) {
+          attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
+        } else if (bias_dim == 3) {
+          attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
+        } else {
+          TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
+          attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
+        }
       }
-    }
 
-    const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float();
-    auto dq = at::empty_like(query);
-    auto dk = at::empty_like(key);
-    auto dv = at::empty_like(value);
-    run_cudnn_SDP_bprop(batch_size /*int64_t b*/,
-                        num_heads /*int64_t h*/,
-                        max_q/*int64_t s_q*/,
-                        max_k/*int64_t s_kv*/,
-                        head_dim_qk /*int64_t d_qk*/,
-                        head_dim_v /*int64_t d_v*/,
-                        softmax_scale /*float scaling_factor*/,
-                        is_causal /*bool is_causal*/,
-                        dropout_p /*float dropout_probability*/,
-                        query /*const Tensor& q*/,
-                        key /*const Tensor& k*/,
-                        value /*const Tensor& v*/,
-                        attn_bias_ /*const std::optional<Tensor>& attn_bias*/,
-                        out /*const Tensor& o*/,
-                        grad_out/*const Tensor& dO*/,
-                        logsumexp.unsqueeze(-1)/*const Tensor& softmaxstats*/,
-                        dq/*Tensor& dQ*/,
-                        dk/*Tensor& dK*/,
-                        dv/*Tensor& dV*/,
-                        philox_seed/*Tensor& dropoutseed*/,
-                        philox_offset/*Tensor& dropoutoffset*/);
-    return std::make_tuple(std::move(dq), std::move(dk), std::move(dv));
+      const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float();
+      auto dq = at::empty_like(query);
+      auto dk = at::empty_like(key);
+      auto dv = at::empty_like(value);
+      run_cudnn_SDP_bprop(batch_size /*int64_t b*/,
+                          num_heads /*int64_t h*/,
+                          max_q/*int64_t s_q*/,
+                          max_k/*int64_t s_kv*/,
+                          head_dim_qk /*int64_t d_qk*/,
+                          head_dim_v /*int64_t d_v*/,
+                          softmax_scale /*float scaling_factor*/,
+                          is_causal /*bool is_causal*/,
+                          dropout_p /*float dropout_probability*/,
+                          query /*const Tensor& q*/,
+                          key /*const Tensor& k*/,
+                          value /*const Tensor& v*/,
+                          attn_bias_ /*const std::optional<Tensor>& attn_bias*/,
+                          out /*const Tensor& o*/,
+                          grad_out/*const Tensor& dO*/,
+                          logsumexp.unsqueeze(-1)/*const Tensor& softmaxstats*/,
+                          dq/*Tensor& dQ*/,
+                          dk/*Tensor& dK*/,
+                          dv/*Tensor& dV*/,
+                          philox_seed/*Tensor& dropoutseed*/,
+                          philox_offset/*Tensor& dropoutoffset*/);
+      return std::make_tuple(std::move(dq), std::move(dk), std::move(dv));
+    } else {
+      // BHSD ...
+      const int64_t batch_size = cum_seq_q.size(0) - 1;
+      const int64_t num_heads_q = query.size(-2);
+      const int64_t num_heads_k = key.size(-2);
+      const int64_t num_heads_v = value.size(-2);
+      const int64_t head_dim_qk = query.size(-1);
+      const int64_t head_dim_v = value.size(-1);
+      std::optional<Tensor> attn_bias_;
+      if (attn_bias.defined()) {
+        attn_bias_ = attn_bias;
+      }
+      if (attn_bias_.has_value()) {
+        const auto bias_dim = attn_bias_.value().dim();
+        if (bias_dim == 2) {
+          attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
+        } else if (bias_dim == 3) {
+          attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
+        } else {
+          attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
+          TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
+        }
+      }
+
+      auto dq = at::empty_like(query);
+      auto dk = at::empty_like(key);
+      auto dv = at::empty_like(value);
+
+      const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked();
+      run_cudnn_SDP_bprop_nestedtensor(
+        batch_size,
+        num_heads_q,
+        num_heads_k,
+        num_heads_v,
+        max_seqlen_batch_q,
+        max_seqlen_batch_k,
+        head_dim_qk,
+        head_dim_v,
+        softmax_scale,
+        is_causal,
+        dropout_p,
+        cum_seq_q,
+        cum_seq_k,
+        query,
+        key,
+        value,
+        attn_bias_,
+        out,
+        grad_out,
+        logsumexp,
+        dq,
+        dk,
+        dv,
+        philox_seed,
+        philox_offset);
+      return std::make_tuple(std::move(dq), std::move(dk), std::move(dv));
+    }
 }
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
@@ -950,4 +1012,40 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_e
       grad_q.transpose(1, 2), grad_k.transpose(1, 2), grad_v.transpose(1, 2), grad_bias);
 }
 
+std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_cuda(
+    const Tensor& grad_out,
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const Tensor& out,
+    const Tensor& logsumexp,
+    const Tensor& philox_seed,
+    const Tensor& philox_offset,
+    const Tensor& attn_bias,
+    const Tensor& cum_seq_q,
+    const Tensor& cum_seq_k,
+    const int64_t max_q,
+    const int64_t max_k,
+    double dropout_p,
+    bool is_causal,
+    std::optional<double> scale) {
+        return at::_cudnn_attention_backward(
+            grad_out,
+            query,
+            key,
+            value,
+            out,
+            logsumexp,
+            philox_seed,
+            philox_offset,
+            attn_bias,
+            cum_seq_q,
+            cum_seq_k,
+            max_q,
+            max_k,
+            dropout_p,
+            is_causal,
+            scale);
+}
+
 } // namespace at::native