rebase to latest

LuFinch · LuFinch · commit 89c6a49e712a · 2025-11-18T00:28:04.000-08:00
diff --git a/src/ATen/native/transformers/xpu/flash_attn/sycltla/kernel/xe_sdpa_fwd_bshd.h b/src/ATen/native/transformers/xpu/flash_attn/sycltla/kernel/xe_sdpa_fwd_bshd.h
@@ -164,6 +164,7 @@ class FMHAPrefill {
       Arguments const& args,
       void* workspace) {
     (void)workspace;
+
     return {
         args.mode,
         args.problem_shape,
@@ -438,6 +439,29 @@ class FMHAPrefill {
           prefetch(tiled_prefetch_v, pVgV(_, i, _, nblock));
         }
 
+        // Prevnt numerical errors when seq_len_kv is not fully divisible by
+        // QK_BLK_N
+        const int item_id = thread_idx % SubgroupSize;
+        if (seq_len_kv % QK_BLK_N != 0) {
+          int col_idx = item_id + nblock * QK_BLK_N;
+          int remainder = seq_len_kv % QK_BLK_N;
+          int cutoff = (seq_len_kv / QK_BLK_N) * QK_BLK_N + remainder;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < FragsN; n++, col_idx += get<1>(MmaAtomShape())) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int m = 0; m < FragsM; m++) {
+              int row_idx = m * Vec + seq_coord;
+              CUTLASS_PRAGMA_UNROLL
+              for (int row = 0; row < Vec; row++, row_idx++) {
+                if (col_idx >= cutoff) {
+                  tSr(row, m, n) = ElementAccumulator{-INFINITY};
+                }
+              }
+            }
+          }
+        }
+
         CollectiveSoftmaxEpilogue softmax(params.softmax);
         softmax(nblock == 0, tSr, max_reg, sum_reg, out_reg);
 
@@ -479,6 +503,8 @@ class FMHAPrefill {
         // mask the elements of each tile using the bottom right masking
         const int item_id = thread_idx % SubgroupSize;
         int col_idx = item_id + (nblock_limit - 1) * QK_BLK_N;
+        int remainder = seq_len_kv % QK_BLK_N;
+        int cutoff = (seq_len_kv / QK_BLK_N) * QK_BLK_N + remainder;
         CUTLASS_PRAGMA_UNROLL
         for (int n = 0; n < FragsN;
              n++, col_idx += get<1>(MmaAtomShape())) { // 4
@@ -487,8 +513,12 @@ class FMHAPrefill {
             int row_idx = m * Vec + seq_coord;
             CUTLASS_PRAGMA_UNROLL
             for (int row = 0; row < Vec; row++, row_idx++) { // 8
-              if (row_idx < first_non_masked_sequence ||
-                  col_idx > row_idx - first_non_masked_sequence) {
+              if (row_idx < first_non_masked_sequence || // for the sequence
+                                                         // that is fully masked
+                  col_idx > row_idx -
+                          first_non_masked_sequence || // for the bottom right
+                                                       // triangular masking
+                  col_idx >= cutoff) { // for seq_len_kv not fully divisible
                 tSr(row, m, n) = ElementAccumulator{-INFINITY};
               }
             }
diff --git a/src/ATen/native/transformers/xpu/flash_attn/sycltla/mha_bwd.cpp b/src/ATen/native/transformers/xpu/flash_attn/sycltla/mha_bwd.cpp
@@ -22,7 +22,7 @@ void compute_o_dot_do(
     const int bidh) {
   // The thread index.
   constexpr int kBlockM = T::kBlockM;
-  // constexpr int kBlockN = T::kBlockN;
+  constexpr int kBlockN = T::kBlockN;
   constexpr int kHeadDim = T::kHeadDim;
   constexpr int kNSGs = T::kNSGs;
   constexpr int SubgroupSize = T::SubgroupSize;
@@ -31,8 +31,8 @@ void compute_o_dot_do(
 
   auto sg = compat::get_nd_item<1>().get_sub_group();
   auto group = compat::get_nd_item<1>().get_group();
-  // auto first_thread_in_sg_idx = sg.get_group_linear_id() *
-  // trait.SubgroupSize;
+  auto first_thread_in_sg_idx = sg.get_group_linear_id() * trait.SubgroupSize;
+
   auto bofst = Boffset(param);
 
   const index_t o_offset = bofst.o_offset(bidb, bidh, m_block * kBlockM);
@@ -209,7 +209,7 @@ CUTLASS_DEVICE void apply_mask_causal(
   auto sg = compat::get_nd_item<1>().get_sub_group();
   auto group = compat::get_nd_item<1>().get_group();
   int sg_local_id = sg.get_local_id();
-  // int sg_group_id = sg.get_group_id();
+  int sg_group_id = sg.get_group_id();
   Tensor rC_2d = make_tensor(rC.data(), convert_layout_2d_layout(rC.layout()));
   CUTLASS_PRAGMA_UNROLL
   for (int n = 0; n < size<1>(tensor); ++n) {
@@ -371,8 +371,8 @@ void dq_dk_dv_1colblock(
   constexpr int kBlockM = Trait::kBlockM;
   constexpr int kBlockN = Trait::kBlockN;
   constexpr bool is_causal = Trait::is_causal;
-  // constexpr int kNSGs = Trait::kNSGs;
-  // constexpr int SubgroupSize = Trait::SubgroupSize;
+  constexpr int kNSGs = Trait::kNSGs;
+  constexpr int SubgroupSize = Trait::SubgroupSize;
   auto sg = compat::get_nd_item<1>().get_sub_group();
   auto group = compat::get_nd_item<1>().get_group();
   auto first_thread_in_sg_idx = sg.get_group_linear_id() * trait.SubgroupSize;
@@ -675,7 +675,7 @@ void dq_dk_dv_1colblock(
   const int max_m_block = ceil_div(param.seq_len_q, kBlockM);
   const int tail_m = param.seq_len_q % kBlockM;
 
-  // cutlass::NumericConverter<T, float> converter;
+  cutlass::NumericConverter<T, float> converter;
 
   // clear accumulator
   clear(tdVrdV);
@@ -880,7 +880,7 @@ void convert_dq(
     int bidb,
     int bidh) {
   constexpr int kBlockM = T::kBlockM;
-  // constexpr int kBlockN = T::kBlockN;
+  constexpr int kBlockN = T::kBlockN;
   constexpr int kHeadDim = T::kHeadDim;
   using DType = typename T::DType;
   using VType = typename T::VType;