intel
diff --git a/‎src/ATen/native/transformers/xpu/flash_attn/sycltla/collective/xe_flash_attn_prefill_mma_bshd.h‎
Lines changed: 14 additions & 119 deletions b/‎src/ATen/native/transformers/xpu/flash_attn/sycltla/collective/xe_flash_attn_prefill_mma_bshd.h‎
Lines changed: 14 additions & 119 deletions
diff --git a/‎src/ATen/native/transformers/xpu/flash_attn/sycltla/collective/xe_flash_attn_sdpa_fwd_bshd_epilogue.h‎
Lines changed: 4 additions & 29 deletions b/‎src/ATen/native/transformers/xpu/flash_attn/sycltla/collective/xe_flash_attn_sdpa_fwd_bshd_epilogue.h‎
Lines changed: 4 additions & 29 deletions
@@ -118,54 +118,39 @@ struct FlashPrefillMma<
   using ElementAccumulator = typename TiledMmaQK::ValTypeC;
   static constexpr bool CausalMask = CausalMask_;
   static constexpr int SubgroupSize = DispatchPolicy::SubgroupSize;
-
   using MmaAtomShape = typename MmaAtom::Shape_MNK;
-
   static constexpr auto PV_ATOM_M =
       decltype(get<0>(SubgroupLayout{}.shape()))::value;
   static constexpr auto PV_ATOM_N =
       decltype(get<1>(SubgroupLayout{}.shape()))::value;
   static constexpr auto PV_ATOM_K =
       decltype(get<2>(SubgroupLayout{}.shape()))::value;
-
   using SubgroupTileShapePV =
       decltype(cute::shape_div(TileShapePV{}, (SubgroupLayout{}.shape())));
-
   static constexpr auto QK_BLK_M = get<0>(TileShapeQK{});
   static constexpr auto QK_BLK_N = get<1>(TileShapeQK{});
   static constexpr auto QK_BLK_K = get<2>(TileShapeQK{});
-
-  // This TiledMma is only required to serve the specific tiling requirements
-  // for matrix K. This is due to the consumption of matrix K by all subgroups
-  // within a workgroup.
-  static constexpr auto QK_ATOM_M = PV_ATOM_M; // 8
-  static constexpr auto QK_ATOM_N = PV_ATOM_N; // 1
-  static constexpr auto QK_ATOM_K = PV_ATOM_K; // 1
-
-  using SubgroupTileShapeQK = decltype(cute::shape_div(
-      TileShapeQK{},
-      SubgroupLayout{}.shape())); // 128, 64, 32 / 16, 1, 1 = (8, 64, 32 )
-
+  static constexpr auto QK_ATOM_M = PV_ATOM_M;
+  static constexpr auto QK_ATOM_N = PV_ATOM_N;
+  static constexpr auto QK_ATOM_K = PV_ATOM_K;
+  using SubgroupTileShapeQK =
+      decltype(cute::shape_div(TileShapeQK{}, SubgroupLayout{}.shape()));
   static constexpr auto QK_SG_M = get<0>(SubgroupTileShapeQK{});
   static constexpr auto QK_SG_N = get<1>(SubgroupTileShapeQK{});
   static constexpr auto QK_SG_K = get<2>(SubgroupTileShapeQK{});
-
   static constexpr bool is_var_len =
       cutlass::fmha::collective::is_variable_length_v<
           tuple_element_t<3, ProblemShapeType>>;
-
   using FragsShapeS = decltype(cute::shape_div(
       take<0, 2>(SubgroupTileShapeQK{}),
-      take<0, 2>(MmaAtomShape()))); // 8, 64, 32 /  8, 16, 16 (1, 4)
+      take<0, 2>(MmaAtomShape())));
   static constexpr int Vec =
-      (get<0>(MmaAtomShape()) * get<1>(MmaAtomShape())) / SubgroupSize; // 8
+      (get<0>(MmaAtomShape()) * get<1>(MmaAtomShape())) / SubgroupSize;
   static constexpr int FragsM = get<0>(FragsShapeS{});
-  static constexpr int FragsNS = get<1>(FragsShapeS{}); // 4
-
+  static constexpr int FragsNS = get<1>(FragsShapeS{});
   static constexpr uint32_t MaxThreadsPerBlock =
       size(SubgroupLayout{}) * SubgroupSize;
   using CopyThreadShape = Shape<_1, Int<SubgroupSize>>;
-
   using traits_load_Q = Copy_Traits<GmemTiledCopyQ, StrideQ>;
   using atom_load_Q = Copy_Atom<traits_load_Q, ElementQ>;
   using val_layout_load_Q = decltype(make_layout(
@@ -174,7 +159,6 @@ struct FlashPrefillMma<
       atom_load_Q{},
       Layout<CopyThreadShape>{},
       val_layout_load_Q{}));
-
   using traits_load_K = Copy_Traits<GmemTiledCopyK, StrideK>;
   using atom_load_K = Copy_Atom<traits_load_K, ElementK>;
   using val_layout_load_K = decltype(make_layout(
@@ -183,7 +167,6 @@ struct FlashPrefillMma<
       atom_load_K{},
       Layout<CopyThreadShape>{},
       val_layout_load_K{}));
-
   using traits_load_V = Copy_Traits<GmemTiledCopyV, StrideV>;
   using atom_load_V = Copy_Atom<traits_load_V, ElementV>;
   using val_layout_load_V = decltype(make_layout(
@@ -195,6 +178,7 @@ struct FlashPrefillMma<
   template <typename T>
   static constexpr bool is_fp8_v =
       cute::is_same_v<T, float_e4m3_t> || cute::is_same_v<T, float_e5m2_t>;
+
   // Host side kernel arguments
   struct Arguments {
     ElementQ const* ptr_Q;
@@ -222,7 +206,6 @@ struct FlashPrefillMma<
       Arguments const& args,
       void* workspace) {
     (void)workspace;
-
     auto
         [batch,
          num_heads_q,
@@ -231,7 +214,6 @@ struct FlashPrefillMma<
          seq_len_kv,
          head_size_qk,
          head_size_vo] = problem_shape;
-
     auto tensorQ = make_tensor(
         make_gmem_ptr(args.ptr_Q),
         make_layout(
@@ -250,7 +232,6 @@ struct FlashPrefillMma<
     XE_Copy_Q copyQ{XE_Copy_Q{}.with(tensorQ)};
     XE_Copy_K copyK{XE_Copy_K{}.with(tensorK)};
     XE_Copy_V copyV{XE_Copy_V{}.with(tensorV)};
-
     return Params{copyQ, copyK, copyV};
   }
 
@@ -265,22 +246,16 @@ struct FlashPrefillMma<
     int thread_idx = static_cast<int>(ThreadIdxX());
     auto thr_copy_Q = params.gmem_tiled_copy_q.get_slice(thread_idx);
     auto thr_copy_K = params.gmem_tiled_copy_k.get_slice(thread_idx);
-    // Instantiate the MMA object
     TiledMmaQK tiled_mma;
-    // To make all threads in a warp have the same global tensors pass in the
-    // index of thread 0 in each warp
     auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx =
         sg.get_group_id()[0] * DispatchPolicy::SubgroupSize;
     auto thread_mma_q = tiled_mma.get_slice(first_thread_in_sg_idx);
     auto thread_mma_k = tiled_mma.get_slice(0);
-
     // Partition
     Tensor tCgQ = thread_mma_q.partition_A(gQ);
     Tensor tCgK = thread_mma_k.partition_B(gK);
-
     // Create fragments
-    // TODO(Codeplay): fix this, this is probably not general
     using TCrQ_Type =
         cute::conditional_t<is_fp8_v<ElementQ>, uint8_t, ElementQ>;
     using TCrK_Type =
@@ -289,68 +264,18 @@ struct FlashPrefillMma<
         params.gmem_tiled_copy_q, take<0, 3>(tCgQ.shape())));
     Tensor tCrK = make_tensor<TCrK_Type>(make_fragment_layout(
         params.gmem_tiled_copy_k, take<0, 3>(tCgK.shape())));
-
     // Retile registers for copies
     Tensor tQrQ = thr_copy_Q.retile_D(tCrQ);
     Tensor tKrK = thr_copy_K.retile_D(tCrK);
-
     // Retile global tile for copies
     Tensor tQgQ = thr_copy_Q.retile_S(tCgQ);
     Tensor tKgK = thr_copy_K.retile_S(tCgK);
 
-#if CUTLASS_ENABLE_DEBUG_PRINTS
-#define PRINT(x)  \
-  print(#x ": "); \
-  print(x);       \
-  print("\n");
-    if (cute::thread(LOG_THREAD, LOG_GROUP)) {
-      print("======================= Q: \n");
-      PRINT(gQ);
-      PRINT(tCrQ);
-      PRINT(tCgQ);
-      PRINT(tQrQ);
-      PRINT(tQgQ);
-
-      print("=====================  K :\n");
-      PRINT(gK);
-      PRINT(tCrK);
-      PRINT(tCgK);
-      PRINT(tKrK);
-      PRINT(tKgK);
-
-      print("=====================  Config: \n");
-      PRINT(MaxThreadsPerBlock);
-      PRINT(SubgroupTileShapeQK{});
-    }
-#undef PRINT
-#endif
-
-    //
     // Mainloop
-    //
-
     for (int k_tile = 0; k_tile < k_tile_count; ++k_tile) {
       copy(params.gmem_tiled_copy_q, tQgQ(_, _, _, k_tile), tQrQ);
       copy(params.gmem_tiled_copy_k, tKgK(_, _, _, k_tile), tKrK);
-      if constexpr (is_fp8_v<ElementQ> && is_fp8_v<ElementK>) {
-        auto tCrQ_ = make_fragment_like<half_t>(tCrQ);
-        convert_FP8_to_FP16<ElementQ>(tCrQ, tCrQ_);
-        auto tCrK_ = make_fragment_like<half_t>(tCrK);
-        convert_FP8_to_FP16<ElementK>(tCrK, tCrK_);
-        cute::gemm(tiled_mma, accum, tCrQ_, tCrK_, frag_src);
-
-      } else if constexpr (is_fp8_v<ElementQ> && !is_fp8_v<ElementK>) {
-        auto tCrQ_ = make_fragment_like<half_t>(tCrQ);
-        convert_FP8_to_FP16<ElementQ>(tCrQ, tCrQ_);
-        cute::gemm(tiled_mma, accum, tCrQ_, tCrK, frag_src);
-
-      } else if constexpr (!is_fp8_v<ElementQ> && is_fp8_v<ElementK>) {
-        auto tCrK_ = make_fragment_like<half_t>(tCrK);
-        convert_FP8_to_FP16<ElementK>(tCrK, tCrK_);
-        cute::gemm(tiled_mma, accum, tCrQ, tCrK_, frag_src);
-      } else {
-        cute::gemm(tiled_mma, accum, tCrQ, tCrK, frag_src);
-      }
+      cute::gemm(tiled_mma, accum, tCrQ, tCrK, frag_src);
     }
   }
   template <
@@ -366,10 +291,7 @@ struct FlashPrefillMma<
       FragSrc const& frag_src,
       Params const& params) {
     int thread_idx = static_cast<int>(ThreadIdxX());
-    // Instantiate the MMA object
     TiledMmaPV tiled_mma;
-    // Tile GV to the shape of <64,64> and loop over the HeadSize/64 to avoid
-    // Register spill
     Tensor gV_ = take<0, 3>(
         local_tile(gV, select<1, 2>(TileShapePV{}), make_coord(_, _)));
     auto sg = compat::get_nd_item<1>().get_sub_group();
@@ -381,49 +303,20 @@ struct FlashPrefillMma<
         cute::conditional_t<is_fp8_v<ElementV>, uint8_t, ElementV>;
     Tensor tCrV = make_tensor<TCrV_Type>(make_fragment_layout(
         params.gmem_tiled_copy_v, take<0, 3>(tCgV.shape())));
-
     // Partition the copying of A and B tiles across the threads
     auto gmem_thr_copy_V = params.gmem_tiled_copy_v.get_slice(thread_idx);
     Tensor tVrV = gmem_thr_copy_V.retile_D(tCrV);
     Tensor tVgV = gmem_thr_copy_V.retile_S(tCgV);
 
-#if CUTLASS_ENABLE_DEBUG_PRINTS
-#define PRINT(x)  \
-  print(#x ": "); \
-  print(x);       \
-  print("\n");
-    if (cute::thread(LOG_THREAD, LOG_GROUP)) {
-      print("=====================  V :\n");
-      PRINT(gV);
-      PRINT(tCrV);
-      PRINT(tCgV);
-      PRINT(tVrV);
-      PRINT(tVgV);
-
-      print("=====================  Config: \n");
-      PRINT(MaxThreadsPerBlock);
-      PRINT(SubgroupTileShapePV{});
-    }
-#undef PRINT
-#endif
-
-    // 7) Convert S to P (FP32 -> BF16)
+    // Convert S to P (FP32 -> BF16)
     Tensor tPr = convert_type<typename TiledMmaPV::ValTypeA>(tSr);
     //
     // Mainloop
     //
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < tile_count; i++) {
       copy(params.gmem_tiled_copy_v, tVgV(_, _, _, i), tVrV);
-      if constexpr (is_fp8_v<ElementV>) {
-        auto tCrV_ = make_fragment_like<half_t>(tCrV);
-        convert_FP8_to_FP16<ElementV>(tCrV, tCrV_);
-        cute::gemm(
-            tiled_mma, accum(_, _, _, i), tPr, tCrV_, frag_src(_, _, _, i));
-      } else {
-        cute::gemm(
-            tiled_mma, accum(_, _, _, i), tPr, tCrV, frag_src(_, _, _, i));
-      }
+      cute::gemm(tiled_mma, accum(_, _, _, i), tPr, tCrV, frag_src(_, _, _, i));
     }
   }
 
@@ -496,3 +389,5 @@ struct FlashPrefillMma<
 };
 
 } // namespace cutlass::flash_attention::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
@@ -166,6 +166,7 @@ class FlashPrefillEpilogue<
     return true;
   }
 
+  // The main operator
   CUTLASS_HOST_DEVICE
   FlashPrefillEpilogue(Params const& params_, TensorStorage const&)
       : params(params_) {}
@@ -187,16 +188,13 @@ class FlashPrefillEpilogue<
       int const& q_head_coord,
       float softmax_scale) {
     using namespace cute;
-
     static constexpr bool is_var_len =
         cutlass::fmha::collective::is_variable_length_v<
             tuple_element_t<2, ProblemShape>>;
-
     using FragOutLayout = typename FragOut::layout_type;
     constexpr int Vec = shape<0>(FragOutLayout{});
     constexpr int FragsM = shape<1>(FragOutLayout{});
     constexpr int FragsN = size(select<2, 3>(shape(FragOutLayout{})));
-
     auto g = compat::get_nd_item<1>().get_sub_group();
     auto out_reg = make_tensor(
         static_cast<decltype(out)&&>(out).data(),
@@ -231,14 +229,9 @@ class FlashPrefillEpilogue<
     // Indexing variables
     auto [batch, num_heads_q, head_size_vo] = select<0, 1, 6>(problem_shape);
     auto [seq_len_qo] = select<0>(sequence_length_shape);
-    // Represent the full output tensor
-    // Tensor mO_mnl = cute::get_xe_tensor(make_shape(seq_len_qo, head_size_vo,
-    // (is_var_len ? batch : 1) * num_heads_q));
     Tensor mO_mnl =
         cute::get_xe_tensor(make_shape(seq_len_qo, head_size_vo, 1));
-
     auto [m_coord, n_coord, k_coord, l_coord] = tile_coord;
-    // Tile the output tensor per WG
     Tensor g_wg_O = local_tile(
         mO_mnl,
         select<0, 1>(TileShapeOutput{}),
@@ -247,21 +240,14 @@ class FlashPrefillEpilogue<
         get<2>(typename TiledMmaOutput::ThrLayoutVMNK{}.shape());
     auto m_sg = get_sub_group_id() / ATOM_N;
     auto n_sg = get_sub_group_id() % ATOM_N;
-    // Tile the output tensor per SG
     Tensor gO = local_tile(
         g_wg_O,
         SubgroupTileShape{},
         make_coord(m_sg, n_sg, _),
         Step<_1, _1, X>{}); // (BLK_M,BLK_N,m,n,l)
     auto thread_xe_store_o = params.xe_store_o.get_thread_slice(ThreadIdxX());
     Tensor tOgO = thread_xe_store_o.partition_D(gO);
-
     Tensor final_out_reg = make_fragment_like<ElementOutput>(out_reg);
-    // iff ElementOutput == ElementAccumulator, then convert_type doesn't do the
-    // right conversion iff ElementOutput == fp8, there is no NumericConverter
-    // specialization available for both the above cases, we call copy() which
-    // internally performs a static_cast op on the data. for ElementOutput ==
-    // bf16 | fp16, convert_type calls relevant NumericConverter specialization.
     if constexpr (
         cute::is_any_of_v<
             ElementOutput,
@@ -280,30 +266,17 @@ class FlashPrefillEpilogue<
     int lane_id = static_cast<int>(sg.get_local_linear_id());
     int sub_group_id = get_sub_group_id();
     const int BLK_M = size(select<0>(TileShapeOutput{}));
-
-    // write along the sequence.
-    // use the entire sub_group to write lse since all
-    // work items within subgroup have the same sum() data stored
-    // in registers
     auto blk_m_coord = get<0>(tile_coord); // seq_len_blk_idx
-
     size_t lse_offset =
         k_coord * num_heads_q * seq_len_qo + // shift the batch -- batch_idx *
                                              // num_heads_q * seq_len_qo  -- OK
         q_head_coord *
             seq_len_qo + // shift the head  -- head_q * seq_len_qo -- ok
         m_coord * BLK_M; // shift to the particular tile
-
     int localtile_seq_coord = 0;
-
-    // Calculate the sequence coordinate
-    // The coordinate value should be within [0.. seq_len_qo - 1]
     localtile_seq_coord = sub_group_id * SubgroupSize +
-        lane_id; // one subgroup will handle 16 (usually) sequence
-
-    // checked
+        lane_id; // one subgroup will handle 16 sequence
     int seq_coord = m_coord * BLK_M + localtile_seq_coord;
-
     // Check that if this is within the seq_len_qo
     if (seq_coord < seq_len_qo) {
       auto cur_sum = rowsum[lane_id];
@@ -356,3 +329,5 @@ class FlashPrefillEpilogue<
 } // namespace collective
 } // namespace flash_attention
 } // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////