intel
diff --git a/‎src/ATen/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎src/ATen/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/ATen/native/transformers/xpu/flash_attn/flash_api.cpp‎
Lines changed: 109 additions & 0 deletions b/‎src/ATen/native/transformers/xpu/flash_attn/flash_api.cpp‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎src/ATen/native/transformers/xpu/flash_attn/flash_api.h‎
Lines changed: 43 additions & 0 deletions b/‎src/ATen/native/transformers/xpu/flash_attn/flash_api.h‎
Lines changed: 43 additions & 0 deletions
@@ -1,9 +1,9 @@
 # ATen XPU sources
 
 file(GLOB xpu_cpp "xpu/*.cpp")
-file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/sparse/xpu/*.cpp" "native/nested/*.cpp" "native/nested/xpu/*.cpp" "native/transformers/*.cpp" "native/quantized/*.cpp" ${TORCH_ROOT}/aten/src/ATen/native/transformers/xpu/flash_attn/*.cpp)
+file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/sparse/xpu/*.cpp" "native/nested/*.cpp" "native/nested/xpu/*.cpp" "native/transformers/*.cpp" "native/quantized/*.cpp" "native/transformers/xpu/flash_attn/*.cpp")
 file(GLOB xpu_sycl "native/xpu/sycl/*.cpp" "native/sparse/xpu/sycl/*.cpp" "native/nested/xpu/sycl/*.cpp" "native/transformers/sycl/*.cpp" "native/quantized/sycl/*.cpp")
-file(GLOB xpu_sycltla "${TORCH_ROOT}/aten/src/ATen/native/transformers/xpu/flash_attn/sycltla/*.cpp")
+file(GLOB xpu_sycltla "native/transformers/xpu/flash_attn/sycltla/*.cpp")
 
 list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp})
 if(USE_ONEMKL_XPU)
 
@@ -0,0 +1,109 @@
+#include <ATen/native/transformers/xpu/flash_attn/flash_api.h>
+#include <ATen/native/transformers/xpu/flash_attn/sycltla/flash_api.h>
+
+namespace sycltla {
+
+bool is_flash_attention_available() {
+#ifndef USE_SYCLTLA
+  return false;
+#else
+  return true;
+#endif
+}
+
+std::tuple<
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    c10::SymInt,
+    c10::SymInt,
+    at::Tensor,
+    at::Tensor>
+flash_attention_forward(
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const double dropout,
+    const bool is_causal,
+    const float scale) {
+#ifndef USE_SYCLTLA
+  TORCH_CHECK(
+      false,
+      "flash_attention_forward: Torch XPU was not compiled with SYCLTLA support.");
+  return std::make_tuple(
+      at::Tensor(),
+      at::Tensor(),
+      at::Tensor(),
+      at::Tensor(),
+      c10::SymInt(0),
+      c10::SymInt(0),
+      at::Tensor(),
+      at::Tensor());
+#else
+  auto
+      [attention,
+       logsumexp,
+       cumulative_sequence_length_q,
+       cumulative_sequence_length_k,
+       max_seqlen_batch_q,
+       max_seqlen_batch_k,
+       philox_seed,
+       philox_offset] =
+          flash_attention_forward_sycltla(
+              query, key, value, dropout, is_causal, scale);
+  return std::make_tuple(
+      std::move(attention),
+      std::move(logsumexp),
+      std::move(cumulative_sequence_length_q),
+      std::move(cumulative_sequence_length_k),
+      std::move(max_seqlen_batch_q),
+      std::move(max_seqlen_batch_k),
+      std::move(philox_seed),
+      std::move(philox_offset));
+#endif
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> flash_attention_backward(
+    const at::Tensor& grad_out,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const at::Tensor& out,
+    const at::Tensor& logsumexp,
+    const at::Tensor& cumulative_sequence_length_q,
+    const at::Tensor& cumulative_sequence_length_k,
+    const int64_t max_seqlen_batch_q,
+    const int64_t max_seqlen_batch_k,
+    const double dropout,
+    const bool is_causal,
+    const at::Tensor& philox_seed,
+    const at::Tensor& philox_offset,
+    const float scale) {
+#ifndef USE_SYCLTLA
+  TORCH_CHECK(
+      false,
+      "flash_attention_backward: Torch XPU was not compiled with SYCLTLA support.");
+  return std::make_tuple(at::Tensor(), at::Tensor(), at::Tensor());
+#else
+  auto [grad_query, grad_key, grad_value] = flash_attention_backward_sycltla(
+      grad_out,
+      query,
+      key,
+      value,
+      out,
+      logsumexp,
+      cumulative_sequence_length_q,
+      cumulative_sequence_length_k,
+      max_seqlen_batch_q,
+      max_seqlen_batch_k,
+      dropout,
+      is_causal,
+      philox_seed,
+      philox_offset,
+      scale);
+  return std::make_tuple(
+      std::move(grad_query), std::move(grad_key), std::move(grad_value));
+#endif
+}
+} // namespace sycltla
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace sycltla {
+
+bool is_flash_attention_available();
+
+std::tuple<
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    c10::SymInt,
+    c10::SymInt,
+    at::Tensor,
+    at::Tensor>
+flash_attention_forward(
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const double dropout,
+    const bool is_causal,
+    const float scale);
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> flash_attention_backward(
+    const at::Tensor& grad_out,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const at::Tensor& out,
+    const at::Tensor& logsumexp,
+    const at::Tensor& cumulative_sequence_length_q,
+    const at::Tensor& cumulative_sequence_length_k,
+    const int64_t max_seqlen_batch_q,
+    const int64_t max_seqlen_batch_k,
+    const double dropout,
+    const bool is_causal,
+    const at::Tensor& philox_seed,
+    const at::Tensor& philox_offset,
+    const float scale);
+
+} // namespace sycltla