intel · guangyey · Mar 7, 2025 · Feb 21, 2025 · Feb 25, 2025 · Feb 26, 2025
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
@@ -4,6 +4,7 @@ endif()
 set(Codegen_XPU_cmake_included true)
 
 set(BUILD_TORCH_XPU_ATEN_GENERATED "${CMAKE_BINARY_DIR}/xpu/ATen")
+set(BUILD_TORCH_ATEN_GENERATED "${CMAKE_BINARY_DIR}/aten/src/ATen")
 file(MAKE_DIRECTORY ${BUILD_TORCH_XPU_ATEN_GENERATED})
 
 set(RegisterXPU_GENERATED ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp)
@@ -49,6 +50,38 @@ function(GEN_XPU file_yaml)
     --xpu
   )
 
+  set(XPU_INSTALL_HEADER_COMMAND
+    "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/install_xpu_headers.py
+    --src-header-dir ${BUILD_TORCH_XPU_ATEN_GENERATED}
+    --dst-header-dir ${BUILD_TORCH_ATEN_GENERATED}
+  )
+
+  execute_process(
+    COMMAND
+    ${XPU_CODEGEN_COMMAND}
+    --generate headers
+    --dry-run
+    --output-dependencies ${BUILD_TORCH_XPU_ATEN_GENERATED}/generated_headers.cmake
+    RESULT_VARIABLE RETURN_VALUE
+    WORKING_DIRECTORY ${TORCH_ROOT}
+  )
+
+  if(NOT RETURN_VALUE EQUAL 0)
+    message(FATAL_ERROR "Failed to get generated_headers list")
+  endif()
+
+  execute_process(
+    COMMAND
+    ${XPU_INSTALL_HEADER_COMMAND}
+    --dry-run
+    RESULT_VARIABLE RETURN_VALUE
+    WORKING_DIRECTORY ${TORCH_ROOT}
+  )
+
+  if(NOT RETURN_VALUE EQUAL 0)
+    message(FATAL_ERROR "Failed to get XPU header list to install")
+  endif()
+
   add_custom_command(
     COMMENT "Generating XPU ATen Codegen..."
     OUTPUT ${generated_files}
@@ -66,14 +99,13 @@ function(GEN_XPU file_yaml)
     COMMAND
     ${REGISTER_FALLBACK_CMD}
     # Codegen post-process
-    COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterXPU_GENERATED}
-    COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseXPU_GENERATED}
-    COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseCsrXPU_GENERATED}
-    COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterNestedTensorXPU_GENERATED}
+    COMMAND
+    ${XPU_INSTALL_HEADER_COMMAND}
     WORKING_DIRECTORY ${TORCH_ROOT}
     DEPENDS
     ${CODEGEN_XPU_YAML_DIR}/native/${file_yaml}
     ${XPUFallback_TEMPLATE}
+    ${TORCH_XPU_OPS_ROOT}/tools/codegen/install_xpu_headers.py
   )
 
   # Post codegen delete the copied templates folder only on Windows.
@@ -99,11 +131,7 @@ GEN_XPU(
   ${XPU_AOTI_SHIM_SOURCE}
 )
 
-# The c_shim_xpu.cpp needs include files in ${CMAKE_BINARY_DIR}/xpu/ATen/ops/*.h)
-# The include path is auto generated as "#include <ATen/ops/*.h">
-# To follow the design of aoti codegen, here ${CMAKE_BINARY_DIR}/xpu is added to
-# $TORCH_XPU_OPS_INCLUDE_DIRS, so that "#include <ATen/ops/*.h>" works.
-list(APPEND TORCH_XPU_OPS_INCLUDE_DIRS ${CMAKE_BINARY_DIR}/xpu)
+include(${BUILD_TORCH_XPU_ATEN_GENERATED}/xpu_ops_generated_headers.cmake)
 
 list(APPEND xpu_generated_src
   ${RegisterXPU_GENERATED}

diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt
@@ -19,3 +19,7 @@ set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE)
 foreach(HEADER  ${xpu_h})
   install(FILES ${HEADER} DESTINATION "${AT_INSTALL_INCLUDE_DIR}/ATen/xpu")
 endforeach()
+
+foreach(HEADER  ${xpu_ops_generated_headers})
+  install(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/ops)
+endforeach()
diff --git a/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp b/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp
@@ -1,7 +1,7 @@
 #include <ATen/native/sparse/SparseStubs.h>
 #include <ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.h>
-#include <xpu/ATen/ops/_convert_indices_from_coo_to_csr_native.h>
-#include <xpu/ATen/ops/_convert_indices_from_csr_to_coo_native.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr_native.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo_native.h>
 
 namespace at::native {
 

diff --git a/src/ATen/native/xpu/Activation.cpp b/src/ATen/native/xpu/Activation.cpp
@@ -7,9 +7,9 @@
 #include <ATen/native/TensorIterator.h>
 
 #include <ATen/ops/empty_like.h>
-#include <xpu/ATen/ops/empty.h>
-#include <xpu/ATen/ops/gelu_backward_native.h>
-#include <xpu/ATen/ops/gelu_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/gelu_backward_native.h>
+#include <ATen/ops/gelu_native.h>
 
 #include <ATen/native/xpu/sycl/ActivationEluKernels.h>
 #include <ATen/native/xpu/sycl/ActivationGeluKernel.h>

diff --git a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
@@ -7,8 +7,8 @@
 
 #include <ATen/ops/mean.h>
 #include <ATen/ops/zeros_like.h>
-#include <xpu/ATen/ops/_adaptive_avg_pool2d_backward_native.h>
-#include <xpu/ATen/ops/_adaptive_avg_pool2d_native.h>
+#include <ATen/ops/_adaptive_avg_pool2d_backward_native.h>
+#include <ATen/ops/_adaptive_avg_pool2d_native.h>
 
 #include <ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h>
 

diff --git a/src/ATen/native/xpu/AdaptiveAveragePooling3d.cpp b/src/ATen/native/xpu/AdaptiveAveragePooling3d.cpp
@@ -4,8 +4,8 @@
 
 #include <ATen/ops/empty.h>
 #include <ATen/ops/empty_like.h>
-#include <xpu/ATen/ops/adaptive_avg_pool3d_backward_native.h>
-#include <xpu/ATen/ops/adaptive_avg_pool3d_native.h>
+#include <ATen/ops/adaptive_avg_pool3d_backward_native.h>
+#include <ATen/ops/adaptive_avg_pool3d_native.h>
 
 namespace at::native {
 

diff --git a/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp b/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp
@@ -4,8 +4,8 @@
 #include <ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h>
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/adaptive_max_pool2d_backward_native.h>
-#include <xpu/ATen/ops/adaptive_max_pool2d_native.h>
+#include <ATen/ops/adaptive_max_pool2d_backward_native.h>
+#include <ATen/ops/adaptive_max_pool2d_native.h>
 
 namespace at {
 namespace native {

diff --git a/src/ATen/native/xpu/AdaptiveMaxPooling3d.cpp b/src/ATen/native/xpu/AdaptiveMaxPooling3d.cpp
@@ -4,8 +4,8 @@
 #include <ATen/native/xpu/sycl/AdaptiveMaxPooling3dKernels.h>
 
 #include <ATen/ops/empty.h>
-#include <xpu/ATen/ops/adaptive_max_pool3d_backward_native.h>
-#include <xpu/ATen/ops/adaptive_max_pool3d_native.h>
+#include <ATen/ops/adaptive_max_pool3d_backward_native.h>
+#include <ATen/ops/adaptive_max_pool3d_native.h>
 
 namespace at {
 namespace native {

diff --git a/src/ATen/native/xpu/AveragePool2d.cpp b/src/ATen/native/xpu/AveragePool2d.cpp
@@ -5,8 +5,8 @@
 #include <ATen/native/xpu/sycl/AveragePool2dKernels.h>
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/avg_pool2d_backward_native.h>
-#include <xpu/ATen/ops/avg_pool2d_native.h>
+#include <ATen/ops/avg_pool2d_backward_native.h>
+#include <ATen/ops/avg_pool2d_native.h>
 
 namespace at {
 namespace native {

diff --git a/src/ATen/native/xpu/AveragePool3d.cpp b/src/ATen/native/xpu/AveragePool3d.cpp
@@ -1,8 +1,8 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/native/xpu/sycl/AveragePool3dKernels.h>
 
-#include <xpu/ATen/ops/avg_pool3d_backward_native.h>
-#include <xpu/ATen/ops/avg_pool3d_native.h>
+#include <ATen/ops/avg_pool3d_backward_native.h>
+#include <ATen/ops/avg_pool3d_native.h>
 
 namespace at {
 namespace native {

diff --git a/src/ATen/native/xpu/BinaryOps.cpp b/src/ATen/native/xpu/BinaryOps.cpp
@@ -4,7 +4,7 @@
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 
-#include <xpu/ATen/ops/add_native.h>
+#include <ATen/ops/add_native.h>
 
 #include <ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h>
 #include <ATen/native/xpu/sycl/BinaryGeometricKernels.h>

diff --git a/src/ATen/native/xpu/Col2Im.cpp b/src/ATen/native/xpu/Col2Im.cpp
@@ -7,7 +7,7 @@
 #include <ATen/native/xpu/sycl/Col2ImKernel.h>
 
 #include <comm/xpu_aten.h>
-#include <xpu/ATen/ops/col2im_native.h>
+#include <ATen/ops/col2im_native.h>
 
 namespace at::native {
 

diff --git a/src/ATen/native/xpu/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/DilatedMaxPool2d.cpp
@@ -4,9 +4,9 @@
 #include <ATen/native/xpu/sycl/DilatedMaxPool2d.h>
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/max.h>
-#include <xpu/ATen/ops/max_pool2d_with_indices_backward_native.h>
-#include <xpu/ATen/ops/max_pool2d_with_indices_native.h>
+#include <ATen/ops/max.h>
+#include <ATen/ops/max_pool2d_with_indices_backward_native.h>
+#include <ATen/ops/max_pool2d_with_indices_native.h>
 
 namespace at {
 namespace native {

diff --git a/src/ATen/native/xpu/DilatedMaxPool3d.cpp b/src/ATen/native/xpu/DilatedMaxPool3d.cpp
@@ -2,8 +2,8 @@
 #include <ATen/native/xpu/sycl/DilatedMaxPool3d.h>
 
 #include <ATen/ops/empty.h>
-#include <xpu/ATen/ops/max_pool3d_with_indices_backward_native.h>
-#include <xpu/ATen/ops/max_pool3d_with_indices_native.h>
+#include <ATen/ops/max_pool3d_with_indices_backward_native.h>
+#include <ATen/ops/max_pool3d_with_indices_native.h>
 namespace at {
 namespace native {
 

diff --git a/src/ATen/native/xpu/Dropout.cpp b/src/ATen/native/xpu/Dropout.cpp
@@ -3,8 +3,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/DropoutKernels.h>
 
-#include <xpu/ATen/ops/native_dropout_backward_native.h>
-#include <xpu/ATen/ops/native_dropout_native.h>
+#include <ATen/ops/native_dropout_backward_native.h>
+#include <ATen/ops/native_dropout_native.h>
 
 #include <comm/xpu_aten.h>
 

diff --git a/src/ATen/native/xpu/Embedding.cpp b/src/ATen/native/xpu/Embedding.cpp
@@ -1,6 +1,6 @@
 #include <ATen/core/op_registration/adaption.h>
 
-#include <xpu/ATen/ops/embedding_dense_backward_native.h>
+#include <ATen/ops/embedding_dense_backward_native.h>
 
 #include <ATen/native/xpu/sycl/EmbeddingKernels.h>
 #include <comm/xpu_aten.h>

diff --git a/src/ATen/native/xpu/EmbeddingBag.cpp b/src/ATen/native/xpu/EmbeddingBag.cpp
@@ -1,5 +1,5 @@
-#include <xpu/ATen/ops/_embedding_bag_forward_only_native.h>
-#include <xpu/ATen/ops/_embedding_bag_native.h>
+#include <ATen/ops/_embedding_bag_forward_only_native.h>
+#include <ATen/ops/_embedding_bag_native.h>
 
 #include <ATen/native/xpu/sycl/EmbeddingBagKernels.h>
 #include <comm/xpu_aten.h>

diff --git a/src/ATen/native/xpu/Equal.cpp b/src/ATen/native/xpu/Equal.cpp
@@ -1,6 +1,6 @@
 #include <ATen/NamedTensorUtils.h>
 
-#include <xpu/ATen/ops/equal_native.h>
+#include <ATen/ops/equal_native.h>
 
 namespace at {
 namespace xpu {

diff --git a/src/ATen/native/xpu/ForeachOpScalarList.cpp b/src/ATen/native/xpu/ForeachOpScalarList.cpp
@@ -16,8 +16,8 @@
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachTernaryOpScalarListKernels.h>
 
-#include <xpu/ATen/ops/_foreach_add_native.h>
-#include <xpu/ATen/ops/_foreach_mul_native.h>
+#include <ATen/ops/_foreach_add_native.h>
+#include <ATen/ops/_foreach_mul_native.h>
 
 namespace at {
 namespace native {

diff --git a/src/ATen/native/xpu/ForeachReduceOp.cpp b/src/ATen/native/xpu/ForeachReduceOp.cpp
@@ -1,8 +1,8 @@
 #include <ATen/native/ForeachUtils.h>
 
 #include <ATen/native/xpu/sycl/ForeachReduceKernels.h>
-#include <xpu/ATen/ops/_foreach_max_native.h>
-#include <xpu/ATen/ops/_foreach_norm_native.h>
+#include <ATen/ops/_foreach_max_native.h>
+#include <ATen/ops/_foreach_norm_native.h>
 
 namespace at {
 namespace native {

diff --git a/src/ATen/native/xpu/FractionalMaxPool2d.cpp b/src/ATen/native/xpu/FractionalMaxPool2d.cpp
@@ -3,8 +3,8 @@
 #include <ATen/native/cpu/mixed_data_type.h>
 #include <ATen/native/xpu/sycl/FractionalMaxPool2dKernels.h>
 
-#include <xpu/ATen/ops/fractional_max_pool2d_backward_native.h>
-#include <xpu/ATen/ops/fractional_max_pool2d_native.h>
+#include <ATen/ops/fractional_max_pool2d_backward_native.h>
+#include <ATen/ops/fractional_max_pool2d_native.h>
 
 namespace at::native {
 

diff --git a/src/ATen/native/xpu/FractionalMaxPool3d.cpp b/src/ATen/native/xpu/FractionalMaxPool3d.cpp
@@ -4,8 +4,8 @@
 #include <ATen/native/xpu/sycl/FractionalMaxPool3dKernels.h>
 #include <ATen/ops/empty.h>
 
-#include <xpu/ATen/ops/fractional_max_pool3d_backward_native.h>
-#include <xpu/ATen/ops/fractional_max_pool3d_native.h>
+#include <ATen/ops/fractional_max_pool3d_backward_native.h>
+#include <ATen/ops/fractional_max_pool3d_native.h>
 
 namespace at::native {
 

diff --git a/src/ATen/native/xpu/Im2Col.cpp b/src/ATen/native/xpu/Im2Col.cpp
@@ -4,7 +4,7 @@
 #include <ATen/native/TensorIterator.h>
 #include <torch/library.h>
 
-#include <xpu/ATen/ops/im2col_native.h>
+#include <ATen/ops/im2col_native.h>
 
 #include <ATen/native/xpu/sycl/Im2ColKernel.h>
 #include <comm/xpu_aten.h>

diff --git a/src/ATen/native/xpu/Indexing.cpp b/src/ATen/native/xpu/Indexing.cpp
@@ -10,7 +10,7 @@
 #include <comm/xpu_aten.h>
 
 #include <ATen/ops/index.h>
-#include <xpu/ATen/ops/index_native.h>
+#include <ATen/ops/index_native.h>
 
 namespace at {
 namespace native {

diff --git a/src/ATen/native/xpu/LossMultiMargin.cpp b/src/ATen/native/xpu/LossMultiMargin.cpp
@@ -2,8 +2,8 @@
 #include <ATen/native/xpu/sycl/MultiMarginLossKernels.h>
 
 #include <ATen/ops/empty.h>
-#include <xpu/ATen/ops/multi_margin_loss_backward_native.h>
-#include <xpu/ATen/ops/multi_margin_loss_native.h>
+#include <ATen/ops/multi_margin_loss_backward_native.h>
+#include <ATen/ops/multi_margin_loss_native.h>
 
 namespace at::native {
 

diff --git a/src/ATen/native/xpu/LossNLL.cpp b/src/ATen/native/xpu/LossNLL.cpp
@@ -5,8 +5,8 @@
 #include <comm/RegisterUtils.h>
 #include <comm/xpu_aten.h>
 
-#include <xpu/ATen/ops/nll_loss_backward_native.h>
-#include <xpu/ATen/ops/nll_loss_forward_native.h>
+#include <ATen/ops/nll_loss_backward_native.h>
+#include <ATen/ops/nll_loss_forward_native.h>
 
 namespace at {
 namespace native {

diff --git a/src/ATen/native/xpu/PinnedMemoryAllocator.cpp b/src/ATen/native/xpu/PinnedMemoryAllocator.cpp
@@ -3,7 +3,7 @@
 #include <ATen/xpu/PinnedMemoryAllocator.h>
 #include <comm/xpu_aten.h>
 
-#include <xpu/ATen/ops/is_pinned_native.h>
+#include <ATen/ops/is_pinned_native.h>
 
 namespace at {
 namespace native {

diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp
@@ -10,10 +10,10 @@
 #include <comm/xpu_aten.h>
 #include <torch/library.h>
 
-#include <xpu/ATen/ops/arange_native.h>
-#include <xpu/ATen/ops/linspace_native.h>
-#include <xpu/ATen/ops/logspace_native.h>
-#include <xpu/ATen/ops/range_native.h>
+#include <ATen/ops/arange_native.h>
+#include <ATen/ops/linspace_native.h>
+#include <ATen/ops/logspace_native.h>
+#include <ATen/ops/range_native.h>
 
 namespace at {
 

diff --git a/src/ATen/native/xpu/ReflectionPad.cpp b/src/ATen/native/xpu/ReflectionPad.cpp
@@ -6,12 +6,12 @@
 
 #include <ATen/ops/empty.h>
 #include <ATen/ops/zeros_like.h>
-#include <xpu/ATen/ops/reflection_pad1d_backward_native.h>
-#include <xpu/ATen/ops/reflection_pad1d_native.h>
-#include <xpu/ATen/ops/reflection_pad2d_backward_native.h>
-#include <xpu/ATen/ops/reflection_pad2d_native.h>
-#include <xpu/ATen/ops/reflection_pad3d_backward_native.h>
-#include <xpu/ATen/ops/reflection_pad3d_native.h>
+#include <ATen/ops/reflection_pad1d_backward_native.h>
+#include <ATen/ops/reflection_pad1d_native.h>
+#include <ATen/ops/reflection_pad2d_backward_native.h>
+#include <ATen/ops/reflection_pad2d_native.h>
+#include <ATen/ops/reflection_pad3d_backward_native.h>
+#include <ATen/ops/reflection_pad3d_native.h>
 #include "ATen/TensorMeta.h"
 
 namespace at {

diff --git a/src/ATen/native/xpu/ReplicationPadding.cpp b/src/ATen/native/xpu/ReplicationPadding.cpp
@@ -6,12 +6,12 @@
 
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/replication_pad1d_backward_native.h>
-#include <xpu/ATen/ops/replication_pad1d_native.h>
-#include <xpu/ATen/ops/replication_pad2d_backward_native.h>
-#include <xpu/ATen/ops/replication_pad2d_native.h>
-#include <xpu/ATen/ops/replication_pad3d_backward_native.h>
-#include <xpu/ATen/ops/replication_pad3d_native.h>
+#include <ATen/ops/replication_pad1d_backward_native.h>
+#include <ATen/ops/replication_pad1d_native.h>
+#include <ATen/ops/replication_pad2d_backward_native.h>
+#include <ATen/ops/replication_pad2d_native.h>
+#include <ATen/ops/replication_pad3d_backward_native.h>
+#include <ATen/ops/replication_pad3d_native.h>
 
 namespace at {
 namespace native {