From 556b6b17a54f19ed04d479c7cb874c041043564c Mon Sep 17 00:00:00 2001
From: "Yu, Guangye" <guangye.yu@intel.com>
Date: Fri, 21 Feb 2025 17:06:17 +0000
Subject: [PATCH 1/3] install codegen header to torch/include

---
 cmake/Codegen.cmake                  |  44 +++++++--
 src/ATen/CMakeLists.txt              |   4 +
 tools/codegen/install_xpu_headers.py | 130 +++++++++++++++++++++++++++
 tools/codegen/remove_headers.py      |  31 -------
 4 files changed, 173 insertions(+), 36 deletions(-)
 create mode 100644 tools/codegen/install_xpu_headers.py
 delete mode 100644 tools/codegen/remove_headers.py

diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index 993decc324..92a6a40b90 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -4,6 +4,7 @@ endif()
 set(Codegen_XPU_cmake_included true)
 
 set(BUILD_TORCH_XPU_ATEN_GENERATED "${CMAKE_BINARY_DIR}/xpu/ATen")
+set(BUILD_TORCH_ATEN_GENERATED "${CMAKE_BINARY_DIR}/aten/src/ATen")
 file(MAKE_DIRECTORY ${BUILD_TORCH_XPU_ATEN_GENERATED})
 
 set(RegisterXPU_GENERATED ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp)
@@ -49,6 +50,38 @@ function(GEN_XPU file_yaml)
     --xpu
   )
 
+  set(XPU_INSTALL_HEADER_COMMAND
+    "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/install_xpu_headers.py
+    --src-header-dir ${BUILD_TORCH_XPU_ATEN_GENERATED}
+    --dst-header-dir ${BUILD_TORCH_ATEN_GENERATED}
+  )
+
+  execute_process(
+    COMMAND
+    ${XPU_CODEGEN_COMMAND}
+    --generate headers
+    --dry-run
+    --output-dependencies ${BUILD_TORCH_XPU_ATEN_GENERATED}/generated_headers.cmake
+    RESULT_VARIABLE RETURN_VALUE
+    WORKING_DIRECTORY ${TORCH_ROOT}
+  )
+
+  if(NOT RETURN_VALUE EQUAL 0)
+    message(FATAL_ERROR "Failed to get generated_headers list")
+  endif()
+
+  execute_process(
+    COMMAND
+    ${XPU_INSTALL_HEADER_COMMAND}
+    --dry-run
+    RESULT_VARIABLE RETURN_VALUE
+    WORKING_DIRECTORY ${TORCH_ROOT}
+  )
+
+  if(NOT RETURN_VALUE EQUAL 0)
+    message(FATAL_ERROR "Failed to get XPU header list to install")
+  endif()
+
   add_custom_command(
     COMMENT "Generating XPU ATen Codegen..."
     OUTPUT ${generated_files}
@@ -65,15 +98,14 @@ function(GEN_XPU file_yaml)
     --aoti-install-dir=${XPU_AOTI_INSTALL_DIR}
     COMMAND
     ${REGISTER_FALLBACK_CMD}
-    # Codegen post-process
-    COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterXPU_GENERATED}
-    COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseXPU_GENERATED}
-    COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseCsrXPU_GENERATED}
-    COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterNestedTensorXPU_GENERATED}
+    # # Codegen post-process
+    COMMAND
+    ${XPU_INSTALL_HEADER_COMMAND}
     WORKING_DIRECTORY ${TORCH_ROOT}
     DEPENDS
     ${CODEGEN_XPU_YAML_DIR}/native/${file_yaml}
     ${XPUFallback_TEMPLATE}
+    ${TORCH_XPU_OPS_ROOT}/tools/codegen/install_xpu_headers.py
   )
 
   # Post codegen delete the copied templates folder only on Windows.
@@ -99,6 +131,8 @@ GEN_XPU(
   ${XPU_AOTI_SHIM_SOURCE}
 )
 
+include(${BUILD_TORCH_XPU_ATEN_GENERATED}/xpu_ops_generated_headers.cmake)
+
 # The c_shim_xpu.cpp needs include files in ${CMAKE_BINARY_DIR}/xpu/ATen/ops/*.h)
 # The include path is auto generated as "#include <ATen/ops/*.h">
 # To follow the design of aoti codegen, here ${CMAKE_BINARY_DIR}/xpu is added to
diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt
index 22e0601111..ad936acb8e 100644
--- a/src/ATen/CMakeLists.txt
+++ b/src/ATen/CMakeLists.txt
@@ -19,3 +19,7 @@ set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE)
 foreach(HEADER  ${xpu_h})
   install(FILES ${HEADER} DESTINATION "${AT_INSTALL_INCLUDE_DIR}/ATen/xpu")
 endforeach()
+
+foreach(HEADER  ${xpu_ops_generated_headers})
+  install(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/ops)
+endforeach()
diff --git a/tools/codegen/install_xpu_headers.py b/tools/codegen/install_xpu_headers.py
new file mode 100644
index 0000000000..c6fdefb4a1
--- /dev/null
+++ b/tools/codegen/install_xpu_headers.py
@@ -0,0 +1,130 @@
+import argparse
+import os
+import re
+import shutil
+
+
+parser = argparse.ArgumentParser(description="Utils for append ops headers")
+parser.add_argument(
+    "--src-header-dir", type=str, help="torch-xpu-ops build header file path"
+)
+parser.add_argument("--dst-header-dir", type=str, help="torch build header file path")
+parser.add_argument(
+    "--dry-run", action="store_true", help="run without writing any files"
+)
+args = parser.parse_args()
+
+
+def append_xpu_function_header(src, dst):
+    r"""
+    Append XPU function header XPUFunctions_inl.h from source to destination build.
+    """
+    if args.dry_run:
+        return
+
+    with open(dst) as fr:
+        lines = fr.readlines()
+    while lines and lines[-1].strip() == "":
+        lines.pop()
+    with open(dst, "w") as fw:
+        fw.writelines(lines)
+
+    with open(src) as fr, open(dst, "a") as fa:
+        src_lines = fr.readlines()
+        for line in src_lines:
+            if re.match(r"^#include <ATen/ops/.*", line):
+                fa.write(line)
+
+
+def parse_ops_headers(src):
+    r"""
+    Parse ops headers from file.
+    """
+    ops_headers = []
+    with open(src) as fr:
+        src_text = fr.read()
+        ops_headers.extend(re.findall(r".*/ATen/+ops/(.*.h)", src_text))
+    return ops_headers
+
+
+def classify_ops_headers(src_dir, dst_dir):
+    r"""
+    Classify ops headers into common headers and XPU-specific ops headers.
+    """
+    src_ops_headers = parse_ops_headers(os.path.join(src_dir, "ops_generated_headers.cmake"))
+    dst_ops_headers = parse_ops_headers(os.path.join(dst_dir, "ops_generated_headers.cmake"))
+    common_headers = [f for f in src_ops_headers if f in dst_ops_headers]
+    xpu_ops_headers = [f for f in src_ops_headers if f not in common_headers]
+    return common_headers, xpu_ops_headers
+
+
+def generate_xpu_ops_headers_cmake(src_dir, dst_dir, xpu_ops_headers):
+    r"""
+    Generate XPU ops headers xpu_ops_generated_headers.cmake
+    """
+    with open(os.path.join(src_dir, "xpu_ops_generated_headers.cmake"), "w") as fw:
+        fw.write("set(xpu_ops_generated_headers\n")
+        for header in xpu_ops_headers:
+            fw.write(f'    "{os.path.join(dst_dir, header)}"\n')
+        fw.write(")\n")
+
+
+def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers):
+    r"""
+    For XPU-specific ops headers, copy them to destination build and append XPU declarations to common headers.
+    """
+    if args.dry_run:
+        return
+
+    for f in xpu_ops_headers:
+        # TODO: fix the incorrect op info registered in native_functions.yaml
+        # assert "xpu" in f, f"Error: The function signature or namespace in '{f}' is incorrect. Expected 'xpu' to be present."
+        src = os.path.join(src_dir, f)
+        dst = os.path.join(dst_dir, f)
+        shutil.copy(src, dst)
+
+    for f in common_headers:
+        src = os.path.join(src_dir, f)
+        dst = os.path.join(dst_dir, f)
+        xpu_declarations = []
+        with open(src) as fr:
+            src_text = fr.read()
+            xpu_declarations.extend(
+                re.findall(r"^TORCH_API.*xpu.*?;\n", src_text, re.MULTILINE)
+            )
+            xpu_declarations.extend(
+                re.findall(r"struct TORCH_XPU_API.*xpu.*?{.*?};\n", src_text, re.DOTALL)
+            )
+
+        with open(dst) as fr:
+            dst_lines = fr.readlines()
+            dst_text = "".join(dst_lines)
+            for line in dst_lines:
+                if re.match(r"^(TORCH_API.*;|struct TORCH_API.*)", line):
+                    for xpu_declaration in xpu_declarations:
+                        if not re.search(re.escape(xpu_declaration), dst_text):
+                            dst_lines.insert(dst_lines.index(line), xpu_declaration)
+                    break
+
+        with open(dst, "w") as fw:
+            fw.writelines(dst_lines)
+
+
+def main():
+    src_xpu_function_header = os.path.join(args.src_header_dir, "XPUFunctions_inl.h")
+    dst_xpu_function_header = os.path.join(args.dst_header_dir, "XPUFunctions_inl.h")
+    append_xpu_function_header(src_xpu_function_header, dst_xpu_function_header)
+
+    src_xpu_ops_header_dir = os.path.join(args.src_header_dir, "ops")
+    dst_xpu_ops_header_dir = os.path.join(args.dst_header_dir, "ops")
+    common_headers, xpu_ops_headers = classify_ops_headers(
+        args.src_header_dir, args.dst_header_dir
+    )
+    generate_xpu_ops_headers_cmake(args.src_header_dir, dst_xpu_ops_header_dir, xpu_ops_headers)
+    append_xpu_ops_headers(
+        src_xpu_ops_header_dir, dst_xpu_ops_header_dir, common_headers, xpu_ops_headers
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/codegen/remove_headers.py b/tools/codegen/remove_headers.py
deleted file mode 100644
index 8f5e24e642..0000000000
--- a/tools/codegen/remove_headers.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import argparse
-import re
-
-parser = argparse.ArgumentParser(description="Utils for remove unused headers")
-parser.add_argument("--register_xpu_path", type=str, help="file location of RegisterXPU.cpp")
-args = parser.parse_args()
-
-def rm_as_strided_native():
-    with open(args.register_xpu_path) as fr:
-        lines = fr.readlines()
-
-        with open(args.register_xpu_path, 'w') as fw:
-            for ln in lines:
-                if "#include <ATen/ops/as_strided_native.h>" not in ln:
-                    fw.write(ln)
-
-def replace_op_headers():
-    with open(args.register_xpu_path) as fr:
-        lines = fr.readlines()
-        patt = r'#include <ATen/ops'
-        rep = r'#include <xpu/ATen/ops'
-        with open(args.register_xpu_path, 'w') as fw:
-            for ln in lines:
-                if 'empty.h' in ln:
-                    continue
-                replaced = re.sub(patt, rep, ln)
-                fw.write(replaced)
-
-if __name__ == "__main__":
-    # rm_as_strided_native()
-    replace_op_headers()

From 2345c4e589120782d52b459934388d81e70e4efe Mon Sep 17 00:00:00 2001
From: "Yu, Guangye" <guangye.yu@intel.com>
Date: Tue, 25 Feb 2025 17:59:37 +0000
Subject: [PATCH 2/3] replace include header

---
 cmake/Codegen.cmake                              |  6 ------
 .../native/sparse/xpu/SparseCsrTensorMath.cpp    |  4 ++--
 src/ATen/native/xpu/Activation.cpp               |  6 +++---
 src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp |  4 ++--
 src/ATen/native/xpu/AdaptiveAveragePooling3d.cpp |  4 ++--
 src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp     |  4 ++--
 src/ATen/native/xpu/AdaptiveMaxPooling3d.cpp     |  4 ++--
 src/ATen/native/xpu/AveragePool2d.cpp            |  4 ++--
 src/ATen/native/xpu/AveragePool3d.cpp            |  4 ++--
 src/ATen/native/xpu/BinaryOps.cpp                |  2 +-
 src/ATen/native/xpu/Col2Im.cpp                   |  2 +-
 src/ATen/native/xpu/DilatedMaxPool2d.cpp         |  6 +++---
 src/ATen/native/xpu/DilatedMaxPool3d.cpp         |  4 ++--
 src/ATen/native/xpu/Dropout.cpp                  |  4 ++--
 src/ATen/native/xpu/Embedding.cpp                |  2 +-
 src/ATen/native/xpu/EmbeddingBag.cpp             |  4 ++--
 src/ATen/native/xpu/Equal.cpp                    |  2 +-
 src/ATen/native/xpu/ForeachOpScalarList.cpp      |  4 ++--
 src/ATen/native/xpu/ForeachReduceOp.cpp          |  4 ++--
 src/ATen/native/xpu/FractionalMaxPool2d.cpp      |  4 ++--
 src/ATen/native/xpu/FractionalMaxPool3d.cpp      |  4 ++--
 src/ATen/native/xpu/Im2Col.cpp                   |  2 +-
 src/ATen/native/xpu/Indexing.cpp                 |  2 +-
 src/ATen/native/xpu/LossMultiMargin.cpp          |  4 ++--
 src/ATen/native/xpu/LossNLL.cpp                  |  4 ++--
 src/ATen/native/xpu/PinnedMemoryAllocator.cpp    |  2 +-
 src/ATen/native/xpu/RangeFactories.cpp           |  8 ++++----
 src/ATen/native/xpu/ReflectionPad.cpp            | 12 ++++++------
 src/ATen/native/xpu/ReplicationPadding.cpp       | 12 ++++++------
 src/ATen/native/xpu/Resize.cpp                   | 13 +++----------
 src/ATen/native/xpu/SoftMax.cpp                  |  8 ++++----
 src/ATen/native/xpu/SummaryOps.cpp               |  2 +-
 src/ATen/native/xpu/TensorAdvancedIndexing.cpp   |  6 +++---
 src/ATen/native/xpu/TensorFactories.cpp          |  2 +-
 src/ATen/native/xpu/TensorShape.cpp              |  6 +++---
 src/ATen/native/xpu/TensorTopK.cpp               |  2 +-
 src/ATen/native/xpu/TriangluarOps.cpp            |  4 ++--
 src/ATen/native/xpu/UpSampleBicubic2d.cpp        |  8 ++++----
 src/ATen/native/xpu/UpSampleBilinear2d.cpp       |  8 ++++----
 src/ATen/native/xpu/UpSampleLinear1d.cpp         |  4 ++--
 src/ATen/native/xpu/UpSampleNearest1d.cpp        |  8 ++++----
 src/ATen/native/xpu/UpSampleNearest2d.cpp        |  8 ++++----
 src/ATen/native/xpu/UpSampleNearest3d.cpp        | 16 ++++++++--------
 src/ATen/native/xpu/UpSampleTrilinear3d.cpp      |  4 ++--
 src/ATen/native/xpu/XPUScalar.cpp                |  2 +-
 45 files changed, 108 insertions(+), 121 deletions(-)

diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index 92a6a40b90..9cf4459162 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -133,12 +133,6 @@ GEN_XPU(
 
 include(${BUILD_TORCH_XPU_ATEN_GENERATED}/xpu_ops_generated_headers.cmake)
 
-# The c_shim_xpu.cpp needs include files in ${CMAKE_BINARY_DIR}/xpu/ATen/ops/*.h)
-# The include path is auto generated as "#include <ATen/ops/*.h">
-# To follow the design of aoti codegen, here ${CMAKE_BINARY_DIR}/xpu is added to
-# $TORCH_XPU_OPS_INCLUDE_DIRS, so that "#include <ATen/ops/*.h>" works.
-list(APPEND TORCH_XPU_OPS_INCLUDE_DIRS ${CMAKE_BINARY_DIR}/xpu)
-
 list(APPEND xpu_generated_src
   ${RegisterXPU_GENERATED}
   ${RegisterSparseXPU_GENERATED}
diff --git a/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp b/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp
index 38564914b4..965de7b482 100644
--- a/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp
+++ b/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp
@@ -1,7 +1,7 @@
 #include <ATen/native/sparse/SparseStubs.h>
 #include <ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.h>
-#include <xpu/ATen/ops/_convert_indices_from_coo_to_csr_native.h>
-#include <xpu/ATen/ops/_convert_indices_from_csr_to_coo_native.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr_native.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo_native.h>
 
 namespace at::native {
 
diff --git a/src/ATen/native/xpu/Activation.cpp b/src/ATen/native/xpu/Activation.cpp
index 87cac9c36f..a19249c197 100644
--- a/src/ATen/native/xpu/Activation.cpp
+++ b/src/ATen/native/xpu/Activation.cpp
@@ -7,9 +7,9 @@
 #include <ATen/native/TensorIterator.h>
 
 #include <ATen/ops/empty_like.h>
-#include <xpu/ATen/ops/empty.h>
-#include <xpu/ATen/ops/gelu_backward_native.h>
-#include <xpu/ATen/ops/gelu_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/gelu_backward_native.h>
+#include <ATen/ops/gelu_native.h>
 
 #include <ATen/native/xpu/sycl/ActivationEluKernels.h>
 #include <ATen/native/xpu/sycl/ActivationGeluKernel.h>
diff --git a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
index 4a34e70d13..4b3efcebe4 100644
--- a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
+++ b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
@@ -7,8 +7,8 @@
 
 #include <ATen/ops/mean.h>
 #include <ATen/ops/zeros_like.h>
-#include <xpu/ATen/ops/_adaptive_avg_pool2d_backward_native.h>
-#include <xpu/ATen/ops/_adaptive_avg_pool2d_native.h>
+#include <ATen/ops/_adaptive_avg_pool2d_backward_native.h>
+#include <ATen/ops/_adaptive_avg_pool2d_native.h>
 
 #include <ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h>
 
diff --git a/src/ATen/native/xpu/AdaptiveAveragePooling3d.cpp b/src/ATen/native/xpu/AdaptiveAveragePooling3d.cpp
index 1a445b8ec3..86ffe0f57f 100644
--- a/src/ATen/native/xpu/AdaptiveAveragePooling3d.cpp
+++ b/src/ATen/native/xpu/AdaptiveAveragePooling3d.cpp
@@ -4,8 +4,8 @@
 
 #include <ATen/ops/empty.h>
 #include <ATen/ops/empty_like.h>
-#include <xpu/ATen/ops/adaptive_avg_pool3d_backward_native.h>
-#include <xpu/ATen/ops/adaptive_avg_pool3d_native.h>
+#include <ATen/ops/adaptive_avg_pool3d_backward_native.h>
+#include <ATen/ops/adaptive_avg_pool3d_native.h>
 
 namespace at::native {
 
diff --git a/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp b/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp
index 6098072ac1..c587cde358 100644
--- a/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp
+++ b/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp
@@ -4,8 +4,8 @@
 #include <ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h>
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/adaptive_max_pool2d_backward_native.h>
-#include <xpu/ATen/ops/adaptive_max_pool2d_native.h>
+#include <ATen/ops/adaptive_max_pool2d_backward_native.h>
+#include <ATen/ops/adaptive_max_pool2d_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/AdaptiveMaxPooling3d.cpp b/src/ATen/native/xpu/AdaptiveMaxPooling3d.cpp
index 7610dbd455..3bca6156bc 100644
--- a/src/ATen/native/xpu/AdaptiveMaxPooling3d.cpp
+++ b/src/ATen/native/xpu/AdaptiveMaxPooling3d.cpp
@@ -4,8 +4,8 @@
 #include <ATen/native/xpu/sycl/AdaptiveMaxPooling3dKernels.h>
 
 #include <ATen/ops/empty.h>
-#include <xpu/ATen/ops/adaptive_max_pool3d_backward_native.h>
-#include <xpu/ATen/ops/adaptive_max_pool3d_native.h>
+#include <ATen/ops/adaptive_max_pool3d_backward_native.h>
+#include <ATen/ops/adaptive_max_pool3d_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/AveragePool2d.cpp b/src/ATen/native/xpu/AveragePool2d.cpp
index 326ad8a517..7647aa5621 100644
--- a/src/ATen/native/xpu/AveragePool2d.cpp
+++ b/src/ATen/native/xpu/AveragePool2d.cpp
@@ -5,8 +5,8 @@
 #include <ATen/native/xpu/sycl/AveragePool2dKernels.h>
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/avg_pool2d_backward_native.h>
-#include <xpu/ATen/ops/avg_pool2d_native.h>
+#include <ATen/ops/avg_pool2d_backward_native.h>
+#include <ATen/ops/avg_pool2d_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/AveragePool3d.cpp b/src/ATen/native/xpu/AveragePool3d.cpp
index 471e98a27a..97eda5ae1e 100644
--- a/src/ATen/native/xpu/AveragePool3d.cpp
+++ b/src/ATen/native/xpu/AveragePool3d.cpp
@@ -1,8 +1,8 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/native/xpu/sycl/AveragePool3dKernels.h>
 
-#include <xpu/ATen/ops/avg_pool3d_backward_native.h>
-#include <xpu/ATen/ops/avg_pool3d_native.h>
+#include <ATen/ops/avg_pool3d_backward_native.h>
+#include <ATen/ops/avg_pool3d_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/BinaryOps.cpp b/src/ATen/native/xpu/BinaryOps.cpp
index 53a8e56d26..18654eda17 100644
--- a/src/ATen/native/xpu/BinaryOps.cpp
+++ b/src/ATen/native/xpu/BinaryOps.cpp
@@ -4,7 +4,7 @@
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 
-#include <xpu/ATen/ops/add_native.h>
+#include <ATen/ops/add_native.h>
 
 #include <ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h>
 #include <ATen/native/xpu/sycl/BinaryGeometricKernels.h>
diff --git a/src/ATen/native/xpu/Col2Im.cpp b/src/ATen/native/xpu/Col2Im.cpp
index 2a6742e5e1..71c42fd6b7 100644
--- a/src/ATen/native/xpu/Col2Im.cpp
+++ b/src/ATen/native/xpu/Col2Im.cpp
@@ -7,7 +7,7 @@
 #include <ATen/native/xpu/sycl/Col2ImKernel.h>
 
 #include <comm/xpu_aten.h>
-#include <xpu/ATen/ops/col2im_native.h>
+#include <ATen/ops/col2im_native.h>
 
 namespace at::native {
 
diff --git a/src/ATen/native/xpu/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/DilatedMaxPool2d.cpp
index a08227b470..c13e76bb5f 100644
--- a/src/ATen/native/xpu/DilatedMaxPool2d.cpp
+++ b/src/ATen/native/xpu/DilatedMaxPool2d.cpp
@@ -4,9 +4,9 @@
 #include <ATen/native/xpu/sycl/DilatedMaxPool2d.h>
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/max.h>
-#include <xpu/ATen/ops/max_pool2d_with_indices_backward_native.h>
-#include <xpu/ATen/ops/max_pool2d_with_indices_native.h>
+#include <ATen/ops/max.h>
+#include <ATen/ops/max_pool2d_with_indices_backward_native.h>
+#include <ATen/ops/max_pool2d_with_indices_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/DilatedMaxPool3d.cpp b/src/ATen/native/xpu/DilatedMaxPool3d.cpp
index f19e8c5305..56d9ba0dcc 100644
--- a/src/ATen/native/xpu/DilatedMaxPool3d.cpp
+++ b/src/ATen/native/xpu/DilatedMaxPool3d.cpp
@@ -2,8 +2,8 @@
 #include <ATen/native/xpu/sycl/DilatedMaxPool3d.h>
 
 #include <ATen/ops/empty.h>
-#include <xpu/ATen/ops/max_pool3d_with_indices_backward_native.h>
-#include <xpu/ATen/ops/max_pool3d_with_indices_native.h>
+#include <ATen/ops/max_pool3d_with_indices_backward_native.h>
+#include <ATen/ops/max_pool3d_with_indices_native.h>
 namespace at {
 namespace native {
 
diff --git a/src/ATen/native/xpu/Dropout.cpp b/src/ATen/native/xpu/Dropout.cpp
index bfb704e5f1..5cc9ded92f 100644
--- a/src/ATen/native/xpu/Dropout.cpp
+++ b/src/ATen/native/xpu/Dropout.cpp
@@ -3,8 +3,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/DropoutKernels.h>
 
-#include <xpu/ATen/ops/native_dropout_backward_native.h>
-#include <xpu/ATen/ops/native_dropout_native.h>
+#include <ATen/ops/native_dropout_backward_native.h>
+#include <ATen/ops/native_dropout_native.h>
 
 #include <comm/xpu_aten.h>
 
diff --git a/src/ATen/native/xpu/Embedding.cpp b/src/ATen/native/xpu/Embedding.cpp
index 204a324fdf..42b8cd67d1 100644
--- a/src/ATen/native/xpu/Embedding.cpp
+++ b/src/ATen/native/xpu/Embedding.cpp
@@ -1,6 +1,6 @@
 #include <ATen/core/op_registration/adaption.h>
 
-#include <xpu/ATen/ops/embedding_dense_backward_native.h>
+#include <ATen/ops/embedding_dense_backward_native.h>
 
 #include <ATen/native/xpu/sycl/EmbeddingKernels.h>
 #include <comm/xpu_aten.h>
diff --git a/src/ATen/native/xpu/EmbeddingBag.cpp b/src/ATen/native/xpu/EmbeddingBag.cpp
index 25e9e8d1e2..120370d6b2 100644
--- a/src/ATen/native/xpu/EmbeddingBag.cpp
+++ b/src/ATen/native/xpu/EmbeddingBag.cpp
@@ -1,5 +1,5 @@
-#include <xpu/ATen/ops/_embedding_bag_forward_only_native.h>
-#include <xpu/ATen/ops/_embedding_bag_native.h>
+#include <ATen/ops/_embedding_bag_forward_only_native.h>
+#include <ATen/ops/_embedding_bag_native.h>
 
 #include <ATen/native/xpu/sycl/EmbeddingBagKernels.h>
 #include <comm/xpu_aten.h>
diff --git a/src/ATen/native/xpu/Equal.cpp b/src/ATen/native/xpu/Equal.cpp
index dcee9b3809..bc91268435 100644
--- a/src/ATen/native/xpu/Equal.cpp
+++ b/src/ATen/native/xpu/Equal.cpp
@@ -1,6 +1,6 @@
 #include <ATen/NamedTensorUtils.h>
 
-#include <xpu/ATen/ops/equal_native.h>
+#include <ATen/ops/equal_native.h>
 
 namespace at {
 namespace xpu {
diff --git a/src/ATen/native/xpu/ForeachOpScalarList.cpp b/src/ATen/native/xpu/ForeachOpScalarList.cpp
index 87c1f0ce39..2ec48cf0fc 100644
--- a/src/ATen/native/xpu/ForeachOpScalarList.cpp
+++ b/src/ATen/native/xpu/ForeachOpScalarList.cpp
@@ -16,8 +16,8 @@
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachTernaryOpScalarListKernels.h>
 
-#include <xpu/ATen/ops/_foreach_add_native.h>
-#include <xpu/ATen/ops/_foreach_mul_native.h>
+#include <ATen/ops/_foreach_add_native.h>
+#include <ATen/ops/_foreach_mul_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/ForeachReduceOp.cpp b/src/ATen/native/xpu/ForeachReduceOp.cpp
index a9ef1ff44c..6b104dda2b 100644
--- a/src/ATen/native/xpu/ForeachReduceOp.cpp
+++ b/src/ATen/native/xpu/ForeachReduceOp.cpp
@@ -1,8 +1,8 @@
 #include <ATen/native/ForeachUtils.h>
 
 #include <ATen/native/xpu/sycl/ForeachReduceKernels.h>
-#include <xpu/ATen/ops/_foreach_max_native.h>
-#include <xpu/ATen/ops/_foreach_norm_native.h>
+#include <ATen/ops/_foreach_max_native.h>
+#include <ATen/ops/_foreach_norm_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/FractionalMaxPool2d.cpp b/src/ATen/native/xpu/FractionalMaxPool2d.cpp
index e0ddea64b4..2586fe17ca 100644
--- a/src/ATen/native/xpu/FractionalMaxPool2d.cpp
+++ b/src/ATen/native/xpu/FractionalMaxPool2d.cpp
@@ -3,8 +3,8 @@
 #include <ATen/native/cpu/mixed_data_type.h>
 #include <ATen/native/xpu/sycl/FractionalMaxPool2dKernels.h>
 
-#include <xpu/ATen/ops/fractional_max_pool2d_backward_native.h>
-#include <xpu/ATen/ops/fractional_max_pool2d_native.h>
+#include <ATen/ops/fractional_max_pool2d_backward_native.h>
+#include <ATen/ops/fractional_max_pool2d_native.h>
 
 namespace at::native {
 
diff --git a/src/ATen/native/xpu/FractionalMaxPool3d.cpp b/src/ATen/native/xpu/FractionalMaxPool3d.cpp
index 29d6acf8cb..a0ac54b1be 100644
--- a/src/ATen/native/xpu/FractionalMaxPool3d.cpp
+++ b/src/ATen/native/xpu/FractionalMaxPool3d.cpp
@@ -4,8 +4,8 @@
 #include <ATen/native/xpu/sycl/FractionalMaxPool3dKernels.h>
 #include <ATen/ops/empty.h>
 
-#include <xpu/ATen/ops/fractional_max_pool3d_backward_native.h>
-#include <xpu/ATen/ops/fractional_max_pool3d_native.h>
+#include <ATen/ops/fractional_max_pool3d_backward_native.h>
+#include <ATen/ops/fractional_max_pool3d_native.h>
 
 namespace at::native {
 
diff --git a/src/ATen/native/xpu/Im2Col.cpp b/src/ATen/native/xpu/Im2Col.cpp
index eb9f4077ac..200b56831f 100644
--- a/src/ATen/native/xpu/Im2Col.cpp
+++ b/src/ATen/native/xpu/Im2Col.cpp
@@ -4,7 +4,7 @@
 #include <ATen/native/TensorIterator.h>
 #include <torch/library.h>
 
-#include <xpu/ATen/ops/im2col_native.h>
+#include <ATen/ops/im2col_native.h>
 
 #include <ATen/native/xpu/sycl/Im2ColKernel.h>
 #include <comm/xpu_aten.h>
diff --git a/src/ATen/native/xpu/Indexing.cpp b/src/ATen/native/xpu/Indexing.cpp
index bb8c07a92e..fe4dc79fcd 100644
--- a/src/ATen/native/xpu/Indexing.cpp
+++ b/src/ATen/native/xpu/Indexing.cpp
@@ -10,7 +10,7 @@
 #include <comm/xpu_aten.h>
 
 #include <ATen/ops/index.h>
-#include <xpu/ATen/ops/index_native.h>
+#include <ATen/ops/index_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/LossMultiMargin.cpp b/src/ATen/native/xpu/LossMultiMargin.cpp
index 2db427135b..6f8076a0f8 100644
--- a/src/ATen/native/xpu/LossMultiMargin.cpp
+++ b/src/ATen/native/xpu/LossMultiMargin.cpp
@@ -2,8 +2,8 @@
 #include <ATen/native/xpu/sycl/MultiMarginLossKernels.h>
 
 #include <ATen/ops/empty.h>
-#include <xpu/ATen/ops/multi_margin_loss_backward_native.h>
-#include <xpu/ATen/ops/multi_margin_loss_native.h>
+#include <ATen/ops/multi_margin_loss_backward_native.h>
+#include <ATen/ops/multi_margin_loss_native.h>
 
 namespace at::native {
 
diff --git a/src/ATen/native/xpu/LossNLL.cpp b/src/ATen/native/xpu/LossNLL.cpp
index d80fef7462..28cceca996 100644
--- a/src/ATen/native/xpu/LossNLL.cpp
+++ b/src/ATen/native/xpu/LossNLL.cpp
@@ -5,8 +5,8 @@
 #include <comm/RegisterUtils.h>
 #include <comm/xpu_aten.h>
 
-#include <xpu/ATen/ops/nll_loss_backward_native.h>
-#include <xpu/ATen/ops/nll_loss_forward_native.h>
+#include <ATen/ops/nll_loss_backward_native.h>
+#include <ATen/ops/nll_loss_forward_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/PinnedMemoryAllocator.cpp b/src/ATen/native/xpu/PinnedMemoryAllocator.cpp
index a12b686b2e..88c9e46c21 100644
--- a/src/ATen/native/xpu/PinnedMemoryAllocator.cpp
+++ b/src/ATen/native/xpu/PinnedMemoryAllocator.cpp
@@ -3,7 +3,7 @@
 #include <ATen/xpu/PinnedMemoryAllocator.h>
 #include <comm/xpu_aten.h>
 
-#include <xpu/ATen/ops/is_pinned_native.h>
+#include <ATen/ops/is_pinned_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp
index bfa0f1545e..4cbed8c739 100644
--- a/src/ATen/native/xpu/RangeFactories.cpp
+++ b/src/ATen/native/xpu/RangeFactories.cpp
@@ -10,10 +10,10 @@
 #include <comm/xpu_aten.h>
 #include <torch/library.h>
 
-#include <xpu/ATen/ops/arange_native.h>
-#include <xpu/ATen/ops/linspace_native.h>
-#include <xpu/ATen/ops/logspace_native.h>
-#include <xpu/ATen/ops/range_native.h>
+#include <ATen/ops/arange_native.h>
+#include <ATen/ops/linspace_native.h>
+#include <ATen/ops/logspace_native.h>
+#include <ATen/ops/range_native.h>
 
 namespace at {
 
diff --git a/src/ATen/native/xpu/ReflectionPad.cpp b/src/ATen/native/xpu/ReflectionPad.cpp
index a881519146..0c9ee7da45 100644
--- a/src/ATen/native/xpu/ReflectionPad.cpp
+++ b/src/ATen/native/xpu/ReflectionPad.cpp
@@ -6,12 +6,12 @@
 
 #include <ATen/ops/empty.h>
 #include <ATen/ops/zeros_like.h>
-#include <xpu/ATen/ops/reflection_pad1d_backward_native.h>
-#include <xpu/ATen/ops/reflection_pad1d_native.h>
-#include <xpu/ATen/ops/reflection_pad2d_backward_native.h>
-#include <xpu/ATen/ops/reflection_pad2d_native.h>
-#include <xpu/ATen/ops/reflection_pad3d_backward_native.h>
-#include <xpu/ATen/ops/reflection_pad3d_native.h>
+#include <ATen/ops/reflection_pad1d_backward_native.h>
+#include <ATen/ops/reflection_pad1d_native.h>
+#include <ATen/ops/reflection_pad2d_backward_native.h>
+#include <ATen/ops/reflection_pad2d_native.h>
+#include <ATen/ops/reflection_pad3d_backward_native.h>
+#include <ATen/ops/reflection_pad3d_native.h>
 #include "ATen/TensorMeta.h"
 
 namespace at {
diff --git a/src/ATen/native/xpu/ReplicationPadding.cpp b/src/ATen/native/xpu/ReplicationPadding.cpp
index 3f00938450..e72ff0a4c9 100644
--- a/src/ATen/native/xpu/ReplicationPadding.cpp
+++ b/src/ATen/native/xpu/ReplicationPadding.cpp
@@ -6,12 +6,12 @@
 
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/replication_pad1d_backward_native.h>
-#include <xpu/ATen/ops/replication_pad1d_native.h>
-#include <xpu/ATen/ops/replication_pad2d_backward_native.h>
-#include <xpu/ATen/ops/replication_pad2d_native.h>
-#include <xpu/ATen/ops/replication_pad3d_backward_native.h>
-#include <xpu/ATen/ops/replication_pad3d_native.h>
+#include <ATen/ops/replication_pad1d_backward_native.h>
+#include <ATen/ops/replication_pad1d_native.h>
+#include <ATen/ops/replication_pad2d_backward_native.h>
+#include <ATen/ops/replication_pad2d_native.h>
+#include <ATen/ops/replication_pad3d_backward_native.h>
+#include <ATen/ops/replication_pad3d_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/Resize.cpp b/src/ATen/native/xpu/Resize.cpp
index 66c95302b1..19b37ff0c6 100644
--- a/src/ATen/native/xpu/Resize.cpp
+++ b/src/ATen/native/xpu/Resize.cpp
@@ -6,20 +6,13 @@
 #include <torch/library.h>
 
 #include <ATen/native/Resize.h>
-#include <xpu/ATen/ops/copy.h>
-#include <xpu/ATen/ops/resize_native.h>
-#include <xpu/ATen/ops/set_native.h>
+#include <ATen/ops/copy.h>
+#include <ATen/ops/resize_native.h>
+#include <ATen/ops/set_native.h>
 
 #include <ATen/native/xpu/sycl/ResizeKernel.h>
 
 namespace at {
-
-namespace native {
-const at::Tensor& resize_(
-    const at::Tensor& self,
-    at::IntArrayRef size,
-    ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt);
-}
 namespace native::xpu {
 
 const Tensor& resize_xpu_(
diff --git a/src/ATen/native/xpu/SoftMax.cpp b/src/ATen/native/xpu/SoftMax.cpp
index f155165ceb..052a3d9cf1 100644
--- a/src/ATen/native/xpu/SoftMax.cpp
+++ b/src/ATen/native/xpu/SoftMax.cpp
@@ -5,10 +5,10 @@
 #include <comm/RegisterUtils.h>
 #include <comm/xpu_aten.h>
 
-#include <xpu/ATen/ops/_log_softmax_backward_data_native.h>
-#include <xpu/ATen/ops/_log_softmax_native.h>
-#include <xpu/ATen/ops/_softmax_backward_data_native.h>
-#include <xpu/ATen/ops/_softmax_native.h>
+#include <ATen/ops/_log_softmax_backward_data_native.h>
+#include <ATen/ops/_log_softmax_native.h>
+#include <ATen/ops/_softmax_backward_data_native.h>
+#include <ATen/ops/_softmax_native.h>
 namespace at::native {
 
 TORCH_IMPL_FUNC(softmax_xpu_out)
diff --git a/src/ATen/native/xpu/SummaryOps.cpp b/src/ATen/native/xpu/SummaryOps.cpp
index 953004227b..22a52e4e2d 100644
--- a/src/ATen/native/xpu/SummaryOps.cpp
+++ b/src/ATen/native/xpu/SummaryOps.cpp
@@ -2,7 +2,7 @@
 #include <ATen/native/xpu/sycl/SummaryOpsKernels.h>
 #include <comm/SYCLContext.h>
 
-#include <xpu/ATen/ops/bincount_native.h>
+#include <ATen/ops/bincount_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/TensorAdvancedIndexing.cpp b/src/ATen/native/xpu/TensorAdvancedIndexing.cpp
index bd24aa3a03..a1ce1fad53 100644
--- a/src/ATen/native/xpu/TensorAdvancedIndexing.cpp
+++ b/src/ATen/native/xpu/TensorAdvancedIndexing.cpp
@@ -22,9 +22,9 @@
 
 #include <ATen/ops/index_add_meta.h>
 #include <ATen/ops/index_reduce_meta.h>
-#include <xpu/ATen/ops/index_add_native.h>
-#include <xpu/ATen/ops/index_reduce_native.h> //generated
-//#include <xpu/ATen/ops/index_reduce_prod_native.h> //generated
+#include <ATen/ops/index_add_native.h>
+#include <ATen/ops/index_reduce_native.h> //generated
+//#include <ATen/ops/index_reduce_prod_native.h> //generated
 
 namespace at {
 
diff --git a/src/ATen/native/xpu/TensorFactories.cpp b/src/ATen/native/xpu/TensorFactories.cpp
index 3caef39ba9..2c0faa5353 100644
--- a/src/ATen/native/xpu/TensorFactories.cpp
+++ b/src/ATen/native/xpu/TensorFactories.cpp
@@ -5,7 +5,7 @@
 
 #include <ATen/ops/empty_native.h>
 #include <ATen/ops/empty_strided_native.h>
-#include <xpu/ATen/ops/_efficientzerotensor_native.h>
+#include <ATen/ops/_efficientzerotensor_native.h>
 
 #include <ATen/native/xpu/sycl/ComplexKernels.h>
 #include <ATen/native/xpu/sycl/RandpermKernel.h>
diff --git a/src/ATen/native/xpu/TensorShape.cpp b/src/ATen/native/xpu/TensorShape.cpp
index b237b4336d..aae14c1b60 100644
--- a/src/ATen/native/xpu/TensorShape.cpp
+++ b/src/ATen/native/xpu/TensorShape.cpp
@@ -9,9 +9,9 @@
 #include <ATen/native/xpu/sycl/ShapeKernels.h>
 #include <ATen/native/xpu/sycl/TensorShapeKernels.h>
 #include <comm/RegisterUtils.h>
-#include <xpu/ATen/ops/as_strided_copy_native.h>
-#include <xpu/ATen/ops/as_strided_native.h>
-#include <xpu/ATen/ops/cat_native.h>
+#include <ATen/ops/as_strided_copy_native.h>
+#include <ATen/ops/as_strided_native.h>
+#include <ATen/ops/cat_native.h>
 
 namespace at {
 
diff --git a/src/ATen/native/xpu/TensorTopK.cpp b/src/ATen/native/xpu/TensorTopK.cpp
index ab3fc52509..0c79610cd3 100644
--- a/src/ATen/native/xpu/TensorTopK.cpp
+++ b/src/ATen/native/xpu/TensorTopK.cpp
@@ -5,7 +5,7 @@
 
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/topk_native.h>
+#include <ATen/ops/topk_native.h>
 
 namespace at {
 
diff --git a/src/ATen/native/xpu/TriangluarOps.cpp b/src/ATen/native/xpu/TriangluarOps.cpp
index 3db5e967ba..39213b00fe 100644
--- a/src/ATen/native/xpu/TriangluarOps.cpp
+++ b/src/ATen/native/xpu/TriangluarOps.cpp
@@ -5,8 +5,8 @@
 #include <comm/RegisterUtils.h>
 #include <comm/xpu_aten.h>
 
-#include <xpu/ATen/ops/tril_native.h>
-#include <xpu/ATen/ops/triu_native.h>
+#include <ATen/ops/tril_native.h>
+#include <ATen/ops/triu_native.h>
 
 namespace at::native {
 
diff --git a/src/ATen/native/xpu/UpSampleBicubic2d.cpp b/src/ATen/native/xpu/UpSampleBicubic2d.cpp
index 7e0e4de402..388c6d0e45 100644
--- a/src/ATen/native/xpu/UpSampleBicubic2d.cpp
+++ b/src/ATen/native/xpu/UpSampleBicubic2d.cpp
@@ -5,10 +5,10 @@
 #include <ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h>
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/upsample_bicubic2d_backward_native.h>
-#include <xpu/ATen/ops/upsample_bicubic2d_native.h>
-#include <xpu/ATen/ops/_upsample_bicubic2d_aa_backward_native.h>
-#include <xpu/ATen/ops/_upsample_bicubic2d_aa_native.h>
+#include <ATen/ops/upsample_bicubic2d_backward_native.h>
+#include <ATen/ops/upsample_bicubic2d_native.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_backward_native.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_native.h>
 namespace at {
 namespace native {
 TORCH_IMPL_FUNC(upsample_bicubic2d_out_xpu)
diff --git a/src/ATen/native/xpu/UpSampleBilinear2d.cpp b/src/ATen/native/xpu/UpSampleBilinear2d.cpp
index aec7071938..91bc5219bf 100644
--- a/src/ATen/native/xpu/UpSampleBilinear2d.cpp
+++ b/src/ATen/native/xpu/UpSampleBilinear2d.cpp
@@ -4,10 +4,10 @@
 #include <ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h>
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/upsample_bilinear2d_backward_native.h>
-#include <xpu/ATen/ops/upsample_bilinear2d_native.h>
-#include <xpu/ATen/ops/_upsample_bilinear2d_aa_backward_native.h>
-#include <xpu/ATen/ops/_upsample_bilinear2d_aa_native.h>
+#include <ATen/ops/upsample_bilinear2d_backward_native.h>
+#include <ATen/ops/upsample_bilinear2d_native.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward_native.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/UpSampleLinear1d.cpp b/src/ATen/native/xpu/UpSampleLinear1d.cpp
index 13dfa33dea..388f6c2573 100644
--- a/src/ATen/native/xpu/UpSampleLinear1d.cpp
+++ b/src/ATen/native/xpu/UpSampleLinear1d.cpp
@@ -5,8 +5,8 @@
 #include <comm/RegisterUtils.h>
 #include "ATen/core/ATen_fwd.h"
 
-#include <xpu/ATen/ops/upsample_linear1d_backward_native.h>
-#include <xpu/ATen/ops/upsample_linear1d_native.h>
+#include <ATen/ops/upsample_linear1d_backward_native.h>
+#include <ATen/ops/upsample_linear1d_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/UpSampleNearest1d.cpp b/src/ATen/native/xpu/UpSampleNearest1d.cpp
index 30287e4b25..7603a43e9c 100644
--- a/src/ATen/native/xpu/UpSampleNearest1d.cpp
+++ b/src/ATen/native/xpu/UpSampleNearest1d.cpp
@@ -2,10 +2,10 @@
 #include <ATen/native/xpu/sycl/UpSampleNearest1dKernels.h>
 #include <comm/xpu_aten.h>
 
-#include <xpu/ATen/ops/_upsample_nearest_exact1d_backward_native.h>
-#include <xpu/ATen/ops/_upsample_nearest_exact1d_native.h>
-#include <xpu/ATen/ops/upsample_nearest1d_backward_native.h>
-#include <xpu/ATen/ops/upsample_nearest1d_native.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact1d_native.h>
+#include <ATen/ops/upsample_nearest1d_backward_native.h>
+#include <ATen/ops/upsample_nearest1d_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/UpSampleNearest2d.cpp b/src/ATen/native/xpu/UpSampleNearest2d.cpp
index 9ebbd74b14..c906a703f2 100644
--- a/src/ATen/native/xpu/UpSampleNearest2d.cpp
+++ b/src/ATen/native/xpu/UpSampleNearest2d.cpp
@@ -2,10 +2,10 @@
 #include <ATen/native/xpu/sycl/UpSampleNearest2dKernels.h>
 #include <comm/xpu_aten.h>
 
-#include <xpu/ATen/ops/_upsample_nearest_exact2d_backward_native.h>
-#include <xpu/ATen/ops/_upsample_nearest_exact2d_native.h>
-#include <xpu/ATen/ops/upsample_nearest2d_backward_native.h>
-#include <xpu/ATen/ops/upsample_nearest2d_native.h>
+#include <ATen/ops/_upsample_nearest_exact2d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact2d_native.h>
+#include <ATen/ops/upsample_nearest2d_backward_native.h>
+#include <ATen/ops/upsample_nearest2d_native.h>
 namespace at {
 
 namespace native {
diff --git a/src/ATen/native/xpu/UpSampleNearest3d.cpp b/src/ATen/native/xpu/UpSampleNearest3d.cpp
index 5528b0ac22..8cc0bb9f89 100644
--- a/src/ATen/native/xpu/UpSampleNearest3d.cpp
+++ b/src/ATen/native/xpu/UpSampleNearest3d.cpp
@@ -1,14 +1,14 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/native/xpu/sycl/UpSampleNearest3dKernels.h>
 
-#include <xpu/ATen/ops/_upsample_nearest_exact3d.h>
-#include <xpu/ATen/ops/_upsample_nearest_exact3d_backward.h>
-#include <xpu/ATen/ops/_upsample_nearest_exact3d_backward_native.h>
-#include <xpu/ATen/ops/_upsample_nearest_exact3d_native.h>
-#include <xpu/ATen/ops/upsample_nearest3d.h>
-#include <xpu/ATen/ops/upsample_nearest3d_backward.h>
-#include <xpu/ATen/ops/upsample_nearest3d_backward_native.h>
-#include <xpu/ATen/ops/upsample_nearest3d_native.h>
+#include <ATen/ops/_upsample_nearest_exact3d.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact3d_native.h>
+#include <ATen/ops/upsample_nearest3d.h>
+#include <ATen/ops/upsample_nearest3d_backward.h>
+#include <ATen/ops/upsample_nearest3d_backward_native.h>
+#include <ATen/ops/upsample_nearest3d_native.h>
 
 namespace at::native {
 
diff --git a/src/ATen/native/xpu/UpSampleTrilinear3d.cpp b/src/ATen/native/xpu/UpSampleTrilinear3d.cpp
index 4c46a07c66..e6a28ca844 100644
--- a/src/ATen/native/xpu/UpSampleTrilinear3d.cpp
+++ b/src/ATen/native/xpu/UpSampleTrilinear3d.cpp
@@ -2,8 +2,8 @@
 #include <ATen/native/xpu/sycl/UpSampleTrilinear3dKernels.h>
 #include <comm/SYCLContext.h>
 
-#include <xpu/ATen/ops/upsample_trilinear3d_backward_native.h>
-#include <xpu/ATen/ops/upsample_trilinear3d_native.h>
+#include <ATen/ops/upsample_trilinear3d_backward_native.h>
+#include <ATen/ops/upsample_trilinear3d_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/XPUScalar.cpp b/src/ATen/native/xpu/XPUScalar.cpp
index d47dd78713..25acb44d14 100644
--- a/src/ATen/native/xpu/XPUScalar.cpp
+++ b/src/ATen/native/xpu/XPUScalar.cpp
@@ -3,7 +3,7 @@
 #include <ATen/EmptyTensor.h>
 #include <ATen/core/Tensor.h>
 #include <comm/SYCLContext.h>
-#include <xpu/ATen/ops/_local_scalar_dense_native.h>
+#include <ATen/ops/_local_scalar_dense_native.h>
 
 namespace at::native {
 

From 0a8ed2eac5c2dc23292daf2d60168ed64018f391 Mon Sep 17 00:00:00 2001
From: "Yu, Guangye" <guangye.yu@intel.com>
Date: Wed, 26 Feb 2025 16:04:32 +0000
Subject: [PATCH 3/3] add a ut to guard the change

---
 cmake/Codegen.cmake                     |  2 +-
 test/regressions/test_xpu_ops_header.py | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 test/regressions/test_xpu_ops_header.py

diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index 9cf4459162..fbbcc3e195 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -98,7 +98,7 @@ function(GEN_XPU file_yaml)
     --aoti-install-dir=${XPU_AOTI_INSTALL_DIR}
     COMMAND
     ${REGISTER_FALLBACK_CMD}
-    # # Codegen post-process
+    # Codegen post-process
     COMMAND
     ${XPU_INSTALL_HEADER_COMMAND}
     WORKING_DIRECTORY ${TORCH_ROOT}
diff --git a/test/regressions/test_xpu_ops_header.py b/test/regressions/test_xpu_ops_header.py
new file mode 100644
index 0000000000..844c9b25a2
--- /dev/null
+++ b/test/regressions/test_xpu_ops_header.py
@@ -0,0 +1,21 @@
+# Owner(s): ["module: intel"]
+import os
+
+import torch
+from torch.testing._internal.common_utils import TestCase
+
+
+class TestXpuOpsHeader(TestCase):
+    def test_xpu_ops_header(self):
+        include_dir = os.path.join(os.path.dirname(torch.__file__), "include")
+        aten_ops_dir = os.path.join(include_dir, "ATen/ops")
+        self.assertTrue(
+            os.path.exists(os.path.join(aten_ops_dir, "cat_xpu_dispatch.h"))
+        )
+        self.assertTrue(
+            os.path.exists(os.path.join(aten_ops_dir, "index_fill_xpu_dispatch.h"))
+        )
+        self.assertTrue(os.path.exists(os.path.join(aten_ops_dir, "col2im_native.h")))
+        with open(os.path.join(aten_ops_dir, "col2im_native.h")) as fr:
+            text = fr.read()
+            self.assertTrue("col2im_xpu" in text)