From 9682a8224d7b4e42b749a38e7bcede0de5146fe8 Mon Sep 17 00:00:00 2001
From: Slawomir Siwek <slawomir.siwek@intel.com>
Date: Tue, 7 Oct 2025 19:18:35 +0200
Subject: [PATCH 1/4] Cast intermediate tensor to bfloat16

Truncate precision to match CPU implementation
---
 src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
index 8e8b09d5cf..60ec6c16a4 100644
--- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
@@ -134,8 +134,9 @@ struct Logit0Functor {
   using T_ACC = acc_type_device<scalar_t, c10::DeviceType::XPU>;
   scalar_t operator()(scalar_t x) const {
     const T_ACC x_acc = static_cast<T_ACC>(x);
+    const T_ACC div = static_cast<T_ACC>(static_cast<scalar_t>(x_acc / (T_ACC(1) - x_acc));
     // suppress compiler optimization on data type promotion.
-    volatile T_ACC res = std::log(x_acc / (T_ACC(1) - x_acc));
+    volatile T_ACC res = std::log(div);
     return res;
   }
 };

From 0ff60c8540d9658a7b106200c86b2f80f432d126 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Wed, 8 Oct 2025 11:17:55 +0000
Subject: [PATCH 2/4] syntax

---
 src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
index 60ec6c16a4..f192b93b05 100644
--- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
@@ -134,8 +134,8 @@ struct Logit0Functor {
   using T_ACC = acc_type_device<scalar_t, c10::DeviceType::XPU>;
   scalar_t operator()(scalar_t x) const {
     const T_ACC x_acc = static_cast<T_ACC>(x);
-    const T_ACC div = static_cast<T_ACC>(static_cast<scalar_t>(x_acc / (T_ACC(1) - x_acc));
     // suppress compiler optimization on data type promotion.
+    const T_ACC div = static_cast<T_ACC>(static_cast<scalar_t>(x_acc / (T_ACC(1) - x_acc)));
     volatile T_ACC res = std::log(div);
     return res;
   }
@@ -148,7 +148,8 @@ struct Logit1Functor {
     const T_ACC x_acc = static_cast<T_ACC>(x);
     T_ACC z = x_acc < lo_ ? lo_ : (x_acc > hi_ ? hi_ : x_acc);
     // suppress compiler optimization on data type promotion.
-    volatile T_ACC res = std::log(z / (T_ACC(1) - z));
+    const T_ACC div = static_cast<T_ACC>(static_cast<scalar_t>(z / (T_ACC(1) - z)));
+    volatile T_ACC res = std::log(div);
     return res;
   }
   Logit1Functor(const T_ACC lo, const T_ACC hi) : lo_(lo), hi_(hi) {}

From c788ff14ee95385672c145e82300e8a1f8e2e9d2 Mon Sep 17 00:00:00 2001
From: Pawel Swider <pawel.swider@yintel.com>
Date: Wed, 8 Oct 2025 11:17:13 +0000
Subject: [PATCH 3/4] Modify files in install_xpu_headers only if they changes

---
 tools/codegen/install_xpu_headers.py | 103 ++++++++++++++++++++-------
 1 file changed, 76 insertions(+), 27 deletions(-)

diff --git a/tools/codegen/install_xpu_headers.py b/tools/codegen/install_xpu_headers.py
index 807a509bc6..a0a7855221 100644
--- a/tools/codegen/install_xpu_headers.py
+++ b/tools/codegen/install_xpu_headers.py
@@ -4,6 +4,7 @@
 import shutil
 from pathlib import Path
 
+VERBOSE = False
 
 parser = argparse.ArgumentParser(description="Utils for append ops headers")
 parser.add_argument(
@@ -20,34 +21,53 @@ def append_xpu_function_header(src, dst):
     r"""
     Cleans trailing empty lines from the destination file, then appends #include
     lines from the source file that match `#include <ATen/ops/...` to the destination.
+    Only modifies file if changes are actually needed.
     """
     if args.dry_run:
         return
 
-    # Read source file and match header lines
-    with open(src, encoding="utf-8") as fr:
-        src_text = fr.read()
+    try:
+        with open(src, encoding="utf-8") as fr:
+            src_text = fr.read()
+    except (IOError, OSError) as e:
+        if VERBOSE:
+            print(f"Warning: Could not read source file {src}: {e}")
+        return
+
     pattern = r"^#include <ATen/ops/.*>\s*\r?\n"
     matches = re.findall(pattern, src_text, re.MULTILINE)
     if not matches:
         return
 
-    with open(dst, "r+", encoding="utf-8") as f:
-        dst_lines = f.readlines()
-        dst_text = "".join(dst_lines)
-        missing_headers = [match for match in matches if match not in dst_text]
-        if not missing_headers:
-            return
+    try:
+        with open(dst, "r", encoding="utf-8") as f:
+            dst_lines = f.readlines()
+            dst_text = "".join(dst_lines)
+    except (IOError, OSError) as e:
+        if VERBOSE:
+            print(f"Warning: Could not read destination file {dst}: {e}")
+        return
 
-        # Remove trailing empty lines from dst_lines
-        while dst_lines and not dst_lines[-1].strip():
-            dst_lines.pop()
+    missing_headers = [match for match in matches if match not in dst_text]
+    if not missing_headers:
+        return
 
-        f.seek(0)
-        f.truncate()
-        f.writelines(dst_lines)
-        # Append missing headers to the end of the file
-        f.writelines(missing_headers)
+    new_dst_lines = dst_lines.copy()
+    
+    while new_dst_lines and not new_dst_lines[-1].strip():
+        new_dst_lines.pop()
+    new_dst_lines.extend(missing_headers)
+    
+    new_content = "".join(new_dst_lines)
+    old_content = "".join(dst_lines)
+    
+    if new_content != old_content:
+        try:
+            with open(dst, "w", encoding="utf-8") as f:
+                f.writelines(new_dst_lines)
+        except (IOError, OSError) as e:
+            if VERBOSE:
+                print(f"Error: Could not write to {dst}: {e}")
 
 
 def parse_ops_headers(src):
@@ -78,18 +98,36 @@ def classify_ops_headers(src_dir, dst_dir):
 
 def generate_xpu_ops_headers_cmake(src_dir, dst_dir, xpu_ops_headers):
     r"""
-    Generate XPU ops headers xpu_ops_generated_headers.cmake
+    Generate XPU ops headers xpu_ops_generated_headers.cmake only if content changes
     """
-    with open(os.path.join(src_dir, "xpu_ops_generated_headers.cmake"), "w", encoding="utf-8") as fw:
-        fw.write("set(xpu_ops_generated_headers\n")
-        for header in xpu_ops_headers:
-            fw.write(f'    "{Path(os.path.join(dst_dir, header)).as_posix()}"\n')
-        fw.write(")\n")
+    output_file = os.path.join(src_dir, "xpu_ops_generated_headers.cmake")
+
+    # Generate new content
+    new_content = "set(xpu_ops_generated_headers\n"
+    for header in xpu_ops_headers:
+        new_content += f'    "{Path(os.path.join(dst_dir, header)).as_posix()}"\n'
+    new_content += ")\n"
+
+    # Check if file exists and has same content
+    should_write = True
+    if os.path.exists(output_file):
+        try:
+            with open(output_file, "r", encoding="utf-8") as f:
+                existing_content = f.read()
+            should_write = existing_content != new_content
+        except (IOError, OSError):
+            # If we can't read the file, write it anyway
+            should_write = True
+
+    if should_write:
+        with open(output_file, "w", encoding="utf-8") as fw:
+            fw.write(new_content)
 
 
 def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers):
     r"""
     For XPU-specific ops headers, copy them to destination build and append XPU declarations to common headers.
+    Copies and appends are done only if leading to file changes to prevent unnecessary recompilations.
     """
     if args.dry_run:
         return
@@ -99,7 +137,16 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers):
         # assert "xpu" in f, f"Error: The function signature or namespace in '{f}' is incorrect. Expected 'xpu' to be present."
         src = os.path.join(src_dir, f)
         dst = os.path.join(dst_dir, f)
-        shutil.copy(src, dst)
+        # Only copy if src and dst differ or dst does not exist
+        should_copy = True
+        if os.path.exists(dst):
+            try:
+                with open(src, "rb") as fsrc, open(dst, "rb") as fdst:
+                    should_copy = fsrc.read() != fdst.read()
+            except (IOError, OSError):
+                should_copy = True
+        if should_copy:
+            shutil.copy(src, dst)
 
     for f in common_headers:
         src = os.path.join(src_dir, f)
@@ -118,6 +165,7 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers):
         with open(dst, "r+", encoding="utf-8") as f:
             dst_lines = f.readlines()
             dst_text = "".join(dst_lines)
+            old_content = "".join(dst_lines)
             missing_declarations = []
             insertion_index = None
             for index, line in enumerate(dst_lines):
@@ -133,9 +181,10 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers):
                     break
             assert (insertion_index is not None), f"Error: No TORCH_API declaration found in {dst}."
 
-            f.seek(0)
-            f.writelines(dst_lines)
-            f.truncate()
+            if old_content != "".join(dst_lines):
+                f.seek(0)
+                f.writelines(dst_lines)
+                f.truncate()
 
 
 def main():

From 3996d7122645ea0d5825ef2199f03e0d7ba0cd27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Thu, 9 Oct 2025 12:06:28 +0000
Subject: [PATCH 4/4] Calculate in declared precision

---
 .../xpu/sycl/UnarySpecialOpsKernels.cpp       | 33 +++++++------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
index f192b93b05..cd6c277889 100644
--- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
@@ -130,33 +130,24 @@ void exp2_kernel(TensorIteratorBase& iter) {
 }
 
 template <typename scalar_t>
-struct Logit0Functor {
-  using T_ACC = acc_type_device<scalar_t, c10::DeviceType::XPU>;
+struct LogitFunctor {
   scalar_t operator()(scalar_t x) const {
-    const T_ACC x_acc = static_cast<T_ACC>(x);
-    // suppress compiler optimization on data type promotion.
-    const T_ACC div = static_cast<T_ACC>(static_cast<scalar_t>(x_acc / (T_ACC(1) - x_acc)));
-    volatile T_ACC res = std::log(div);
-    return res;
+    return std::log(x / (1 - x));
   }
 };
 
 template <typename scalar_t>
-struct Logit1Functor {
+struct LogitEpsFunctor {
   using T_ACC = acc_type_device<scalar_t, c10::DeviceType::XPU>;
   scalar_t operator()(scalar_t x) const {
-    const T_ACC x_acc = static_cast<T_ACC>(x);
-    T_ACC z = x_acc < lo_ ? lo_ : (x_acc > hi_ ? hi_ : x_acc);
-    // suppress compiler optimization on data type promotion.
-    const T_ACC div = static_cast<T_ACC>(static_cast<scalar_t>(z / (T_ACC(1) - z)));
-    volatile T_ACC res = std::log(div);
-    return res;
+    scalar_t x_clamped = x < low_ ? low_ : (x > high_ ? high_ : x);
+    return std::log(x_clamped / (1 - x_clamped));
   }
-  Logit1Functor(const T_ACC lo, const T_ACC hi) : lo_(lo), hi_(hi) {}
+  LogitEpsFunctor(const T_ACC low, const T_ACC high) : low_(low), high_(high) {}
 
  private:
-  T_ACC lo_;
-  T_ACC hi_;
+  scalar_t low_;
+  scalar_t high_;
 };
 
 void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar) {
@@ -169,11 +160,11 @@ void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar) {
         using T_ACC = acc_type_device<scalar_t, c10::DeviceType::XPU>;
         const T_ACC eps = eps_scalar.to<T_ACC>();
         if (eps < T_ACC(0)) {
-          gpu_kernel(iter, Logit0Functor<scalar_t>());
+          gpu_kernel(iter, LogitFunctor<scalar_t>());
         } else {
-          const T_ACC lo = eps;
-          const T_ACC hi = T_ACC(1) - eps;
-          gpu_kernel(iter, Logit1Functor<scalar_t>(lo, hi));
+          const T_ACC low = eps;
+          const T_ACC high = T_ACC(1) - eps;
+          gpu_kernel(iter, LogitEpsFunctor<scalar_t>(low, high));
         }
       });
 }