From 9682a8224d7b4e42b749a38e7bcede0de5146fe8 Mon Sep 17 00:00:00 2001 From: Slawomir Siwek Date: Tue, 7 Oct 2025 19:18:35 +0200 Subject: [PATCH 1/4] Cast intermediate tensor to bfloat16 Truncate precision to match CPU implementation --- src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp index 8e8b09d5cf..60ec6c16a4 100644 --- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp @@ -134,8 +134,9 @@ struct Logit0Functor { using T_ACC = acc_type_device; scalar_t operator()(scalar_t x) const { const T_ACC x_acc = static_cast(x); + const T_ACC div = static_cast(static_cast(x_acc / (T_ACC(1) - x_acc)); // suppress compiler optimization on data type promotion. - volatile T_ACC res = std::log(x_acc / (T_ACC(1) - x_acc)); + volatile T_ACC res = std::log(div); return res; } }; From 0ff60c8540d9658a7b106200c86b2f80f432d126 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Wed, 8 Oct 2025 11:17:55 +0000 Subject: [PATCH 2/4] syntax --- src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp index 60ec6c16a4..f192b93b05 100644 --- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp @@ -134,8 +134,8 @@ struct Logit0Functor { using T_ACC = acc_type_device; scalar_t operator()(scalar_t x) const { const T_ACC x_acc = static_cast(x); - const T_ACC div = static_cast(static_cast(x_acc / (T_ACC(1) - x_acc)); // suppress compiler optimization on data type promotion. + const T_ACC div = static_cast(static_cast(x_acc / (T_ACC(1) - x_acc))); volatile T_ACC res = std::log(div); return res; } @@ -148,7 +148,8 @@ struct Logit1Functor { const T_ACC x_acc = static_cast(x); T_ACC z = x_acc < lo_ ? lo_ : (x_acc > hi_ ? hi_ : x_acc); // suppress compiler optimization on data type promotion. - volatile T_ACC res = std::log(z / (T_ACC(1) - z)); + const T_ACC div = static_cast(static_cast(z / (T_ACC(1) - z))); + volatile T_ACC res = std::log(div); return res; } Logit1Functor(const T_ACC lo, const T_ACC hi) : lo_(lo), hi_(hi) {} From c788ff14ee95385672c145e82300e8a1f8e2e9d2 Mon Sep 17 00:00:00 2001 From: Pawel Swider Date: Wed, 8 Oct 2025 11:17:13 +0000 Subject: [PATCH 3/4] Modify files in install_xpu_headers only if they changes --- tools/codegen/install_xpu_headers.py | 103 ++++++++++++++++++++------- 1 file changed, 76 insertions(+), 27 deletions(-) diff --git a/tools/codegen/install_xpu_headers.py b/tools/codegen/install_xpu_headers.py index 807a509bc6..a0a7855221 100644 --- a/tools/codegen/install_xpu_headers.py +++ b/tools/codegen/install_xpu_headers.py @@ -4,6 +4,7 @@ import shutil from pathlib import Path +VERBOSE = False parser = argparse.ArgumentParser(description="Utils for append ops headers") parser.add_argument( @@ -20,34 +21,53 @@ def append_xpu_function_header(src, dst): r""" Cleans trailing empty lines from the destination file, then appends #include lines from the source file that match `#include \s*\r?\n" matches = re.findall(pattern, src_text, re.MULTILINE) if not matches: return - with open(dst, "r+", encoding="utf-8") as f: - dst_lines = f.readlines() - dst_text = "".join(dst_lines) - missing_headers = [match for match in matches if match not in dst_text] - if not missing_headers: - return + try: + with open(dst, "r", encoding="utf-8") as f: + dst_lines = f.readlines() + dst_text = "".join(dst_lines) + except (IOError, OSError) as e: + if VERBOSE: + print(f"Warning: Could not read destination file {dst}: {e}") + return - # Remove trailing empty lines from dst_lines - while dst_lines and not dst_lines[-1].strip(): - dst_lines.pop() + missing_headers = [match for match in matches if match not in dst_text] + if not missing_headers: + return - f.seek(0) - f.truncate() - f.writelines(dst_lines) - # Append missing headers to the end of the file - f.writelines(missing_headers) + new_dst_lines = dst_lines.copy() + + while new_dst_lines and not new_dst_lines[-1].strip(): + new_dst_lines.pop() + new_dst_lines.extend(missing_headers) + + new_content = "".join(new_dst_lines) + old_content = "".join(dst_lines) + + if new_content != old_content: + try: + with open(dst, "w", encoding="utf-8") as f: + f.writelines(new_dst_lines) + except (IOError, OSError) as e: + if VERBOSE: + print(f"Error: Could not write to {dst}: {e}") def parse_ops_headers(src): @@ -78,18 +98,36 @@ def classify_ops_headers(src_dir, dst_dir): def generate_xpu_ops_headers_cmake(src_dir, dst_dir, xpu_ops_headers): r""" - Generate XPU ops headers xpu_ops_generated_headers.cmake + Generate XPU ops headers xpu_ops_generated_headers.cmake only if content changes """ - with open(os.path.join(src_dir, "xpu_ops_generated_headers.cmake"), "w", encoding="utf-8") as fw: - fw.write("set(xpu_ops_generated_headers\n") - for header in xpu_ops_headers: - fw.write(f' "{Path(os.path.join(dst_dir, header)).as_posix()}"\n') - fw.write(")\n") + output_file = os.path.join(src_dir, "xpu_ops_generated_headers.cmake") + + # Generate new content + new_content = "set(xpu_ops_generated_headers\n" + for header in xpu_ops_headers: + new_content += f' "{Path(os.path.join(dst_dir, header)).as_posix()}"\n' + new_content += ")\n" + + # Check if file exists and has same content + should_write = True + if os.path.exists(output_file): + try: + with open(output_file, "r", encoding="utf-8") as f: + existing_content = f.read() + should_write = existing_content != new_content + except (IOError, OSError): + # If we can't read the file, write it anyway + should_write = True + + if should_write: + with open(output_file, "w", encoding="utf-8") as fw: + fw.write(new_content) def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers): r""" For XPU-specific ops headers, copy them to destination build and append XPU declarations to common headers. + Copies and appends are done only if leading to file changes to prevent unnecessary recompilations. """ if args.dry_run: return @@ -99,7 +137,16 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers): # assert "xpu" in f, f"Error: The function signature or namespace in '{f}' is incorrect. Expected 'xpu' to be present." src = os.path.join(src_dir, f) dst = os.path.join(dst_dir, f) - shutil.copy(src, dst) + # Only copy if src and dst differ or dst does not exist + should_copy = True + if os.path.exists(dst): + try: + with open(src, "rb") as fsrc, open(dst, "rb") as fdst: + should_copy = fsrc.read() != fdst.read() + except (IOError, OSError): + should_copy = True + if should_copy: + shutil.copy(src, dst) for f in common_headers: src = os.path.join(src_dir, f) @@ -118,6 +165,7 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers): with open(dst, "r+", encoding="utf-8") as f: dst_lines = f.readlines() dst_text = "".join(dst_lines) + old_content = "".join(dst_lines) missing_declarations = [] insertion_index = None for index, line in enumerate(dst_lines): @@ -133,9 +181,10 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers): break assert (insertion_index is not None), f"Error: No TORCH_API declaration found in {dst}." - f.seek(0) - f.writelines(dst_lines) - f.truncate() + if old_content != "".join(dst_lines): + f.seek(0) + f.writelines(dst_lines) + f.truncate() def main(): From 3996d7122645ea0d5825ef2199f03e0d7ba0cd27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Thu, 9 Oct 2025 12:06:28 +0000 Subject: [PATCH 4/4] Calculate in declared precision --- .../xpu/sycl/UnarySpecialOpsKernels.cpp | 33 +++++++------------ 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp index f192b93b05..cd6c277889 100644 --- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp @@ -130,33 +130,24 @@ void exp2_kernel(TensorIteratorBase& iter) { } template -struct Logit0Functor { - using T_ACC = acc_type_device; +struct LogitFunctor { scalar_t operator()(scalar_t x) const { - const T_ACC x_acc = static_cast(x); - // suppress compiler optimization on data type promotion. - const T_ACC div = static_cast(static_cast(x_acc / (T_ACC(1) - x_acc))); - volatile T_ACC res = std::log(div); - return res; + return std::log(x / (1 - x)); } }; template -struct Logit1Functor { +struct LogitEpsFunctor { using T_ACC = acc_type_device; scalar_t operator()(scalar_t x) const { - const T_ACC x_acc = static_cast(x); - T_ACC z = x_acc < lo_ ? lo_ : (x_acc > hi_ ? hi_ : x_acc); - // suppress compiler optimization on data type promotion. - const T_ACC div = static_cast(static_cast(z / (T_ACC(1) - z))); - volatile T_ACC res = std::log(div); - return res; + scalar_t x_clamped = x < low_ ? low_ : (x > high_ ? high_ : x); + return std::log(x_clamped / (1 - x_clamped)); } - Logit1Functor(const T_ACC lo, const T_ACC hi) : lo_(lo), hi_(hi) {} + LogitEpsFunctor(const T_ACC low, const T_ACC high) : low_(low), high_(high) {} private: - T_ACC lo_; - T_ACC hi_; + scalar_t low_; + scalar_t high_; }; void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar) { @@ -169,11 +160,11 @@ void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar) { using T_ACC = acc_type_device; const T_ACC eps = eps_scalar.to(); if (eps < T_ACC(0)) { - gpu_kernel(iter, Logit0Functor()); + gpu_kernel(iter, LogitFunctor()); } else { - const T_ACC lo = eps; - const T_ACC hi = T_ACC(1) - eps; - gpu_kernel(iter, Logit1Functor(lo, hi)); + const T_ACC low = eps; + const T_ACC high = T_ACC(1) - eps; + gpu_kernel(iter, LogitEpsFunctor(low, high)); } }); }