From 9403eed06b80a557e79b62b60542a8059977e721 Mon Sep 17 00:00:00 2001
From: "Deng, Weishi" <weishi.deng@intel.com>
Date: Tue, 4 Nov 2025 14:20:46 +0800
Subject: [PATCH 1/4] enlarge the range for 2-pass reduction

---
 src/ATen/native/xpu/sycl/LayerNormKernels.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
index e4a8444c94..0aff3e603c 100644
--- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
@@ -1063,8 +1063,8 @@ void _layer_norm_backward_kernel(
       norm_config_global_size / syclMaxSubGroupSize() * 2 <= thread_slots;
   // cuda uses condition M > 64 * 1024 && N / 32 < sm_count / 2 to parallelize
   // in the M dimension
-  if (use_two_stage_col_reduction && M > 64 * 1024 &&
-      N / 32 < syclGpuEuCount() / syclGpuEUCountPerSubslice() / 2) {
+  if (use_two_stage_col_reduction && M > 20 * 1024 &&
+      N / 32 < syclGpuEuCount() / syclGpuEUCountPerSubslice()) {
     const size_t local_size_x = 8;
     const size_t SIMD = 32;
     // workgroup size is 256

From d07225223b5573b367a1188826bfccbe5718d438 Mon Sep 17 00:00:00 2001
From: "Deng, Weishi" <weishi.deng@intel.com>
Date: Tue, 4 Nov 2025 14:39:54 +0800
Subject: [PATCH 2/4] update config logic

---
 src/ATen/native/xpu/sycl/LayerNormKernels.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
index 0aff3e603c..f86825b3d7 100644
--- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
@@ -1063,8 +1063,9 @@ void _layer_norm_backward_kernel(
       norm_config_global_size / syclMaxSubGroupSize() * 2 <= thread_slots;
   // cuda uses condition M > 64 * 1024 && N / 32 < sm_count / 2 to parallelize
   // in the M dimension
-  if (use_two_stage_col_reduction && M > 20 * 1024 &&
-      N / 32 < syclGpuEuCount() / syclGpuEUCountPerSubslice()) {
+  int subslice_count = syclGpuEuCount() / syclGpuEUCountPerSubslice();
+  if (use_two_stage_col_reduction && M > subslice_count * 1024 &&
+      N / 32 < subslice_count) {
     const size_t local_size_x = 8;
     const size_t SIMD = 32;
     // workgroup size is 256

From 98ccf83b6b656a504f166fcaa0775b708854076e Mon Sep 17 00:00:00 2001
From: "Weishi.Deng" <weishi.deng@intel.com>
Date: Wed, 5 Nov 2025 16:30:21 +0800
Subject: [PATCH 3/4] Update src/ATen/native/xpu/sycl/LayerNormKernels.cpp

Co-authored-by: Eikan Wang <eikan.wang@intel.com>
---
 src/ATen/native/xpu/sycl/LayerNormKernels.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
index f86825b3d7..6745a52493 100644
--- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
@@ -1063,7 +1063,7 @@ void _layer_norm_backward_kernel(
       norm_config_global_size / syclMaxSubGroupSize() * 2 <= thread_slots;
   // cuda uses condition M > 64 * 1024 && N / 32 < sm_count / 2 to parallelize
   // in the M dimension
-  int subslice_count = syclGpuEuCount() / syclGpuEUCountPerSubslice();
+  int xe_core_count = syclGpuEuCount() / syclGpuEUCountPerSubslice();
   if (use_two_stage_col_reduction && M > subslice_count * 1024 &&
       N / 32 < subslice_count) {
     const size_t local_size_x = 8;

From 7f6f9a8a7c832eea350fa64a9f254d44fb260c61 Mon Sep 17 00:00:00 2001
From: "Deng, Weishi" <weishi.deng@intel.com>
Date: Wed, 5 Nov 2025 16:42:31 +0800
Subject: [PATCH 4/4] update condition for 2 pass

---
 src/ATen/native/xpu/sycl/LayerNormKernels.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
index 6745a52493..c681880058 100644
--- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
@@ -1064,8 +1064,9 @@ void _layer_norm_backward_kernel(
   // cuda uses condition M > 64 * 1024 && N / 32 < sm_count / 2 to parallelize
   // in the M dimension
   int xe_core_count = syclGpuEuCount() / syclGpuEUCountPerSubslice();
-  if (use_two_stage_col_reduction && M > subslice_count * 1024 &&
-      N / 32 < subslice_count) {
+  int tile_n = N / 32;
+  if (use_two_stage_col_reduction && M > xe_core_count * 1024 &&
+      tile_n < xe_core_count * 2) {
     const size_t local_size_x = 8;
     const size_t SIMD = 32;
     // workgroup size is 256