From 9403eed06b80a557e79b62b60542a8059977e721 Mon Sep 17 00:00:00 2001 From: "Deng, Weishi" Date: Tue, 4 Nov 2025 14:20:46 +0800 Subject: [PATCH 1/4] enlarge the range for 2-pass reduction --- src/ATen/native/xpu/sycl/LayerNormKernels.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp index e4a8444c94..0aff3e603c 100644 --- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp @@ -1063,8 +1063,8 @@ void _layer_norm_backward_kernel( norm_config_global_size / syclMaxSubGroupSize() * 2 <= thread_slots; // cuda uses condition M > 64 * 1024 && N / 32 < sm_count / 2 to parallelize // in the M dimension - if (use_two_stage_col_reduction && M > 64 * 1024 && - N / 32 < syclGpuEuCount() / syclGpuEUCountPerSubslice() / 2) { + if (use_two_stage_col_reduction && M > 20 * 1024 && + N / 32 < syclGpuEuCount() / syclGpuEUCountPerSubslice()) { const size_t local_size_x = 8; const size_t SIMD = 32; // workgroup size is 256 From d07225223b5573b367a1188826bfccbe5718d438 Mon Sep 17 00:00:00 2001 From: "Deng, Weishi" Date: Tue, 4 Nov 2025 14:39:54 +0800 Subject: [PATCH 2/4] update config logic --- src/ATen/native/xpu/sycl/LayerNormKernels.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp index 0aff3e603c..f86825b3d7 100644 --- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp @@ -1063,8 +1063,9 @@ void _layer_norm_backward_kernel( norm_config_global_size / syclMaxSubGroupSize() * 2 <= thread_slots; // cuda uses condition M > 64 * 1024 && N / 32 < sm_count / 2 to parallelize // in the M dimension - if (use_two_stage_col_reduction && M > 20 * 1024 && - N / 32 < syclGpuEuCount() / syclGpuEUCountPerSubslice()) { + int subslice_count = syclGpuEuCount() / syclGpuEUCountPerSubslice(); + if (use_two_stage_col_reduction && M > subslice_count * 1024 && + N / 32 < subslice_count) { const size_t local_size_x = 8; const size_t SIMD = 32; // workgroup size is 256 From 98ccf83b6b656a504f166fcaa0775b708854076e Mon Sep 17 00:00:00 2001 From: "Weishi.Deng" Date: Wed, 5 Nov 2025 16:30:21 +0800 Subject: [PATCH 3/4] Update src/ATen/native/xpu/sycl/LayerNormKernels.cpp Co-authored-by: Eikan Wang --- src/ATen/native/xpu/sycl/LayerNormKernels.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp index f86825b3d7..6745a52493 100644 --- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp @@ -1063,7 +1063,7 @@ void _layer_norm_backward_kernel( norm_config_global_size / syclMaxSubGroupSize() * 2 <= thread_slots; // cuda uses condition M > 64 * 1024 && N / 32 < sm_count / 2 to parallelize // in the M dimension - int subslice_count = syclGpuEuCount() / syclGpuEUCountPerSubslice(); + int xe_core_count = syclGpuEuCount() / syclGpuEUCountPerSubslice(); if (use_two_stage_col_reduction && M > subslice_count * 1024 && N / 32 < subslice_count) { const size_t local_size_x = 8; From 7f6f9a8a7c832eea350fa64a9f254d44fb260c61 Mon Sep 17 00:00:00 2001 From: "Deng, Weishi" Date: Wed, 5 Nov 2025 16:42:31 +0800 Subject: [PATCH 4/4] update condition for 2 pass --- src/ATen/native/xpu/sycl/LayerNormKernels.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp index 6745a52493..c681880058 100644 --- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp @@ -1064,8 +1064,9 @@ void _layer_norm_backward_kernel( // cuda uses condition M > 64 * 1024 && N / 32 < sm_count / 2 to parallelize // in the M dimension int xe_core_count = syclGpuEuCount() / syclGpuEUCountPerSubslice(); - if (use_two_stage_col_reduction && M > subslice_count * 1024 && - N / 32 < subslice_count) { + int tile_n = N / 32; + if (use_two_stage_col_reduction && M > xe_core_count * 1024 && + tile_n < xe_core_count * 2) { const size_t local_size_x = 8; const size_t SIMD = 32; // workgroup size is 256