Revert "[ATen] Fix CUDA reduction warp shuffle order (pytorch#164790)"

pytorchmergebot · Chao1Han · commit 75204a75ddba · 2025-10-21T15:38:02.000+08:00
This reverts commit 8e1f409. Reverted pytorch#164790 on behalf of https://github.com/jeffdaily due to broke cuda and rocm ci ([comment](pytorch#164790 (comment)))
diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -655,17 +655,12 @@ struct ReduceOp {
     }
 
     __syncthreads();
-    // Warp-level reduction for remaining threads
-    // For non-power-of-2 sizes, we start from the next power-of-2 divided by 2
-    // and use a boundary check to avoid out-of-bounds access
-    for (size_t offset = warpSize / 2; offset > 0; offset >>= 1) {
+
+    for (int offset = 1; offset < dim_x; offset <<= 1) {
       #pragma unroll
       for (int i = 0; i < output_vec_size; i++) {
         arg_t other = ops.warp_shfl_down(value[i], offset);
-        // Only combine if the source thread (threadIdx.x + offset) is within bounds
-        if (threadIdx.x + offset < dim_x) {
-          value[i] = ops.combine(value[i], other);
-        }
+        value[i] = ops.combine(value[i], other);
       }
     }
     return value;
diff --git a/aten/src/ATen/native/cuda/reduction_template.cuh b/aten/src/ATen/native/cuda/reduction_template.cuh
@@ -466,13 +466,11 @@ struct ReduceJitOp {
 
     __syncthreads();
 
-    for (size_t offset = warpSize / 2; offset > 0; offset >>= 1) {
+    for (int offset = 1; offset < dim_x; offset <<= 1) {
       #pragma unroll
       for (int i = 0; i < output_vec_size; i++) {
         arg_t other = reducer::warp_shfl_down(value[i], offset);
-        if (threadIdx.x + offset < dim_x) {
-          value[i] = reducer::combine(value[i], other);
-        }
+        value[i] = reducer::combine(value[i], other);
       }
     }
     return value;

Original file line number	Diff line number	Diff line change
`@@ -466,13 +466,11 @@ struct ReduceJitOp {`
`466`	`466`
`467`	`467`	`__syncthreads();`
`468`	`468`
`469`		`- for (size_t offset = warpSize / 2; offset > 0; offset >>= 1) {`
	`469`	`+ for (int offset = 1; offset < dim_x; offset <<= 1) {`
`470`	`470`	`#pragma unroll`
`471`	`471`	`for (int i = 0; i < output_vec_size; i++) {`
`472`	`472`	`arg_t other = reducer::warp_shfl_down(value[i], offset);`
`473`		`- if (threadIdx.x + offset < dim_x) {`
`474`		`- value[i] = reducer::combine(value[i], other);`
`475`		`- }`
	`473`	`+ value[i] = reducer::combine(value[i], other);`
`476`	`474`	`}`
`477`	`475`	`}`
`478`	`476`	`return value;`