Revert "CUDA: fix race condition in MMQ ids_dst (ggml-org#13294)"

Nexesenex · Nexesenex · commit 18a38839b931 · 2025-05-09T01:33:33.000+02:00
This reverts commit 8afbd96.
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
@@ -2637,7 +2637,6 @@ static __global__ void mul_mat_q(
 
         ids_dst_shared[j] = j;
     }
-    __syncthreads();
 
     // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
 #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
@@ -2666,7 +2665,6 @@ static __global__ void mul_mat_q(
                 return;
             }
 
-            // __syncthreads(); // There is no previous tile that could cause a race condition.
 #pragma unroll
             for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
                 const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
@@ -2677,7 +2675,6 @@ static __global__ void mul_mat_q(
 
                 ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
             }
-            __syncthreads();
         }
 
         offset_y   += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
@@ -2744,7 +2741,6 @@ static __global__ void mul_mat_q(
                 continue;
             }
 
-            __syncthreads();
 #pragma unroll
             for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
                 const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
@@ -2755,7 +2751,6 @@ static __global__ void mul_mat_q(
 
                 ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
             }
-            __syncthreads();
         }
 
         offset_y   += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
@@ -2811,7 +2806,6 @@ static __global__ void mul_mat_q(
         }
 
         // The memory layout for the fixup buffer is always contiguous, therefore reset ids:
-        __syncthreads();
 #pragma unroll
         for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
             const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
@@ -2822,7 +2816,6 @@ static __global__ void mul_mat_q(
 
             ids_dst_shared[j] = j;
         }
-        __syncthreads();
     }
 
     offset_y   += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));

Original file line number	Diff line number	Diff line change
`@@ -2637,7 +2637,6 @@ static __global__ void mul_mat_q(`
`2637`	`2637`
`2638`	`2638`	`ids_dst_shared[j] = j;`
`2639`	`2639`	`}`
`2640`		`- __syncthreads();`
`2641`	`2640`
`2642`	`2641`	`// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:`
`2643`	`2642`	`#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) \|\| __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA`
`@@ -2666,7 +2665,6 @@ static __global__ void mul_mat_q(`
`2666`	`2665`	`return;`
`2667`	`2666`	`}`
`2668`	`2667`
`2669`		`- // __syncthreads(); // There is no previous tile that could cause a race condition.`
`2670`	`2668`	`#pragma unroll`
`2671`	`2669`	`for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {`
`2672`	`2670`	`const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;`
`@@ -2677,7 +2675,6 @@ static __global__ void mul_mat_q(`
`2677`	`2675`
`2678`	`2676`	`ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];`
`2679`	`2677`	`}`
`2680`		`- __syncthreads();`
`2681`	`2678`	`}`
`2682`	`2679`
`2683`	`2680`	`offset_y += (col_low + jtmmq_x)(sizeof(block_q8_1_mmq)/sizeof(int));`
`@@ -2744,7 +2741,6 @@ static __global__ void mul_mat_q(`
`2744`	`2741`	`continue;`
`2745`	`2742`	`}`
`2746`	`2743`
`2747`		`- __syncthreads();`
`2748`	`2744`	`#pragma unroll`
`2749`	`2745`	`for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {`
`2750`	`2746`	`const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;`
`@@ -2755,7 +2751,6 @@ static __global__ void mul_mat_q(`
`2755`	`2751`
`2756`	`2752`	`ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];`
`2757`	`2753`	`}`
`2758`		`- __syncthreads();`
`2759`	`2754`	`}`
`2760`	`2755`
`2761`	`2756`	`offset_y += (col_low + jtmmq_x)(sizeof(block_q8_1_mmq)/sizeof(int));`
`@@ -2811,7 +2806,6 @@ static __global__ void mul_mat_q(`
`2811`	`2806`	`}`
`2812`	`2807`
`2813`	`2808`	`// The memory layout for the fixup buffer is always contiguous, therefore reset ids:`
`2814`		`- __syncthreads();`
`2815`	`2809`	`#pragma unroll`
`2816`	`2810`	`for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {`
`2817`	`2811`	`const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;`
`@@ -2822,7 +2816,6 @@ static __global__ void mul_mat_q(`
`2822`	`2816`
`2823`	`2817`	`ids_dst_shared[j] = j;`
`2824`	`2818`	`}`
`2825`		`- __syncthreads();`
`2826`	`2819`	`}`
`2827`	`2820`
`2828`	`2821`	`offset_y += (col_low + jtmmq_x)(sizeof(block_q8_1_mmq)/sizeof(int));`