Skip to content

Commit 18a3883

Browse files
committed
Revert "CUDA: fix race condition in MMQ ids_dst (ggml-org#13294)"
This reverts commit 8afbd96.
1 parent 3eee3f0 commit 18a3883

File tree

1 file changed

+0
-7
lines changed

1 file changed

+0
-7
lines changed

ggml/src/ggml-cuda/mmq.cuh

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2637,7 +2637,6 @@ static __global__ void mul_mat_q(
26372637

26382638
ids_dst_shared[j] = j;
26392639
}
2640-
__syncthreads();
26412640

26422641
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
26432642
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
@@ -2666,7 +2665,6 @@ static __global__ void mul_mat_q(
26662665
return;
26672666
}
26682667

2669-
// __syncthreads(); // There is no previous tile that could cause a race condition.
26702668
#pragma unroll
26712669
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
26722670
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
@@ -2677,7 +2675,6 @@ static __global__ void mul_mat_q(
26772675

26782676
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
26792677
}
2680-
__syncthreads();
26812678
}
26822679

26832680
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
@@ -2744,7 +2741,6 @@ static __global__ void mul_mat_q(
27442741
continue;
27452742
}
27462743

2747-
__syncthreads();
27482744
#pragma unroll
27492745
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
27502746
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
@@ -2755,7 +2751,6 @@ static __global__ void mul_mat_q(
27552751

27562752
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
27572753
}
2758-
__syncthreads();
27592754
}
27602755

27612756
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
@@ -2811,7 +2806,6 @@ static __global__ void mul_mat_q(
28112806
}
28122807

28132808
// The memory layout for the fixup buffer is always contiguous, therefore reset ids:
2814-
__syncthreads();
28152809
#pragma unroll
28162810
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
28172811
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
@@ -2822,7 +2816,6 @@ static __global__ void mul_mat_q(
28222816

28232817
ids_dst_shared[j] = j;
28242818
}
2825-
__syncthreads();
28262819
}
28272820

28282821
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));

0 commit comments

Comments
 (0)