@@ -2637,7 +2637,6 @@ static __global__ void mul_mat_q(
2637
2637
2638
2638
ids_dst_shared[j] = j;
2639
2639
}
2640
- __syncthreads ();
2641
2640
2642
2641
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
2643
2642
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
@@ -2666,7 +2665,6 @@ static __global__ void mul_mat_q(
2666
2665
return ;
2667
2666
}
2668
2667
2669
- // __syncthreads(); // There is no previous tile that could cause a race condition.
2670
2668
#pragma unroll
2671
2669
for (int j0 = 0 ; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
2672
2670
const int j = j0 + threadIdx .y *WARP_SIZE + threadIdx .x ;
@@ -2677,7 +2675,6 @@ static __global__ void mul_mat_q(
2677
2675
2678
2676
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
2679
2677
}
2680
- __syncthreads ();
2681
2678
}
2682
2679
2683
2680
offset_y += (col_low + jt*mmq_x)*(sizeof (block_q8_1_mmq)/sizeof (int ));
@@ -2744,7 +2741,6 @@ static __global__ void mul_mat_q(
2744
2741
continue ;
2745
2742
}
2746
2743
2747
- __syncthreads ();
2748
2744
#pragma unroll
2749
2745
for (int j0 = 0 ; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
2750
2746
const int j = j0 + threadIdx .y *WARP_SIZE + threadIdx .x ;
@@ -2755,7 +2751,6 @@ static __global__ void mul_mat_q(
2755
2751
2756
2752
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
2757
2753
}
2758
- __syncthreads ();
2759
2754
}
2760
2755
2761
2756
offset_y += (col_low + jt*mmq_x)*(sizeof (block_q8_1_mmq)/sizeof (int ));
@@ -2811,7 +2806,6 @@ static __global__ void mul_mat_q(
2811
2806
}
2812
2807
2813
2808
// The memory layout for the fixup buffer is always contiguous, therefore reset ids:
2814
- __syncthreads ();
2815
2809
#pragma unroll
2816
2810
for (int j0 = 0 ; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
2817
2811
const int j = j0 + threadIdx .y *WARP_SIZE + threadIdx .x ;
@@ -2822,7 +2816,6 @@ static __global__ void mul_mat_q(
2822
2816
2823
2817
ids_dst_shared[j] = j;
2824
2818
}
2825
- __syncthreads ();
2826
2819
}
2827
2820
2828
2821
offset_y += (col_low + jt*mmq_x)*(sizeof (block_q8_1_mmq)/sizeof (int ));
0 commit comments