@@ -989,7 +989,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
989
989
990
990
static_assert (16 %K_QUANTS_PER_ITERATION == 0 , " 16 must be divisible by K_QUANTS_PER_ITERATION" );
991
991
992
- const int row = blockIdx .y *blockDim .y + threadIdx .y ;
992
+ const int row = blockIdx .x *blockDim .y + threadIdx .y ;
993
993
if (row > nrows) return ;
994
994
995
995
const int num_blocks_per_row = ncols / QK_K;
@@ -1093,7 +1093,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
1093
1093
1094
1094
static __global__ void dequantize_mul_mat_vec_q3_k (const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1095
1095
1096
- const int row = blockIdx .y *blockDim .y + threadIdx .y ;
1096
+ const int row = blockIdx .x *blockDim .y + threadIdx .y ;
1097
1097
if (row > nrows) return ;
1098
1098
1099
1099
const int num_blocks_per_row = ncols / QK_K;
@@ -1197,7 +1197,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
1197
1197
1198
1198
static __global__ void dequantize_mul_mat_vec_q4_k (const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1199
1199
1200
- const int row = blockIdx .y *blockDim .y + threadIdx .y ;
1200
+ const int row = blockIdx .x *blockDim .y + threadIdx .y ;
1201
1201
if (row > nrows) return ;
1202
1202
const int num_blocks_per_row = ncols / QK_K;
1203
1203
const int ib0 = row*num_blocks_per_row;
@@ -1451,7 +1451,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
1451
1451
1452
1452
static_assert (16 %K_QUANTS_PER_ITERATION == 0 , " 16 must be divisible by K_QUANTS_PER_ITERATION" );
1453
1453
1454
- const int row = blockIdx .y *blockDim .y + threadIdx .y ;
1454
+ const int row = blockIdx .x *blockDim .y + threadIdx .y ;
1455
1455
if (row > nrows) return ;
1456
1456
1457
1457
const int num_blocks_per_row = ncols / QK_K;
@@ -4261,7 +4261,7 @@ template <bool need_check> static __global__ void
4261
4261
4262
4262
template <int qk, int qi, typename block_q_t , int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
4263
4263
static __global__ void mul_mat_vec_q (const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
4264
- const int row = blockIdx .y *blockDim .y + threadIdx .y ;
4264
+ const int row = blockIdx .x *blockDim .y + threadIdx .y ;
4265
4265
4266
4266
if (row >= nrows) {
4267
4267
return ;
@@ -4301,7 +4301,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
4301
4301
static __global__ void dequantize_mul_mat_vec (const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
4302
4302
// qk = quantized weights per x block
4303
4303
// qr = number of quantized weights per data value in x block
4304
- const int row = blockIdx .y *blockDim .y + threadIdx .y ;
4304
+ const int row = blockIdx .x *blockDim .y + threadIdx .y ;
4305
4305
4306
4306
if (row >= nrows) {
4307
4307
return ;
@@ -4874,7 +4874,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
4874
4874
static void dequantize_mul_mat_vec_q4_0_cuda (const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4875
4875
GGML_ASSERT (ncols % GGML_CUDA_DMMV_X == 0 );
4876
4876
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
4877
- const dim3 block_nums (1 , block_num_y, 1 );
4877
+ // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
4878
+ const dim3 block_nums (block_num_y, 1 , 1 );
4878
4879
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
4879
4880
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
4880
4881
<<<block_nums, block_dims, 0 , stream>>> (vx, y, dst, ncols, nrows);
@@ -4883,7 +4884,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
4883
4884
static void dequantize_mul_mat_vec_q4_1_cuda (const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4884
4885
GGML_ASSERT (ncols % GGML_CUDA_DMMV_X == 0 );
4885
4886
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
4886
- const dim3 block_nums (1 , block_num_y , 1 );
4887
+ const dim3 block_nums (block_num_y, 1 , 1 );
4887
4888
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
4888
4889
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
4889
4890
<<<block_nums, block_dims, 0 , stream>>> (vx, y, dst, ncols, nrows);
@@ -4892,7 +4893,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
4892
4893
static void dequantize_mul_mat_vec_q5_0_cuda (const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4893
4894
GGML_ASSERT (ncols % GGML_CUDA_DMMV_X == 0 );
4894
4895
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
4895
- const dim3 block_nums (1 , block_num_y , 1 );
4896
+ const dim3 block_nums (block_num_y, 1 , 1 );
4896
4897
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
4897
4898
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
4898
4899
<<<block_nums, block_dims, 0 , stream>>> (vx, y, dst, ncols, nrows);
@@ -4901,7 +4902,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
4901
4902
static void dequantize_mul_mat_vec_q5_1_cuda (const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4902
4903
GGML_ASSERT (ncols % GGML_CUDA_DMMV_X == 0 );
4903
4904
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
4904
- const dim3 block_nums (1 , block_num_y , 1 );
4905
+ const dim3 block_nums (block_num_y, 1 , 1 );
4905
4906
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
4906
4907
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
4907
4908
<<<block_nums, block_dims, 0 , stream>>> (vx, y, dst, ncols, nrows);
@@ -4910,7 +4911,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
4910
4911
static void dequantize_mul_mat_vec_q8_0_cuda (const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4911
4912
GGML_ASSERT (ncols % GGML_CUDA_DMMV_X == 0 );
4912
4913
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
4913
- const dim3 block_nums (1 , block_num_y , 1 );
4914
+ const dim3 block_nums (block_num_y, 1 , 1 );
4914
4915
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
4915
4916
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
4916
4917
<<<block_nums, block_dims, 0 , stream>>> (vx, y, dst, ncols, nrows);
@@ -4920,7 +4921,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
4920
4921
GGML_ASSERT (ncols % QK_K == 0 );
4921
4922
const int ny = 2 ; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
4922
4923
const int block_num_y = (nrows + ny - 1 ) / ny;
4923
- const dim3 block_nums (1 , block_num_y , 1 );
4924
+ const dim3 block_nums (block_num_y, 1 , 1 );
4924
4925
const dim3 block_dims (32 , ny, 1 );
4925
4926
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0 , stream>>> (vx, y, dst, ncols, nrows);
4926
4927
}
@@ -4929,7 +4930,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
4929
4930
GGML_ASSERT (ncols % QK_K == 0 );
4930
4931
const int ny = 2 / K_QUANTS_PER_ITERATION;
4931
4932
const int block_num_y = (nrows + ny - 1 ) / ny;
4932
- const dim3 block_nums (1 , block_num_y , 1 );
4933
+ const dim3 block_nums (block_num_y, 1 , 1 );
4933
4934
const dim3 block_dims (32 , ny, 1 );
4934
4935
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0 , stream>>> (vx, y, dst, ncols, nrows);
4935
4936
}
@@ -4938,7 +4939,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
4938
4939
GGML_ASSERT (ncols % QK_K == 0 );
4939
4940
const int ny = 2 / K_QUANTS_PER_ITERATION;
4940
4941
const int block_num_y = (nrows + ny - 1 ) / ny;
4941
- const dim3 block_nums (1 , block_num_y , 1 );
4942
+ const dim3 block_nums (block_num_y, 1 , 1 );
4942
4943
const dim3 block_dims (32 , ny, 1 );
4943
4944
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0 , stream>>> (vx, y, dst, ncols, nrows);
4944
4945
}
@@ -4953,15 +4954,15 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
4953
4954
GGML_ASSERT (ncols % QK_K == 0 );
4954
4955
const int ny = 2 / K_QUANTS_PER_ITERATION;
4955
4956
const int block_num_y = (nrows + ny - 1 ) / ny;
4956
- const dim3 block_nums (1 , block_num_y , 1 );
4957
+ const dim3 block_nums (block_num_y, 1 , 1 );
4957
4958
const dim3 block_dims (32 , ny, 1 );
4958
4959
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0 , stream>>> (vx, y, dst, ncols, nrows);
4959
4960
}
4960
4961
4961
4962
static void mul_mat_vec_q4_0_q8_1_cuda (const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4962
4963
GGML_ASSERT (ncols % QK4_0 == 0 );
4963
4964
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
4964
- const dim3 block_nums (1 , block_num_y , 1 );
4965
+ const dim3 block_nums (block_num_y, 1 , 1 );
4965
4966
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
4966
4967
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
4967
4968
<<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
@@ -4970,7 +4971,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
4970
4971
static void mul_mat_vec_q4_1_q8_1_cuda (const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4971
4972
GGML_ASSERT (ncols % QK4_1 == 0 );
4972
4973
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
4973
- const dim3 block_nums (1 , block_num_y , 1 );
4974
+ const dim3 block_nums (block_num_y, 1 , 1 );
4974
4975
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
4975
4976
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
4976
4977
<<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
@@ -4979,7 +4980,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
4979
4980
static void mul_mat_vec_q5_0_q8_1_cuda (const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4980
4981
GGML_ASSERT (ncols % QK5_0 == 0 );
4981
4982
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
4982
- const dim3 block_nums (1 , block_num_y , 1 );
4983
+ const dim3 block_nums (block_num_y, 1 , 1 );
4983
4984
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
4984
4985
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
4985
4986
<<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
@@ -4988,7 +4989,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
4988
4989
static void mul_mat_vec_q5_1_q8_1_cuda (const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4989
4990
GGML_ASSERT (ncols % QK5_1 == 0 );
4990
4991
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
4991
- const dim3 block_nums (1 , block_num_y , 1 );
4992
+ const dim3 block_nums (block_num_y, 1 , 1 );
4992
4993
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
4993
4994
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
4994
4995
<<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
@@ -4997,7 +4998,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
4997
4998
static void mul_mat_vec_q8_0_q8_1_cuda (const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4998
4999
GGML_ASSERT (ncols % QK8_0 == 0 );
4999
5000
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
5000
- const dim3 block_nums (1 , block_num_y , 1 );
5001
+ const dim3 block_nums (block_num_y, 1 , 1 );
5001
5002
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
5002
5003
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
5003
5004
<<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
@@ -5006,7 +5007,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
5006
5007
static void mul_mat_vec_q2_K_q8_1_cuda (const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5007
5008
GGML_ASSERT (ncols % QK_K == 0 );
5008
5009
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
5009
- const dim3 block_nums (1 , block_num_y , 1 );
5010
+ const dim3 block_nums (block_num_y, 1 , 1 );
5010
5011
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
5011
5012
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
5012
5013
<<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
@@ -5015,7 +5016,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
5015
5016
static void mul_mat_vec_q3_K_q8_1_cuda (const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5016
5017
GGML_ASSERT (ncols % QK_K == 0 );
5017
5018
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
5018
- const dim3 block_nums (1 , block_num_y , 1 );
5019
+ const dim3 block_nums (block_num_y, 1 , 1 );
5019
5020
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
5020
5021
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
5021
5022
<<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
@@ -5024,7 +5025,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
5024
5025
static void mul_mat_vec_q4_K_q8_1_cuda (const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5025
5026
GGML_ASSERT (ncols % QK_K == 0 );
5026
5027
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
5027
- const dim3 block_nums (1 , block_num_y , 1 );
5028
+ const dim3 block_nums (block_num_y, 1 , 1 );
5028
5029
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
5029
5030
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
5030
5031
<<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
@@ -5033,7 +5034,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
5033
5034
static void mul_mat_vec_q5_K_q8_1_cuda (const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5034
5035
GGML_ASSERT (ncols % QK_K == 0 );
5035
5036
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
5036
- const dim3 block_nums (1 , block_num_y , 1 );
5037
+ const dim3 block_nums (block_num_y, 1 , 1 );
5037
5038
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
5038
5039
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
5039
5040
<<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
@@ -5042,7 +5043,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
5042
5043
static void mul_mat_vec_q6_K_q8_1_cuda (const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5043
5044
GGML_ASSERT (ncols % QK_K == 0 );
5044
5045
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
5045
- const dim3 block_nums (1 , block_num_y , 1 );
5046
+ const dim3 block_nums (block_num_y, 1 , 1 );
5046
5047
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
5047
5048
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
5048
5049
<<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
@@ -5061,7 +5062,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
5061
5062
static void convert_mul_mat_vec_f16_cuda (const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5062
5063
GGML_ASSERT (ncols % GGML_CUDA_DMMV_X == 0 );
5063
5064
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
5064
- const dim3 block_nums (1 , block_num_y , 1 );
5065
+ const dim3 block_nums (block_num_y, 1 , 1 );
5065
5066
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
5066
5067
dequantize_mul_mat_vec<1 , 1 , convert_f16>
5067
5068
<<<block_nums, block_dims, 0 , stream>>> (vx, y, dst, ncols, nrows);
0 commit comments