@@ -1232,19 +1232,23 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1232
1232
v.y = x[ib + iqs + 1 ];
1233
1233
}
1234
1234
1235
- static __global__ void quantize_q8_1 (const float * __restrict__ x, void * __restrict__ vy, const int ndata , const int k ) {
1236
- const int i = blockDim .x *blockIdx .x + threadIdx .x ;
1235
+ static __global__ void quantize_q8_1 (const float * __restrict__ x, void * __restrict__ vy, const int kx , const int kx_padded ) {
1236
+ const int ix = blockDim .x *blockIdx .x + threadIdx .x ;
1237
1237
1238
- if (i >= k ) {
1238
+ if (ix >= kx_padded ) {
1239
1239
return ;
1240
1240
}
1241
1241
1242
+ const int iy = blockDim .y *blockIdx .y + threadIdx .y ;
1243
+
1244
+ const int i_padded = iy*kx_padded + ix;
1245
+
1242
1246
block_q8_1 * y = (block_q8_1 *) vy;
1243
1247
1244
- const int ib = i / QK8_1; // block index
1245
- const int iqs = i % QK8_1; // quant index
1248
+ const int ib = i_padded / QK8_1; // block index
1249
+ const int iqs = i_padded % QK8_1; // quant index
1246
1250
1247
- const float xi = i < ndata ? x[i ] : 0 .0f ;
1251
+ const float xi = ix < kx_padded ? x[iy*kx + ix ] : 0 .0f ;
1248
1252
float amax = fabsf (xi);
1249
1253
float sum = xi;
1250
1254
@@ -1779,12 +1783,14 @@ static __global__ void mul_mat_q(
1779
1783
const int iqsy = sizeof (int ) * (tid_x % QI8_1);
1780
1784
1781
1785
for (int i = 0 ; i < WARP_SIZE; i += 8 ) {
1782
- const block_q8_1 * __restrict__ by0 = &y[(col_y_0 + tid_y + i)*blocks_per_row + ib0 + iby0];
1786
+ const int col_y_eff = min (col_y_0 + tid_y + i, ncols_y-1 ); // to prevent out-of-bounds memory accesses
1787
+
1788
+ const block_q8_1 * __restrict__ by0 = &y[col_y_eff*blocks_per_row + ib0 + iby0];
1783
1789
1784
1790
tile_y_qs[(tid_y + i) * (2 *WARP_SIZE) + tid_x] = *((int *) &by0->qs [iqsy]);
1785
1791
tile_y_ds[(tid_y + i) * (2 *WARP_SIZE/QI8_1) + iby0] = by0->ds ;
1786
1792
1787
- const block_q8_1 * __restrict__ by1 = &y[(col_y_0 + tid_y + i) *blocks_per_row + ib0 + iby1];
1793
+ const block_q8_1 * __restrict__ by1 = &y[col_y_eff *blocks_per_row + ib0 + iby1];
1788
1794
1789
1795
tile_y_qs[(tid_y + i) * (2 *WARP_SIZE) + tid_x + WARP_SIZE] = *((int *) &by1->qs [iqsy]);
1790
1796
tile_y_ds[(tid_y + i) * (2 *WARP_SIZE/QI8_1) + iby1] = by1->ds ;
@@ -2215,9 +2221,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
2215
2221
rms_norm_f32<<<nrows, block_dims, 0 , stream>>> (x, dst, ncols);
2216
2222
}
2217
2223
2218
- static void quantize_row_q8_1_cuda (const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
2219
- const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1 ) / CUDA_QUANTIZE_BLOCK_SIZE;
2220
- quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0 , stream>>> (x, vy, ndata, k);
2224
+ static void quantize_row_q8_1_cuda (const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
2225
+ const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1 ) / CUDA_QUANTIZE_BLOCK_SIZE;
2226
+ const dim3 num_blocks (block_num_x, ky, 1 );
2227
+ const dim3 block_size (CUDA_DEQUANTIZE_BLOCK_SIZE, 1 , 1 );
2228
+ quantize_q8_1<<<num_blocks, block_size, 0 , stream>>> (x, vy, kx, kx_padded);
2221
2229
}
2222
2230
2223
2231
static void dequantize_row_q4_0_cuda (const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -2962,6 +2970,7 @@ inline void ggml_cuda_op_mul_mat_q(
2962
2970
2963
2971
const int64_t ne10 = src1->ne [0 ];
2964
2972
const int64_t ne11 = src1->ne [1 ];
2973
+ GGML_ASSERT (ne10 % QK8_1 == 0 );
2965
2974
2966
2975
const int64_t ne0 = dst->ne [0 ];
2967
2976
@@ -2974,11 +2983,11 @@ inline void ggml_cuda_op_mul_mat_q(
2974
2983
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
2975
2984
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
2976
2985
2977
- int64_t padded_row_size = ne10*ne11 + MATRIX_ROW_PADDING - 1 ;
2986
+ int64_t padded_row_size = ne10 + MATRIX_ROW_PADDING - 1 ;
2978
2987
padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
2979
2988
size_t as;
2980
- void * src1_q8_1 = ggml_cuda_pool_malloc (padded_row_size*sizeof (block_q8_1)/QK8_1, &as);
2981
- quantize_row_q8_1_cuda (src1_ddf_i, src1_q8_1, ne10* ne11, padded_row_size, cudaStream_main);
2989
+ void * src1_q8_1 = ggml_cuda_pool_malloc (padded_row_size*ne11* sizeof (block_q8_1)/QK8_1, &as);
2990
+ quantize_row_q8_1_cuda (src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
2982
2991
2983
2992
switch (src0->type ) {
2984
2993
case GGML_TYPE_Q4_0:
@@ -3042,7 +3051,7 @@ inline void ggml_cuda_op_mul_mat_vec(
3042
3051
padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
3043
3052
size_t as;
3044
3053
void * src1_q8_1 = ggml_cuda_pool_malloc (padded_row_size*sizeof (block_q8_1)/QK8_1, &as);
3045
- quantize_row_q8_1_cuda (src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
3054
+ quantize_row_q8_1_cuda (src1_ddf_i, src1_q8_1, ne00, 1 , padded_row_size, cudaStream_main);
3046
3055
3047
3056
switch (src0->type ) {
3048
3057
case GGML_TYPE_Q4_0:
0 commit comments