Skip to content

Commit a3b096b

Browse files
Fixed out-of-bounds memory accesses
1 parent bca8a68 commit a3b096b

File tree

1 file changed

+24
-15
lines changed

1 file changed

+24
-15
lines changed

ggml-cuda.cu

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1232,19 +1232,23 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
12321232
v.y = x[ib + iqs + 1];
12331233
}
12341234

1235-
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
1236-
const int i = blockDim.x*blockIdx.x + threadIdx.x;
1235+
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
1236+
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
12371237

1238-
if (i >= k) {
1238+
if (ix >= kx_padded) {
12391239
return;
12401240
}
12411241

1242+
const int iy = blockDim.y*blockIdx.y + threadIdx.y;
1243+
1244+
const int i_padded = iy*kx_padded + ix;
1245+
12421246
block_q8_1 * y = (block_q8_1 *) vy;
12431247

1244-
const int ib = i / QK8_1; // block index
1245-
const int iqs = i % QK8_1; // quant index
1248+
const int ib = i_padded / QK8_1; // block index
1249+
const int iqs = i_padded % QK8_1; // quant index
12461250

1247-
const float xi = i < ndata ? x[i] : 0.0f;
1251+
const float xi = ix < kx_padded ? x[iy*kx + ix] : 0.0f;
12481252
float amax = fabsf(xi);
12491253
float sum = xi;
12501254

@@ -1779,12 +1783,14 @@ static __global__ void mul_mat_q(
17791783
const int iqsy = sizeof(int) * (tid_x % QI8_1);
17801784

17811785
for (int i = 0; i < WARP_SIZE; i += 8) {
1782-
const block_q8_1 * __restrict__ by0 = &y[(col_y_0 + tid_y + i)*blocks_per_row + ib0 + iby0];
1786+
const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
1787+
1788+
const block_q8_1 * __restrict__ by0 = &y[col_y_eff*blocks_per_row + ib0 + iby0];
17831789

17841790
tile_y_qs[(tid_y + i) * (2*WARP_SIZE) + tid_x] = *((int *) &by0->qs[iqsy]);
17851791
tile_y_ds[(tid_y + i) * (2*WARP_SIZE/QI8_1) + iby0] = by0->ds;
17861792

1787-
const block_q8_1 * __restrict__ by1 = &y[(col_y_0 + tid_y + i)*blocks_per_row + ib0 + iby1];
1793+
const block_q8_1 * __restrict__ by1 = &y[col_y_eff*blocks_per_row + ib0 + iby1];
17881794

17891795
tile_y_qs[(tid_y + i) * (2*WARP_SIZE) + tid_x + WARP_SIZE] = *((int *) &by1->qs[iqsy]);
17901796
tile_y_ds[(tid_y + i) * (2*WARP_SIZE/QI8_1) + iby1] = by1->ds;
@@ -2215,9 +2221,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
22152221
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
22162222
}
22172223

2218-
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
2219-
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
2220-
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
2224+
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
2225+
const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
2226+
const dim3 num_blocks(block_num_x, ky, 1);
2227+
const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
2228+
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
22212229
}
22222230

22232231
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -2962,6 +2970,7 @@ inline void ggml_cuda_op_mul_mat_q(
29622970

29632971
const int64_t ne10 = src1->ne[0];
29642972
const int64_t ne11 = src1->ne[1];
2973+
GGML_ASSERT(ne10 % QK8_1 == 0);
29652974

29662975
const int64_t ne0 = dst->ne[0];
29672976

@@ -2974,11 +2983,11 @@ inline void ggml_cuda_op_mul_mat_q(
29742983
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
29752984
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
29762985

2977-
int64_t padded_row_size = ne10*ne11 + MATRIX_ROW_PADDING - 1;
2986+
int64_t padded_row_size = ne10 + MATRIX_ROW_PADDING - 1;
29782987
padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
29792988
size_t as;
2980-
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
2981-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10*ne11, padded_row_size, cudaStream_main);
2989+
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
2990+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
29822991

29832992
switch (src0->type) {
29842993
case GGML_TYPE_Q4_0:
@@ -3042,7 +3051,7 @@ inline void ggml_cuda_op_mul_mat_vec(
30423051
padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
30433052
size_t as;
30443053
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
3045-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
3054+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
30463055

30473056
switch (src0->type) {
30483057
case GGML_TYPE_Q4_0:

0 commit comments

Comments
 (0)