@@ -1174,16 +1174,12 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1174
1174
static __global__ void quantize_q8_1 (const float * __restrict__ x, void * __restrict__ vy, const int k) {
1175
1175
const int i = blockDim .x *blockIdx .x + threadIdx .x ;
1176
1176
1177
- if (i >= k) {
1178
- return ;
1179
- }
1180
-
1181
1177
block_q8_1 * y = (block_q8_1 *) vy;
1182
1178
1183
- const int ib = i / QK8_0 ; // block index
1184
- const int iqs = i % QK8_0 ; // quant index
1179
+ const int ib = i / QK8_1 ; // block index
1180
+ const int iqs = i % QK8_1 ; // quant index
1185
1181
1186
- const float xi = x[i];
1182
+ const float xi = i < k ? x[i] : 0 . 0f ;
1187
1183
float amax = fabsf (xi);
1188
1184
float sum = xi;
1189
1185
@@ -2359,8 +2355,10 @@ inline void ggml_cuda_op_mul_mat_vec(
2359
2355
#endif
2360
2356
2361
2357
if (use_mul_mat_vec_q) {
2358
+ int64_t padded_row_size = ne00 + CUDA_QUANTIZE_BLOCK_SIZE - 1 ;
2359
+ padded_row_size -= padded_row_size % CUDA_QUANTIZE_BLOCK_SIZE;
2362
2360
size_t as;
2363
- void * src1_q8_1 = ggml_cuda_pool_malloc (ne00 *sizeof (block_q8_1)/QK8_1, &as);
2361
+ void * src1_q8_1 = ggml_cuda_pool_malloc (padded_row_size *sizeof (block_q8_1)/QK8_1, &as);
2364
2362
quantize_row_q8_1_cuda (src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
2365
2363
2366
2364
switch (src0->type ) {
@@ -3105,7 +3103,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
3105
3103
3106
3104
void ggml_cuda_transform_tensor (void * data, struct ggml_tensor * tensor) {
3107
3105
int nrows = ggml_nrows (tensor);
3106
+
3107
+ const int64_t ne0 = tensor->ne [0 ];
3108
+
3108
3109
const size_t nb1 = tensor->nb [1 ];
3110
+
3109
3111
ggml_backend backend = tensor->backend ;
3110
3112
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu ;
3111
3113
memset (extra, 0 , sizeof (*extra));
@@ -3134,11 +3136,24 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3134
3136
int64_t nrows_split = row_high - row_low;
3135
3137
3136
3138
const size_t offset_split = row_low*nb1;
3137
- const size_t size = ggml_nbytes_split (tensor, nrows_split);
3139
+ size_t size = ggml_nbytes_split (tensor, nrows_split);
3140
+ const size_t original_size = size;
3141
+
3142
+ // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
3143
+ if (ne0 % CUDA_QUANTIZE_BLOCK_SIZE != 0 ) {
3144
+ size += (CUDA_QUANTIZE_BLOCK_SIZE - ne0 % CUDA_QUANTIZE_BLOCK_SIZE)
3145
+ * ggml_type_size (tensor->type )/ggml_blck_size (tensor->type );
3146
+ }
3138
3147
3139
- void * buf;
3148
+ char * buf;
3140
3149
CUDA_CHECK (cudaMalloc (&buf, size));
3141
- void * buf_host = (char *)data + offset_split;
3150
+ char * buf_host = (char *)data + offset_split;
3151
+
3152
+ // set padding to 0 to avoid possible NaN values
3153
+ if (size > original_size) {
3154
+ CUDA_CHECK (cudaMemset (buf + original_size, 0 , size - original_size));
3155
+ }
3156
+
3142
3157
3143
3158
cudaMemcpy (buf, buf_host, size, cudaMemcpyHostToDevice);
3144
3159
0 commit comments