Skip to content

Commit 52f90f2

Browse files
Fixed OpenLLaMA 3b CUDA mul_mat_vec_q
1 parent 061f5f8 commit 52f90f2

File tree

1 file changed

+26
-11
lines changed

1 file changed

+26
-11
lines changed

ggml-cuda.cu

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,16 +1174,12 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
11741174
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int k) {
11751175
const int i = blockDim.x*blockIdx.x + threadIdx.x;
11761176

1177-
if (i >= k) {
1178-
return;
1179-
}
1180-
11811177
block_q8_1 * y = (block_q8_1 *) vy;
11821178

1183-
const int ib = i / QK8_0; // block index
1184-
const int iqs = i % QK8_0; // quant index
1179+
const int ib = i / QK8_1; // block index
1180+
const int iqs = i % QK8_1; // quant index
11851181

1186-
const float xi = x[i];
1182+
const float xi = i < k ? x[i] : 0.0f;
11871183
float amax = fabsf(xi);
11881184
float sum = xi;
11891185

@@ -2359,8 +2355,10 @@ inline void ggml_cuda_op_mul_mat_vec(
23592355
#endif
23602356

23612357
if (use_mul_mat_vec_q) {
2358+
int64_t padded_row_size = ne00 + CUDA_QUANTIZE_BLOCK_SIZE - 1;
2359+
padded_row_size -= padded_row_size % CUDA_QUANTIZE_BLOCK_SIZE;
23622360
size_t as;
2363-
void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
2361+
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
23642362
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
23652363

23662364
switch (src0->type) {
@@ -3105,7 +3103,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
31053103

31063104
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
31073105
int nrows = ggml_nrows(tensor);
3106+
3107+
const int64_t ne0 = tensor->ne[0];
3108+
31083109
const size_t nb1 = tensor->nb[1];
3110+
31093111
ggml_backend backend = tensor->backend;
31103112
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
31113113
memset(extra, 0, sizeof(*extra));
@@ -3134,11 +3136,24 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
31343136
int64_t nrows_split = row_high - row_low;
31353137

31363138
const size_t offset_split = row_low*nb1;
3137-
const size_t size = ggml_nbytes_split(tensor, nrows_split);
3139+
size_t size = ggml_nbytes_split(tensor, nrows_split);
3140+
const size_t original_size = size;
3141+
3142+
// pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
3143+
if (ne0 % CUDA_QUANTIZE_BLOCK_SIZE != 0) {
3144+
size += (CUDA_QUANTIZE_BLOCK_SIZE - ne0 % CUDA_QUANTIZE_BLOCK_SIZE)
3145+
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
3146+
}
31383147

3139-
void * buf;
3148+
char * buf;
31403149
CUDA_CHECK(cudaMalloc(&buf, size));
3141-
void * buf_host = (char*)data + offset_split;
3150+
char * buf_host = (char*)data + offset_split;
3151+
3152+
// set padding to 0 to avoid possible NaN values
3153+
if (size > original_size) {
3154+
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
3155+
}
3156+
31423157

31433158
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
31443159

0 commit comments

Comments
 (0)