@@ -90,11 +90,11 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
90
90
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
91
91
92
92
// dmmv = dequantize_mul_mat_vec
93
- #ifndef GGML_CUDA_DMMV_BLOCK_X
94
- #define GGML_CUDA_DMMV_BLOCK_X 32 // can by set by compiler option LLAMA_CUDA_BY
93
+ #ifndef GGML_CUDA_DMMV_X
94
+ #define GGML_CUDA_DMMV_X 32 // can by set by compiler option LLAMA_CUDA_BY
95
95
#endif
96
- #ifndef GGML_CUDA_DMMV_BLOCK_Y
97
- #define GGML_CUDA_DMMV_BLOCK_Y 1 // can by set by compiler option LLAMA_CUDA_BY
96
+ #ifndef GGML_CUDA_DMMV_Y
97
+ #define GGML_CUDA_DMMV_Y 1 // can by set by compiler option LLAMA_CUDA_BY
98
98
#endif
99
99
100
100
static __global__ void mul_f32 (const float * x, const float * y, float * dst, const int kx, const int ky) {
@@ -217,7 +217,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
217
217
const int row = blockIdx .x *blockDim .y + threadIdx .y ;
218
218
const int tid = threadIdx .x ;
219
219
220
- const int iter_stride = 2 *GGML_CUDA_DMMV_BLOCK_X ;
220
+ const int iter_stride = 2 *GGML_CUDA_DMMV_X ;
221
221
const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
222
222
const int y_offset = qr == 1 ? 1 : qk/2 ;
223
223
@@ -289,43 +289,43 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu
289
289
}
290
290
291
291
static void dequantize_mul_mat_vec_q4_0_cuda (const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
292
- GGML_ASSERT (ncols % GGML_CUDA_DMMV_BLOCK_X == 0 );
293
- GGML_ASSERT (nrows % GGML_CUDA_DMMV_BLOCK_Y == 0 );
294
- const dim3 block_dims (WARP_SIZE, GGML_CUDA_DMMV_BLOCK_Y , 1 );
292
+ GGML_ASSERT (ncols % GGML_CUDA_DMMV_X == 0 );
293
+ GGML_ASSERT (nrows % GGML_CUDA_DMMV_Y == 0 );
294
+ const dim3 block_dims (WARP_SIZE, GGML_CUDA_DMMV_Y , 1 );
295
295
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
296
- <<<nrows/GGML_CUDA_DMMV_BLOCK_Y , block_dims, 0 , stream>>> (vx, y, dst, ncols);
296
+ <<<nrows/GGML_CUDA_DMMV_Y , block_dims, 0 , stream>>> (vx, y, dst, ncols);
297
297
}
298
298
299
299
static void dequantize_mul_mat_vec_q4_1_cuda (const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
300
- GGML_ASSERT (ncols % GGML_CUDA_DMMV_BLOCK_X == 0 );
301
- GGML_ASSERT (nrows % GGML_CUDA_DMMV_BLOCK_Y == 0 );
302
- const dim3 block_dims (WARP_SIZE, GGML_CUDA_DMMV_BLOCK_Y , 1 );
300
+ GGML_ASSERT (ncols % GGML_CUDA_DMMV_X == 0 );
301
+ GGML_ASSERT (nrows % GGML_CUDA_DMMV_Y == 0 );
302
+ const dim3 block_dims (WARP_SIZE, GGML_CUDA_DMMV_Y , 1 );
303
303
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
304
- <<<nrows/GGML_CUDA_DMMV_BLOCK_Y , block_dims, 0 , stream>>> (vx, y, dst, ncols);
304
+ <<<nrows/GGML_CUDA_DMMV_Y , block_dims, 0 , stream>>> (vx, y, dst, ncols);
305
305
}
306
306
307
307
static void dequantize_mul_mat_vec_q5_0_cuda (const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
308
- GGML_ASSERT (ncols % GGML_CUDA_DMMV_BLOCK_X == 0 );
309
- GGML_ASSERT (nrows % GGML_CUDA_DMMV_BLOCK_Y == 0 );
310
- const dim3 block_dims (WARP_SIZE, GGML_CUDA_DMMV_BLOCK_Y , 1 );
308
+ GGML_ASSERT (ncols % GGML_CUDA_DMMV_X == 0 );
309
+ GGML_ASSERT (nrows % GGML_CUDA_DMMV_Y == 0 );
310
+ const dim3 block_dims (WARP_SIZE, GGML_CUDA_DMMV_Y , 1 );
311
311
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
312
- <<<nrows/GGML_CUDA_DMMV_BLOCK_Y , block_dims, 0 , stream>>> (vx, y, dst, ncols);
312
+ <<<nrows/GGML_CUDA_DMMV_Y , block_dims, 0 , stream>>> (vx, y, dst, ncols);
313
313
}
314
314
315
315
static void dequantize_mul_mat_vec_q5_1_cuda (const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
316
- GGML_ASSERT (ncols % GGML_CUDA_DMMV_BLOCK_X == 0 );
317
- GGML_ASSERT (nrows % GGML_CUDA_DMMV_BLOCK_Y == 0 );
318
- const dim3 block_dims (WARP_SIZE, GGML_CUDA_DMMV_BLOCK_Y , 1 );
316
+ GGML_ASSERT (ncols % GGML_CUDA_DMMV_X == 0 );
317
+ GGML_ASSERT (nrows % GGML_CUDA_DMMV_Y == 0 );
318
+ const dim3 block_dims (WARP_SIZE, GGML_CUDA_DMMV_Y , 1 );
319
319
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
320
- <<<nrows/GGML_CUDA_DMMV_BLOCK_Y , block_dims, 0 , stream>>> (vx, y, dst, ncols);
320
+ <<<nrows/GGML_CUDA_DMMV_Y , block_dims, 0 , stream>>> (vx, y, dst, ncols);
321
321
}
322
322
323
323
static void dequantize_mul_mat_vec_q8_0_cuda (const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
324
- GGML_ASSERT (ncols % GGML_CUDA_DMMV_BLOCK_X == 0 );
325
- GGML_ASSERT (nrows % GGML_CUDA_DMMV_BLOCK_Y == 0 );
326
- const dim3 block_dims (WARP_SIZE, GGML_CUDA_DMMV_BLOCK_Y , 1 );
324
+ GGML_ASSERT (ncols % GGML_CUDA_DMMV_X == 0 );
325
+ GGML_ASSERT (nrows % GGML_CUDA_DMMV_Y == 0 );
326
+ const dim3 block_dims (WARP_SIZE, GGML_CUDA_DMMV_Y , 1 );
327
327
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
328
- <<<nrows/GGML_CUDA_DMMV_BLOCK_Y , block_dims, 0 , stream>>> (vx, y, dst, ncols);
328
+ <<<nrows/GGML_CUDA_DMMV_Y , block_dims, 0 , stream>>> (vx, y, dst, ncols);
329
329
}
330
330
331
331
static void convert_fp16_to_fp32_cuda (const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -334,11 +334,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
334
334
}
335
335
336
336
static void convert_mul_mat_vec_f16_cuda (const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
337
- GGML_ASSERT (ncols % GGML_CUDA_DMMV_BLOCK_X == 0 );
338
- GGML_ASSERT (nrows % GGML_CUDA_DMMV_BLOCK_Y == 0 );
339
- const dim3 block_dims (WARP_SIZE, GGML_CUDA_DMMV_BLOCK_Y , 1 );
337
+ GGML_ASSERT (ncols % GGML_CUDA_DMMV_X == 0 );
338
+ GGML_ASSERT (nrows % GGML_CUDA_DMMV_Y == 0 );
339
+ const dim3 block_dims (WARP_SIZE, GGML_CUDA_DMMV_Y , 1 );
340
340
dequantize_mul_mat_vec<1 , 1 , convert_f16>
341
- <<<nrows/GGML_CUDA_DMMV_BLOCK_Y , block_dims, 0 , stream>>> (vx, y, dst, ncols);
341
+ <<<nrows/GGML_CUDA_DMMV_Y , block_dims, 0 , stream>>> (vx, y, dst, ncols);
342
342
}
343
343
344
344
static to_fp32_cuda_t ggml_get_to_fp32_cuda (ggml_type type) {
0 commit comments