Skip to content

Commit ae0b2a9

Browse files
Added q4_1 via template
1 parent ace05c1 commit ae0b2a9

File tree

1 file changed

+57
-31
lines changed

1 file changed

+57
-31
lines changed

ggml-cuda.cu

Lines changed: 57 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
3232
} \
3333
} while (0)
3434

35+
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
3536
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
37+
typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream);
3638

3739
#define QK4_0 32
3840
typedef struct {
@@ -73,6 +75,37 @@ typedef struct {
7375
} block_q8_0;
7476
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
7577

78+
#define CUDA_DMMV_BLOCK_SIZE 32
79+
80+
static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
81+
const block_q4_0 * x = (const block_q4_0 *) vx;
82+
83+
const float d = x[ib].d;
84+
85+
const uint8_t vui = x[ib].qs[iqs];
86+
87+
const int8_t vi0 = vui & 0xF;
88+
const int8_t vi1 = vui >> 4;
89+
90+
v0 = (vi0 - 8)*d;
91+
v1 = (vi1 - 8)*d;
92+
}
93+
94+
static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
95+
const block_q4_1 * x = (const block_q4_1 *) vx;
96+
97+
const float d = x[ib].d;
98+
const float m = x[ib].m;
99+
100+
const uint8_t vui = x[ib].qs[iqs];
101+
102+
const int8_t vi0 = vui & 0xF;
103+
const int8_t vi1 = vui >> 4;
104+
105+
v0 = vi0*d + m;
106+
v1 = vi1*d + m;
107+
}
108+
76109
static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
77110
static const int qk = QK4_0;
78111

@@ -173,10 +206,7 @@ static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
173206
}
174207
}
175208

176-
template <int block_size> static __global__ void dequantize_mul_mat_q4_0(const void * vx, const float * y, float * dst, const int ncols) {
177-
const block_q4_0 * x = (const block_q4_0 *) vx;
178-
const int qk = QK4_0;
179-
209+
template <int block_size, int qk, dequantize_kernel_t dequantize_kernel> static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
180210
const int row = blockIdx.x;
181211
const int tid = threadIdx.x;
182212

@@ -190,17 +220,8 @@ template <int block_size> static __global__ void dequantize_mul_mat_q4_0(const v
190220
const int iybs = col - col%qk; // y block start index
191221

192222
// dequantize
193-
const float d = x[ib].d;
194-
195-
const uint8_t * pp = x[ib].qs;
196-
197-
const uint8_t vui = pp[iqs];
198-
199-
const int8_t vi0 = vui & 0xF;
200-
const int8_t vi1 = vui >> 4;
201-
202-
const float v0 = (vi0 - 8)*d;
203-
const float v1 = (vi1 - 8)*d;
223+
float v0, v1;
224+
dequantize_kernel(vx, ib, iqs, v0, v1);
204225

205226
// matrix multiplication
206227
tmp[tid] += v0 * y[iybs + iqs + 0];
@@ -244,21 +265,14 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStre
244265
dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
245266
}
246267

247-
static void dequantize_mul_mat_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
248-
// static int block_size = -1;
249-
// if (block_size == -1) {
250-
// int min_grid_size, max_block_size = 1;
251-
// CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &max_block_size, dequantize_mul_mat_q4_0<256>, 0, 0));
252-
// max_block_size = min(max_block_size, GGML_CUDA_MAX_BLOCK_SIZE);
253-
// block_size = 1;
254-
// while (block_size*2 <= max_block_size && block_size*2 % ncols == 0) {
255-
// block_size *= 2;
256-
// }
257-
// }
258-
// dequantize_mul_mat_q4_0<<<nrows, block_size, 0, stream>>>(vx, y, dst, ncols);
259-
const int block_size = 32;
260-
GGML_ASSERT(ncols % block_size == 0);
261-
dequantize_mul_mat_q4_0<block_size><<<nrows, block_size, 0, stream>>>(vx, y, dst, ncols);
268+
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
269+
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
270+
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_0, dequantize_q4_0><<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
271+
}
272+
273+
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
274+
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
275+
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_1, dequantize_q4_1><<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
262276
}
263277

264278
// TODO: optimize
@@ -293,6 +307,17 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
293307
}
294308
}
295309

310+
static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_type type) {
311+
switch (type) {
312+
case GGML_TYPE_Q4_0:
313+
return dequantize_mul_mat_vec_q4_0_cuda;
314+
case GGML_TYPE_Q4_1:
315+
return dequantize_mul_mat_vec_q4_1_cuda;
316+
default:
317+
return nullptr;
318+
}
319+
}
320+
296321
// buffer pool for cuda
297322
#define MAX_CUDA_BUFFERS 256
298323

@@ -610,6 +635,7 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
610635
char * d_Q = (char *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);
611636

612637
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
638+
dequantize_mul_mat_vec_cuda_t dmmv = ggml_get_dequantize_mul_mat_vec_cuda(type);
613639
GGML_ASSERT(to_fp32_cuda != nullptr);
614640

615641
for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -641,7 +667,7 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
641667
CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
642668

643669
// compute
644-
dequantize_mul_mat_q4_0_cuda(c_Q, c_Y, c_D, ne00, ne01, cudaStream);
670+
dmmv(c_Q, c_Y, c_D, ne00, ne01, cudaStream);
645671
CUDA_CHECK(cudaGetLastError());
646672

647673
} else {

0 commit comments

Comments
 (0)