@@ -106,6 +106,24 @@ static __device__ void dequantize_q4_1(const void * vx, const int ib, const int
106
106
v1 = vi1*d + m;
107
107
}
108
108
109
+ static __device__ void dequantize_q5_0 (const void * vx, const int ib, const int iqs, float & v0, float & v1){
110
+ const block_q5_0 * x = (const block_q5_0 *) vx;
111
+
112
+ const float d = x[ib].d ;
113
+
114
+ uint32_t qh;
115
+ memcpy (&qh, x[ib].qh , sizeof (qh));
116
+
117
+ const uint8_t xh_0 = ((qh >> (iqs + 0 )) << 4 ) & 0x10 ;
118
+ const uint8_t xh_1 = ((qh >> (iqs + 12 )) ) & 0x10 ;
119
+
120
+ const int32_t x0 = ((x[ib].qs [iqs] & 0xf ) | xh_0) - 16 ;
121
+ const int32_t x1 = ((x[ib].qs [iqs] >> 4 ) | xh_1) - 16 ;
122
+
123
+ v0 = x0*d;
124
+ v1 = x1*d;
125
+ }
126
+
109
127
static __global__ void dequantize_block_q4_0 (const void * vx, float * y) {
110
128
static const int qk = QK4_0;
111
129
@@ -277,6 +295,11 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, f
277
295
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_1, dequantize_q4_1><<<nrows, CUDA_DMMV_BLOCK_SIZE, 0 , stream>>> (vx, y, dst, ncols);
278
296
}
279
297
298
+ static void dequantize_mul_mat_vec_q5_0_cuda (const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
299
+ GGML_ASSERT (ncols % CUDA_DMMV_BLOCK_SIZE == 0 );
300
+ dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_0, dequantize_q5_0><<<nrows, CUDA_DMMV_BLOCK_SIZE, 0 , stream>>> (vx, y, dst, ncols);
301
+ }
302
+
280
303
// TODO: optimize
281
304
static __global__ void convert_fp16_to_fp32 (const void * vx, float * y) {
282
305
const half * x = (const half *) vx;
@@ -315,6 +338,8 @@ static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_t
315
338
return dequantize_mul_mat_vec_q4_0_cuda;
316
339
case GGML_TYPE_Q4_1:
317
340
return dequantize_mul_mat_vec_q4_1_cuda;
341
+ case GGML_TYPE_Q5_0:
342
+ return dequantize_mul_mat_vec_q5_0_cuda;
318
343
default :
319
344
return nullptr ;
320
345
}
0 commit comments