Skip to content

Commit cb5fad4

Browse files
CUDA: refactor and optimize IQ MMVQ (#8215)
* CUDA: refactor and optimize IQ MMVQ * uint -> uint32_t * __dp4a -> ggml_cuda_dp4a * remove MIN_CC_DP4A checks * change default * try CI fix
1 parent dae57a1 commit cb5fad4

File tree

8 files changed

+409
-490
lines changed

8 files changed

+409
-490
lines changed

ggml/src/ggml-common.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -106,19 +106,19 @@ typedef sycl::half2 ggml_half2;
106106
#define QR6_K 2
107107

108108
#define QI2_XXS (QK_K / (4*QR2_XXS))
109-
#define QR2_XXS 8
109+
#define QR2_XXS 4
110110

111111
#define QI2_XS (QK_K / (4*QR2_XS))
112-
#define QR2_XS 8
112+
#define QR2_XS 4
113113

114114
#define QI2_S (QK_K / (4*QR2_S))
115-
#define QR2_S 8
115+
#define QR2_S 4
116116

117117
#define QI3_XXS (QK_K / (4*QR3_XXS))
118-
#define QR3_XXS 8
118+
#define QR3_XXS 4
119119

120120
#define QI3_XS (QK_K / (4*QR3_XS))
121-
#define QR3_XS 8
121+
#define QR3_XS 4
122122

123123
#define QI1_S (QK_K / (4*QR1_S))
124124
#define QR1_S 8
@@ -130,10 +130,10 @@ typedef sycl::half2 ggml_half2;
130130
#define QR4_NL 2
131131

132132
#define QI4_XS (QK_K / (4*QR4_XS))
133-
#define QR4_XS 8
133+
#define QR4_XS 2
134134

135135
#define QI3_S (QK_K / (4*QR3_S))
136-
#define QR3_S 8
136+
#define QR3_S 4
137137

138138
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
139139

ggml/src/ggml-cuda.cu

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1882,6 +1882,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
18821882
bool use_mul_mat_q = ggml_is_quantized(src0->type)
18831883
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
18841884

1885+
// if mmvq is available it's a better choice than dmmv:
1886+
#ifndef GGML_CUDA_FORCE_DMMV
1887+
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
1888+
#endif // GGML_CUDA_FORCE_DMMV
1889+
18851890
bool any_gpus_with_slow_fp16 = false;
18861891

18871892
if (split) {
@@ -1894,22 +1899,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
18941899
}
18951900

18961901
const int cc = ggml_cuda_info().devices[id].cc;
1897-
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
18981902
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
18991903
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
19001904
}
19011905
} else {
19021906
const int cc = ggml_cuda_info().devices[ctx.device].cc;
1903-
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
19041907
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
19051908
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
19061909
}
19071910

1908-
// if mmvq is available it's a better choice than dmmv:
1909-
#ifndef GGML_CUDA_FORCE_DMMV
1910-
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
1911-
#endif // GGML_CUDA_FORCE_DMMV
1912-
19131911
// debug helpers
19141912
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
19151913
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);

ggml/src/ggml-cuda/common.cuh

Lines changed: 51 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "ggml.h"
44
#include "ggml-cuda.h"
55

6+
#include <cstdint>
67
#include <memory>
78

89
#if defined(GGML_USE_HIPBLAS)
@@ -268,30 +269,15 @@ static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigne
268269
return c;
269270
}
270271

271-
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
272-
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
273-
c = __builtin_amdgcn_sdot4(a, b, c, false);
274-
#elif defined(RDNA3)
275-
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
276-
#elif defined(__gfx1010__) || defined(__gfx900__)
277-
int tmp1;
278-
int tmp2;
279-
asm("\n \
280-
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
281-
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
282-
v_add3_u32 %0, %1, %2, %0 \n \
283-
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
284-
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
285-
v_add3_u32 %0, %1, %2, %0 \n \
286-
"
287-
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
288-
: "v"(a), "v"(b)
289-
);
290-
#else
291-
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
292-
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
293-
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
294-
#endif
272+
static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
273+
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
274+
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
275+
unsigned int c;
276+
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
277+
#pragma unroll
278+
for (int i = 0; i < 4; ++i) {
279+
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
280+
}
295281
return c;
296282
}
297283

@@ -467,8 +453,48 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
467453
}
468454
#endif // CUDART_VERSION < 12000
469455

456+
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
457+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
458+
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
459+
c = __builtin_amdgcn_sdot4(a, b, c, false);
460+
#elif defined(RDNA3)
461+
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
462+
#elif defined(__gfx1010__) || defined(__gfx900__)
463+
int tmp1;
464+
int tmp2;
465+
asm("\n \
466+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
467+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
468+
v_add3_u32 %0, %1, %2, %0 \n \
469+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
470+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
471+
v_add3_u32 %0, %1, %2, %0 \n \
472+
"
473+
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
474+
: "v"(a), "v"(b)
475+
);
476+
#else
477+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
478+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
479+
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
480+
#endif
481+
return c;
482+
483+
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
484+
485+
#if __CUDA_ARCH__ >= MIN_CC_DP4A
486+
return __dp4a(a, b, c);
487+
#else // __CUDA_ARCH__ >= MIN_CC_DP4A
488+
const int8_t * a8 = (const int8_t *) &a;
489+
const int8_t * b8 = (const int8_t *) &b;
490+
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
491+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
492+
493+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
494+
}
495+
470496
// TODO: move to ggml-common.h
471-
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
497+
static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
472498

473499
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
474500

ggml/src/ggml-cuda/fattn-common.cuh

Lines changed: 5 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,11 @@ typedef float (*vec_dot_KQ_f32_t)(
5454
template<typename T, int D>
5555
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
5656
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
57-
#if __CUDA_ARCH__ >= MIN_CC_DP4A
5857

5958
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
6059
GGML_UNUSED(Q_v);
6160

62-
half sum = 0.0f;
61+
T sum = 0.0f;
6362

6463
#pragma unroll
6564
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
@@ -72,7 +71,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
7271
const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
7372
const int u = Q_q8[k_KQ_0/WARP_SIZE];
7473

75-
const int sumi = __dp4a(v, u, 0);
74+
const int sumi = ggml_cuda_dp4a(v, u, 0);
7675

7776
#ifdef FP16_AVAILABLE
7877
if (std::is_same<T, half>::value) {
@@ -90,19 +89,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
9089
}
9190

9291
return sum;
93-
#else
94-
GGML_UNUSED(K_c);
95-
GGML_UNUSED(Q_v);
96-
GGML_UNUSED(Q_q8);
97-
GGML_UNUSED(Q_ds_v);
98-
NO_DEVICE_CODE;
99-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
10092
}
10193

10294
template<typename T, int D>
10395
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
10496
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
105-
#if __CUDA_ARCH__ >= MIN_CC_DP4A
10697

10798
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
10899
GGML_UNUSED(Q_v);
@@ -120,7 +111,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
120111
const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
121112
const int u = Q_q8[k_KQ_0/WARP_SIZE];
122113

123-
const int sumi = __dp4a(v, u, 0);
114+
const int sumi = ggml_cuda_dp4a(v, u, 0);
124115

125116
#ifdef FP16_AVAILABLE
126117
if (std::is_same<T, half>::value) {
@@ -142,19 +133,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
142133
}
143134

144135
return sum;
145-
#else
146-
GGML_UNUSED(K_c);
147-
GGML_UNUSED(Q_v);
148-
GGML_UNUSED(Q_q8);
149-
GGML_UNUSED(Q_ds_v);
150-
NO_DEVICE_CODE;
151-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
152136
}
153137

154138
template<typename T, int D>
155139
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
156140
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
157-
#if __CUDA_ARCH__ >= MIN_CC_DP4A
158141

159142
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
160143
GGML_UNUSED(Q_v);
@@ -179,7 +162,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
179162

180163
const int u = Q_q8[k_KQ_0/WARP_SIZE];
181164

182-
const int sumi = __dp4a(v, u, 0);
165+
const int sumi = ggml_cuda_dp4a(v, u, 0);
183166

184167
#ifdef FP16_AVAILABLE
185168
if (std::is_same<T, half>::value) {
@@ -197,19 +180,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
197180
}
198181

199182
return sum;
200-
#else
201-
GGML_UNUSED(K_c);
202-
GGML_UNUSED(Q_v);
203-
GGML_UNUSED(Q_q8);
204-
GGML_UNUSED(Q_ds_v);
205-
NO_DEVICE_CODE;
206-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
207183
}
208184

209185
template<typename T, int D>
210186
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
211187
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
212-
#if __CUDA_ARCH__ >= MIN_CC_DP4A
213188

214189
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
215190
GGML_UNUSED(Q_v);
@@ -234,7 +209,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
234209

235210
const int u = Q_q8[k_KQ_0/WARP_SIZE];
236211

237-
const int sumi = __dp4a(v, u, 0);
212+
const int sumi = ggml_cuda_dp4a(v, u, 0);
238213

239214
#ifdef FP16_AVAILABLE
240215
if (std::is_same<T, half>::value) {
@@ -256,19 +231,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
256231
}
257232

258233
return sum;
259-
#else
260-
GGML_UNUSED(K_c);
261-
GGML_UNUSED(Q_v);
262-
GGML_UNUSED(Q_q8);
263-
GGML_UNUSED(Q_ds_v);
264-
NO_DEVICE_CODE;
265-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
266234
}
267235

268236
template <typename T, int D>
269237
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
270238
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
271-
#if __CUDA_ARCH__ >= MIN_CC_DP4A
272239

273240
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
274241
GGML_UNUSED(Q_v);
@@ -297,13 +264,6 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
297264
}
298265

299266
return sum;
300-
#else
301-
GGML_UNUSED(K_c);
302-
GGML_UNUSED(Q_v);
303-
GGML_UNUSED(Q_q8);
304-
GGML_UNUSED(Q_ds_v);
305-
NO_DEVICE_CODE;
306-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
307267
}
308268

309269
template <typename T, int D>

ggml/src/ggml-cuda/mmvq.cu

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,22 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
2828

2929
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
3030
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
31-
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
32-
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
33-
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
34-
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
35-
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
36-
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
37-
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
38-
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
39-
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
40-
type == GGML_TYPE_IQ4_NL ? VDR_Q4_K_Q8_1_MMVQ :
31+
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
32+
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
33+
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
34+
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
35+
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
36+
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
37+
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
38+
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
39+
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
40+
type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ :
41+
type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ :
42+
type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ :
43+
type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ :
44+
type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ :
45+
type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ :
46+
type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ :
4147
1;
4248
}
4349

0 commit comments

Comments
 (0)