Skip to content

Commit 129d844

Browse files
ikawrakowKawrakow
andauthored
Fix Q4_K and Q5_K for QK_K = 64 on CUDA (#2359)
* Fix Q4_K and Q5_K for QK_K = 64 * Very slightly better Q5_K bit fiddling --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent d5512b7 commit 129d844

File tree

1 file changed

+80
-3
lines changed

1 file changed

+80
-3
lines changed

ggml-cuda.cu

Lines changed: 80 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1564,12 +1564,14 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
15641564
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
15651565
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
15661566

1567-
// iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
1568-
const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
1569-
15701567
float sumf_d = 0.0f;
15711568
float sumf_m = 0.0f;
15721569

1570+
#ifndef GGML_QKK_64
1571+
1572+
// iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
1573+
const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
1574+
15731575
const float d = bq4_K->d;
15741576
const float dmin = bq4_K->dmin;
15751577

@@ -1614,6 +1616,43 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
16141616
}
16151617

16161618
return d*sumf_d - dmin*sumf_m;
1619+
1620+
#else
1621+
1622+
uint16_t aux16[2];
1623+
const uint8_t * s = (const uint8_t *)aux16;
1624+
1625+
const uint16_t * a = (const uint16_t *)bq4_K->scales;
1626+
aux16[0] = a[0] & 0x0f0f;
1627+
aux16[1] = (a[0] >> 4) & 0x0f0f;
1628+
1629+
const float dall = bq4_K->d[0];
1630+
const float dmin = bq4_K->d[1];
1631+
1632+
const float d8_1 = bq8_1[0].d;
1633+
const float d8_2 = bq8_1[1].d;
1634+
1635+
const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1636+
const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1637+
const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1638+
const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
1639+
1640+
const int * q4 = (const int *)bq4_K->qs + iqs;
1641+
const int v1 = q4[0];
1642+
const int v2 = q4[4];
1643+
1644+
const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
1645+
const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
1646+
const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1647+
const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
1648+
1649+
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
1650+
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
1651+
1652+
return dall * sumf_d - dmin * sumf_m;
1653+
1654+
#endif
1655+
16171656
#else
16181657
return 0.0f; // only to satisfy the compiler
16191658
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@@ -1625,6 +1664,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
16251664
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
16261665
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
16271666

1667+
#ifndef GGML_QKK_64
1668+
16281669
const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
16291670
const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
16301671
const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
@@ -1680,6 +1721,42 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
16801721
}
16811722

16821723
return d*sumf_d - dmin*sumf_m;
1724+
1725+
#else
1726+
1727+
const int8_t * s = bq5_K->scales;
1728+
1729+
const float d = bq5_K->d;
1730+
1731+
const float d8_1 = bq8_1[0].d;
1732+
const float d8_2 = bq8_1[1].d;
1733+
1734+
const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1735+
const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1736+
const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1737+
const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
1738+
1739+
const int * ql = (const int *)bq5_K->qs + iqs;
1740+
const int vl1 = ql[0];
1741+
const int vl2 = ql[4];
1742+
1743+
const int step = 4 * iqs; // 0, 4, 8, 12
1744+
const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
1745+
const int in = step%8; // 0, 4, 0, 4
1746+
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
1747+
1748+
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
1749+
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
1750+
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
1751+
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
1752+
1753+
const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
1754+
+ d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
1755+
1756+
return d * sumf_d;
1757+
1758+
#endif
1759+
16831760
#else
16841761
return 0.0f; // only to satisfy the compiler
16851762
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

0 commit comments

Comments
 (0)