@@ -1564,12 +1564,14 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1564
1564
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1565
1565
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
1566
1566
1567
- // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
1568
- const int bq8_offset = QR4_K * (iqs / (QI8_1/2 ));
1569
-
1570
1567
float sumf_d = 0 .0f ;
1571
1568
float sumf_m = 0 .0f ;
1572
1569
1570
+ #ifndef GGML_QKK_64
1571
+
1572
+ // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
1573
+ const int bq8_offset = QR4_K * (iqs / (QI8_1/2 ));
1574
+
1573
1575
const float d = bq4_K->d ;
1574
1576
const float dmin = bq4_K->dmin ;
1575
1577
@@ -1614,6 +1616,43 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1614
1616
}
1615
1617
1616
1618
return d*sumf_d - dmin*sumf_m;
1619
+
1620
+ #else
1621
+
1622
+ uint16_t aux16[2 ];
1623
+ const uint8_t * s = (const uint8_t *)aux16;
1624
+
1625
+ const uint16_t * a = (const uint16_t *)bq4_K->scales ;
1626
+ aux16[0 ] = a[0 ] & 0x0f0f ;
1627
+ aux16[1 ] = (a[0 ] >> 4 ) & 0x0f0f ;
1628
+
1629
+ const float dall = bq4_K->d [0 ];
1630
+ const float dmin = bq4_K->d [1 ];
1631
+
1632
+ const float d8_1 = bq8_1[0 ].d ;
1633
+ const float d8_2 = bq8_1[1 ].d ;
1634
+
1635
+ const int ui1 = *((const int *)bq8_1[0 ].qs + iqs);
1636
+ const int ui2 = *((const int *)bq8_1[0 ].qs + iqs + 4 );
1637
+ const int ui3 = *((const int *)bq8_1[1 ].qs + iqs);
1638
+ const int ui4 = *((const int *)bq8_1[1 ].qs + iqs + 4 );
1639
+
1640
+ const int * q4 = (const int *)bq4_K->qs + iqs;
1641
+ const int v1 = q4[0 ];
1642
+ const int v2 = q4[4 ];
1643
+
1644
+ const int dot1 = __dp4a (ui2, v2 & 0x0f0f0f0f , __dp4a (ui1, v1 & 0x0f0f0f0f , 0 ));
1645
+ const int dot2 = __dp4a (ui4, (v2 >> 4 ) & 0x0f0f0f0f , __dp4a (ui3, (v1 >> 4 ) & 0x0f0f0f0f , 0 ));
1646
+ const int dot3 = __dp4a (0x01010101 , ui2, __dp4a (0x01010101 , ui1, 0 ));
1647
+ const int dot4 = __dp4a (0x01010101 , ui4, __dp4a (0x01010101 , ui3, 0 ));
1648
+
1649
+ sumf_d += d8_1 * (dot1 * s[0 ]) + d8_2 * (dot2 * s[1 ]);
1650
+ sumf_m += d8_1 * (dot3 * s[2 ]) + d8_2 * (dot4 * s[3 ]);
1651
+
1652
+ return dall * sumf_d - dmin * sumf_m;
1653
+
1654
+ #endif
1655
+
1617
1656
#else
1618
1657
return 0 .0f ; // only to satisfy the compiler
1619
1658
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@@ -1625,6 +1664,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1625
1664
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1626
1665
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
1627
1666
1667
+ #ifndef GGML_QKK_64
1668
+
1628
1669
const int bq8_offset = QR5_K * (iqs / (QI8_1/2 ));
1629
1670
const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4 ));
1630
1671
const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4 ));
@@ -1680,6 +1721,42 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1680
1721
}
1681
1722
1682
1723
return d*sumf_d - dmin*sumf_m;
1724
+
1725
+ #else
1726
+
1727
+ const int8_t * s = bq5_K->scales ;
1728
+
1729
+ const float d = bq5_K->d ;
1730
+
1731
+ const float d8_1 = bq8_1[0 ].d ;
1732
+ const float d8_2 = bq8_1[1 ].d ;
1733
+
1734
+ const int ui1 = *((const int *)bq8_1[0 ].qs + iqs);
1735
+ const int ui2 = *((const int *)bq8_1[0 ].qs + iqs + 4 );
1736
+ const int ui3 = *((const int *)bq8_1[1 ].qs + iqs);
1737
+ const int ui4 = *((const int *)bq8_1[1 ].qs + iqs + 4 );
1738
+
1739
+ const int * ql = (const int *)bq5_K->qs + iqs;
1740
+ const int vl1 = ql[0 ];
1741
+ const int vl2 = ql[4 ];
1742
+
1743
+ const int step = 4 * iqs; // 0, 4, 8, 12
1744
+ const int im = step/8 ; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
1745
+ const int in = step%8 ; // 0, 4, 0, 4
1746
+ const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
1747
+
1748
+ const int v1 = (((vh << 4 ) & 0x10101010 ) ^ 0x10101010 ) | ((vl1 >> 0 ) & 0x0f0f0f0f );
1749
+ const int v2 = (((vh << 2 ) & 0x10101010 ) ^ 0x10101010 ) | ((vl2 >> 0 ) & 0x0f0f0f0f );
1750
+ const int v3 = (((vh >> 0 ) & 0x10101010 ) ^ 0x10101010 ) | ((vl1 >> 4 ) & 0x0f0f0f0f );
1751
+ const int v4 = (((vh >> 2 ) & 0x10101010 ) ^ 0x10101010 ) | ((vl2 >> 4 ) & 0x0f0f0f0f );
1752
+
1753
+ const float sumf_d = d8_1 * (__dp4a (ui1, v1, 0 ) * s[0 ] + __dp4a (ui2, v2, 0 ) * s[1 ])
1754
+ + d8_2 * (__dp4a (ui3, v3, 0 ) * s[2 ] + __dp4a (ui4, v4, 0 ) * s[3 ]);
1755
+
1756
+ return d * sumf_d;
1757
+
1758
+ #endif
1759
+
1683
1760
#else
1684
1761
return 0 .0f ; // only to satisfy the compiler
1685
1762
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
0 commit comments