@@ -1303,6 +1303,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
1303
1303
1304
1304
static __device__ __forceinline__ float vec_dot_q4_0_q8_1 (
1305
1305
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1306
+
1306
1307
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1307
1308
1308
1309
int vi;
@@ -1313,7 +1314,9 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1313
1314
return vec_dot_q4_0_q8_1_impl (vi, ui0, ui1, __half2float (bq4_0->d ), __half2float (bq8_1->d ));
1314
1315
}
1315
1316
1316
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1 (const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1317
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1 (
1318
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1319
+
1317
1320
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1318
1321
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1319
1322
@@ -1340,6 +1343,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
1340
1343
1341
1344
static __device__ __forceinline__ float vec_dot_q5_0_q8_1 (
1342
1345
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1346
+
1343
1347
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1344
1348
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1345
1349
@@ -1376,6 +1380,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1376
1380
1377
1381
static __device__ __forceinline__ float vec_dot_q5_1_q8_1 (
1378
1382
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1383
+
1379
1384
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1380
1385
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1381
1386
@@ -1411,6 +1416,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1411
1416
1412
1417
static __device__ __forceinline__ float vec_dot_q8_0_q8_1 (
1413
1418
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1419
+
1414
1420
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1415
1421
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1416
1422
@@ -1430,7 +1436,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1430
1436
}
1431
1437
1432
1438
static __device__ __forceinline__ float vec_dot_q2_K_q8_1 (
1433
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1439
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1434
1440
1435
1441
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1436
1442
const block_q2_K * bq2_K = (const block_q2_K *) vbq;
@@ -1466,7 +1472,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
1466
1472
}
1467
1473
1468
1474
static __device__ __forceinline__ float vec_dot_q3_K_q8_1 (
1469
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1475
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1470
1476
1471
1477
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1472
1478
const block_q3_K * bq3_K = (const block_q3_K *) vbq;
@@ -1519,7 +1525,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
1519
1525
}
1520
1526
1521
1527
static __device__ __forceinline__ float vec_dot_q4_K_q8_1 (
1522
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1528
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1523
1529
1524
1530
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1525
1531
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
@@ -1557,7 +1563,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1557
1563
}
1558
1564
1559
1565
static __device__ __forceinline__ float vec_dot_q5_K_q8_1 (
1560
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1566
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1561
1567
1562
1568
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1563
1569
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
@@ -1601,7 +1607,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1601
1607
}
1602
1608
1603
1609
static __device__ __forceinline__ float vec_dot_q6_K_q8_1 (
1604
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1610
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1605
1611
1606
1612
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1607
1613
const block_q6_K * bq6_K = (const block_q6_K *) vbq;
0 commit comments