Skip to content

Commit 84c38ea

Browse files
Fixed k-quant kernels
1 parent 7fa2d80 commit 84c38ea

File tree

1 file changed

+12
-6
lines changed

1 file changed

+12
-6
lines changed

ggml-cuda.cu

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1303,6 +1303,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
13031303

13041304
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
13051305
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1306+
13061307
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
13071308

13081309
int vi;
@@ -1313,7 +1314,9 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
13131314
return vec_dot_q4_0_q8_1_impl(vi, ui0, ui1, __half2float(bq4_0->d), __half2float(bq8_1->d));
13141315
}
13151316

1316-
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1317+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1318+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1319+
13171320
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
13181321
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
13191322

@@ -1340,6 +1343,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
13401343

13411344
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
13421345
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1346+
13431347
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
13441348
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
13451349

@@ -1376,6 +1380,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
13761380

13771381
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
13781382
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1383+
13791384
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
13801385
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
13811386

@@ -1411,6 +1416,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
14111416

14121417
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
14131418
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1419+
14141420
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
14151421
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
14161422

@@ -1430,7 +1436,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
14301436
}
14311437

14321438
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
1433-
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1439+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
14341440

14351441
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
14361442
const block_q2_K * bq2_K = (const block_q2_K *) vbq;
@@ -1466,7 +1472,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
14661472
}
14671473

14681474
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
1469-
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1475+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
14701476

14711477
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
14721478
const block_q3_K * bq3_K = (const block_q3_K *) vbq;
@@ -1519,7 +1525,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
15191525
}
15201526

15211527
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1522-
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1528+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
15231529

15241530
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
15251531
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
@@ -1557,7 +1563,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
15571563
}
15581564

15591565
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1560-
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1566+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
15611567

15621568
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
15631569
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
@@ -1601,7 +1607,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
16011607
}
16021608

16031609
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
1604-
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1610+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
16051611

16061612
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
16071613
const block_q6_K * bq6_K = (const block_q6_K *) vbq;

0 commit comments

Comments
 (0)