@@ -162,7 +162,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
162
162
typedef void (*allocate_tiles_cuda_t )(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
163
163
typedef void (*load_tiles_cuda_t )(
164
164
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
165
- int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row);
165
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
166
166
typedef float (*vec_dot_q_mul_mat_cuda_t )(
167
167
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
168
168
const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
@@ -1406,7 +1406,7 @@ static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 **
1406
1406
1407
1407
static __device__ __forceinline__ void load_tiles_q4_0 (
1408
1408
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1409
- int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) {
1409
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1410
1410
1411
1411
__builtin_assume (i_offset >= 0 );
1412
1412
__builtin_assume (i_offset < 8 );
@@ -1420,7 +1420,7 @@ static __device__ __forceinline__ void load_tiles_q4_0(
1420
1420
1421
1421
#pragma unroll
1422
1422
for (int i0 = 0 ; i0 < GGML_CUDA_MMQ_Y; i0 += 8 ) {
1423
- const int i = i0 + i_offset;
1423
+ const int i = min ( i0 + i_offset, i_max) ;
1424
1424
1425
1425
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
1426
1426
@@ -1515,7 +1515,7 @@ static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 **
1515
1515
1516
1516
static __device__ __forceinline__ void load_tiles_q4_1 (
1517
1517
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1518
- int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) {
1518
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1519
1519
1520
1520
__builtin_assume (i_offset >= 0 );
1521
1521
__builtin_assume (i_offset < 8 );
@@ -1619,7 +1619,7 @@ static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 **
1619
1619
1620
1620
static __device__ __forceinline__ void load_tiles_q5_0 (
1621
1621
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1622
- int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) {
1622
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1623
1623
1624
1624
__builtin_assume (i_offset >= 0 );
1625
1625
__builtin_assume (i_offset < 8 );
@@ -1735,7 +1735,7 @@ static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 **
1735
1735
1736
1736
static __device__ __forceinline__ void load_tiles_q5_1 (
1737
1737
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1738
- int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) {
1738
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1739
1739
1740
1740
__builtin_assume (i_offset >= 0 );
1741
1741
__builtin_assume (i_offset < 8 );
@@ -1826,7 +1826,7 @@ static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 **
1826
1826
1827
1827
static __device__ __forceinline__ void load_tiles_q8_0 (
1828
1828
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1829
- int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) {
1829
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1830
1830
1831
1831
__builtin_assume (i_offset >= 0 );
1832
1832
__builtin_assume (i_offset < 8 );
@@ -1949,7 +1949,7 @@ static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 **
1949
1949
1950
1950
static __device__ __forceinline__ void load_tiles_q2_K (
1951
1951
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1952
- int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) {
1952
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1953
1953
1954
1954
__builtin_assume (i_offset >= 0 );
1955
1955
__builtin_assume (i_offset < 8 );
@@ -2101,7 +2101,7 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 **
2101
2101
2102
2102
static __device__ __forceinline__ void load_tiles_q3_K (
2103
2103
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2104
- int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) {
2104
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2105
2105
2106
2106
__builtin_assume (i_offset >= 0 );
2107
2107
__builtin_assume (i_offset < 8 );
@@ -2322,7 +2322,7 @@ static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 **
2322
2322
2323
2323
static __device__ __forceinline__ void load_tiles_q4_K (
2324
2324
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2325
- int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) {
2325
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2326
2326
2327
2327
__builtin_assume (i_offset >= 0 );
2328
2328
__builtin_assume (i_offset < 8 );
@@ -2550,7 +2550,7 @@ static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 **
2550
2550
2551
2551
static __device__ __forceinline__ void load_tiles_q5_K (
2552
2552
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2553
- int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) {
2553
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2554
2554
2555
2555
__builtin_assume (i_offset >= 0 );
2556
2556
__builtin_assume (i_offset < 8 );
@@ -2719,7 +2719,7 @@ static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 **
2719
2719
2720
2720
static __device__ __forceinline__ void load_tiles_q6_K (
2721
2721
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2722
- int * __restrict__ x_sc, const int & i_offset, const int & k, const int & blocks_per_row) {
2722
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2723
2723
2724
2724
__builtin_assume (i_offset >= 0 );
2725
2725
__builtin_assume (i_offset < 8 );
@@ -2849,7 +2849,7 @@ static __global__ void mul_mat_q(
2849
2849
for (int ib0 = 0 ; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
2850
2850
2851
2851
load_tiles (x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
2852
- tid_y, tid_x, blocks_per_row_x);
2852
+ tid_y, nrows_x-row_x_0- 1 , tid_x, blocks_per_row_x);
2853
2853
2854
2854
for (int ir = 0 ; ir < qr; ++ir) {
2855
2855
const int kqs = ir*WARP_SIZE + tid_x;
0 commit comments