Skip to content

Commit e0d0a0f

Browse files
fix q8_0 for model with n_embd_head % 32 != 0
1 parent 91523fb commit e0d0a0f

File tree

4 files changed

+126
-66
lines changed

4 files changed

+126
-66
lines changed

ggml-cuda.cu

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4044,7 +4044,7 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
40444044
cpy_1(cx + x_offset, cdst + dst_offset);
40454045
}
40464046

4047-
template <bool first_incomplete, bool last_incomplete>
4047+
template <bool first_incomplete, bool last_incomplete, bool save_unquantized>
40484048
static __global__ void cpy_f32_q8_0(
40494049
const char * cx, char * cdst, const int i_blck_0, const int ne00, const int ne01, const int ne02,
40504050
const int nb00, const int nb01, const int nb02, const int nb11, const int nb12) {
@@ -4075,7 +4075,7 @@ static __global__ void cpy_f32_q8_0(
40754075
val = *((float *) src);
40764076
}
40774077

4078-
if (last_incomplete && i0 / QK8_0 == (i_blck_0 + ne00) / QK8_0) {
4078+
if (save_unquantized && last_incomplete && i0 / QK8_0 == (i_blck_0 + ne00) / QK8_0) {
40794079
memcpy(&dst[1 + iqs/8].qs[sizeof(float) * (iqs % 8)], src, sizeof(float));
40804080
}
40814081

@@ -5114,7 +5114,7 @@ static void ggml_cpy_f32_f16_cuda(
51145114

51155115
static void ggml_cpy_f32_q8_0_cuda(
51165116
const char * cx, char * cdst, const int i_blck_0, const int ne00, const int ne01, const int ne02,
5117-
const int nb00, const int nb01, const int nb02, const int nb11, const int nb12, cudaStream_t stream) {
5117+
const int nb00, const int nb01, const int nb02, const int nb11, const int nb12, const bool pad, cudaStream_t stream) {
51185118

51195119
const int num_blocks_x = (i_blck_0 + ne00 + WARP_SIZE - 1) / WARP_SIZE;
51205120
const dim3 block_nums(num_blocks_x, ne01, ne02);
@@ -5125,17 +5125,27 @@ static void ggml_cpy_f32_q8_0_cuda(
51255125

51265126
if (first_incomplete && last_incomplete) {
51275127
GGML_ASSERT(i_blck_0 + ne00 < QK8_0); // otherwise there would be a race condition
5128-
cpy_f32_q8_0<true, true><<<block_nums, block_dims, 0, stream>>>
5128+
GGML_ASSERT(pad == false);
5129+
cpy_f32_q8_0<true, true, false><<<block_nums, block_dims, 0, stream>>>
51295130
(cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
51305131
} else if (first_incomplete && !last_incomplete) {
5131-
cpy_f32_q8_0<true, false><<<block_nums, block_dims, 0, stream>>>
5132+
GGML_ASSERT(pad == false);
5133+
cpy_f32_q8_0<true, false, false><<<block_nums, block_dims, 0, stream>>>
51325134
(cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
5133-
} else if (!first_incomplete && last_incomplete) {
5134-
cpy_f32_q8_0<false, true><<<block_nums, block_dims, 0, stream>>>
5135+
} else if (!first_incomplete && last_incomplete && pad) {
5136+
cpy_f32_q8_0<false, true, false><<<block_nums, block_dims, 0, stream>>>
51355137
(cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
5136-
} else if (!first_incomplete && !last_incomplete) {
5137-
cpy_f32_q8_0<false, false><<<block_nums, block_dims, 0, stream>>>
5138+
} else if (!first_incomplete && last_incomplete && !pad) {
5139+
cpy_f32_q8_0<false, true, true><<<block_nums, block_dims, 0, stream>>>
51385140
(cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
5141+
} else if (!first_incomplete && !last_incomplete && pad) {
5142+
cpy_f32_q8_0<false, false, true><<<block_nums, block_dims, 0, stream>>>
5143+
(cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
5144+
} else if (!first_incomplete && !last_incomplete && !pad) {
5145+
cpy_f32_q8_0<false, false, true><<<block_nums, block_dims, 0, stream>>>
5146+
(cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
5147+
} else {
5148+
GGML_ASSERT(false);
51395149
}
51405150
}
51415151

@@ -6626,9 +6636,6 @@ void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_te
66266636
}
66276637

66286638
void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6629-
const int64_t ne = ggml_nelements(src0);
6630-
GGML_ASSERT(ne == ggml_nelements(src1));
6631-
66326639
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
66336640
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
66346641

@@ -6652,6 +6659,16 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
66526659
const int64_t nb11 = src1->nb[1];
66536660
const int64_t nb12 = src1->nb[2];
66546661

6662+
const int64_t blck_size = ggml_blck_size(src1->type);
6663+
const int64_t ne00_padded = ((ne00 + blck_size - 1) / blck_size) * blck_size;
6664+
const int64_t ne = ggml_nelements(src0);
6665+
const bool pad = dst->op_params[0] & 1;
6666+
if (pad) {
6667+
GGML_ASSERT(ne00_padded * ggml_nrows(src0) == ggml_nelements(src1));
6668+
} else {
6669+
GGML_ASSERT(ne == ggml_nelements(src1));
6670+
}
6671+
66556672
CUDA_CHECK(cudaSetDevice(g_main_device));
66566673
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
66576674

@@ -6670,16 +6687,19 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
66706687
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
66716688
GGML_ASSERT(nb10 == sizeof(block_q8_0));
66726689

6673-
const size_t * op_params = (const size_t *) src1->op_params;
6674-
const size_t i_blck_0 = op_params[1];
6690+
size_t i_blck_0 = 0;
6691+
if (src1->op == GGML_OP_VIEW) {
6692+
const size_t * op_params = (const size_t *) src1->op_params;
6693+
i_blck_0 = op_params[1];
6694+
}
66756695

66766696
if (ggml_is_contiguous(src1)) {
66776697
ggml_cpy_f32_q8_0_cuda(
66786698
src0_ddc, src1_ddc, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02,
6679-
ne00*sizeof(block_q8_0)/QK8_0, ne00*ne01*sizeof(block_q8_0)/QK8_0, cudaStream_main);
6699+
ne00_padded*sizeof(block_q8_0)/QK8_0, ne00_padded*ne01*sizeof(block_q8_0)/QK8_0, pad, cudaStream_main);
66806700
} else {
66816701
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, i_blck_0, ne00, ne01, ne02,
6682-
nb00, nb01, nb02, nb11, nb12, cudaStream_main);
6702+
nb00, nb01, nb02, nb11, nb12, pad, cudaStream_main);
66836703
}
66846704

66856705
} else {

ggml.c

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6312,8 +6312,15 @@ static struct ggml_tensor * ggml_cpy_impl(
63126312
struct ggml_context * ctx,
63136313
struct ggml_tensor * a,
63146314
struct ggml_tensor * b,
6315-
bool inplace) {
6316-
GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
6315+
const bool inplace,
6316+
const bool pad) {
6317+
if (pad) {
6318+
const int64_t blck_size = ggml_blck_size(b->type);
6319+
const int64_t ne00_padded = ((a->ne[0] + blck_size - 1) / blck_size) * blck_size;
6320+
GGML_ASSERT(ne00_padded*ggml_nrows(a) == ggml_nelements(b));
6321+
} else {
6322+
GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
6323+
}
63176324

63186325
bool is_node = false;
63196326

@@ -6329,6 +6336,8 @@ static struct ggml_tensor * ggml_cpy_impl(
63296336
ggml_format_name(result, "%s (copy)", a->name);
63306337
}
63316338

6339+
ggml_set_op_params_i32(result, 0, pad ? 1 : 0);
6340+
63326341
result->op = GGML_OP_CPY;
63336342
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
63346343
result->src[0] = a;
@@ -6341,14 +6350,21 @@ struct ggml_tensor * ggml_cpy(
63416350
struct ggml_context * ctx,
63426351
struct ggml_tensor * a,
63436352
struct ggml_tensor * b) {
6344-
return ggml_cpy_impl(ctx, a, b, false);
6353+
return ggml_cpy_impl(ctx, a, b, false, false);
63456354
}
63466355

63476356
struct ggml_tensor * ggml_cpy_inplace(
63486357
struct ggml_context * ctx,
63496358
struct ggml_tensor * a,
63506359
struct ggml_tensor * b) {
6351-
return ggml_cpy_impl(ctx, a, b, true);
6360+
return ggml_cpy_impl(ctx, a, b, true, false);
6361+
}
6362+
6363+
struct ggml_tensor * ggml_cpy_pad(
6364+
struct ggml_context * ctx,
6365+
struct ggml_tensor * a,
6366+
struct ggml_tensor * b) {
6367+
return ggml_cpy_impl(ctx, a, b, false, true);
63526368
}
63536369

63546370
// ggml_cont
@@ -8233,6 +8249,8 @@ static void ggml_compute_forward_dup_f16(
82338249

82348250
GGML_TENSOR_UNARY_OP_LOCALS;
82358251

8252+
GGML_ASSERT(dst->op_params[0] == 0);
8253+
82368254
const int ith = params->ith; // thread index
82378255
const int nth = params->nth; // number of threads
82388256

@@ -8496,14 +8514,21 @@ static void ggml_compute_forward_dup_f32(
84968514
const struct ggml_compute_params * params,
84978515
const struct ggml_tensor * src0,
84988516
struct ggml_tensor * dst) {
8499-
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
8500-
85018517
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
85028518
return;
85038519
}
85048520

85058521
GGML_TENSOR_UNARY_OP_LOCALS;
85068522

8523+
const bool pad = dst->op_params[0] & 1;
8524+
const int blck_size = ggml_blck_size(dst->type);
8525+
const int ne00_padded = ((ne00 + blck_size - 1) / blck_size) * blck_size;
8526+
if (pad) {
8527+
GGML_ASSERT(ggml_nelements(dst) == ne00_padded*ggml_nrows(src0));
8528+
} else {
8529+
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
8530+
}
8531+
85078532
const int ith = params->ith; // thread index
85088533
const int nth = params->nth; // number of threads
85098534

@@ -8561,15 +8586,20 @@ static void ggml_compute_forward_dup_f32(
85618586
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
85628587

85638588
size_t id = 0;
8564-
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
8589+
const size_t rs = nb0 * ne00_padded / blck_size;
85658590
char * dst_ptr = (char *) dst->data;
8591+
float src0_padded[ne00_padded];
85668592

85678593
for (int i03 = 0; i03 < ne03; i03++) {
85688594
for (int i02 = 0; i02 < ne02; i02++) {
85698595
id += rs * ir0;
85708596
for (int i01 = ir0; i01 < ir1; i01++) {
85718597
const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
8572-
quantize_row_q(src0_ptr, dst_ptr + id, ne00);
8598+
if (ne00 != ne00_padded) {
8599+
memcpy(src0_padded, src0_ptr, ne00*sizeof(float));
8600+
memset(src0_padded + ne00, 0, (ne00_padded - ne00) * sizeof(float));
8601+
}
8602+
quantize_row_q(ne00 == ne00_padded ? src0_ptr : src0_padded, dst_ptr + id, ne00_padded);
85738603
id += rs;
85748604
}
85758605
id += rs * (ne01 - ir1);
@@ -8737,6 +8767,7 @@ static void ggml_compute_forward_dup_f32(
87378767
}
87388768
}
87398769
} else if (type_traits[dst->type].from_float) {
8770+
GGML_ASSERT(!pad);
87408771
GGML_ASSERT(ne00 == ne0);
87418772
GGML_ASSERT(ne01 == ne1);
87428773
GGML_ASSERT(ne02 == ne2);

ggml.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1053,6 +1053,12 @@ extern "C" {
10531053
struct ggml_tensor * a,
10541054
struct ggml_tensor * b);
10551055

1056+
// a -> b, pad row size of a to a multiple of block size of b, return view(b)
1057+
GGML_API struct ggml_tensor * ggml_cpy_pad(
1058+
struct ggml_context * ctx,
1059+
struct ggml_tensor * a,
1060+
struct ggml_tensor * b);
1061+
10561062
// make contiguous
10571063
GGML_API struct ggml_tensor * ggml_cont(
10581064
struct ggml_context * ctx,

0 commit comments

Comments
 (0)