Skip to content

Commit 5c6576e

Browse files
committed
q4_0c: quantize support
1 parent 63bead4 commit 5c6576e

File tree

5 files changed

+50
-9
lines changed

5 files changed

+50
-9
lines changed

examples/quantize/quantize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ int main(int argc, char ** argv) {
1313
if (argc < 4) {
1414
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
1515
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
16+
fprintf(stderr, " type = %d - q4_0c\n", LLAMA_FTYPE_MOSTLY_Q4_0C);
1617
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
1718
fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
1819
fprintf(stderr, " type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3);

ggml.c

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -627,11 +627,17 @@ static_assert(sizeof(block_q8_0) == 2*sizeof(float) + QK8_0, "wrong q8_0 block s
627627

628628
#define QK4_0C (4*32)
629629
#define QK4_0C_MUL (QK4_0C / QK4_0)
630-
// TODO: nicer description - pseudostruct?
631-
// q4_0c : (uint8_t[QK4_0C/2]) qs[nb] || float d[n]
630+
#define Q4_0C_QSIZE (QK4_0C/2 + 4*sizeof(float))
631+
// typedef struct {
632+
// uint8_t qs[QK4_0C/2][nb];
633+
// float d[nb];
634+
// } block_q4_0c
632635

633636
#define QK8_0C 32
634-
// q8_0c : uint8_t qs[n] || float d[n]
637+
// typedef struct {
638+
// uint8_t qs[QK8_0C][nb];
639+
// float d[nb];
640+
// } block_q8_0c
635641

636642
// reference implementation for deterministic creation of model files
637643
static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
@@ -12325,6 +12331,27 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
1232512331
return (n/QK4_0*sizeof(block_q4_0));
1232612332
}
1232712333

12334+
size_t ggml_quantize_q4_0c(const float * src, void * dst, int n, int k, int64_t * hist) {
12335+
assert(k % QK4_0C == 0);
12336+
const int nb = k / QK4_0;
12337+
12338+
for (int j = 0; j < n; j += k) {
12339+
uint8_t * restrict y = (uint8_t *)dst + sizeof(block_q4_0)*j/QK4_0;
12340+
12341+
quantize_row_q4_0c_reference(src + j, y, k);
12342+
12343+
for (int i = 0; i < nb*QK4_0/2; i++) {
12344+
const uint8_t vi0 = y[i] & 0xF;
12345+
const uint8_t vi1 = y[i] >> 4;
12346+
12347+
hist[vi0]++;
12348+
hist[vi1]++;
12349+
}
12350+
}
12351+
12352+
return (n/QK4_0*sizeof(block_q4_0));
12353+
}
12354+
1232812355
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
1232912356
assert(k % QK4_1 == 0);
1233012357
const int nb = k / QK4_1;
@@ -12395,7 +12422,7 @@ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t *
1239512422
return (n/QK4_3*sizeof(block_q4_3));
1239612423
}
1239712424

12398-
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
12425+
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int k, int64_t * hist) {
1239912426
size_t result = 0;
1240012427
switch (type) {
1240112428
case GGML_TYPE_Q4_0:
@@ -12404,6 +12431,12 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
1240412431
block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
1240512432
result = ggml_quantize_q4_0(src + start, block, n, n, hist);
1240612433
} break;
12434+
case GGML_TYPE_Q4_0C:
12435+
{
12436+
GGML_ASSERT(start % QK4_0C == 0);
12437+
uint8_t * dst_off = (uint8_t *) dst + Q4_0C_QSIZE * start / QK4_0C;
12438+
result = ggml_quantize_q4_0c(src + start, dst_off, n, k, hist);
12439+
} break;
1240712440
case GGML_TYPE_Q4_1:
1240812441
{
1240912442
GGML_ASSERT(start % QK4_1 == 0);

ggml.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -812,11 +812,12 @@ enum ggml_opt_result ggml_opt(
812812
//
813813

814814
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
815+
size_t ggml_quantize_q4_0c(const float * src, void * dst, int n, int k, int64_t * hist);
815816
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
816817
size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
817818
size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
818819

819-
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
820+
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int k, int64_t * hist);
820821

821822
//
822823
// system info

llama.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,7 @@ struct llama_file_loader {
480480
case GGML_TYPE_F32:
481481
case GGML_TYPE_F16:
482482
case GGML_TYPE_Q4_0:
483+
case GGML_TYPE_Q4_0C:
483484
case GGML_TYPE_Q4_1:
484485
case GGML_TYPE_Q4_2:
485486
case GGML_TYPE_Q4_3:
@@ -554,6 +555,7 @@ struct llama_file_saver {
554555
case GGML_TYPE_F32:
555556
case GGML_TYPE_F16:
556557
case GGML_TYPE_Q4_0:
558+
case GGML_TYPE_Q4_0C:
557559
case GGML_TYPE_Q4_1:
558560
case GGML_TYPE_Q4_2:
559561
case GGML_TYPE_Q4_3:
@@ -842,6 +844,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
842844
case LLAMA_FTYPE_ALL_F32: return "all F32";
843845
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
844846
case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
847+
case LLAMA_FTYPE_MOSTLY_Q4_0C: return "mostly Q4_0C";
845848
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
846849
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
847850
return "mostly Q4_1, some F16";
@@ -1579,6 +1582,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
15791582
ggml_type quantized_type;
15801583
switch (ftype) {
15811584
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1585+
case LLAMA_FTYPE_MOSTLY_Q4_0C: quantized_type = GGML_TYPE_Q4_0C; break;
15821586
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
15831587
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
15841588
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
@@ -1658,15 +1662,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16581662
new_data = work.addr;
16591663
std::vector<int64_t> hist_cur(1 << 4, 0);
16601664

1661-
int chunk_size = 32 * 512;
1665+
int row_size = tensor.ne.at(0);
1666+
int chunk_size = ceil(32 * 512 * 1.0 / row_size) * row_size;
16621667
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
16631668
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
16641669
if (nthread_use < 2) {
1665-
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
1670+
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, row_size, hist_cur.data());
16661671
} else {
16671672
size_t counter = 0;
16681673
new_size = 0;
1669-
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
1674+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size, row_size] () {
16701675
std::vector<int64_t> local_hist;
16711676
size_t local_size = 0;
16721677
while (true) {
@@ -1682,7 +1687,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16821687
lock.unlock();
16831688
size_t last = std::min(nelements, first + chunk_size);
16841689
if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
1685-
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
1690+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, row_size, local_hist.data());
16861691
}
16871692
};
16881693
if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ extern "C" {
7474
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
7575
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
7676
LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
77+
LLAMA_FTYPE_MOSTLY_Q4_0C = 7, // except 1d tensors
7778
};
7879

7980
LLAMA_API struct llama_context_params llama_context_default_params();

0 commit comments

Comments
 (0)