Skip to content

Commit da030ed

Browse files
committed
Faster quantize
1 parent f31b6f4 commit da030ed

File tree

4 files changed

+70
-42
lines changed

4 files changed

+70
-42
lines changed

examples/quantize/quantize.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,12 +154,23 @@ int main(int argc, char ** argv) {
154154
if (argc > arg_idx) {
155155
try {
156156
params.nthread = std::stoi(argv[arg_idx]);
157+
++arg_idx;
157158
}
158159
catch (const std::exception & e) {
159160
fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what());
160161
return 1;
161162
}
162163
}
164+
if (argc > arg_idx) {
165+
try {
166+
params.collect_histo = std::stoi(argv[arg_idx]);
167+
++arg_idx;
168+
}
169+
catch (const std::exception & e) {
170+
fprintf(stderr, "%s: invalid collect_histo '%s' (%s)\n", __func__, argv[arg_idx], e.what());
171+
return 1;
172+
}
173+
}
163174

164175
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
165176

ggml.c

Lines changed: 49 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -19366,13 +19366,15 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
1936619366

1936719367
quantize_row_q4_0_reference(src + b, y, k);
1936819368

19369-
for (int i = 0; i < nb; i++) {
19370-
for (int j = 0; j < QK4_0; j += 2) {
19371-
const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
19372-
const uint8_t vi1 = y[i].qs[j/2] >> 4;
19373-
19374-
hist[vi0]++;
19375-
hist[vi1]++;
19369+
if (hist) {
19370+
for (int i = 0; i < nb; i++) {
19371+
for (int j = 0; j < QK4_0; j += 2) {
19372+
const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
19373+
const uint8_t vi1 = y[i].qs[j/2] >> 4;
19374+
19375+
hist[vi0]++;
19376+
hist[vi1]++;
19377+
}
1937619378
}
1937719379
}
1937819380
}
@@ -19389,13 +19391,15 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
1938919391

1939019392
quantize_row_q4_1_reference(src + b, y, k);
1939119393

19392-
for (int i = 0; i < nb; i++) {
19393-
for (int j = 0; j < QK4_1; j += 2) {
19394-
const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
19395-
const uint8_t vi1 = y[i].qs[j/2] >> 4;
19394+
if (hist) {
19395+
for (int i = 0; i < nb; i++) {
19396+
for (int j = 0; j < QK4_1; j += 2) {
19397+
const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
19398+
const uint8_t vi1 = y[i].qs[j/2] >> 4;
1939619399

19397-
hist[vi0]++;
19398-
hist[vi1]++;
19400+
hist[vi0]++;
19401+
hist[vi1]++;
19402+
}
1939919403
}
1940019404
}
1940119405
}
@@ -19412,20 +19416,22 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
1941219416

1941319417
quantize_row_q5_0_reference(src + b, y, k);
1941419418

19415-
for (int i = 0; i < nb; i++) {
19416-
uint32_t qh;
19417-
memcpy(&qh, &y[i].qh, sizeof(qh));
19419+
if (hist) {
19420+
for (int i = 0; i < nb; i++) {
19421+
uint32_t qh;
19422+
memcpy(&qh, &y[i].qh, sizeof(qh));
1941819423

19419-
for (int j = 0; j < QK5_0; j += 2) {
19420-
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
19421-
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
19424+
for (int j = 0; j < QK5_0; j += 2) {
19425+
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
19426+
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
1942219427

19423-
// cast to 16 bins
19424-
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
19425-
const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
19428+
// cast to 16 bins
19429+
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
19430+
const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
1942619431

19427-
hist[vi0]++;
19428-
hist[vi1]++;
19432+
hist[vi0]++;
19433+
hist[vi1]++;
19434+
}
1942919435
}
1943019436
}
1943119437
}
@@ -19442,20 +19448,22 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
1944219448

1944319449
quantize_row_q5_1_reference(src + b, y, k);
1944419450

19445-
for (int i = 0; i < nb; i++) {
19446-
uint32_t qh;
19447-
memcpy(&qh, &y[i].qh, sizeof(qh));
19451+
if (hist) {
19452+
for (int i = 0; i < nb; i++) {
19453+
uint32_t qh;
19454+
memcpy(&qh, &y[i].qh, sizeof(qh));
1944819455

19449-
for (int j = 0; j < QK5_1; j += 2) {
19450-
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
19451-
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
19456+
for (int j = 0; j < QK5_1; j += 2) {
19457+
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
19458+
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
1945219459

19453-
// cast to 16 bins
19454-
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
19455-
const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
19460+
// cast to 16 bins
19461+
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
19462+
const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
1945619463

19457-
hist[vi0]++;
19458-
hist[vi1]++;
19464+
hist[vi0]++;
19465+
hist[vi1]++;
19466+
}
1945919467
}
1946019468
}
1946119469
}
@@ -19472,11 +19480,13 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
1947219480

1947319481
quantize_row_q8_0_reference(src + b, y, k);
1947419482

19475-
for (int i = 0; i < nb; i++) {
19476-
for (int j = 0; j < QK8_0; ++j) {
19477-
const int8_t vi = y[i].qs[j];
19483+
if (hist) {
19484+
for (int i = 0; i < nb; i++) {
19485+
for (int j = 0; j < QK8_0; ++j) {
19486+
const int8_t vi = y[i].qs[j];
1947819487

19479-
hist[vi/16 + 8]++;
19488+
hist[vi/16 + 8]++;
19489+
}
1948019490
}
1948119491
}
1948219492
}

llama.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4726,6 +4726,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
47264726
}
47274727

47284728
int nthread = params->nthread;
4729+
bool collect_histo = params->collect_histo;
47294730

47304731
if (nthread <= 0) {
47314732
nthread = std::thread::hardware_concurrency();
@@ -4808,6 +4809,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
48084809
// placeholder for the meta data
48094810
::zeros(fout, meta_size);
48104811

4812+
std::vector<float> f32_conv_buf;
4813+
48114814
for (int i = 0; i < ml->n_tensors; ++i) {
48124815
struct ggml_tensor * tensor = ml->get_tensor_meta(i);
48134816

@@ -4947,7 +4950,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
49474950
const size_t nelements = ggml_nelements(tensor);
49484951

49494952
float * f32_data;
4950-
std::vector<float> f32_conv_buf;
49514953

49524954
if (tensor->type == GGML_TYPE_F32) {
49534955
f32_data = (float *) tensor->data;
@@ -4963,7 +4965,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
49634965

49644966
work.resize(nelements * 4); // upper bound on size
49654967
new_data = work.data();
4966-
std::vector<int64_t> hist_cur(1 << 4, 0);
4968+
std::vector<int64_t> hist_cur;
4969+
if (collect_histo) {
4970+
hist_cur.resize(1 << 4, 0);
4971+
}
49674972

49684973
static const int chunk_size = 32 * 512;
49694974
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
@@ -4990,7 +4995,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
49904995
}
49914996
lock.unlock();
49924997
size_t last = std::min(nelements, first + chunk_size);
4993-
if (local_hist.empty()) {
4998+
if (local_hist.empty() && !hist_cur.empty()) {
49944999
local_hist.resize(hist_cur.size(), 0);
49955000
}
49965001
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
@@ -5379,6 +5384,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
53795384
/*.allow_requantize =*/ false,
53805385
/*.quantize_output_tensor =*/ true,
53815386
/*.only_copy =*/ false,
5387+
/*.collect_histo =*/ false,
53825388
};
53835389

53845390
return result;

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ extern "C" {
165165
bool allow_requantize; // allow quantizing non-f32/f16 tensors
166166
bool quantize_output_tensor; // quantize output.weight
167167
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
168+
bool collect_histo; // collect quant histogram when quantizing?
168169
} llama_model_quantize_params;
169170

170171
// grammar types

0 commit comments

Comments
 (0)