Skip to content

Commit 38de86a

Browse files
ikawrakowKawrakowggerganov
authored
llama : multi-threaded quantization (#1075)
* Multi-threading quantization. Not much gain for simple quantizations, bit it will be important for quantizations that require more CPU cycles. * Multi-threading for quantize-stats It now does the job in ~14 seconds on my Mac for Q4_0, Q4_1 and Q4_2. Single-threaded it was taking more than 2 minutes after adding the more elaborate version of Q4_2. * Reviewer comments * Avoiding compiler confusion After changing chunk_size to const int as suggested by @ggerganov, clang and GCC starting to warn me that I don't need to capture it in the lambda. So, I removed it from the capture list. But that makes the MSVC build fail. So, making it a constexpr to make every compiler happy. * Still fighting with lambda captures in MSVC --------- Co-authored-by: Iwan Kawrakow <[email protected]> Co-authored-by: Georgi Gerganov <[email protected]>
1 parent e0305ea commit 38de86a

File tree

6 files changed

+182
-60
lines changed

6 files changed

+182
-60
lines changed

examples/quantize-stats/quantize-stats.cpp

Lines changed: 101 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#include <string>
1616
#include <unordered_map>
1717
#include <vector>
18+
#include <thread>
19+
#include <mutex>
1820

1921
struct quantize_stats_params {
2022
std::string model = "models/7B/ggml-model-f16.bin";
@@ -27,7 +29,6 @@ struct quantize_stats_params {
2729
std::vector<enum ggml_type> include_types;
2830
};
2931

30-
const int64_t SCRATCH_ELEMENTS = 32*32;
3132
const size_t HISTOGRAM_BUCKETS = 150;
3233
const double HISTOGRAM_RANGE = 0.03;
3334

@@ -90,6 +91,13 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou
9091
stats.num_samples += nelements;
9192
}
9293

94+
void combine_error_stats(error_stats & into, const error_stats & from) {
95+
into.num_samples += from.num_samples;
96+
into.total_error += from.total_error;
97+
if (from.max_error > into.max_error) into.max_error = from.max_error;
98+
for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
99+
}
100+
93101
double find_quantile(const error_stats & stats, double quantile) {
94102
double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
95103

@@ -130,47 +138,98 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
130138
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
131139
}
132140

141+
void test_roundtrip_on_chunk(
142+
const ggml_tensor * layer,
143+
int64_t offset,
144+
int64_t chunk_size,
145+
const quantize_fns_t & qfns,
146+
bool use_reference,
147+
float * input_scratch,
148+
char * quantized_scratch,
149+
float * output_scratch,
150+
error_stats & stats) {
151+
152+
if (layer->type == GGML_TYPE_F16) {
153+
for (int i = 0; i < chunk_size; i++) {
154+
input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
155+
}
156+
} else {
157+
input_scratch = ggml_get_data_f32(layer) + offset;
158+
}
159+
160+
if (use_reference) {
161+
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
162+
} else {
163+
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
164+
}
165+
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
166+
167+
update_error_stats(chunk_size, input_scratch, output_scratch, stats);
168+
}
169+
170+
133171
// Run quantization function for a single layer and update error stats
134172
void test_roundtrip_on_layer(
135173
std::string & name,
136174
bool print_layer_stats,
137175
const quantize_fns_t & qfns,
138176
bool use_reference,
139177
const ggml_tensor * layer,
140-
float * input_scratch,
141-
char *quantized_scratch,
142-
float * output_scratch,
143-
error_stats & total_error) {
178+
std::vector<float> & input_scratch,
179+
std::vector<char> & quantized_scratch,
180+
std::vector<float> & output_scratch,
181+
error_stats & total_error,
182+
int max_thread = 0) {
144183

145184
assert(tensor_is_contiguous(layer));
146185
error_stats layer_error {};
147-
int64_t nelements = ggml_nelements(layer);
148-
149-
for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) {
150-
int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset);
186+
uint64_t nelements = ggml_nelements(layer);
151187

152-
if (layer->type == GGML_TYPE_F16) {
153-
for (int i = 0; i < chunk_size; i++) {
154-
input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
188+
float* input_scratch_ptr = nullptr;
189+
if (layer->type == GGML_TYPE_F16) {
190+
if (input_scratch.size() < nelements) input_scratch.resize(nelements);
191+
input_scratch_ptr = input_scratch.data();
192+
}
193+
if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements);
194+
if (output_scratch.size() < nelements) output_scratch.resize(nelements);
195+
196+
if (max_thread < 1) max_thread = std::thread::hardware_concurrency();
197+
int chunk_size = 32*512;
198+
int num_chunks = (nelements + chunk_size - 1)/chunk_size;
199+
200+
if (num_chunks < 2 || max_thread < 2) {
201+
test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
202+
output_scratch.data(), print_layer_stats ? layer_error : total_error);
203+
} else {
204+
auto & stats = print_layer_stats ? layer_error : total_error;
205+
std::mutex mutex;
206+
uint64_t counter = 0;
207+
auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
208+
&quantized_scratch, &output_scratch, chunk_size] () {
209+
error_stats local_stats {};
210+
while (true) {
211+
std::unique_lock<std::mutex> lock(mutex);
212+
uint64_t offset = counter; counter += chunk_size;
213+
if (offset >= nelements) {
214+
combine_error_stats(stats, local_stats);
215+
break;
216+
}
217+
lock.unlock();
218+
uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
219+
test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
220+
quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
155221
}
156-
} else {
157-
input_scratch = ggml_get_data_f32(layer) + offset;
158-
}
159-
160-
if (use_reference) {
161-
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
162-
} else {
163-
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
164-
}
165-
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
166-
167-
update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
168-
if (print_layer_stats) {
169-
update_error_stats(chunk_size, input_scratch, output_scratch, layer_error);
170-
}
222+
};
223+
int nthread = std::min(num_chunks, max_thread);
224+
std::vector<std::thread> workers(nthread-1);
225+
for (auto& w : workers) w = std::thread(compute);
226+
compute();
227+
for (auto& w : workers) w.join();
171228
}
229+
172230
if (print_layer_stats) {
173231
print_error_stats(name, layer_error, false);
232+
combine_error_stats(total_error, layer_error);
174233
}
175234
}
176235

@@ -181,6 +240,7 @@ int main(int argc, char ** argv) {
181240

182241
// read command line
183242

243+
int max_thread = 0;
184244
bool invalid_param = false;
185245
std::string arg;
186246
for (int i = 1; i < argc; i++) {
@@ -230,6 +290,12 @@ int main(int argc, char ** argv) {
230290
fprintf(stderr, "error: %s not in list of types\n", argv[i]);
231291
invalid_param = true;
232292
}
293+
} else if (arg == "-n" || arg == "--num-threads") {
294+
if (++i >= argc) {
295+
invalid_param = true;
296+
break;
297+
}
298+
max_thread = atoi(argv[i]);
233299
} else {
234300
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
235301
quantize_stats_print_usage(argc, argv);
@@ -295,9 +361,9 @@ int main(int argc, char ** argv) {
295361
}
296362
printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
297363
// allocate scratch space
298-
std::vector<float> input_scratch(SCRATCH_ELEMENTS);
299-
std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
300-
std::vector<float> output_scratch(SCRATCH_ELEMENTS);
364+
std::vector<float> input_scratch;
365+
std::vector<char> quantized_scratch;
366+
std::vector<float> output_scratch;
301367

302368
// loop throught quantization types
303369
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
@@ -328,10 +394,11 @@ int main(int argc, char ** argv) {
328394
qfns,
329395
params.reference,
330396
kv_tensor.second,
331-
input_scratch.data(),
332-
quantized_scratch.data(),
333-
output_scratch.data(),
334-
global_stats
397+
input_scratch,
398+
quantized_scratch,
399+
output_scratch,
400+
global_stats,
401+
max_thread
335402
);
336403
}
337404

examples/quantize/quantize.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010
int main(int argc, char ** argv) {
1111
ggml_time_init();
1212

13-
if (argc != 4) {
14-
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
13+
if (argc < 4) {
14+
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
1515
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
1616
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
1717
fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
@@ -30,6 +30,7 @@ int main(int argc, char ** argv) {
3030
const std::string fname_out = argv[2];
3131

3232
const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
33+
int nthread = argc > 4 ? atoi(argv[4]) : 0;
3334

3435
const int64_t t_main_start_us = ggml_time_us();
3536

@@ -39,7 +40,7 @@ int main(int argc, char ** argv) {
3940
{
4041
const int64_t t_start_us = ggml_time_us();
4142

42-
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) {
43+
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
4344
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
4445
return 1;
4546
}

ggml.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12189,6 +12189,33 @@ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t *
1218912189
return (n/QK4_3*sizeof(block_q4_3));
1219012190
}
1219112191

12192+
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
12193+
size_t result = 0;
12194+
switch (type) {
12195+
case GGML_TYPE_Q4_0:
12196+
{
12197+
GGML_ASSERT(start % QK4_0 == 0);
12198+
block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
12199+
result = ggml_quantize_q4_0(src + start, block, n, n, hist);
12200+
} break;
12201+
case GGML_TYPE_Q4_1:
12202+
{
12203+
GGML_ASSERT(start % QK4_1 == 0);
12204+
block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
12205+
result = ggml_quantize_q4_1(src + start, block, n, n, hist);
12206+
} break;
12207+
case GGML_TYPE_Q4_2:
12208+
{
12209+
GGML_ASSERT(start % QK4_2 == 0);
12210+
block_q4_2 * block = (block_q4_2*)dst + start / QK4_2;
12211+
result = ggml_quantize_q4_2(src + start, block, n, n, hist);
12212+
} break;
12213+
default:
12214+
assert(false);
12215+
}
12216+
return result;
12217+
}
12218+
1219212219
////////////////////////////////////////////////////////////////////////////////
1219312220

1219412221
int ggml_cpu_has_avx(void) {

ggml.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -813,6 +813,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
813813
size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
814814
size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
815815

816+
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
817+
816818
//
817819
// system info
818820
//

llama.cpp

Lines changed: 45 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
#include <memory>
2525
#include <algorithm>
2626
#include <initializer_list>
27+
#include <thread>
28+
#include <atomic>
29+
#include <mutex>
2730

2831
#define LLAMA_USE_SCRATCH
2932
#define LLAMA_MAX_SCRATCH_BUFFERS 16
@@ -1572,7 +1575,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
15721575
// quantization
15731576
//
15741577

1575-
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
1578+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
15761579
ggml_type quantized_type;
15771580
switch (ftype) {
15781581
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
@@ -1582,6 +1585,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
15821585
default: throw format("invalid output file type %d\n", ftype);
15831586
};
15841587

1588+
if (nthread <= 0) {
1589+
nthread = std::thread::hardware_concurrency();
1590+
}
1591+
15851592
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
15861593
/*vocab_only*/ false));
15871594
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
@@ -1590,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
15901597
size_t total_size_new = 0;
15911598
std::vector<int64_t> hist_all(1 << 4, 0);
15921599

1600+
std::vector<std::thread> workers;
1601+
std::mutex mutex;
1602+
15931603
size_t idx = 0;
15941604
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
15951605
llama_buffer read_data;
@@ -1643,25 +1653,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16431653
new_data = work.addr;
16441654
std::vector<int64_t> hist_cur(1 << 4, 0);
16451655

1646-
switch (new_type) {
1647-
case GGML_TYPE_Q4_0:
1648-
{
1649-
new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1650-
} break;
1651-
case GGML_TYPE_Q4_1:
1652-
{
1653-
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1654-
} break;
1655-
case GGML_TYPE_Q4_2:
1656-
{
1657-
new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1658-
} break;
1659-
case GGML_TYPE_Q4_3:
1660-
{
1661-
new_size = ggml_quantize_q4_3(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1662-
} break;
1663-
default:
1664-
LLAMA_ASSERT(false);
1656+
int chunk_size = 32 * 512;
1657+
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
1658+
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
1659+
if (nthread_use < 2) {
1660+
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
1661+
} else {
1662+
size_t counter = 0;
1663+
new_size = 0;
1664+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
1665+
std::vector<int64_t> local_hist;
1666+
size_t local_size = 0;
1667+
while (true) {
1668+
std::unique_lock<std::mutex> lock(mutex);
1669+
size_t first = counter; counter += chunk_size;
1670+
if (first >= nelements) {
1671+
if (!local_hist.empty()) {
1672+
for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
1673+
new_size += local_size;
1674+
}
1675+
break;
1676+
}
1677+
lock.unlock();
1678+
size_t last = std::min(nelements, first + chunk_size);
1679+
if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
1680+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
1681+
}
1682+
};
1683+
if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
1684+
for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
1685+
compute();
1686+
for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
16651687
}
16661688

16671689
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -1783,9 +1805,10 @@ void llama_free(struct llama_context * ctx) {
17831805
int llama_model_quantize(
17841806
const char * fname_inp,
17851807
const char * fname_out,
1786-
enum llama_ftype ftype) {
1808+
enum llama_ftype ftype,
1809+
int nthread) {
17871810
try {
1788-
llama_model_quantize_internal(fname_inp, fname_out, ftype);
1811+
llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
17891812
return 0;
17901813
} catch (const std::string & err) {
17911814
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());

0 commit comments

Comments
 (0)