15
15
#include < string>
16
16
#include < unordered_map>
17
17
#include < vector>
18
+ #include < thread>
19
+ #include < mutex>
18
20
19
21
struct quantize_stats_params {
20
22
std::string model = " models/7B/ggml-model-f16.bin" ;
@@ -27,7 +29,6 @@ struct quantize_stats_params {
27
29
std::vector<enum ggml_type> include_types;
28
30
};
29
31
30
- const int64_t SCRATCH_ELEMENTS = 32 *32 ;
31
32
const size_t HISTOGRAM_BUCKETS = 150 ;
32
33
const double HISTOGRAM_RANGE = 0.03 ;
33
34
@@ -90,6 +91,13 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou
90
91
stats.num_samples += nelements;
91
92
}
92
93
94
+ void combine_error_stats (error_stats & into, const error_stats & from) {
95
+ into.num_samples += from.num_samples ;
96
+ into.total_error += from.total_error ;
97
+ if (from.max_error > into.max_error ) into.max_error = from.max_error ;
98
+ for (size_t i=0 ; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram [i] += from.error_histogram [i];
99
+ }
100
+
93
101
double find_quantile (const error_stats & stats, double quantile) {
94
102
double sum = std::accumulate (std::begin (stats.error_histogram ), std::end (stats.error_histogram ), 0.0 );
95
103
@@ -130,47 +138,98 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
130
138
tensor->nb [3 ] == tensor->nb [2 ]*tensor->ne [2 ];
131
139
}
132
140
141
+ void test_roundtrip_on_chunk (
142
+ const ggml_tensor * layer,
143
+ int64_t offset,
144
+ int64_t chunk_size,
145
+ const quantize_fns_t & qfns,
146
+ bool use_reference,
147
+ float * input_scratch,
148
+ char * quantized_scratch,
149
+ float * output_scratch,
150
+ error_stats & stats) {
151
+
152
+ if (layer->type == GGML_TYPE_F16) {
153
+ for (int i = 0 ; i < chunk_size; i++) {
154
+ input_scratch[i] = ggml_get_f32_1d (layer, i + offset);
155
+ }
156
+ } else {
157
+ input_scratch = ggml_get_data_f32 (layer) + offset;
158
+ }
159
+
160
+ if (use_reference) {
161
+ qfns.quantize_row_q_reference (input_scratch, quantized_scratch, chunk_size);
162
+ } else {
163
+ qfns.quantize_row_q (input_scratch, quantized_scratch, chunk_size);
164
+ }
165
+ qfns.dequantize_row_q (quantized_scratch, output_scratch, chunk_size);
166
+
167
+ update_error_stats (chunk_size, input_scratch, output_scratch, stats);
168
+ }
169
+
170
+
133
171
// Run quantization function for a single layer and update error stats
134
172
void test_roundtrip_on_layer (
135
173
std::string & name,
136
174
bool print_layer_stats,
137
175
const quantize_fns_t & qfns,
138
176
bool use_reference,
139
177
const ggml_tensor * layer,
140
- float * input_scratch,
141
- char *quantized_scratch,
142
- float * output_scratch,
143
- error_stats & total_error) {
178
+ std::vector<float > & input_scratch,
179
+ std::vector<char > & quantized_scratch,
180
+ std::vector<float > & output_scratch,
181
+ error_stats & total_error,
182
+ int max_thread = 0 ) {
144
183
145
184
assert (tensor_is_contiguous (layer));
146
185
error_stats layer_error {};
147
- int64_t nelements = ggml_nelements (layer);
148
-
149
- for (int64_t offset = 0 ; offset < nelements; offset += SCRATCH_ELEMENTS) {
150
- int64_t chunk_size = std::min (SCRATCH_ELEMENTS, nelements - offset);
186
+ uint64_t nelements = ggml_nelements (layer);
151
187
152
- if (layer->type == GGML_TYPE_F16) {
153
- for (int i = 0 ; i < chunk_size; i++) {
154
- input_scratch[i] = ggml_get_f32_1d (layer, i + offset);
188
+ float * input_scratch_ptr = nullptr ;
189
+ if (layer->type == GGML_TYPE_F16) {
190
+ if (input_scratch.size () < nelements) input_scratch.resize (nelements);
191
+ input_scratch_ptr = input_scratch.data ();
192
+ }
193
+ if (quantized_scratch.size () < 4 *nelements) quantized_scratch.resize (4 *nelements);
194
+ if (output_scratch.size () < nelements) output_scratch.resize (nelements);
195
+
196
+ if (max_thread < 1 ) max_thread = std::thread::hardware_concurrency ();
197
+ int chunk_size = 32 *512 ;
198
+ int num_chunks = (nelements + chunk_size - 1 )/chunk_size;
199
+
200
+ if (num_chunks < 2 || max_thread < 2 ) {
201
+ test_roundtrip_on_chunk (layer, 0 , nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data (),
202
+ output_scratch.data (), print_layer_stats ? layer_error : total_error);
203
+ } else {
204
+ auto & stats = print_layer_stats ? layer_error : total_error;
205
+ std::mutex mutex;
206
+ uint64_t counter = 0 ;
207
+ auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
208
+ &quantized_scratch, &output_scratch, chunk_size] () {
209
+ error_stats local_stats {};
210
+ while (true ) {
211
+ std::unique_lock<std::mutex> lock (mutex);
212
+ uint64_t offset = counter; counter += chunk_size;
213
+ if (offset >= nelements) {
214
+ combine_error_stats (stats, local_stats);
215
+ break ;
216
+ }
217
+ lock.unlock ();
218
+ uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
219
+ test_roundtrip_on_chunk (layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
220
+ quantized_scratch.data () + 4 *offset, output_scratch.data () + offset, local_stats);
155
221
}
156
- } else {
157
- input_scratch = ggml_get_data_f32 (layer) + offset;
158
- }
159
-
160
- if (use_reference) {
161
- qfns.quantize_row_q_reference (input_scratch, quantized_scratch, chunk_size);
162
- } else {
163
- qfns.quantize_row_q (input_scratch, quantized_scratch, chunk_size);
164
- }
165
- qfns.dequantize_row_q (quantized_scratch, output_scratch, chunk_size);
166
-
167
- update_error_stats (chunk_size, input_scratch, output_scratch, total_error);
168
- if (print_layer_stats) {
169
- update_error_stats (chunk_size, input_scratch, output_scratch, layer_error);
170
- }
222
+ };
223
+ int nthread = std::min (num_chunks, max_thread);
224
+ std::vector<std::thread> workers (nthread-1 );
225
+ for (auto & w : workers) w = std::thread (compute);
226
+ compute ();
227
+ for (auto & w : workers) w.join ();
171
228
}
229
+
172
230
if (print_layer_stats) {
173
231
print_error_stats (name, layer_error, false );
232
+ combine_error_stats (total_error, layer_error);
174
233
}
175
234
}
176
235
@@ -181,6 +240,7 @@ int main(int argc, char ** argv) {
181
240
182
241
// read command line
183
242
243
+ int max_thread = 0 ;
184
244
bool invalid_param = false ;
185
245
std::string arg;
186
246
for (int i = 1 ; i < argc; i++) {
@@ -230,6 +290,12 @@ int main(int argc, char ** argv) {
230
290
fprintf (stderr, " error: %s not in list of types\n " , argv[i]);
231
291
invalid_param = true ;
232
292
}
293
+ } else if (arg == " -n" || arg == " --num-threads" ) {
294
+ if (++i >= argc) {
295
+ invalid_param = true ;
296
+ break ;
297
+ }
298
+ max_thread = atoi (argv[i]);
233
299
} else {
234
300
fprintf (stderr, " error: unknown argument: %s\n " , arg.c_str ());
235
301
quantize_stats_print_usage (argc, argv);
@@ -295,9 +361,9 @@ int main(int argc, char ** argv) {
295
361
}
296
362
printf (" testing %d layers with max size %" PRId64 " \n " , included_layers, max_nelements);
297
363
// allocate scratch space
298
- std::vector<float > input_scratch (SCRATCH_ELEMENTS) ;
299
- std::vector<char > quantized_scratch (SCRATCH_ELEMENTS* 4 ) ;
300
- std::vector<float > output_scratch (SCRATCH_ELEMENTS) ;
364
+ std::vector<float > input_scratch;
365
+ std::vector<char > quantized_scratch;
366
+ std::vector<float > output_scratch;
301
367
302
368
// loop throught quantization types
303
369
for (int i = 0 ; i < GGML_TYPE_COUNT; i++) {
@@ -328,10 +394,11 @@ int main(int argc, char ** argv) {
328
394
qfns,
329
395
params.reference ,
330
396
kv_tensor.second ,
331
- input_scratch.data (),
332
- quantized_scratch.data (),
333
- output_scratch.data (),
334
- global_stats
397
+ input_scratch,
398
+ quantized_scratch,
399
+ output_scratch,
400
+ global_stats,
401
+ max_thread
335
402
);
336
403
}
337
404
0 commit comments