Skip to content

Commit 65c11d4

Browse files
committed
llama-bench threadpool CLI params
1 parent 5c9222d commit 65c11d4

File tree

4 files changed

+114
-47
lines changed

4 files changed

+114
-47
lines changed

common/common.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -218,15 +218,15 @@ void gpt_params_handle_model_default(gpt_params & params) {
218218
}
219219
}
220220

221-
static void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr) {
221+
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
222222
int32_t n_set = 0;
223223

224224
if (cpuparams.n_threads < 0) {
225225
// Assuming everything about cpuparams is invalid
226226
if (role_model != nullptr) {
227227
cpuparams = *role_model;
228228
} else {
229-
cpuparams.n_threads = cpu_get_num_math();
229+
cpuparams.n_threads = std::thread::hardware_concurrency();
230230
}
231231
}
232232

@@ -235,11 +235,13 @@ static void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role
235235
n_set++;
236236
}
237237
}
238+
238239
if (n_set == 0) {
239240
// You hit the jackpot!
240241
memset(&cpuparams.cpumask[0], 1, GGML_N_CORES_MAX);
241242
n_set = GGML_N_CORES_MAX;
242243
}
244+
243245
if (n_set < cpuparams.n_threads) {
244246
// Not enough set bits, may experience performance issues.
245247
fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
@@ -313,7 +315,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
313315
return result;
314316
}
315317

316-
static bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_N_CORES_MAX]) {
318+
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_N_CORES_MAX]) {
317319
size_t dash_loc = range.find('-');
318320
if (dash_loc == std::string::npos) {
319321
fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
@@ -350,7 +352,7 @@ static bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_N_C
350352
return true;
351353
}
352354

353-
static bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_N_CORES_MAX]) {
355+
bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_N_CORES_MAX]) {
354356
// Discard potential 0x prefix
355357
size_t start_i = 0;
356358
if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {

common/common.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,10 @@ bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
198198
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
199199
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
200200

201+
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_N_CORES_MAX]);
202+
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_N_CORES_MAX]);
203+
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
204+
201205
std::string gpt_params_get_system_info(const gpt_params & params);
202206

203207
//

examples/llama-bench/llama-bench.cpp

Lines changed: 103 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -186,11 +186,18 @@ struct cmd_params {
186186
std::vector<bool> use_mmap;
187187
std::vector<bool> embeddings;
188188
ggml_numa_strategy numa;
189+
cpu_params cpuparams;
189190
int reps;
190191
bool verbose;
191192
output_formats output_format;
192193
};
193194

195+
int32_t n_threads = -1;
196+
bool cpumask[GGML_N_CORES_MAX] = { false }; // CPU affinity mask.
197+
bool mask_valid = false; // Default: any CPU
198+
int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
199+
bool strict_cpu = false; // Use strict CPU placement
200+
bool poll = false; // Use polling (busywait) to wait for work
194201
static const cmd_params cmd_params_defaults = {
195202
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
196203
/* n_prompt */ {512},
@@ -210,6 +217,7 @@ static const cmd_params cmd_params_defaults = {
210217
/* use_mmap */ {true},
211218
/* embeddings */ {false},
212219
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
220+
/* cpuparams */ {int32_t(std::thread::hardware_concurrency()), {false}, false, 1, false, false},
213221
/* reps */ 5,
214222
/* verbose */ false,
215223
/* output_format */ MARKDOWN
@@ -236,6 +244,11 @@ static void print_usage(int /* argc */, char ** argv) {
236244
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
237245
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
238246
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
247+
printf(" -mt, --max-threads <n> (default: %d)\n", cmd_params_defaults.cpuparams.n_threads);
248+
printf(" -C, --cpu-mask <hex> (default: 0x0)\n");
249+
printf(" --cpu-strict <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu);
250+
printf(" --priority <0|1|2|3> (default: %d)\n", cmd_params_defaults.cpuparams.priority);
251+
printf(" --poll <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.poll);
239252
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
240253
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
241254
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
@@ -272,7 +285,7 @@ static ggml_type ggml_type_from_name(const std::string & s) {
272285
}
273286

274287

275-
static cmd_params parse_cmd_params(int argc, char ** argv) {
288+
static cmd_params parse_cmd_params(int argc, char** argv) {
276289
cmd_params params;
277290
std::string arg;
278291
bool invalid_param = false;
@@ -292,28 +305,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
292305
if (arg == "-h" || arg == "--help") {
293306
print_usage(argc, argv);
294307
exit(0);
295-
} else if (arg == "-m" || arg == "--model") {
308+
}
309+
else if (arg == "-m" || arg == "--model") {
296310
if (++i >= argc) {
297311
invalid_param = true;
298312
break;
299313
}
300314
auto p = split<std::string>(argv[i], split_delim);
301315
params.model.insert(params.model.end(), p.begin(), p.end());
302-
} else if (arg == "-p" || arg == "--n-prompt") {
316+
}
317+
else if (arg == "-p" || arg == "--n-prompt") {
303318
if (++i >= argc) {
304319
invalid_param = true;
305320
break;
306321
}
307322
auto p = split<int>(argv[i], split_delim);
308323
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
309-
} else if (arg == "-n" || arg == "--n-gen") {
324+
}
325+
else if (arg == "-n" || arg == "--n-gen") {
310326
if (++i >= argc) {
311327
invalid_param = true;
312328
break;
313329
}
314330
auto p = split<int>(argv[i], split_delim);
315331
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
316-
} else if (arg == "-pg") {
332+
}
333+
else if (arg == "-pg") {
317334
if (++i >= argc) {
318335
invalid_param = true;
319336
break;
@@ -323,29 +340,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
323340
invalid_param = true;
324341
break;
325342
}
326-
params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
327-
} else if (arg == "-b" || arg == "--batch-size") {
343+
params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
344+
}
345+
else if (arg == "-b" || arg == "--batch-size") {
328346
if (++i >= argc) {
329347
invalid_param = true;
330348
break;
331349
}
332350
auto p = split<int>(argv[i], split_delim);
333351
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
334-
} else if (arg == "-ub" || arg == "--ubatch-size") {
352+
}
353+
else if (arg == "-ub" || arg == "--ubatch-size") {
335354
if (++i >= argc) {
336355
invalid_param = true;
337356
break;
338357
}
339358
auto p = split<int>(argv[i], split_delim);
340359
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
341-
} else if (arg == "-ctk" || arg == "--cache-type-k") {
360+
}
361+
else if (arg == "-ctk" || arg == "--cache-type-k") {
342362
if (++i >= argc) {
343363
invalid_param = true;
344364
break;
345365
}
346366
auto p = split<std::string>(argv[i], split_delim);
347367
std::vector<ggml_type> types;
348-
for (const auto & t : p) {
368+
for (const auto& t : p) {
349369
ggml_type gt = ggml_type_from_name(t);
350370
if (gt == GGML_TYPE_COUNT) {
351371
invalid_param = true;
@@ -354,14 +374,15 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
354374
types.push_back(gt);
355375
}
356376
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
357-
} else if (arg == "-ctv" || arg == "--cache-type-v") {
377+
}
378+
else if (arg == "-ctv" || arg == "--cache-type-v") {
358379
if (++i >= argc) {
359380
invalid_param = true;
360381
break;
361382
}
362383
auto p = split<std::string>(argv[i], split_delim);
363384
std::vector<ggml_type> types;
364-
for (const auto & t : p) {
385+
for (const auto& t : p) {
365386
ggml_type gt = ggml_type_from_name(t);
366387
if (gt == GGML_TYPE_COUNT) {
367388
invalid_param = true;
@@ -370,66 +391,104 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
370391
types.push_back(gt);
371392
}
372393
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
373-
} else if (arg == "-t" || arg == "--threads") {
394+
}
395+
else if (arg == "-t" || arg == "--threads") {
374396
if (++i >= argc) {
375397
invalid_param = true;
376398
break;
377399
}
378400
auto p = split<int>(argv[i], split_delim);
379401
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
380-
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
402+
}
403+
else if (arg == "-ngl" || arg == "--n-gpu-layers") {
381404
if (++i >= argc) {
382405
invalid_param = true;
383406
break;
384407
}
385408
auto p = split<int>(argv[i], split_delim);
386409
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
387-
} else if (arg == "-sm" || arg == "--split-mode") {
410+
}
411+
else if (arg == "-sm" || arg == "--split-mode") {
388412
if (++i >= argc) {
389413
invalid_param = true;
390414
break;
391415
}
392416
auto p = split<std::string>(argv[i], split_delim);
393417
std::vector<llama_split_mode> modes;
394-
for (const auto & m : p) {
418+
for (const auto& m : p) {
395419
llama_split_mode mode;
396420
if (m == "none") {
397421
mode = LLAMA_SPLIT_MODE_NONE;
398-
} else if (m == "layer") {
422+
}
423+
else if (m == "layer") {
399424
mode = LLAMA_SPLIT_MODE_LAYER;
400-
} else if (m == "row") {
425+
}
426+
else if (m == "row") {
401427
mode = LLAMA_SPLIT_MODE_ROW;
402-
} else {
428+
}
429+
else {
403430
invalid_param = true;
404431
break;
405432
}
406433
modes.push_back(mode);
407434
}
408435
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
409-
} else if (arg == "-mg" || arg == "--main-gpu") {
436+
}
437+
else if (arg == "-mg" || arg == "--main-gpu") {
410438
if (++i >= argc) {
411439
invalid_param = true;
412440
break;
413441
}
414442
params.main_gpu = split<int>(argv[i], split_delim);
415-
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
443+
}
444+
else if (arg == "-nkvo" || arg == "--no-kv-offload") {
416445
if (++i >= argc) {
417446
invalid_param = true;
418447
break;
419448
}
420449
auto p = split<bool>(argv[i], split_delim);
421450
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
422-
} else if (arg == "--numa") {
451+
}
452+
else if (arg == "--numa") {
423453
if (++i >= argc) {
424454
invalid_param = true;
425455
break;
426-
} else {
456+
}
457+
else {
427458
std::string value(argv[i]);
428-
/**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
429-
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
430-
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
459+
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
460+
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
461+
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
431462
else { invalid_param = true; break; }
432463
}
464+
465+
}
466+
else if (arg == "-mt" || arg == "--max-threads") {
467+
if (++i >= argc) {
468+
invalid_param = true;
469+
break;
470+
}
471+
params.cpuparams.n_threads = std::stoi(argv[i]);
472+
}
473+
else if (arg == "-C" || arg == "--cpu-mask") {
474+
if (++i >= argc) {
475+
invalid_param = true;
476+
break;
477+
}
478+
std::string mask = argv[i];
479+
params.cpuparams.mask_valid = true;
480+
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
481+
}
482+
else if (arg == "--prio") {
483+
if (++i >= argc) {
484+
invalid_param = true;
485+
break;
486+
}
487+
params.cpuparams.priority = std::stoul(argv[i]);
488+
} else if (arg == "--cpu-strict") {
489+
params.cpuparams.strict_cpu = true;
490+
} else if (arg == "--poll") {
491+
params.cpuparams.poll = true;
433492
} else if (arg == "-fa" || arg == "--flash-attn") {
434493
if (++i >= argc) {
435494
invalid_param = true;
@@ -1303,6 +1362,23 @@ int main(int argc, char ** argv) {
13031362
llama_model * lmodel = nullptr;
13041363
const cmd_params_instance * prev_inst = nullptr;
13051364

1365+
postprocess_cpu_params(params.cpuparams);
1366+
1367+
struct ggml_threadpool_params tpp;
1368+
tpp.n_threads = params.cpuparams.n_threads;
1369+
tpp.mask_specified = params.cpuparams.mask_valid;
1370+
tpp.strict_cpu = params.cpuparams.strict_cpu;
1371+
tpp.prio = params.cpuparams.priority;
1372+
tpp.poll = params.cpuparams.poll;
1373+
1374+
std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_N_CORES_MAX);
1375+
1376+
struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
1377+
if (!threadpool) {
1378+
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
1379+
exit(1);
1380+
}
1381+
13061382
for (const auto & inst : params_instances) {
13071383
// keep the same model between tests when possible
13081384
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@@ -1329,21 +1405,6 @@ int main(int argc, char ** argv) {
13291405

13301406
llama_kv_cache_clear(ctx);
13311407

1332-
struct ggml_threadpool_params tpp;
1333-
tpp.n_threads = t.n_threads;
1334-
1335-
// TODO: expose these via cli opts
1336-
tpp.mask_specified = false;
1337-
tpp.strict_cpu = false;
1338-
tpp.prio = 1;
1339-
tpp.poll = false;
1340-
1341-
struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
1342-
if (!threadpool) {
1343-
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
1344-
exit(1);
1345-
}
1346-
13471408
llama_set_n_threads(ctx, t.n_threads, t.n_threads);
13481409
llama_attach_threadpool(ctx, threadpool);
13491410

@@ -1378,8 +1439,8 @@ int main(int argc, char ** argv) {
13781439

13791440
llama_free(ctx);
13801441

1381-
ggml_release_threadpool(threadpool);
13821442
}
1443+
ggml_release_threadpool(threadpool);
13831444

13841445
llama_free_model(lmodel);
13851446

ggml.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@
274274
#define GGML_UNREACHABLE() ((void) 0)
275275
#endif
276276

277-
#define GGML_N_CORES_MAX 512
277+
#define GGML_N_CORES_MAX 16
278278

279279
// used to copy the number of elements and stride in bytes of tensors into local variables.
280280
// main purpose is to reduce code duplication and improve readability.

0 commit comments

Comments
 (0)