@@ -186,11 +186,18 @@ struct cmd_params {
186
186
std::vector<bool > use_mmap;
187
187
std::vector<bool > embeddings;
188
188
ggml_numa_strategy numa;
189
+ cpu_params cpuparams;
189
190
int reps;
190
191
bool verbose;
191
192
output_formats output_format;
192
193
};
193
194
195
+ int32_t n_threads = -1 ;
196
+ bool cpumask[GGML_N_CORES_MAX] = { false }; // CPU affinity mask.
197
+ bool mask_valid = false ; // Default: any CPU
198
+ int32_t priority = 0 ; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
199
+ bool strict_cpu = false ; // Use strict CPU placement
200
+ bool poll = false ; // Use polling (busywait) to wait for work
194
201
static const cmd_params cmd_params_defaults = {
195
202
/* model */ {" models/7B/ggml-model-q4_0.gguf" },
196
203
/* n_prompt */ {512 },
@@ -210,6 +217,7 @@ static const cmd_params cmd_params_defaults = {
210
217
/* use_mmap */ {true },
211
218
/* embeddings */ {false },
212
219
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
220
+ /* cpuparams */ {int32_t (std::thread::hardware_concurrency ()), {false }, false , 1 , false , false },
213
221
/* reps */ 5 ,
214
222
/* verbose */ false ,
215
223
/* output_format */ MARKDOWN
@@ -236,6 +244,11 @@ static void print_usage(int /* argc */, char ** argv) {
236
244
printf (" -fa, --flash-attn <0|1> (default: %s)\n " , join (cmd_params_defaults.flash_attn , " ," ).c_str ());
237
245
printf (" -mmp, --mmap <0|1> (default: %s)\n " , join (cmd_params_defaults.use_mmap , " ," ).c_str ());
238
246
printf (" --numa <distribute|isolate|numactl> (default: disabled)\n " );
247
+ printf (" -mt, --max-threads <n> (default: %d)\n " , cmd_params_defaults.cpuparams .n_threads );
248
+ printf (" -C, --cpu-mask <hex> (default: 0x0)\n " );
249
+ printf (" --cpu-strict <0|1> (default: %d)\n " , cmd_params_defaults.cpuparams .strict_cpu );
250
+ printf (" --priority <0|1|2|3> (default: %d)\n " , cmd_params_defaults.cpuparams .priority );
251
+ printf (" --poll <0|1> (default: %d)\n " , cmd_params_defaults.cpuparams .poll );
239
252
printf (" -embd, --embeddings <0|1> (default: %s)\n " , join (cmd_params_defaults.embeddings , " ," ).c_str ());
240
253
printf (" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n " );
241
254
printf (" -r, --repetitions <n> (default: %d)\n " , cmd_params_defaults.reps );
@@ -272,7 +285,7 @@ static ggml_type ggml_type_from_name(const std::string & s) {
272
285
}
273
286
274
287
275
- static cmd_params parse_cmd_params (int argc, char ** argv) {
288
+ static cmd_params parse_cmd_params (int argc, char ** argv) {
276
289
cmd_params params;
277
290
std::string arg;
278
291
bool invalid_param = false ;
@@ -292,28 +305,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
292
305
if (arg == " -h" || arg == " --help" ) {
293
306
print_usage (argc, argv);
294
307
exit (0 );
295
- } else if (arg == " -m" || arg == " --model" ) {
308
+ }
309
+ else if (arg == " -m" || arg == " --model" ) {
296
310
if (++i >= argc) {
297
311
invalid_param = true ;
298
312
break ;
299
313
}
300
314
auto p = split<std::string>(argv[i], split_delim);
301
315
params.model .insert (params.model .end (), p.begin (), p.end ());
302
- } else if (arg == " -p" || arg == " --n-prompt" ) {
316
+ }
317
+ else if (arg == " -p" || arg == " --n-prompt" ) {
303
318
if (++i >= argc) {
304
319
invalid_param = true ;
305
320
break ;
306
321
}
307
322
auto p = split<int >(argv[i], split_delim);
308
323
params.n_prompt .insert (params.n_prompt .end (), p.begin (), p.end ());
309
- } else if (arg == " -n" || arg == " --n-gen" ) {
324
+ }
325
+ else if (arg == " -n" || arg == " --n-gen" ) {
310
326
if (++i >= argc) {
311
327
invalid_param = true ;
312
328
break ;
313
329
}
314
330
auto p = split<int >(argv[i], split_delim);
315
331
params.n_gen .insert (params.n_gen .end (), p.begin (), p.end ());
316
- } else if (arg == " -pg" ) {
332
+ }
333
+ else if (arg == " -pg" ) {
317
334
if (++i >= argc) {
318
335
invalid_param = true ;
319
336
break ;
@@ -323,29 +340,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
323
340
invalid_param = true ;
324
341
break ;
325
342
}
326
- params.n_pg .push_back ({std::stoi (p[0 ]), std::stoi (p[1 ])});
327
- } else if (arg == " -b" || arg == " --batch-size" ) {
343
+ params.n_pg .push_back ({ std::stoi (p[0 ]), std::stoi (p[1 ]) });
344
+ }
345
+ else if (arg == " -b" || arg == " --batch-size" ) {
328
346
if (++i >= argc) {
329
347
invalid_param = true ;
330
348
break ;
331
349
}
332
350
auto p = split<int >(argv[i], split_delim);
333
351
params.n_batch .insert (params.n_batch .end (), p.begin (), p.end ());
334
- } else if (arg == " -ub" || arg == " --ubatch-size" ) {
352
+ }
353
+ else if (arg == " -ub" || arg == " --ubatch-size" ) {
335
354
if (++i >= argc) {
336
355
invalid_param = true ;
337
356
break ;
338
357
}
339
358
auto p = split<int >(argv[i], split_delim);
340
359
params.n_ubatch .insert (params.n_ubatch .end (), p.begin (), p.end ());
341
- } else if (arg == " -ctk" || arg == " --cache-type-k" ) {
360
+ }
361
+ else if (arg == " -ctk" || arg == " --cache-type-k" ) {
342
362
if (++i >= argc) {
343
363
invalid_param = true ;
344
364
break ;
345
365
}
346
366
auto p = split<std::string>(argv[i], split_delim);
347
367
std::vector<ggml_type> types;
348
- for (const auto & t : p) {
368
+ for (const auto & t : p) {
349
369
ggml_type gt = ggml_type_from_name (t);
350
370
if (gt == GGML_TYPE_COUNT) {
351
371
invalid_param = true ;
@@ -354,14 +374,15 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
354
374
types.push_back (gt);
355
375
}
356
376
params.type_k .insert (params.type_k .end (), types.begin (), types.end ());
357
- } else if (arg == " -ctv" || arg == " --cache-type-v" ) {
377
+ }
378
+ else if (arg == " -ctv" || arg == " --cache-type-v" ) {
358
379
if (++i >= argc) {
359
380
invalid_param = true ;
360
381
break ;
361
382
}
362
383
auto p = split<std::string>(argv[i], split_delim);
363
384
std::vector<ggml_type> types;
364
- for (const auto & t : p) {
385
+ for (const auto & t : p) {
365
386
ggml_type gt = ggml_type_from_name (t);
366
387
if (gt == GGML_TYPE_COUNT) {
367
388
invalid_param = true ;
@@ -370,66 +391,104 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
370
391
types.push_back (gt);
371
392
}
372
393
params.type_v .insert (params.type_v .end (), types.begin (), types.end ());
373
- } else if (arg == " -t" || arg == " --threads" ) {
394
+ }
395
+ else if (arg == " -t" || arg == " --threads" ) {
374
396
if (++i >= argc) {
375
397
invalid_param = true ;
376
398
break ;
377
399
}
378
400
auto p = split<int >(argv[i], split_delim);
379
401
params.n_threads .insert (params.n_threads .end (), p.begin (), p.end ());
380
- } else if (arg == " -ngl" || arg == " --n-gpu-layers" ) {
402
+ }
403
+ else if (arg == " -ngl" || arg == " --n-gpu-layers" ) {
381
404
if (++i >= argc) {
382
405
invalid_param = true ;
383
406
break ;
384
407
}
385
408
auto p = split<int >(argv[i], split_delim);
386
409
params.n_gpu_layers .insert (params.n_gpu_layers .end (), p.begin (), p.end ());
387
- } else if (arg == " -sm" || arg == " --split-mode" ) {
410
+ }
411
+ else if (arg == " -sm" || arg == " --split-mode" ) {
388
412
if (++i >= argc) {
389
413
invalid_param = true ;
390
414
break ;
391
415
}
392
416
auto p = split<std::string>(argv[i], split_delim);
393
417
std::vector<llama_split_mode> modes;
394
- for (const auto & m : p) {
418
+ for (const auto & m : p) {
395
419
llama_split_mode mode;
396
420
if (m == " none" ) {
397
421
mode = LLAMA_SPLIT_MODE_NONE;
398
- } else if (m == " layer" ) {
422
+ }
423
+ else if (m == " layer" ) {
399
424
mode = LLAMA_SPLIT_MODE_LAYER;
400
- } else if (m == " row" ) {
425
+ }
426
+ else if (m == " row" ) {
401
427
mode = LLAMA_SPLIT_MODE_ROW;
402
- } else {
428
+ }
429
+ else {
403
430
invalid_param = true ;
404
431
break ;
405
432
}
406
433
modes.push_back (mode);
407
434
}
408
435
params.split_mode .insert (params.split_mode .end (), modes.begin (), modes.end ());
409
- } else if (arg == " -mg" || arg == " --main-gpu" ) {
436
+ }
437
+ else if (arg == " -mg" || arg == " --main-gpu" ) {
410
438
if (++i >= argc) {
411
439
invalid_param = true ;
412
440
break ;
413
441
}
414
442
params.main_gpu = split<int >(argv[i], split_delim);
415
- } else if (arg == " -nkvo" || arg == " --no-kv-offload" ) {
443
+ }
444
+ else if (arg == " -nkvo" || arg == " --no-kv-offload" ) {
416
445
if (++i >= argc) {
417
446
invalid_param = true ;
418
447
break ;
419
448
}
420
449
auto p = split<bool >(argv[i], split_delim);
421
450
params.no_kv_offload .insert (params.no_kv_offload .end (), p.begin (), p.end ());
422
- } else if (arg == " --numa" ) {
451
+ }
452
+ else if (arg == " --numa" ) {
423
453
if (++i >= argc) {
424
454
invalid_param = true ;
425
455
break ;
426
- } else {
456
+ }
457
+ else {
427
458
std::string value (argv[i]);
428
- /* */ if (value == " distribute" || value == " " ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
429
- else if (value == " isolate" ) { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
430
- else if (value == " numactl" ) { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
459
+ /* */ if (value == " distribute" || value == " " ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
460
+ else if (value == " isolate" ) { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
461
+ else if (value == " numactl" ) { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
431
462
else { invalid_param = true ; break ; }
432
463
}
464
+
465
+ }
466
+ else if (arg == " -mt" || arg == " --max-threads" ) {
467
+ if (++i >= argc) {
468
+ invalid_param = true ;
469
+ break ;
470
+ }
471
+ params.cpuparams .n_threads = std::stoi (argv[i]);
472
+ }
473
+ else if (arg == " -C" || arg == " --cpu-mask" ) {
474
+ if (++i >= argc) {
475
+ invalid_param = true ;
476
+ break ;
477
+ }
478
+ std::string mask = argv[i];
479
+ params.cpuparams .mask_valid = true ;
480
+ invalid_param = !parse_cpu_mask (mask, params.cpuparams .cpumask );
481
+ }
482
+ else if (arg == " --prio" ) {
483
+ if (++i >= argc) {
484
+ invalid_param = true ;
485
+ break ;
486
+ }
487
+ params.cpuparams .priority = std::stoul (argv[i]);
488
+ } else if (arg == " --cpu-strict" ) {
489
+ params.cpuparams .strict_cpu = true ;
490
+ } else if (arg == " --poll" ) {
491
+ params.cpuparams .poll = true ;
433
492
} else if (arg == " -fa" || arg == " --flash-attn" ) {
434
493
if (++i >= argc) {
435
494
invalid_param = true ;
@@ -1303,6 +1362,23 @@ int main(int argc, char ** argv) {
1303
1362
llama_model * lmodel = nullptr ;
1304
1363
const cmd_params_instance * prev_inst = nullptr ;
1305
1364
1365
+ postprocess_cpu_params (params.cpuparams );
1366
+
1367
+ struct ggml_threadpool_params tpp;
1368
+ tpp.n_threads = params.cpuparams .n_threads ;
1369
+ tpp.mask_specified = params.cpuparams .mask_valid ;
1370
+ tpp.strict_cpu = params.cpuparams .strict_cpu ;
1371
+ tpp.prio = params.cpuparams .priority ;
1372
+ tpp.poll = params.cpuparams .poll ;
1373
+
1374
+ std::memcpy (&tpp.cpumask [0 ], ¶ms.cpuparams .cpumask [0 ], GGML_N_CORES_MAX);
1375
+
1376
+ struct ggml_compute_threadpool * threadpool = ggml_create_threadpool (&tpp);
1377
+ if (!threadpool) {
1378
+ LOG_TEE (" %s: threadpool create failed : n_threads %d\n " , __func__, tpp.n_threads );
1379
+ exit (1 );
1380
+ }
1381
+
1306
1382
for (const auto & inst : params_instances) {
1307
1383
// keep the same model between tests when possible
1308
1384
if (!lmodel || !prev_inst || !inst.equal_mparams (*prev_inst)) {
@@ -1329,21 +1405,6 @@ int main(int argc, char ** argv) {
1329
1405
1330
1406
llama_kv_cache_clear (ctx);
1331
1407
1332
- struct ggml_threadpool_params tpp;
1333
- tpp.n_threads = t.n_threads ;
1334
-
1335
- // TODO: expose these via cli opts
1336
- tpp.mask_specified = false ;
1337
- tpp.strict_cpu = false ;
1338
- tpp.prio = 1 ;
1339
- tpp.poll = false ;
1340
-
1341
- struct ggml_compute_threadpool * threadpool = ggml_create_threadpool (&tpp);
1342
- if (!threadpool) {
1343
- LOG_TEE (" %s: threadpool create failed : n_threads %d\n " , __func__, tpp.n_threads );
1344
- exit (1 );
1345
- }
1346
-
1347
1408
llama_set_n_threads (ctx, t.n_threads , t.n_threads );
1348
1409
llama_attach_threadpool (ctx, threadpool);
1349
1410
@@ -1378,8 +1439,8 @@ int main(int argc, char ** argv) {
1378
1439
1379
1440
llama_free (ctx);
1380
1441
1381
- ggml_release_threadpool (threadpool);
1382
1442
}
1443
+ ggml_release_threadpool (threadpool);
1383
1444
1384
1445
llama_free_model (lmodel);
1385
1446
0 commit comments