Skip to content

Commit 20c1cf2

Browse files
slarenarthw
authored andcommitted
llama : accept a list of devices to use to offload a model (ggml-org#10497)
* llama : accept a list of devices to use to offload a model * accept `--dev none` to completely disable offloading * fix dev list with dl backends * rename env parameter to LLAMA_ARG_DEVICE for consistency
1 parent e2b3e41 commit 20c1cf2

File tree

9 files changed

+104
-27
lines changed

9 files changed

+104
-27
lines changed

common/arg.cpp

Lines changed: 56 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,27 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
298298
print_options(specific_options);
299299
}
300300

301+
static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
302+
std::vector<ggml_backend_dev_t> devices;
303+
auto dev_names = string_split<std::string>(value, ',');
304+
if (dev_names.empty()) {
305+
throw std::invalid_argument("no devices specified");
306+
}
307+
if (dev_names.size() == 1 && dev_names[0] == "none") {
308+
devices.push_back(nullptr);
309+
} else {
310+
for (const auto & device : dev_names) {
311+
auto * dev = ggml_backend_dev_by_name(device.c_str());
312+
if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
313+
throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
314+
}
315+
devices.push_back(dev);
316+
}
317+
devices.push_back(nullptr);
318+
}
319+
return devices;
320+
}
321+
301322
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
302323
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
303324
const common_params params_org = ctx_arg.params; // the example can modify the default params
@@ -324,6 +345,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
324345
}
325346

326347
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
348+
// load dynamic backends
349+
ggml_backend_load_all();
350+
327351
common_params_context ctx_arg(params);
328352
ctx_arg.print_usage = print_usage;
329353
ctx_arg.ex = ex;
@@ -1312,6 +1336,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13121336
else { throw std::invalid_argument("invalid value"); }
13131337
}
13141338
).set_env("LLAMA_ARG_NUMA"));
1339+
add_opt(common_arg(
1340+
{"-dev", "--device"}, "<dev1,dev2,..>",
1341+
"comma-separated list of devices to use for offloading (none = don't offload)\n"
1342+
"use --list-devices to see a list of available devices",
1343+
[](common_params & params, const std::string & value) {
1344+
params.devices = parse_device_list(value);
1345+
}
1346+
).set_env("LLAMA_ARG_DEVICE"));
1347+
add_opt(common_arg(
1348+
{"--list-devices"},
1349+
"print list of available devices and exit",
1350+
[](common_params &) {
1351+
printf("Available devices:\n");
1352+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
1353+
auto * dev = ggml_backend_dev_get(i);
1354+
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
1355+
size_t free, total;
1356+
ggml_backend_dev_memory(dev, &free, &total);
1357+
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
1358+
}
1359+
}
1360+
exit(0);
1361+
}
1362+
));
13151363
add_opt(common_arg(
13161364
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
13171365
"number of layers to store in VRAM",
@@ -1336,10 +1384,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13361384
} else if (arg_next == "layer") {
13371385
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
13381386
} else if (arg_next == "row") {
1339-
#ifdef GGML_USE_SYCL
1340-
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
1341-
exit(1);
1342-
#endif // GGML_USE_SYCL
13431387
params.split_mode = LLAMA_SPLIT_MODE_ROW;
13441388
} else {
13451389
throw std::invalid_argument("invalid value");
@@ -2042,6 +2086,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20422086
params.speculative.n_ctx = value;
20432087
}
20442088
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2089+
add_opt(common_arg(
2090+
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
2091+
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
2092+
"use --list-devices to see a list of available devices",
2093+
[](common_params & params, const std::string & value) {
2094+
params.speculative.devices = parse_device_list(value);
2095+
}
2096+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
20452097
add_opt(common_arg(
20462098
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
20472099
"number of layers to store in VRAM for the draft model",

common/common.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -377,9 +377,6 @@ void common_init() {
377377
#endif
378378

379379
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
380-
381-
// load dynamic backends
382-
ggml_backend_load_all();
383380
}
384381

385382
std::string common_params_get_system_info(const common_params & params) {
@@ -982,9 +979,12 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_l
982979
}
983980
}
984981

985-
struct llama_model_params common_model_params_to_llama(const common_params & params) {
982+
struct llama_model_params common_model_params_to_llama(common_params & params) {
986983
auto mparams = llama_model_default_params();
987984

985+
if (!params.devices.empty()) {
986+
mparams.devices = params.devices.data();
987+
}
988988
if (params.n_gpu_layers != -1) {
989989
mparams.n_gpu_layers = params.n_gpu_layers;
990990
}

common/common.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ struct common_params_sampling {
156156
};
157157

158158
struct common_params_speculative {
159+
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
159160
int32_t n_ctx = 0; // draft context size
160161
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
161162
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
@@ -178,9 +179,6 @@ struct common_params {
178179
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
179180
int32_t n_parallel = 1; // number of parallel sequences to decode
180181
int32_t n_sequences = 1; // number of sequences to decode
181-
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
182-
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
183-
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
184182
int32_t grp_attn_n = 1; // group-attention factor
185183
int32_t grp_attn_w = 512; // group-attention width
186184
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
@@ -193,6 +191,13 @@ struct common_params {
193191
int32_t yarn_orig_ctx = 0; // YaRN original context length
194192
float defrag_thold = 0.1f; // KV cache defragmentation threshold
195193

194+
// offload params
195+
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
196+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
197+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
198+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
199+
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
200+
196201
struct cpu_params cpuparams;
197202
struct cpu_params cpuparams_batch;
198203

@@ -201,7 +206,6 @@ struct common_params {
201206

202207
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
203208

204-
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
205209
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
206210
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
207211
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
@@ -462,7 +466,7 @@ struct common_init_result {
462466

463467
struct common_init_result common_init_from_params(common_params & params);
464468

465-
struct llama_model_params common_model_params_to_llama (const common_params & params);
469+
struct llama_model_params common_model_params_to_llama ( common_params & params);
466470
struct llama_context_params common_context_params_to_llama(const common_params & params);
467471
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
468472

examples/server/server.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -692,6 +692,7 @@ struct server_context {
692692

693693
auto params_dft = params_base;
694694

695+
params_dft.devices = params_base.speculative.devices;
695696
params_dft.model = params_base.speculative.model;
696697
params_dft.n_ctx = params_base.speculative.n_ctx;
697698
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;

examples/speculative-simple/speculative-simple.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ int main(int argc, char ** argv) {
4646
ctx_tgt = llama_init_tgt.context;
4747

4848
// load the draft model
49+
params.devices = params.speculative.devices;
4950
params.model = params.speculative.model;
5051
params.n_ctx = params.speculative.n_ctx;
5152
params.n_batch = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_batch;

examples/speculative/speculative.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ int main(int argc, char ** argv) {
7171
ctx_tgt = llama_init_tgt.context;
7272

7373
// load the draft model
74+
params.devices = params.speculative.devices;
7475
params.model = params.speculative.model;
7576
params.n_gpu_layers = params.speculative.n_gpu_layers;
7677
if (params.speculative.cpuparams.n_threads > 0) {

ggml/src/ggml-backend-reg.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,15 @@ void ggml_backend_device_register(ggml_backend_dev_t device) {
253253
}
254254

255255
// Backend (reg) enumeration
256+
static bool striequals(const char * a, const char * b) {
257+
for (; *a && *b; a++, b++) {
258+
if (std::tolower(*a) != std::tolower(*b)) {
259+
return false;
260+
}
261+
}
262+
return *a == *b;
263+
}
264+
256265
size_t ggml_backend_reg_count() {
257266
return get_reg().backends.size();
258267
}
@@ -265,7 +274,7 @@ ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
265274
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
266275
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
267276
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
268-
if (std::strcmp(ggml_backend_reg_name(reg), name) == 0) {
277+
if (striequals(ggml_backend_reg_name(reg), name)) {
269278
return reg;
270279
}
271280
}
@@ -285,7 +294,7 @@ ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
285294
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
286295
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
287296
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
288-
if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
297+
if (striequals(ggml_backend_dev_name(dev), name)) {
289298
return dev;
290299
}
291300
}

include/llama.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,9 @@ extern "C" {
272272
};
273273

274274
struct llama_model_params {
275+
// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
276+
ggml_backend_dev_t * devices;
277+
275278
int32_t n_gpu_layers; // number of layers to store in VRAM
276279
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
277280

src/llama.cpp

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19373,6 +19373,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
1937319373
//
1937419374
struct llama_model_params llama_model_default_params() {
1937519375
struct llama_model_params result = {
19376+
/*.devices =*/ nullptr,
1937619377
/*.n_gpu_layers =*/ 0,
1937719378
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
1937819379
/*.main_gpu =*/ 0,
@@ -19585,19 +19586,24 @@ struct llama_model * llama_load_model_from_file(
1958519586
}
1958619587

1958719588
// create list of devices to use with this model
19588-
// currently, we use all available devices
19589-
// TODO: rework API to give user more control over device selection
19590-
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
19591-
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
19592-
switch (ggml_backend_dev_type(dev)) {
19593-
case GGML_BACKEND_DEVICE_TYPE_CPU:
19594-
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
19595-
// skip CPU backends since they are handled separately
19596-
break;
19589+
if (params.devices) {
19590+
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
19591+
model->devices.push_back(*dev);
19592+
}
19593+
} else {
19594+
// use all available devices
19595+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
19596+
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
19597+
switch (ggml_backend_dev_type(dev)) {
19598+
case GGML_BACKEND_DEVICE_TYPE_CPU:
19599+
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
19600+
// skip CPU backends since they are handled separately
19601+
break;
1959719602

19598-
case GGML_BACKEND_DEVICE_TYPE_GPU:
19599-
model->devices.push_back(dev);
19600-
break;
19603+
case GGML_BACKEND_DEVICE_TYPE_GPU:
19604+
model->devices.push_back(dev);
19605+
break;
19606+
}
1960119607
}
1960219608
}
1960319609

0 commit comments

Comments
 (0)