Skip to content

Commit 82e3b03

Browse files
authored
rpc : make RPC servers come first in the device list (#9296)
* rpc : make RPC servers come first in the device list * rpc : disable options for non-RPC builds * rpc : rpc_count always zero for non-RPC builds
1 parent 9379d3c commit 82e3b03

File tree

3 files changed

+50
-33
lines changed

3 files changed

+50
-33
lines changed

common/common.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1234,11 +1234,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
12341234
#endif // GGML_USE_CUDA_SYCL_VULKAN
12351235
return true;
12361236
}
1237+
#ifdef GGML_USE_RPC
12371238
if (arg == "--rpc") {
12381239
CHECK_ARG
12391240
params.rpc_servers = argv[i];
12401241
return true;
12411242
}
1243+
#endif
12421244
if (arg == "--no-mmap") {
12431245
params.use_mmap = false;
12441246
return true;
@@ -1929,7 +1931,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
19291931
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
19301932

19311933
options.push_back({ "backend" });
1934+
#ifdef GGML_USE_RPC
19321935
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
1936+
#endif
19331937

19341938
if (llama_supports_mlock()) {
19351939
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });

examples/llama-bench/llama-bench.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,9 @@ static void print_usage(int /* argc */, char ** argv) {
299299
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
300300
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
301301
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
302+
#ifdef GGML_USE_RPC
302303
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
304+
#endif
303305
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
304306
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
305307
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
@@ -482,12 +484,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
482484
}
483485
auto p = string_split<int>(argv[i], split_delim);
484486
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
487+
#ifdef GGML_USE_RPC
485488
} else if (arg == "-rpc" || arg == "--rpc") {
486489
if (++i >= argc) {
487490
invalid_param = true;
488491
break;
489492
}
490493
params.rpc_servers.push_back(argv[i]);
494+
#endif
491495
} else if (arg == "-sm" || arg == "--split-mode") {
492496
if (++i >= argc) {
493497
invalid_param = true;

src/llama.cpp

Lines changed: 42 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3346,37 +3346,41 @@ static size_t llama_get_device_count(const llama_model & model) {
33463346
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
33473347
ggml_backend_buffer_type_t buft = nullptr;
33483348

3349-
#if defined(GGML_USE_RPC)
3350-
int dev_count = (int)llama_get_device_count(model);
3349+
#ifdef GGML_USE_RPC
33513350
int rpc_count = (int)model.rpc_servers.size();
3352-
if (gpu >= dev_count - rpc_count) {
3353-
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
3351+
#else
3352+
int rpc_count = 0;
3353+
#endif
3354+
int local_gpu = gpu - rpc_count;
3355+
#if defined(GGML_USE_RPC)
3356+
if (gpu < rpc_count) {
3357+
const char * endpoint = model.rpc_servers[gpu].c_str();
33543358
return ggml_backend_rpc_buffer_type(endpoint);
33553359
}
33563360
#endif
33573361
#if defined(GGML_USE_METAL)
33583362
buft = ggml_backend_metal_buffer_type();
33593363
#elif defined(GGML_USE_CUDA)
3360-
buft = ggml_backend_cuda_buffer_type(gpu);
3364+
buft = ggml_backend_cuda_buffer_type(local_gpu);
33613365
#elif defined(GGML_USE_VULKAN)
3362-
buft = ggml_backend_vk_buffer_type(gpu);
3366+
buft = ggml_backend_vk_buffer_type(local_gpu);
33633367
#elif defined(GGML_USE_SYCL)
3364-
buft = ggml_backend_sycl_buffer_type(gpu);
3368+
buft = ggml_backend_sycl_buffer_type(local_gpu);
33653369
#elif defined(GGML_USE_KOMPUTE)
3366-
buft = ggml_backend_kompute_buffer_type(gpu);
3370+
buft = ggml_backend_kompute_buffer_type(local_gpu);
33673371
if (buft == nullptr) {
3368-
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
3372+
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
33693373
}
33703374
#elif defined(GGML_USE_CANN)
3371-
buft = ggml_backend_cann_buffer_type(gpu);
3375+
buft = ggml_backend_cann_buffer_type(local_gpu);
33723376
#endif
33733377

33743378
if (buft == nullptr) {
33753379
buft = llama_default_buffer_type_cpu(true);
33763380
}
33773381
return buft;
33783382
GGML_UNUSED(model);
3379-
GGML_UNUSED(gpu);
3383+
GGML_UNUSED(local_gpu);
33803384
}
33813385

33823386
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
@@ -3403,42 +3407,46 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
34033407
}
34043408

34053409
static size_t llama_get_device_memory(const llama_model & model, int device) {
3406-
#if defined(GGML_USE_RPC)
3407-
int dev_count = (int)llama_get_device_count(model);
3410+
#ifdef GGML_USE_RPC
34083411
int rpc_count = (int)model.rpc_servers.size();
3409-
if (device >= dev_count - rpc_count) {
3412+
#else
3413+
int rpc_count = 0;
3414+
#endif
3415+
int local_device = device - rpc_count;
3416+
#if defined(GGML_USE_RPC)
3417+
if (device < rpc_count) {
34103418
size_t total;
34113419
size_t free;
3412-
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
3420+
const char * endpoint = model.rpc_servers[device].c_str();
34133421
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
34143422
return free;
34153423
}
34163424
#endif
34173425
#if defined(GGML_USE_CUDA)
34183426
size_t total;
34193427
size_t free;
3420-
ggml_backend_cuda_get_device_memory(device, &free, &total);
3428+
ggml_backend_cuda_get_device_memory(local_device, &free, &total);
34213429
return free;
34223430
#elif defined(GGML_USE_SYCL)
34233431
size_t total;
34243432
size_t free;
3425-
ggml_backend_sycl_get_device_memory(device, &free, &total);
3433+
ggml_backend_sycl_get_device_memory(local_device, &free, &total);
34263434
return free;
34273435
#elif defined(GGML_USE_VULKAN)
34283436
size_t total;
34293437
size_t free;
3430-
ggml_backend_vk_get_device_memory(device, &free, &total);
3438+
ggml_backend_vk_get_device_memory(local_device, &free, &total);
34313439
return free;
34323440
#elif defined(GGML_USE_CANN)
34333441
size_t total;
34343442
size_t free;
3435-
ggml_backend_cann_get_device_memory(device, &free, &total);
3443+
ggml_backend_cann_get_device_memory(local_device, &free, &total);
34363444
return free;
34373445
#else
34383446
return 1;
34393447
#endif
34403448
GGML_UNUSED(model);
3441-
GGML_UNUSED(device);
3449+
GGML_UNUSED(local_device);
34423450
}
34433451

34443452
//
@@ -18186,6 +18194,20 @@ struct llama_context * llama_new_context_with_model(
1818618194

1818718195
if (!hparams.vocab_only) {
1818818196
// initialize backends
18197+
#if defined(GGML_USE_RPC)
18198+
if (model->n_gpu_layers > 0) {
18199+
for (const auto & endpoint : model->rpc_servers) {
18200+
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
18201+
if (backend == nullptr) {
18202+
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
18203+
llama_free(ctx);
18204+
return nullptr;
18205+
}
18206+
ctx->backends.push_back(backend);
18207+
}
18208+
}
18209+
#endif
18210+
1818918211
#if defined(GGML_USE_METAL)
1819018212
if (model->n_gpu_layers > 0) {
1819118213
ctx->backend_metal = ggml_backend_metal_init();
@@ -18310,19 +18332,6 @@ struct llama_context * llama_new_context_with_model(
1831018332
}
1831118333
#endif
1831218334

18313-
#if defined(GGML_USE_RPC)
18314-
if (model->n_gpu_layers > 0) {
18315-
for (const auto & endpoint : model->rpc_servers) {
18316-
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
18317-
if (backend == nullptr) {
18318-
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
18319-
llama_free(ctx);
18320-
return nullptr;
18321-
}
18322-
ctx->backends.push_back(backend);
18323-
}
18324-
}
18325-
#endif
1832618335
ctx->backend_cpu = ggml_backend_cpu_init();
1832718336
if (ctx->backend_cpu == nullptr) {
1832818337
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);

0 commit comments

Comments
 (0)