Skip to content

rpc : make RPC servers come first in the device list #9296

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1234,11 +1234,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
#endif // GGML_USE_CUDA_SYCL_VULKAN
return true;
}
#ifdef GGML_USE_RPC
if (arg == "--rpc") {
CHECK_ARG
params.rpc_servers = argv[i];
return true;
}
#endif
if (arg == "--no-mmap") {
params.use_mmap = false;
return true;
Expand Down Expand Up @@ -1929,7 +1931,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });

options.push_back({ "backend" });
#ifdef GGML_USE_RPC
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
#endif

if (llama_supports_mlock()) {
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
Expand Down
4 changes: 4 additions & 0 deletions examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,9 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
#ifdef GGML_USE_RPC
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
#endif
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
Expand Down Expand Up @@ -482,12 +484,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = string_split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
#ifdef GGML_USE_RPC
} else if (arg == "-rpc" || arg == "--rpc") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rpc_servers.push_back(argv[i]);
#endif
} else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) {
invalid_param = true;
Expand Down
75 changes: 42 additions & 33 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3346,37 +3346,41 @@ static size_t llama_get_device_count(const llama_model & model) {
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
ggml_backend_buffer_type_t buft = nullptr;

#if defined(GGML_USE_RPC)
int dev_count = (int)llama_get_device_count(model);
#ifdef GGML_USE_RPC
int rpc_count = (int)model.rpc_servers.size();
if (gpu >= dev_count - rpc_count) {
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
#else
int rpc_count = 0;
#endif
int local_gpu = gpu - rpc_count;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this will work correctly if a list of RPC servers is given in a build without the RPC backend (they should be ignored). The device ids should be from 0 to llama_get_device_count() - 1.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think giving a list of RPC servers in non-RPC build should produce an error, i.e. the --rpc command line option must not be available.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixing that in the examples is good, but it still needs to be handled properly for 3rd party applications that use the llama.cpp API directly. llama_model::rpc_servers should probably only exist in builds with the RPC backend.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have added ifdefs for the rpc_count. If we want to remove the existence of llama_model::rpc_server, we will need more ifdefs.

#if defined(GGML_USE_RPC)
if (gpu < rpc_count) {
const char * endpoint = model.rpc_servers[gpu].c_str();
return ggml_backend_rpc_buffer_type(endpoint);
}
#endif
#if defined(GGML_USE_METAL)
buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUDA)
buft = ggml_backend_cuda_buffer_type(gpu);
buft = ggml_backend_cuda_buffer_type(local_gpu);
#elif defined(GGML_USE_VULKAN)
buft = ggml_backend_vk_buffer_type(gpu);
buft = ggml_backend_vk_buffer_type(local_gpu);
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_buffer_type(gpu);
buft = ggml_backend_sycl_buffer_type(local_gpu);
#elif defined(GGML_USE_KOMPUTE)
buft = ggml_backend_kompute_buffer_type(gpu);
buft = ggml_backend_kompute_buffer_type(local_gpu);
if (buft == nullptr) {
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
}
#elif defined(GGML_USE_CANN)
buft = ggml_backend_cann_buffer_type(gpu);
buft = ggml_backend_cann_buffer_type(local_gpu);
#endif

if (buft == nullptr) {
buft = llama_default_buffer_type_cpu(true);
}
return buft;
GGML_UNUSED(model);
GGML_UNUSED(gpu);
GGML_UNUSED(local_gpu);
}

static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
Expand All @@ -3403,42 +3407,46 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
}

static size_t llama_get_device_memory(const llama_model & model, int device) {
#if defined(GGML_USE_RPC)
int dev_count = (int)llama_get_device_count(model);
#ifdef GGML_USE_RPC
int rpc_count = (int)model.rpc_servers.size();
if (device >= dev_count - rpc_count) {
#else
int rpc_count = 0;
#endif
int local_device = device - rpc_count;
#if defined(GGML_USE_RPC)
if (device < rpc_count) {
size_t total;
size_t free;
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
const char * endpoint = model.rpc_servers[device].c_str();
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
return free;
}
#endif
#if defined(GGML_USE_CUDA)
size_t total;
size_t free;
ggml_backend_cuda_get_device_memory(device, &free, &total);
ggml_backend_cuda_get_device_memory(local_device, &free, &total);
return free;
#elif defined(GGML_USE_SYCL)
size_t total;
size_t free;
ggml_backend_sycl_get_device_memory(device, &free, &total);
ggml_backend_sycl_get_device_memory(local_device, &free, &total);
return free;
#elif defined(GGML_USE_VULKAN)
size_t total;
size_t free;
ggml_backend_vk_get_device_memory(device, &free, &total);
ggml_backend_vk_get_device_memory(local_device, &free, &total);
return free;
#elif defined(GGML_USE_CANN)
size_t total;
size_t free;
ggml_backend_cann_get_device_memory(device, &free, &total);
ggml_backend_cann_get_device_memory(local_device, &free, &total);
return free;
#else
return 1;
#endif
GGML_UNUSED(model);
GGML_UNUSED(device);
GGML_UNUSED(local_device);
}

//
Expand Down Expand Up @@ -18186,6 +18194,20 @@ struct llama_context * llama_new_context_with_model(

if (!hparams.vocab_only) {
// initialize backends
#if defined(GGML_USE_RPC)
if (model->n_gpu_layers > 0) {
for (const auto & endpoint : model->rpc_servers) {
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
#endif

#if defined(GGML_USE_METAL)
if (model->n_gpu_layers > 0) {
ctx->backend_metal = ggml_backend_metal_init();
Expand Down Expand Up @@ -18310,19 +18332,6 @@ struct llama_context * llama_new_context_with_model(
}
#endif

#if defined(GGML_USE_RPC)
if (model->n_gpu_layers > 0) {
for (const auto & endpoint : model->rpc_servers) {
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
#endif
ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
Expand Down
Loading