Skip to content

Commit 6c276de

Browse files
committed
llama : offload to RPC in addition to other backends
1 parent 7846540 commit 6c276de

File tree

2 files changed

+53
-38
lines changed

2 files changed

+53
-38
lines changed

ggml-backend.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
321321
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
322322
} else if (ggml_backend_buffer_is_host(dst->buffer)) {
323323
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
324-
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
324+
}
325+
bool same_backend = strcmp(ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)) == 0;
326+
if (!same_backend || !ggml_backend_buffer_copy_tensor(src, dst)) {
325327
#ifndef NDEBUG
326328
fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
327329
#endif

llama.cpp

Lines changed: 50 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2369,13 +2369,34 @@ struct llama_context {
23692369
struct llama_control_vector cvec;
23702370
};
23712371

2372+
static size_t llama_get_device_count(const llama_model & model) {
2373+
size_t count = 1;
2374+
#if defined(GGML_USE_CUDA)
2375+
count = ggml_backend_cuda_get_device_count();
2376+
#elif defined(GGML_USE_SYCL)
2377+
count = ggml_backend_sycl_get_device_count();
2378+
#elif defined(GGML_USE_VULKAN)
2379+
count = ggml_backend_vk_get_device_count();
2380+
#endif
2381+
#if defined(GGML_USE_RPC)
2382+
count += model.rpc_servers.size();
2383+
#endif
2384+
return count;
2385+
GGML_UNUSED(model);
2386+
}
2387+
23722388
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
23732389
ggml_backend_buffer_type_t buft = nullptr;
23742390

2375-
#ifdef GGML_USE_RPC
2376-
std::string endpoint = model.rpc_servers[gpu];
2377-
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2378-
#elif defined(GGML_USE_METAL)
2391+
#if defined(GGML_USE_RPC)
2392+
int dev_count = (int)llama_get_device_count(model);
2393+
int rpc_count = (int)model.rpc_servers.size();
2394+
if (gpu >= dev_count - rpc_count) {
2395+
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
2396+
return ggml_backend_rpc_buffer_type(endpoint);
2397+
}
2398+
#endif
2399+
#if defined(GGML_USE_METAL)
23792400
buft = ggml_backend_metal_buffer_type();
23802401
#elif defined(GGML_USE_CUDA)
23812402
buft = ggml_backend_cuda_buffer_type(gpu);
@@ -2423,29 +2444,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
24232444
GGML_UNUSED(tensor_split);
24242445
}
24252446

2426-
static size_t llama_get_device_count(const llama_model & model) {
2427-
#if defined(GGML_USE_RPC)
2428-
return model.rpc_servers.size();
2429-
#elif defined(GGML_USE_CUDA)
2430-
return ggml_backend_cuda_get_device_count();
2431-
#elif defined(GGML_USE_SYCL)
2432-
return ggml_backend_sycl_get_device_count();
2433-
#elif defined(GGML_USE_VULKAN)
2434-
return ggml_backend_vk_get_device_count();
2435-
#else
2436-
return 1;
2437-
#endif
2438-
GGML_UNUSED(model);
2439-
}
2440-
24412447
static size_t llama_get_device_memory(const llama_model & model, int device) {
24422448
#if defined(GGML_USE_RPC)
2443-
size_t total;
2444-
size_t free;
2445-
std::string endpoint = model.rpc_servers[device];
2446-
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2447-
return free;
2448-
#elif defined(GGML_USE_CUDA)
2449+
int dev_count = (int)llama_get_device_count(model);
2450+
int rpc_count = (int)model.rpc_servers.size();
2451+
if (device >= dev_count - rpc_count) {
2452+
size_t total;
2453+
size_t free;
2454+
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
2455+
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
2456+
return free;
2457+
}
2458+
#endif
2459+
#if defined(GGML_USE_CUDA)
24492460
size_t total;
24502461
size_t free;
24512462
ggml_backend_cuda_get_device_memory(device, &free, &total);
@@ -16146,7 +16157,7 @@ struct llama_model * llama_load_model_from_file(
1614616157
return true;
1614716158
};
1614816159
}
16149-
if (params.rpc_servers != nullptr) {
16160+
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
1615016161
// split the servers set them into model->rpc_servers
1615116162
std::string servers(params.rpc_servers);
1615216163
size_t pos = 0;
@@ -16304,17 +16315,7 @@ struct llama_context * llama_new_context_with_model(
1630416315

1630516316
if (!hparams.vocab_only) {
1630616317
// initialize backends
16307-
#if defined(GGML_USE_RPC)
16308-
for (auto & server : model->rpc_servers) {
16309-
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
16310-
if (backend == nullptr) {
16311-
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
16312-
llama_free(ctx);
16313-
return nullptr;
16314-
}
16315-
ctx->backends.push_back(backend);
16316-
}
16317-
#elif defined(GGML_USE_METAL)
16318+
#if defined(GGML_USE_METAL)
1631816319
if (model->n_gpu_layers > 0) {
1631916320
ctx->backend_metal = ggml_backend_metal_init();
1632016321
if (ctx->backend_metal == nullptr) {
@@ -16406,6 +16407,18 @@ struct llama_context * llama_new_context_with_model(
1640616407
}
1640716408
ctx->backends.push_back(backend);
1640816409
}
16410+
#endif
16411+
#if defined(GGML_USE_RPC)
16412+
for (int i = 0; i < (int)model->rpc_servers.size(); i++) {
16413+
const char * endpoint = model->rpc_servers[i].c_str();
16414+
ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
16415+
if (backend == nullptr) {
16416+
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint);
16417+
llama_free(ctx);
16418+
return nullptr;
16419+
}
16420+
ctx->backends.push_back(backend);
16421+
}
1640916422
#endif
1641016423
ctx->backend_cpu = ggml_backend_cpu_init();
1641116424
if (ctx->backend_cpu == nullptr) {

0 commit comments

Comments
 (0)