@@ -3346,37 +3346,41 @@ static size_t llama_get_device_count(const llama_model & model) {
3346
3346
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
3347
3347
ggml_backend_buffer_type_t buft = nullptr;
3348
3348
3349
- #if defined(GGML_USE_RPC)
3350
- int dev_count = (int)llama_get_device_count(model);
3349
+ #ifdef GGML_USE_RPC
3351
3350
int rpc_count = (int)model.rpc_servers.size();
3352
- if (gpu >= dev_count - rpc_count) {
3353
- const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
3351
+ #else
3352
+ int rpc_count = 0;
3353
+ #endif
3354
+ int local_gpu = gpu - rpc_count;
3355
+ #if defined(GGML_USE_RPC)
3356
+ if (gpu < rpc_count) {
3357
+ const char * endpoint = model.rpc_servers[gpu].c_str();
3354
3358
return ggml_backend_rpc_buffer_type(endpoint);
3355
3359
}
3356
3360
#endif
3357
3361
#if defined(GGML_USE_METAL)
3358
3362
buft = ggml_backend_metal_buffer_type();
3359
3363
#elif defined(GGML_USE_CUDA)
3360
- buft = ggml_backend_cuda_buffer_type(gpu );
3364
+ buft = ggml_backend_cuda_buffer_type(local_gpu );
3361
3365
#elif defined(GGML_USE_VULKAN)
3362
- buft = ggml_backend_vk_buffer_type(gpu );
3366
+ buft = ggml_backend_vk_buffer_type(local_gpu );
3363
3367
#elif defined(GGML_USE_SYCL)
3364
- buft = ggml_backend_sycl_buffer_type(gpu );
3368
+ buft = ggml_backend_sycl_buffer_type(local_gpu );
3365
3369
#elif defined(GGML_USE_KOMPUTE)
3366
- buft = ggml_backend_kompute_buffer_type(gpu );
3370
+ buft = ggml_backend_kompute_buffer_type(local_gpu );
3367
3371
if (buft == nullptr) {
3368
- LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu );
3372
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu );
3369
3373
}
3370
3374
#elif defined(GGML_USE_CANN)
3371
- buft = ggml_backend_cann_buffer_type(gpu );
3375
+ buft = ggml_backend_cann_buffer_type(local_gpu );
3372
3376
#endif
3373
3377
3374
3378
if (buft == nullptr) {
3375
3379
buft = llama_default_buffer_type_cpu(true);
3376
3380
}
3377
3381
return buft;
3378
3382
GGML_UNUSED(model);
3379
- GGML_UNUSED(gpu );
3383
+ GGML_UNUSED(local_gpu );
3380
3384
}
3381
3385
3382
3386
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
@@ -3403,42 +3407,46 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
3403
3407
}
3404
3408
3405
3409
static size_t llama_get_device_memory(const llama_model & model, int device) {
3406
- #if defined(GGML_USE_RPC)
3407
- int dev_count = (int)llama_get_device_count(model);
3410
+ #ifdef GGML_USE_RPC
3408
3411
int rpc_count = (int)model.rpc_servers.size();
3409
- if (device >= dev_count - rpc_count) {
3412
+ #else
3413
+ int rpc_count = 0;
3414
+ #endif
3415
+ int local_device = device - rpc_count;
3416
+ #if defined(GGML_USE_RPC)
3417
+ if (device < rpc_count) {
3410
3418
size_t total;
3411
3419
size_t free;
3412
- const char * endpoint = model.rpc_servers[device - dev_count + rpc_count ].c_str();
3420
+ const char * endpoint = model.rpc_servers[device].c_str();
3413
3421
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
3414
3422
return free;
3415
3423
}
3416
3424
#endif
3417
3425
#if defined(GGML_USE_CUDA)
3418
3426
size_t total;
3419
3427
size_t free;
3420
- ggml_backend_cuda_get_device_memory(device , &free, &total);
3428
+ ggml_backend_cuda_get_device_memory(local_device , &free, &total);
3421
3429
return free;
3422
3430
#elif defined(GGML_USE_SYCL)
3423
3431
size_t total;
3424
3432
size_t free;
3425
- ggml_backend_sycl_get_device_memory(device , &free, &total);
3433
+ ggml_backend_sycl_get_device_memory(local_device , &free, &total);
3426
3434
return free;
3427
3435
#elif defined(GGML_USE_VULKAN)
3428
3436
size_t total;
3429
3437
size_t free;
3430
- ggml_backend_vk_get_device_memory(device , &free, &total);
3438
+ ggml_backend_vk_get_device_memory(local_device , &free, &total);
3431
3439
return free;
3432
3440
#elif defined(GGML_USE_CANN)
3433
3441
size_t total;
3434
3442
size_t free;
3435
- ggml_backend_cann_get_device_memory(device , &free, &total);
3443
+ ggml_backend_cann_get_device_memory(local_device , &free, &total);
3436
3444
return free;
3437
3445
#else
3438
3446
return 1;
3439
3447
#endif
3440
3448
GGML_UNUSED(model);
3441
- GGML_UNUSED(device );
3449
+ GGML_UNUSED(local_device );
3442
3450
}
3443
3451
3444
3452
//
@@ -18186,6 +18194,20 @@ struct llama_context * llama_new_context_with_model(
18186
18194
18187
18195
if (!hparams.vocab_only) {
18188
18196
// initialize backends
18197
+ #if defined(GGML_USE_RPC)
18198
+ if (model->n_gpu_layers > 0) {
18199
+ for (const auto & endpoint : model->rpc_servers) {
18200
+ ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
18201
+ if (backend == nullptr) {
18202
+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
18203
+ llama_free(ctx);
18204
+ return nullptr;
18205
+ }
18206
+ ctx->backends.push_back(backend);
18207
+ }
18208
+ }
18209
+ #endif
18210
+
18189
18211
#if defined(GGML_USE_METAL)
18190
18212
if (model->n_gpu_layers > 0) {
18191
18213
ctx->backend_metal = ggml_backend_metal_init();
@@ -18310,19 +18332,6 @@ struct llama_context * llama_new_context_with_model(
18310
18332
}
18311
18333
#endif
18312
18334
18313
- #if defined(GGML_USE_RPC)
18314
- if (model->n_gpu_layers > 0) {
18315
- for (const auto & endpoint : model->rpc_servers) {
18316
- ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
18317
- if (backend == nullptr) {
18318
- LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
18319
- llama_free(ctx);
18320
- return nullptr;
18321
- }
18322
- ctx->backends.push_back(backend);
18323
- }
18324
- }
18325
- #endif
18326
18335
ctx->backend_cpu = ggml_backend_cpu_init();
18327
18336
if (ctx->backend_cpu == nullptr) {
18328
18337
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
0 commit comments