@@ -3346,37 +3346,37 @@ static size_t llama_get_device_count(const llama_model & model) {
3346
3346
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
3347
3347
ggml_backend_buffer_type_t buft = nullptr;
3348
3348
3349
- #if defined(GGML_USE_RPC)
3350
- int dev_count = (int)llama_get_device_count(model);
3351
3349
int rpc_count = (int)model.rpc_servers.size();
3352
- if (gpu >= dev_count - rpc_count) {
3353
- const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
3350
+ int local_gpu = gpu - rpc_count;
3351
+ #if defined(GGML_USE_RPC)
3352
+ if (gpu < rpc_count) {
3353
+ const char * endpoint = model.rpc_servers[gpu].c_str();
3354
3354
return ggml_backend_rpc_buffer_type(endpoint);
3355
3355
}
3356
3356
#endif
3357
3357
#if defined(GGML_USE_METAL)
3358
3358
buft = ggml_backend_metal_buffer_type();
3359
3359
#elif defined(GGML_USE_CUDA)
3360
- buft = ggml_backend_cuda_buffer_type(gpu );
3360
+ buft = ggml_backend_cuda_buffer_type(local_gpu );
3361
3361
#elif defined(GGML_USE_VULKAN)
3362
- buft = ggml_backend_vk_buffer_type(gpu );
3362
+ buft = ggml_backend_vk_buffer_type(local_gpu );
3363
3363
#elif defined(GGML_USE_SYCL)
3364
- buft = ggml_backend_sycl_buffer_type(gpu );
3364
+ buft = ggml_backend_sycl_buffer_type(local_gpu );
3365
3365
#elif defined(GGML_USE_KOMPUTE)
3366
- buft = ggml_backend_kompute_buffer_type(gpu );
3366
+ buft = ggml_backend_kompute_buffer_type(local_gpu );
3367
3367
if (buft == nullptr) {
3368
- LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu );
3368
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu );
3369
3369
}
3370
3370
#elif defined(GGML_USE_CANN)
3371
- buft = ggml_backend_cann_buffer_type(gpu );
3371
+ buft = ggml_backend_cann_buffer_type(local_gpu );
3372
3372
#endif
3373
3373
3374
3374
if (buft == nullptr) {
3375
3375
buft = llama_default_buffer_type_cpu(true);
3376
3376
}
3377
3377
return buft;
3378
3378
GGML_UNUSED(model);
3379
- GGML_UNUSED(gpu );
3379
+ GGML_UNUSED(local_gpu );
3380
3380
}
3381
3381
3382
3382
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
@@ -3403,42 +3403,42 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
3403
3403
}
3404
3404
3405
3405
static size_t llama_get_device_memory(const llama_model & model, int device) {
3406
- #if defined(GGML_USE_RPC)
3407
- int dev_count = (int)llama_get_device_count(model);
3408
3406
int rpc_count = (int)model.rpc_servers.size();
3409
- if (device >= dev_count - rpc_count) {
3407
+ int local_device = device - rpc_count;
3408
+ #if defined(GGML_USE_RPC)
3409
+ if (device < rpc_count) {
3410
3410
size_t total;
3411
3411
size_t free;
3412
- const char * endpoint = model.rpc_servers[device - dev_count + rpc_count ].c_str();
3412
+ const char * endpoint = model.rpc_servers[device].c_str();
3413
3413
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
3414
3414
return free;
3415
3415
}
3416
3416
#endif
3417
3417
#if defined(GGML_USE_CUDA)
3418
3418
size_t total;
3419
3419
size_t free;
3420
- ggml_backend_cuda_get_device_memory(device , &free, &total);
3420
+ ggml_backend_cuda_get_device_memory(local_device , &free, &total);
3421
3421
return free;
3422
3422
#elif defined(GGML_USE_SYCL)
3423
3423
size_t total;
3424
3424
size_t free;
3425
- ggml_backend_sycl_get_device_memory(device , &free, &total);
3425
+ ggml_backend_sycl_get_device_memory(local_device , &free, &total);
3426
3426
return free;
3427
3427
#elif defined(GGML_USE_VULKAN)
3428
3428
size_t total;
3429
3429
size_t free;
3430
- ggml_backend_vk_get_device_memory(device , &free, &total);
3430
+ ggml_backend_vk_get_device_memory(local_device , &free, &total);
3431
3431
return free;
3432
3432
#elif defined(GGML_USE_CANN)
3433
3433
size_t total;
3434
3434
size_t free;
3435
- ggml_backend_cann_get_device_memory(device , &free, &total);
3435
+ ggml_backend_cann_get_device_memory(local_device , &free, &total);
3436
3436
return free;
3437
3437
#else
3438
3438
return 1;
3439
3439
#endif
3440
3440
GGML_UNUSED(model);
3441
- GGML_UNUSED(device );
3441
+ GGML_UNUSED(local_device );
3442
3442
}
3443
3443
3444
3444
//
@@ -18188,6 +18188,20 @@ struct llama_context * llama_new_context_with_model(
18188
18188
18189
18189
if (!hparams.vocab_only) {
18190
18190
// initialize backends
18191
+ #if defined(GGML_USE_RPC)
18192
+ if (model->n_gpu_layers > 0) {
18193
+ for (const auto & endpoint : model->rpc_servers) {
18194
+ ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
18195
+ if (backend == nullptr) {
18196
+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
18197
+ llama_free(ctx);
18198
+ return nullptr;
18199
+ }
18200
+ ctx->backends.push_back(backend);
18201
+ }
18202
+ }
18203
+ #endif
18204
+
18191
18205
#if defined(GGML_USE_METAL)
18192
18206
if (model->n_gpu_layers > 0) {
18193
18207
ctx->backend_metal = ggml_backend_metal_init();
@@ -18312,19 +18326,6 @@ struct llama_context * llama_new_context_with_model(
18312
18326
}
18313
18327
#endif
18314
18328
18315
- #if defined(GGML_USE_RPC)
18316
- if (model->n_gpu_layers > 0) {
18317
- for (const auto & endpoint : model->rpc_servers) {
18318
- ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
18319
- if (backend == nullptr) {
18320
- LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
18321
- llama_free(ctx);
18322
- return nullptr;
18323
- }
18324
- ctx->backends.push_back(backend);
18325
- }
18326
- }
18327
- #endif
18328
18329
ctx->backend_cpu = ggml_backend_cpu_init();
18329
18330
if (ctx->backend_cpu == nullptr) {
18330
18331
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
0 commit comments