@@ -2407,7 +2407,7 @@ static bool llama_model_load(
2407
2407
llama_model & model,
2408
2408
int n_ctx,
2409
2409
int n_batch,
2410
- int n_gpu_layers,
2410
+ int * n_gpu_layers,
2411
2411
int main_gpu,
2412
2412
const float * tensor_split,
2413
2413
const bool mul_mat_q,
@@ -2438,8 +2438,23 @@ static bool llama_model_load(
2438
2438
return true ;
2439
2439
}
2440
2440
2441
+ #ifdef GGML_USE_KOMPUTE
2442
+ if (ggml_vk_has_device () && *n_gpu_layers > 0 && (
2443
+ !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
2444
+ || !(
2445
+ model.ftype == LLAMA_FTYPE_ALL_F32 ||
2446
+ model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
2447
+ model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
2448
+ model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
2449
+ )
2450
+ )) {
2451
+ // disable Vulkan due to unsupported model architecture or quantization type
2452
+ *n_gpu_layers = 0 ;
2453
+ }
2454
+ #endif
2455
+
2441
2456
llm_load_tensors (
2442
- *ml, model, n_batch, n_gpu_layers,
2457
+ *ml, model, n_batch, * n_gpu_layers,
2443
2458
main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
2444
2459
use_mlock, progress_callback, progress_callback_user_data);
2445
2460
} catch (const std::exception & err) {
@@ -6354,7 +6369,7 @@ struct llama_model * llama_load_model_from_file(
6354
6369
};
6355
6370
}
6356
6371
6357
- if (!llama_model_load (path_model, *model, params.n_ctx , params.n_batch , params.n_gpu_layers ,
6372
+ if (!llama_model_load (path_model, *model, params.n_ctx , params.n_batch , & params.n_gpu_layers ,
6358
6373
params.main_gpu , params.tensor_split , params.mul_mat_q , params.rope_freq_base , params.rope_freq_scale ,
6359
6374
params.low_vram , memory_type, params.use_mmap , params.use_mlock , params.vocab_only ,
6360
6375
params.progress_callback , params.progress_callback_user_data )) {
@@ -6502,12 +6517,7 @@ struct llama_context * llama_new_context_with_model(
6502
6517
#undef LLAMA_METAL_CHECK_BUF
6503
6518
}
6504
6519
#elif defined(GGML_USE_KOMPUTE)
6505
- if (ggml_vk_has_device () && params.n_gpu_layers > 0
6506
- && (model->arch == LLM_ARCH_LLAMA || model->arch == LLM_ARCH_FALCON)
6507
- && (model->ftype == LLAMA_FTYPE_ALL_F32
6508
- || model->ftype == LLAMA_FTYPE_MOSTLY_F16
6509
- || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
6510
- || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
6520
+ if (ggml_vk_has_device () && params.n_gpu_layers > 0 ) {
6511
6521
// this allocates all Vulkan resources and memory buffers
6512
6522
ctx->ctx_kompute = ggml_vk_init ();
6513
6523
0 commit comments