Skip to content

Commit 89b7127

Browse files
committed
llama : decide to disable Vulkan before loading tensors (ggml-org#7)
1 parent 1c17010 commit 89b7127

File tree

1 file changed

+19
-9
lines changed

1 file changed

+19
-9
lines changed

llama.cpp

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2407,7 +2407,7 @@ static bool llama_model_load(
24072407
llama_model & model,
24082408
int n_ctx,
24092409
int n_batch,
2410-
int n_gpu_layers,
2410+
int * n_gpu_layers,
24112411
int main_gpu,
24122412
const float * tensor_split,
24132413
const bool mul_mat_q,
@@ -2438,8 +2438,23 @@ static bool llama_model_load(
24382438
return true;
24392439
}
24402440

2441+
#ifdef GGML_USE_KOMPUTE
2442+
if (ggml_vk_has_device() && *n_gpu_layers > 0 && (
2443+
!(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
2444+
|| !(
2445+
model.ftype == LLAMA_FTYPE_ALL_F32 ||
2446+
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
2447+
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
2448+
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
2449+
)
2450+
)) {
2451+
// disable Vulkan due to unsupported model architecture or quantization type
2452+
*n_gpu_layers = 0;
2453+
}
2454+
#endif
2455+
24412456
llm_load_tensors(
2442-
*ml, model, n_batch, n_gpu_layers,
2457+
*ml, model, n_batch, *n_gpu_layers,
24432458
main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
24442459
use_mlock, progress_callback, progress_callback_user_data);
24452460
} catch (const std::exception & err) {
@@ -6354,7 +6369,7 @@ struct llama_model * llama_load_model_from_file(
63546369
};
63556370
}
63566371

6357-
if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
6372+
if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, &params.n_gpu_layers,
63586373
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
63596374
params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
63606375
params.progress_callback, params.progress_callback_user_data)) {
@@ -6502,12 +6517,7 @@ struct llama_context * llama_new_context_with_model(
65026517
#undef LLAMA_METAL_CHECK_BUF
65036518
}
65046519
#elif defined(GGML_USE_KOMPUTE)
6505-
if (ggml_vk_has_device() && params.n_gpu_layers > 0
6506-
&& (model->arch == LLM_ARCH_LLAMA || model->arch == LLM_ARCH_FALCON)
6507-
&& (model->ftype == LLAMA_FTYPE_ALL_F32
6508-
|| model->ftype == LLAMA_FTYPE_MOSTLY_F16
6509-
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
6510-
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
6520+
if (ggml_vk_has_device() && params.n_gpu_layers > 0) {
65116521
// this allocates all Vulkan resources and memory buffers
65126522
ctx->ctx_kompute = ggml_vk_init();
65136523

0 commit comments

Comments
 (0)