kompute : fix fallback to CPU (ggml-org#5201)

cebtenzzre · hodlen · commit dd8f089b3461 · 2024-04-02T00:15:04.000+08:00
diff --git a/llama.cpp b/llama.cpp
@@ -4136,7 +4136,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         }
 
 #ifdef GGML_USE_KOMPUTE
-        if (ggml_vk_has_device() && params.n_gpu_layers > 0 && (
+        if (params.n_gpu_layers > 0 && (
             !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
             || !(
                 model.ftype == LLAMA_FTYPE_ALL_F32 ||
@@ -4145,8 +4145,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
                 model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
             )
         )) {
-            // disable Vulkan due to unsupported model architecture or quantization type
             // TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
+            LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
             params.n_gpu_layers = 0;
         }
 #endif

Original file line number	Diff line number	Diff line change
`@@ -4136,7 +4136,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam`
`4136`	`4136`	`}`
`4137`	`4137`
`4138`	`4138`	`#ifdef GGML_USE_KOMPUTE`
`4139`		`- if (ggml_vk_has_device() && params.n_gpu_layers > 0 && (`
	`4139`	`+ if (params.n_gpu_layers > 0 && (`
`4140`	`4140`	`!(model.arch == LLM_ARCH_LLAMA \|\| model.arch == LLM_ARCH_FALCON)`
`4141`	`4141`	`\|\| !(`
`4142`	`4142`	`model.ftype == LLAMA_FTYPE_ALL_F32 \|\|`
`@@ -4145,8 +4145,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam`
`4145`	`4145`	`model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1`
`4146`	`4146`	`)`
`4147`	`4147`	`)) {`
`4148`		`- // disable Vulkan due to unsupported model architecture or quantization type`
`4149`	`4148`	`// TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file`
	`4149`	`+ LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);`
`4150`	`4150`	`params.n_gpu_layers = 0;`
`4151`	`4151`	`}`
`4152`	`4152`	`#endif`