Skip to content

Commit 8406b09

Browse files
authored
ggml : re-enable BLAS for CPU when src0 != F32 + remove redundant full offload checks in llama.cpp (#4240)
* ggml : use blas even if src0 is not F32 * llama : use n_threads_batch only when n_tokens >= 32 ggml-ci * llama : revert n_threads_batch logic ggml-ci
1 parent b38a16d commit 8406b09

File tree

2 files changed

+2
-12
lines changed

2 files changed

+2
-12
lines changed

ggml.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9373,7 +9373,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
93739373
// TODO: find the optimal values for these
93749374
if (ggml_is_contiguous(src0) &&
93759375
ggml_is_contiguous(src1) &&
9376-
src0->type == GGML_TYPE_F32 &&
9376+
//src0->type == GGML_TYPE_F32 &&
93779377
src1->type == GGML_TYPE_F32 &&
93789378
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
93799379

llama.cpp

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5550,18 +5550,8 @@ static int llama_decode_internal(
55505550
n_threads = std::min(4, n_threads);
55515551
}
55525552

5553-
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
5554-
const bool full_offload_supported =
5555-
model.arch == LLM_ARCH_LLAMA ||
5556-
model.arch == LLM_ARCH_BAICHUAN ||
5557-
model.arch == LLM_ARCH_FALCON ||
5558-
model.arch == LLM_ARCH_REFACT ||
5559-
model.arch == LLM_ARCH_MPT ||
5560-
model.arch == LLM_ARCH_STARCODER ||
5561-
model.arch == LLM_ARCH_STABLELM;
5562-
55635553
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5564-
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
5554+
if (ggml_cpu_has_cublas() && fully_offloaded) {
55655555
n_threads = 1;
55665556
}
55675557

0 commit comments

Comments
 (0)