Skip to content

Commit 9330ff0

Browse files
committed
Reduce memory usage and allocate enough memory for large contexts
1 parent f4f5362 commit 9330ff0

File tree

5 files changed

+372
-80
lines changed

5 files changed

+372
-80
lines changed

ggml.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5846,7 +5846,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
58465846
const struct ggml_tensor * src0,
58475847
const struct ggml_tensor * src1,
58485848
struct ggml_tensor * dst) {
5849-
UNUSED(src0);
5849+
const int ne00 = src0->ne[0];
5850+
const int ne01 = src0->ne[1];
58505851

58515852
const int ne10 = src1->ne[0];
58525853

@@ -5856,7 +5857,14 @@ static bool ggml_compute_forward_mul_mat_use_blas(
58565857
// TODO: find the optimal values for these
58575858
if (ggml_is_contiguous(src0) &&
58585859
ggml_is_contiguous(src1) && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
5859-
//printf("BLAS: %d %d %d\n", ne0, ne1, ne10);
5860+
5861+
// disable BLAS for Q4_0 and Q4_1
5862+
// looks like there is no benefit and we only waste a lot of memory
5863+
if (src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1) {
5864+
return false;
5865+
}
5866+
5867+
//printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);
58605868
return true;
58615869
}
58625870

0 commit comments

Comments
 (0)