Skip to content

Commit e50ab5a

Browse files
committed
llama : increase inference graph size up to 4096 nodes
1 parent b1592ea commit e50ab5a

File tree

1 file changed

+11
-9
lines changed

1 file changed

+11
-9
lines changed

llama.cpp

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@
9191
#define LLAMA_ATTRIBUTE_FORMAT(...)
9292
#endif
9393

94+
#define LLAMA_MAX_NODES 4096
95+
9496
//
9597
// logging
9698
//
@@ -3580,7 +3582,7 @@ struct llm_build_context {
35803582
}
35813583

35823584
struct ggml_cgraph * build_llama() {
3583-
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3585+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
35843586

35853587
GGML_ASSERT(n_embd_head == hparams.n_rot);
35863588

@@ -3692,7 +3694,7 @@ struct llm_build_context {
36923694
}
36933695

36943696
struct ggml_cgraph * build_baichuan() {
3695-
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3697+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
36963698

36973699
struct ggml_tensor * cur;
36983700
struct ggml_tensor * inpL;
@@ -3812,7 +3814,7 @@ struct llm_build_context {
38123814
}
38133815

38143816
struct ggml_cgraph * build_falcon() {
3815-
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3817+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
38163818

38173819
struct ggml_tensor * cur;
38183820
struct ggml_tensor * inpL;
@@ -3934,7 +3936,7 @@ struct llm_build_context {
39343936
}
39353937

39363938
struct ggml_cgraph * build_starcoder() {
3937-
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3939+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
39383940

39393941
struct ggml_tensor * cur;
39403942
struct ggml_tensor * pos;
@@ -4033,7 +4035,7 @@ struct llm_build_context {
40334035
}
40344036

40354037
struct ggml_cgraph * build_persimmon() {
4036-
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4038+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
40374039

40384040
const int64_t n_rot = n_embd_head / 2;
40394041

@@ -4243,7 +4245,7 @@ struct llm_build_context {
42434245
}
42444246

42454247
struct ggml_cgraph * build_refact() {
4246-
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4248+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
42474249

42484250
struct ggml_tensor * cur;
42494251
struct ggml_tensor * inpL;
@@ -4334,7 +4336,7 @@ struct llm_build_context {
43344336
}
43354337

43364338
struct ggml_cgraph * build_bloom() {
4337-
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4339+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
43384340

43394341
struct ggml_tensor * cur;
43404342
struct ggml_tensor * inpL;
@@ -4428,7 +4430,7 @@ struct llm_build_context {
44284430
}
44294431

44304432
struct ggml_cgraph * build_mpt() {
4431-
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4433+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
44324434

44334435
struct ggml_tensor * cur;
44344436
struct ggml_tensor * inpL;
@@ -8169,7 +8171,7 @@ struct llama_context * llama_new_context_with_model(
81698171
{
81708172
static const size_t tensor_alignment = 32;
81718173
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
8172-
ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead());
8174+
ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
81738175

81748176
// create measure allocator
81758177
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);

0 commit comments

Comments
 (0)