Skip to content

Commit 6df465a

Browse files
authored
llama : run all KQV ops on the CPU with no KV offload (#5049)
ggml-ci
1 parent 77bc1bb commit 6df465a

File tree

2 files changed

+99
-80
lines changed

2 files changed

+99
-80
lines changed

ggml-backend.c

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1191,6 +1191,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
11911191
ggml_tallocr_t src_allocr = node_allocr(src);
11921192
GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
11931193
if (src_allocr != node_allocr) {
1194+
// create a copy of the input in the split's backend
1195+
size_t id = hash_id(src);
1196+
if (sched->node_copies[id][cur_backend_id] == NULL) {
1197+
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1198+
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1199+
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1200+
1201+
sched->node_copies[id][cur_backend_id] = tensor_copy;
1202+
node_allocr(tensor_copy) = cur_allocr;
1203+
SET_CAUSE(tensor_copy, "4.cpy");
1204+
1205+
int n_inputs = sched->splits[cur_split].n_inputs++;
1206+
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1207+
sched->splits[cur_split].inputs[n_inputs] = src;
1208+
}
1209+
node->src[j] = sched->node_copies[id][cur_backend_id];
1210+
1211+
#if 0
11941212
// check if the input is already in the split
11951213
bool found = false;
11961214
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
@@ -1206,19 +1224,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
12061224
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
12071225
sched->splits[cur_split].inputs[n_inputs] = src;
12081226
}
1209-
1210-
// create a copy of the input in the split's backend
1211-
size_t id = hash_id(src);
1212-
if (sched->node_copies[id][cur_backend_id] == NULL) {
1213-
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1214-
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1215-
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1216-
1217-
sched->node_copies[id][cur_backend_id] = tensor_copy;
1218-
node_allocr(tensor_copy) = cur_allocr;
1219-
SET_CAUSE(tensor_copy, "4.cpy");
1220-
}
1221-
node->src[j] = sched->node_copies[id][cur_backend_id];
1227+
#endif
12221228
}
12231229
}
12241230
}
@@ -1333,7 +1339,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
13331339
uint64_t compute_start_us = ggml_time_us();
13341340
if (!sched->callback_eval) {
13351341
ggml_backend_graph_compute(split_backend, &split->graph);
1336-
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
1342+
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
13371343
} else {
13381344
// similar to ggml_backend_compare_graph_backend
13391345
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {

llama.cpp

Lines changed: 79 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -4315,6 +4315,7 @@ static struct ggml_tensor * llm_build_kqv(
43154315
const llama_model & model,
43164316
const llama_hparams & hparams,
43174317
const llama_kv_cache & kv,
4318+
struct ggml_cgraph * graph,
43184319
struct ggml_tensor * wo,
43194320
struct ggml_tensor * wo_b,
43204321
struct ggml_tensor * q_cur,
@@ -4393,6 +4394,8 @@ static struct ggml_tensor * llm_build_kqv(
43934394
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
43944395
cb(cur, "kqv_merged_cont", il);
43954396

4397+
ggml_build_forward_expand(graph, cur);
4398+
43964399
cur = ggml_mul_mat(ctx, wo, cur);
43974400
if (wo_b) {
43984401
cb(cur, "kqv_wo", il);
@@ -4405,6 +4408,44 @@ static struct ggml_tensor * llm_build_kqv(
44054408
return cur;
44064409
}
44074410

4411+
static struct ggml_tensor * llm_build_kv(
4412+
struct ggml_context * ctx,
4413+
const llama_model & model,
4414+
const llama_hparams & hparams,
4415+
const llama_kv_cache & kv,
4416+
struct ggml_cgraph * graph,
4417+
struct ggml_tensor * wo,
4418+
struct ggml_tensor * wo_b,
4419+
struct ggml_tensor * k_cur,
4420+
struct ggml_tensor * v_cur,
4421+
struct ggml_tensor * q_cur,
4422+
struct ggml_tensor * kq_mask,
4423+
int64_t n_ctx,
4424+
int32_t n_tokens,
4425+
int32_t kv_head,
4426+
int32_t n_kv,
4427+
float max_alibi_bias,
4428+
float kq_scale,
4429+
const llm_build_cb & cb,
4430+
int il) {
4431+
4432+
// these nodes are added to the graph together so that they are not reordered
4433+
// by doing so, the number of splits in the graph is reduced
4434+
ggml_build_forward_expand(graph, k_cur);
4435+
ggml_build_forward_expand(graph, v_cur);
4436+
ggml_build_forward_expand(graph, q_cur);
4437+
4438+
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
4439+
4440+
struct ggml_tensor * cur;
4441+
cur = llm_build_kqv(ctx, model, hparams, kv, graph,
4442+
wo, wo_b,
4443+
q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
4444+
cb(cur, "kqv_out", il);
4445+
4446+
return cur;
4447+
}
4448+
44084449
struct llm_build_context {
44094450
const llama_model & model;
44104451
const llama_hparams & hparams;
@@ -4562,12 +4603,6 @@ struct llm_build_context {
45624603
cb(Vcur, "Vcur", il);
45634604
}
45644605

4565-
// these nodes are added to the graph together so that they are not reordered
4566-
// by doing so, the number of splits in the graph is reduced
4567-
ggml_build_forward_expand(gf, Qcur);
4568-
ggml_build_forward_expand(gf, Kcur);
4569-
ggml_build_forward_expand(gf, Vcur);
4570-
45714606
Qcur = ggml_rope_custom(
45724607
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
45734608
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
@@ -4582,11 +4617,9 @@ struct llm_build_context {
45824617
);
45834618
cb(Kcur, "Kcur", il);
45844619

4585-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4586-
4587-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4620+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
45884621
model.layers[il].wo, model.layers[il].bo,
4589-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4622+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
45904623
cb(cur, "kqv_out", il);
45914624
}
45924625

@@ -4763,14 +4796,13 @@ struct llm_build_context {
47634796
cb(Qcur, "Qcur", il);
47644797
cb(Kcur, "Kcur", il);
47654798

4766-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
47674799

47684800
// apply ALiBi for 13B model
47694801
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
47704802

4771-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4803+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
47724804
model.layers[il].wo, NULL,
4773-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4805+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
47744806
cb(cur, "kqv_out", il);
47754807
}
47764808

@@ -4892,11 +4924,9 @@ struct llm_build_context {
48924924
);
48934925
cb(Kcur, "Kcur", il);
48944926

4895-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4896-
4897-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4927+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
48984928
model.layers[il].wo, NULL,
4899-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4929+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
49004930
cb(cur, "kqv_out", il);
49014931
}
49024932

@@ -4993,11 +5023,9 @@ struct llm_build_context {
49935023

49945024
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
49955025

4996-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4997-
4998-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5026+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
49995027
model.layers[il].wo, model.layers[il].bo,
5000-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5028+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
50015029
cb(cur, "kqv_out", il);
50025030
}
50035031

@@ -5200,12 +5228,9 @@ struct llm_build_context {
52005228
);
52015229
cb(Vcur, "Vcur", il);
52025230

5203-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5204-
5205-
// TODO: not tested, could be broken
5206-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5231+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
52075232
model.layers[il].wo, model.layers[il].bo,
5208-
Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5233+
Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
52095234
cb(cur, "kqv_out", il);
52105235
}
52115236

@@ -5292,11 +5317,9 @@ struct llm_build_context {
52925317
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
52935318
cb(Qcur, "Qcur", il);
52945319

5295-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5296-
5297-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5320+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
52985321
model.layers[il].wo, NULL,
5299-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5322+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
53005323
cb(cur, "kqv_out", il);
53015324
}
53025325

@@ -5390,11 +5413,9 @@ struct llm_build_context {
53905413

53915414
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
53925415

5393-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5394-
5395-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5416+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
53965417
model.layers[il].wo, model.layers[il].bo,
5397-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5418+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
53985419
cb(cur, "kqv_out", il);
53995420
}
54005421

@@ -5485,11 +5506,9 @@ struct llm_build_context {
54855506

54865507
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
54875508

5488-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5489-
5490-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5509+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
54915510
model.layers[il].wo, NULL,
5492-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5511+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
54935512
cb(cur, "kqv_out", il);
54945513
}
54955514

@@ -5597,11 +5616,9 @@ struct llm_build_context {
55975616
);
55985617
cb(Kcur, "Kcur", il);
55995618

5600-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5601-
5602-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5619+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
56035620
model.layers[il].wo, NULL,
5604-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5621+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
56055622
cb(cur, "kqv_out", il);
56065623
}
56075624

@@ -5714,11 +5731,9 @@ struct llm_build_context {
57145731
);
57155732
cb(Kcur, "Kcur", il);
57165733

5717-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5718-
5719-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5734+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
57205735
model.layers[il].wo, NULL,
5721-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5736+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
57225737
cb(cur, "kqv_out", il);
57235738
}
57245739

@@ -5837,11 +5852,9 @@ struct llm_build_context {
58375852
);
58385853
cb(Kcur, "Kcur", il);
58395854

5840-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5841-
5842-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5855+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
58435856
model.layers[il].wo, model.layers[il].bo,
5844-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5857+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
58455858
cb(cur, "kqv_out", il);
58465859
}
58475860

@@ -5966,11 +5979,9 @@ struct llm_build_context {
59665979
);
59675980
cb(Kcur, "Kcur", il);
59685981

5969-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5970-
5971-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5982+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
59725983
model.layers[il].wo, model.layers[il].bo,
5973-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
5984+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
59745985
cb(cur, "kqv_out", il);
59755986
}
59765987

@@ -6071,11 +6082,9 @@ struct llm_build_context {
60716082
ext_factor, attn_factor, beta_fast, beta_slow);
60726083
cb(Kcur, "Kcur", il);
60736084

6074-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
6075-
6076-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
6085+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
60776086
model.layers[il].wo, NULL,
6078-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6087+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
60796088
cb(cur, "kqv_out", il);
60806089
}
60816090
struct ggml_tensor * sa_out = cur;
@@ -6172,11 +6181,9 @@ struct llm_build_context {
61726181

61736182
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
61746183

6175-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
6176-
6177-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
6184+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
61786185
model.layers[il].wo, model.layers[il].bo,
6179-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6186+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
61806187
cb(cur, "kqv_out", il);
61816188
}
61826189

@@ -6283,11 +6290,9 @@ struct llm_build_context {
62836290
);
62846291
cb(Kcur, "Kcur", il);
62856292

6286-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
6287-
6288-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
6293+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
62896294
model.layers[il].wo, model.layers[il].bo,
6290-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6295+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
62916296
cb(cur, "kqv_out", il);
62926297
}
62936298

@@ -6355,6 +6360,14 @@ static struct ggml_cgraph * llama_build_graph(
63556360
ggml_set_name(cur, name);
63566361
}
63576362

6363+
6364+
if (!lctx.cparams.offload_kqv) {
6365+
if (strcmp(name, "kqv_merged_cont") == 0) {
6366+
// all nodes between the KV store and the attention output are run on the CPU
6367+
ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
6368+
}
6369+
}
6370+
63586371
//
63596372
// allocate input tensors and set input data
63606373
//

0 commit comments

Comments
 (0)