Skip to content

Commit 66aaac9

Browse files
committed
llama : update session save/load
1 parent e262947 commit 66aaac9

File tree

2 files changed

+50
-43
lines changed

2 files changed

+50
-43
lines changed

llama.cpp

Lines changed: 49 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1563,6 +1563,8 @@ static bool llama_kv_cache_init(
15631563

15641564
const int i_gpu_start = n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
15651565

1566+
GGML_UNUSED(offload);
1567+
15661568
for (int i = 0; i < (int) n_layer; i++) {
15671569
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
15681570
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
@@ -5406,7 +5408,7 @@ static struct ggml_cgraph * llama_build_graph(
54065408
//
54075409
// TODO: will be removed with backend v2
54085410

5409-
#define LLAMA_OFFLOAD_DEBUG
5411+
//#define LLAMA_OFFLOAD_DEBUG
54105412

54115413
if (!do_offload) {
54125414
return;
@@ -9297,40 +9299,45 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
92979299
data_ctx->write(&kv_used, sizeof(kv_used));
92989300

92999301
if (kv_buf_size) {
9300-
#pragma message("TODO: implement KV cache saving")
9301-
#if 0
9302-
const size_t elt_size = ggml_element_size(kv_self.k);
9302+
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
93039303

9304-
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9304+
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
93059305
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
93069306

9307-
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
9308-
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
9309-
kout3d->data = kout3d_data.data();
9307+
std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
9308+
std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
9309+
9310+
for (int il = 0; il < (int) n_layer; ++il) {
9311+
ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9312+
kout2d_data[il].resize(ggml_nbytes(kout2d));
9313+
kout2d->data = kout2d_data[il].data();
93109314

9311-
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
9312-
std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
9313-
vout3d->data = vout3d_data.data();
9315+
ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9316+
vout2d_data[il].resize(ggml_nbytes(vout2d));
9317+
vout2d->data = vout2d_data[il].data();
93149318

9315-
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
9316-
n_embd, kv_head, n_layer,
9317-
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
9319+
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9320+
n_embd, kv_head,
9321+
elt_size*n_embd, 0);
93189322

9319-
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
9320-
kv_head, n_embd, n_layer,
9321-
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
9323+
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
9324+
kv_head, n_embd,
9325+
elt_size*n_ctx, 0);
9326+
9327+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
9328+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
9329+
}
93229330

9323-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
9324-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
93259331
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
93269332

93279333
ggml_free(cpy_ctx);
93289334

9329-
// our data is now in the kout3d_data and vout3d_data buffers
9335+
// our data is now in the kout2d_data and vout2d_data buffers
93309336
// write them to file
9331-
data_ctx->write(kout3d_data.data(), kout3d_data.size());
9332-
data_ctx->write(vout3d_data.data(), vout3d_data.size());
9333-
#endif
9337+
for (uint32_t il = 0; il < n_layer; ++il) {
9338+
data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
9339+
data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
9340+
}
93349341
}
93359342

93369343
for (uint32_t i = 0; i < kv_size; ++i) {
@@ -9430,35 +9437,35 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
94309437
if (kv_buf_size) {
94319438
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
94329439

9433-
#pragma message("TODO: implement KV cache loading")
9434-
#if 0
9435-
const size_t elt_size = ggml_element_size(kv_self.k);
9440+
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
94369441

9437-
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9442+
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
94389443
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
94399444

9440-
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
9441-
kin3d->data = (void *) inp;
9442-
inp += ggml_nbytes(kin3d);
9445+
for (int il = 0; il < n_layer; ++il) {
9446+
ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9447+
kin2d->data = (void *) inp;
9448+
inp += ggml_nbytes(kin2d);
9449+
9450+
ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9451+
vin2d->data = (void *) inp;
9452+
inp += ggml_nbytes(vin2d);
94439453

9444-
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
9445-
vin3d->data = (void *) inp;
9446-
inp += ggml_nbytes(vin3d);
9454+
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9455+
n_embd, kv_head,
9456+
elt_size*n_embd, 0);
94479457

9448-
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
9449-
n_embd, kv_head, n_layer,
9450-
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
9458+
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
9459+
kv_head, n_embd,
9460+
elt_size*n_ctx, 0);
94519461

9452-
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
9453-
kv_head, n_embd, n_layer,
9454-
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
9462+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
9463+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
9464+
}
94559465

9456-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
9457-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
94589466
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
94599467

94609468
ggml_free(cpy_ctx);
9461-
#endif
94629469
}
94639470

94649471
ctx->kv_self.head = kv_head;

llama.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
4343

4444
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
45-
#define LLAMA_SESSION_VERSION 2
45+
#define LLAMA_SESSION_VERSION 3
4646

4747
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
4848
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.

0 commit comments

Comments
 (0)