@@ -1563,6 +1563,8 @@ static bool llama_kv_cache_init(
1563
1563
1564
1564
const int i_gpu_start = n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
1565
1565
1566
+ GGML_UNUSED(offload);
1567
+
1566
1568
for (int i = 0; i < (int) n_layer; i++) {
1567
1569
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
1568
1570
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
@@ -5406,7 +5408,7 @@ static struct ggml_cgraph * llama_build_graph(
5406
5408
//
5407
5409
// TODO: will be removed with backend v2
5408
5410
5409
- #define LLAMA_OFFLOAD_DEBUG
5411
+ // #define LLAMA_OFFLOAD_DEBUG
5410
5412
5411
5413
if (!do_offload) {
5412
5414
return;
@@ -9297,40 +9299,45 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9297
9299
data_ctx->write(&kv_used, sizeof(kv_used));
9298
9300
9299
9301
if (kv_buf_size) {
9300
- #pragma message("TODO: implement KV cache saving")
9301
- #if 0
9302
- const size_t elt_size = ggml_element_size(kv_self.k);
9302
+ const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
9303
9303
9304
- ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9304
+ ggml_context * cpy_ctx = ggml_init({ 6*n_layer* ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9305
9305
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
9306
9306
9307
- ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
9308
- std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
9309
- kout3d->data = kout3d_data.data();
9307
+ std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
9308
+ std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
9309
+
9310
+ for (int il = 0; il < (int) n_layer; ++il) {
9311
+ ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9312
+ kout2d_data[il].resize(ggml_nbytes(kout2d));
9313
+ kout2d->data = kout2d_data[il].data();
9310
9314
9311
- ggml_tensor * vout3d = ggml_new_tensor_3d (cpy_ctx, kv_self.v ->type, kv_head, n_embd, n_layer );
9312
- std::vector<uint8_t> vout3d_data (ggml_nbytes(vout3d), 0 );
9313
- vout3d ->data = vout3d_data .data();
9315
+ ggml_tensor * vout2d = ggml_new_tensor_2d (cpy_ctx, kv_self.v_l[il] ->type, kv_head, n_embd);
9316
+ vout2d_data[il].resize (ggml_nbytes(vout2d) );
9317
+ vout2d ->data = vout2d_data[il] .data();
9314
9318
9315
- ggml_tensor * k3d = ggml_view_3d (cpy_ctx, kv_self.k ,
9316
- n_embd, kv_head, n_layer ,
9317
- elt_size*n_embd, elt_size*n_embd*n_ctx , 0);
9319
+ ggml_tensor * k2d = ggml_view_2d (cpy_ctx, kv_self.k_l[il] ,
9320
+ n_embd, kv_head,
9321
+ elt_size*n_embd, 0);
9318
9322
9319
- ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
9320
- kv_head, n_embd, n_layer,
9321
- elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
9323
+ ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
9324
+ kv_head, n_embd,
9325
+ elt_size*n_ctx, 0);
9326
+
9327
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
9328
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
9329
+ }
9322
9330
9323
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
9324
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
9325
9331
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
9326
9332
9327
9333
ggml_free(cpy_ctx);
9328
9334
9329
- // our data is now in the kout3d_data and vout3d_data buffers
9335
+ // our data is now in the kout2d_data and vout2d_data buffers
9330
9336
// write them to file
9331
- data_ctx->write(kout3d_data.data(), kout3d_data.size());
9332
- data_ctx->write(vout3d_data.data(), vout3d_data.size());
9333
- #endif
9337
+ for (uint32_t il = 0; il < n_layer; ++il) {
9338
+ data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
9339
+ data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
9340
+ }
9334
9341
}
9335
9342
9336
9343
for (uint32_t i = 0; i < kv_size; ++i) {
@@ -9430,35 +9437,35 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9430
9437
if (kv_buf_size) {
9431
9438
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
9432
9439
9433
- #pragma message("TODO: implement KV cache loading")
9434
- #if 0
9435
- const size_t elt_size = ggml_element_size(kv_self.k);
9440
+ const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
9436
9441
9437
- ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9442
+ ggml_context * cpy_ctx = ggml_init({ 6*n_layer* ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9438
9443
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
9439
9444
9440
- ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
9441
- kin3d->data = (void *) inp;
9442
- inp += ggml_nbytes(kin3d);
9445
+ for (int il = 0; il < n_layer; ++il) {
9446
+ ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9447
+ kin2d->data = (void *) inp;
9448
+ inp += ggml_nbytes(kin2d);
9449
+
9450
+ ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9451
+ vin2d->data = (void *) inp;
9452
+ inp += ggml_nbytes(vin2d);
9443
9453
9444
- ggml_tensor * vin3d = ggml_new_tensor_3d (cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
9445
- vin3d->data = (void *) inp;
9446
- inp += ggml_nbytes(vin3d );
9454
+ ggml_tensor * k2d = ggml_view_2d (cpy_ctx, kv_self.k_l[il],
9455
+ n_embd, kv_head,
9456
+ elt_size*n_embd, 0 );
9447
9457
9448
- ggml_tensor * k3d = ggml_view_3d (cpy_ctx, kv_self.k ,
9449
- n_embd, kv_head, n_layer ,
9450
- elt_size*n_embd, elt_size*n_embd *n_ctx, 0);
9458
+ ggml_tensor * v2d = ggml_view_2d (cpy_ctx, kv_self.v_l[il] ,
9459
+ kv_head, n_embd ,
9460
+ elt_size*n_ctx, 0);
9451
9461
9452
- ggml_tensor * v3d = ggml_view_3d( cpy_ctx, kv_self.v,
9453
- kv_head, n_embd, n_layer,
9454
- elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
9462
+ ggml_build_forward_expand(gf, ggml_cpy( cpy_ctx, kin2d, k2d));
9463
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
9464
+ }
9455
9465
9456
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
9457
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
9458
9466
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
9459
9467
9460
9468
ggml_free(cpy_ctx);
9461
- #endif
9462
9469
}
9463
9470
9464
9471
ctx->kv_self.head = kv_head;
0 commit comments