Skip to content

Commit 6b6cf19

Browse files
committed
talk-llama : sync llama.cpp
ggml-ci
1 parent 05501c2 commit 6b6cf19

8 files changed

+256
-63
lines changed

examples/talk-llama/llama-arch.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1481,6 +1481,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
14811481
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
14821482
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
14831483
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1484+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1485+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1486+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
14841487
},
14851488
},
14861489
{

examples/talk-llama/llama-context.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1704,10 +1704,12 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
17041704
}
17051705
}
17061706

1707-
LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
17081707
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
17091708

1710-
kv_self->state_write(io);
1709+
if (kv_self != nullptr) {
1710+
LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
1711+
kv_self->state_write(io);
1712+
}
17111713

17121714
return io.n_bytes();
17131715
}

examples/talk-llama/llama-kv-cache.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,13 @@ void llama_kv_cache_unified::defrag_sched(float thold) {
441441

442442
void llama_kv_cache_unified::set_full() {
443443
n = size;
444+
445+
// when simulating a full KV cache, the specific value of the "head" pointer is not important because it does not
446+
// affect the shapes of the tensors in the compute graph - it only affects the offsets of the K/V views.
447+
// we should only guarantee that the head position won't cause out-of-bounds view of the K, V tensors, so
448+
// setting it to 0 is the simplest way to achieve that
449+
// ref: https://github.com/ggml-org/llama.cpp/issues/13359
450+
head = 0;
444451
}
445452

446453
llama_sbatch llama_kv_cache_unified::sbatch_init(
@@ -1712,6 +1719,7 @@ void llama_kv_cache_recurrent::defrag_sched(float thold) {
17121719

17131720
void llama_kv_cache_recurrent::set_full() {
17141721
n = size;
1722+
head = 0;
17151723
}
17161724

17171725
llama_sbatch llama_kv_cache_recurrent::sbatch_init(

examples/talk-llama/llama-kv-cache.h

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -171,11 +171,8 @@ class llama_kv_cache_unified : public llama_kv_cache {
171171
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
172172
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
173173

174-
// Note: The value of head isn't only used to optimize searching
175-
// for a free KV slot. llama_decode_impl also uses it, so it
176-
// cannot be freely changed after a slot has been allocated.
177-
uint32_t head = 0;
178-
uint32_t size = 0;
174+
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
175+
uint32_t size = 0; // total number of cells, shared across all sequences
179176
uint32_t used = 0; // used cells (i.e. at least one seq_id)
180177

181178
// computed before each graph build
@@ -343,11 +340,8 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
343340
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
344341
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
345342

346-
// Note: The value of head isn't only used to optimize searching
347-
// for a free KV slot. llama_decode_impl also uses it, so it
348-
// cannot be freely changed after a slot has been allocated.
349-
uint32_t head = 0;
350-
uint32_t size = 0;
343+
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
344+
uint32_t size = 0; // total number of cells, shared across all sequences
351345
uint32_t used = 0; // used cells (i.e. at least one seq_id)
352346

353347
// computed before each graph build

examples/talk-llama/llama-model-loader.cpp

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,7 @@ llama_model_loader::llama_model_loader(
469469

470470
meta.reset(gguf_init_from_file(fname.c_str(), params));
471471
if (!meta) {
472-
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
472+
throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
473473
}
474474

475475
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
@@ -528,7 +528,7 @@ llama_model_loader::llama_model_loader(
528528
};
529529
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
530530
if (!ctx_gguf) {
531-
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname_split));
531+
throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
532532
}
533533

534534
// check idx
@@ -822,13 +822,18 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
822822
mappings.reserve(files.size());
823823
mmaps_used.reserve(files.size());
824824
for (const auto & file : files) {
825-
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
826-
if (!reg) {
827-
throw std::runtime_error(format("%s: no CPU backend found", __func__));
825+
bool is_numa = false;
826+
827+
auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
828+
if (dev) {
829+
auto * reg = ggml_backend_dev_backend_reg(dev);
830+
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
831+
if (is_numa_fn) {
832+
is_numa = is_numa_fn();
833+
}
828834
}
829835

830-
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
831-
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
836+
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa);
832837
mmaps_used.emplace_back(mapping->size(), 0);
833838
if (mlock_mmaps) {
834839
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());

0 commit comments

Comments
 (0)