Skip to content

Commit 6ac252e

Browse files
compiladearthw
authored andcommitted
llama : reduce useless copies when saving session (ggml-org#8916)
* llama : avoid useless copies in dummy session writer * llama : avoid double tensor copy when saving session to buffer
1 parent b270371 commit 6ac252e

File tree

1 file changed

+28
-11
lines changed

1 file changed

+28
-11
lines changed

src/llama.cpp

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17348,6 +17348,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
1734817348
// TODO: replace all non-fatal assertions with returned errors or exceptions
1734917349
struct llama_data_write {
1735017350
virtual void write(const void * src, size_t size) = 0;
17351+
virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
1735117352
virtual size_t get_size_written() = 0;
1735217353
virtual ~llama_data_write() = default;
1735317354

@@ -17470,9 +17471,8 @@ struct llama_data_write {
1747017471
// Read each range of cells of k_size length each into tmp_buf and write out
1747117472
for (const auto & range : cell_ranges) {
1747217473
const size_t range_size = range.second - range.first;
17473-
tmp_buf.resize(range_size * k_size_row);
17474-
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
17475-
write(tmp_buf.data(), tmp_buf.size());
17474+
const size_t buf_size = range_size * k_size_row;
17475+
write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
1747617476
}
1747717477
}
1747817478

@@ -17491,9 +17491,8 @@ struct llama_data_write {
1749117491
// Read each range of cells of v_size length each into tmp_buf and write out
1749217492
for (const auto & range : cell_ranges) {
1749317493
const size_t range_size = range.second - range.first;
17494-
tmp_buf.resize(range_size * v_size_row);
17495-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
17496-
write(tmp_buf.data(), tmp_buf.size());
17494+
const size_t buf_size = range_size * v_size_row;
17495+
write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
1749717496
}
1749817497
}
1749917498
} else {
@@ -17519,9 +17518,8 @@ struct llama_data_write {
1751917518
for (const auto & range : cell_ranges) {
1752017519
const size_t range_size = range.second - range.first;
1752117520
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
17522-
tmp_buf.resize(range_size * v_size_el);
17523-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
17524-
write(tmp_buf.data(), tmp_buf.size());
17521+
const size_t buf_size = range_size * v_size_el;
17522+
write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
1752517523
}
1752617524
}
1752717525
}
@@ -17880,12 +17878,14 @@ struct llama_data_write_dummy : llama_data_write {
1788017878

1788117879
llama_data_write_dummy() {}
1788217880

17883-
// TODO: avoid unnecessary calls to ggml_backend_tensor_get in a dummy context
17884-
1788517881
void write(const void * /* src */, size_t size) override {
1788617882
size_written += size;
1788717883
}
1788817884

17885+
void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
17886+
size_written += size;
17887+
}
17888+
1788917889
size_t get_size_written() override {
1789017890
return size_written;
1789117891
}
@@ -17908,6 +17908,16 @@ struct llama_data_write_buffer : llama_data_write {
1790817908
buf_size -= size;
1790917909
}
1791017910

17911+
void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
17912+
if (size > buf_size) {
17913+
throw std::runtime_error("unexpectedly reached end of buffer");
17914+
}
17915+
ggml_backend_tensor_get(tensor, ptr, offset, size);
17916+
ptr += size;
17917+
size_written += size;
17918+
buf_size -= size;
17919+
}
17920+
1791117921
size_t get_size_written() override {
1791217922
return size_written;
1791317923
}
@@ -17943,6 +17953,7 @@ struct llama_data_read_buffer : llama_data_read {
1794317953
struct llama_data_write_file : llama_data_write {
1794417954
llama_file * file;
1794517955
size_t size_written = 0;
17956+
std::vector<uint8_t> temp_buffer;
1794617957

1794717958
llama_data_write_file(llama_file * f) : file(f) {}
1794817959

@@ -17951,6 +17962,12 @@ struct llama_data_write_file : llama_data_write {
1795117962
size_written += size;
1795217963
}
1795317964

17965+
void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
17966+
temp_buffer.resize(size);
17967+
ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
17968+
write(temp_buffer.data(), temp_buffer.size());
17969+
}
17970+
1795417971
size_t get_size_written() override {
1795517972
return size_written;
1795617973
}

0 commit comments

Comments
 (0)