Skip to content

Commit 345a686

Browse files
authored
llama : reduce useless copies when saving session (#8916)
* llama : avoid useless copies in dummy session writer * llama : avoid double tensor copy when saving session to buffer
1 parent 3a14e00 commit 345a686

File tree

1 file changed

+28
-11
lines changed

1 file changed

+28
-11
lines changed

src/llama.cpp

+28-11
Original file line numberDiff line numberDiff line change
@@ -17343,6 +17343,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
1734317343
// TODO: replace all non-fatal assertions with returned errors or exceptions
1734417344
struct llama_data_write {
1734517345
virtual void write(const void * src, size_t size) = 0;
17346+
virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
1734617347
virtual size_t get_size_written() = 0;
1734717348
virtual ~llama_data_write() = default;
1734817349

@@ -17465,9 +17466,8 @@ struct llama_data_write {
1746517466
// Read each range of cells of k_size length each into tmp_buf and write out
1746617467
for (const auto & range : cell_ranges) {
1746717468
const size_t range_size = range.second - range.first;
17468-
tmp_buf.resize(range_size * k_size_row);
17469-
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
17470-
write(tmp_buf.data(), tmp_buf.size());
17469+
const size_t buf_size = range_size * k_size_row;
17470+
write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
1747117471
}
1747217472
}
1747317473

@@ -17486,9 +17486,8 @@ struct llama_data_write {
1748617486
// Read each range of cells of v_size length each into tmp_buf and write out
1748717487
for (const auto & range : cell_ranges) {
1748817488
const size_t range_size = range.second - range.first;
17489-
tmp_buf.resize(range_size * v_size_row);
17490-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
17491-
write(tmp_buf.data(), tmp_buf.size());
17489+
const size_t buf_size = range_size * v_size_row;
17490+
write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
1749217491
}
1749317492
}
1749417493
} else {
@@ -17514,9 +17513,8 @@ struct llama_data_write {
1751417513
for (const auto & range : cell_ranges) {
1751517514
const size_t range_size = range.second - range.first;
1751617515
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
17517-
tmp_buf.resize(range_size * v_size_el);
17518-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
17519-
write(tmp_buf.data(), tmp_buf.size());
17516+
const size_t buf_size = range_size * v_size_el;
17517+
write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
1752017518
}
1752117519
}
1752217520
}
@@ -17875,12 +17873,14 @@ struct llama_data_write_dummy : llama_data_write {
1787517873

1787617874
llama_data_write_dummy() {}
1787717875

17878-
// TODO: avoid unnecessary calls to ggml_backend_tensor_get in a dummy context
17879-
1788017876
void write(const void * /* src */, size_t size) override {
1788117877
size_written += size;
1788217878
}
1788317879

17880+
void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
17881+
size_written += size;
17882+
}
17883+
1788417884
size_t get_size_written() override {
1788517885
return size_written;
1788617886
}
@@ -17903,6 +17903,16 @@ struct llama_data_write_buffer : llama_data_write {
1790317903
buf_size -= size;
1790417904
}
1790517905

17906+
void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
17907+
if (size > buf_size) {
17908+
throw std::runtime_error("unexpectedly reached end of buffer");
17909+
}
17910+
ggml_backend_tensor_get(tensor, ptr, offset, size);
17911+
ptr += size;
17912+
size_written += size;
17913+
buf_size -= size;
17914+
}
17915+
1790617916
size_t get_size_written() override {
1790717917
return size_written;
1790817918
}
@@ -17938,6 +17948,7 @@ struct llama_data_read_buffer : llama_data_read {
1793817948
struct llama_data_write_file : llama_data_write {
1793917949
llama_file * file;
1794017950
size_t size_written = 0;
17951+
std::vector<uint8_t> temp_buffer;
1794117952

1794217953
llama_data_write_file(llama_file * f) : file(f) {}
1794317954

@@ -17946,6 +17957,12 @@ struct llama_data_write_file : llama_data_write {
1794617957
size_written += size;
1794717958
}
1794817959

17960+
void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
17961+
temp_buffer.resize(size);
17962+
ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
17963+
write(temp_buffer.data(), temp_buffer.size());
17964+
}
17965+
1794917966
size_t get_size_written() override {
1795017967
return size_written;
1795117968
}

0 commit comments

Comments
 (0)