@@ -17348,6 +17348,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
17348
17348
// TODO: replace all non-fatal assertions with returned errors or exceptions
17349
17349
struct llama_data_write {
17350
17350
virtual void write(const void * src, size_t size) = 0;
17351
+ virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
17351
17352
virtual size_t get_size_written() = 0;
17352
17353
virtual ~llama_data_write() = default;
17353
17354
@@ -17470,9 +17471,8 @@ struct llama_data_write {
17470
17471
// Read each range of cells of k_size length each into tmp_buf and write out
17471
17472
for (const auto & range : cell_ranges) {
17472
17473
const size_t range_size = range.second - range.first;
17473
- tmp_buf.resize(range_size * k_size_row);
17474
- ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
17475
- write(tmp_buf.data(), tmp_buf.size());
17474
+ const size_t buf_size = range_size * k_size_row;
17475
+ write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
17476
17476
}
17477
17477
}
17478
17478
@@ -17491,9 +17491,8 @@ struct llama_data_write {
17491
17491
// Read each range of cells of v_size length each into tmp_buf and write out
17492
17492
for (const auto & range : cell_ranges) {
17493
17493
const size_t range_size = range.second - range.first;
17494
- tmp_buf.resize(range_size * v_size_row);
17495
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
17496
- write(tmp_buf.data(), tmp_buf.size());
17494
+ const size_t buf_size = range_size * v_size_row;
17495
+ write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
17497
17496
}
17498
17497
}
17499
17498
} else {
@@ -17519,9 +17518,8 @@ struct llama_data_write {
17519
17518
for (const auto & range : cell_ranges) {
17520
17519
const size_t range_size = range.second - range.first;
17521
17520
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
17522
- tmp_buf.resize(range_size * v_size_el);
17523
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
17524
- write(tmp_buf.data(), tmp_buf.size());
17521
+ const size_t buf_size = range_size * v_size_el;
17522
+ write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
17525
17523
}
17526
17524
}
17527
17525
}
@@ -17880,12 +17878,14 @@ struct llama_data_write_dummy : llama_data_write {
17880
17878
17881
17879
llama_data_write_dummy() {}
17882
17880
17883
- // TODO: avoid unnecessary calls to ggml_backend_tensor_get in a dummy context
17884
-
17885
17881
void write(const void * /* src */, size_t size) override {
17886
17882
size_written += size;
17887
17883
}
17888
17884
17885
+ void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
17886
+ size_written += size;
17887
+ }
17888
+
17889
17889
size_t get_size_written() override {
17890
17890
return size_written;
17891
17891
}
@@ -17908,6 +17908,16 @@ struct llama_data_write_buffer : llama_data_write {
17908
17908
buf_size -= size;
17909
17909
}
17910
17910
17911
+ void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
17912
+ if (size > buf_size) {
17913
+ throw std::runtime_error("unexpectedly reached end of buffer");
17914
+ }
17915
+ ggml_backend_tensor_get(tensor, ptr, offset, size);
17916
+ ptr += size;
17917
+ size_written += size;
17918
+ buf_size -= size;
17919
+ }
17920
+
17911
17921
size_t get_size_written() override {
17912
17922
return size_written;
17913
17923
}
@@ -17943,6 +17953,7 @@ struct llama_data_read_buffer : llama_data_read {
17943
17953
struct llama_data_write_file : llama_data_write {
17944
17954
llama_file * file;
17945
17955
size_t size_written = 0;
17956
+ std::vector<uint8_t> temp_buffer;
17946
17957
17947
17958
llama_data_write_file(llama_file * f) : file(f) {}
17948
17959
@@ -17951,6 +17962,12 @@ struct llama_data_write_file : llama_data_write {
17951
17962
size_written += size;
17952
17963
}
17953
17964
17965
+ void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
17966
+ temp_buffer.resize(size);
17967
+ ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
17968
+ write(temp_buffer.data(), temp_buffer.size());
17969
+ }
17970
+
17954
17971
size_t get_size_written() override {
17955
17972
return size_written;
17956
17973
}
0 commit comments