@@ -17343,6 +17343,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
17343
17343
// TODO: replace all non-fatal assertions with returned errors or exceptions
17344
17344
struct llama_data_write {
17345
17345
virtual void write(const void * src, size_t size) = 0;
17346
+ virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
17346
17347
virtual size_t get_size_written() = 0;
17347
17348
virtual ~llama_data_write() = default;
17348
17349
@@ -17465,9 +17466,8 @@ struct llama_data_write {
17465
17466
// Read each range of cells of k_size length each into tmp_buf and write out
17466
17467
for (const auto & range : cell_ranges) {
17467
17468
const size_t range_size = range.second - range.first;
17468
- tmp_buf.resize(range_size * k_size_row);
17469
- ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
17470
- write(tmp_buf.data(), tmp_buf.size());
17469
+ const size_t buf_size = range_size * k_size_row;
17470
+ write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
17471
17471
}
17472
17472
}
17473
17473
@@ -17486,9 +17486,8 @@ struct llama_data_write {
17486
17486
// Read each range of cells of v_size length each into tmp_buf and write out
17487
17487
for (const auto & range : cell_ranges) {
17488
17488
const size_t range_size = range.second - range.first;
17489
- tmp_buf.resize(range_size * v_size_row);
17490
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
17491
- write(tmp_buf.data(), tmp_buf.size());
17489
+ const size_t buf_size = range_size * v_size_row;
17490
+ write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
17492
17491
}
17493
17492
}
17494
17493
} else {
@@ -17514,9 +17513,8 @@ struct llama_data_write {
17514
17513
for (const auto & range : cell_ranges) {
17515
17514
const size_t range_size = range.second - range.first;
17516
17515
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
17517
- tmp_buf.resize(range_size * v_size_el);
17518
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
17519
- write(tmp_buf.data(), tmp_buf.size());
17516
+ const size_t buf_size = range_size * v_size_el;
17517
+ write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
17520
17518
}
17521
17519
}
17522
17520
}
@@ -17875,12 +17873,14 @@ struct llama_data_write_dummy : llama_data_write {
17875
17873
17876
17874
llama_data_write_dummy() {}
17877
17875
17878
- // TODO: avoid unnecessary calls to ggml_backend_tensor_get in a dummy context
17879
-
17880
17876
void write(const void * /* src */, size_t size) override {
17881
17877
size_written += size;
17882
17878
}
17883
17879
17880
+ void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
17881
+ size_written += size;
17882
+ }
17883
+
17884
17884
size_t get_size_written() override {
17885
17885
return size_written;
17886
17886
}
@@ -17903,6 +17903,16 @@ struct llama_data_write_buffer : llama_data_write {
17903
17903
buf_size -= size;
17904
17904
}
17905
17905
17906
+ void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
17907
+ if (size > buf_size) {
17908
+ throw std::runtime_error("unexpectedly reached end of buffer");
17909
+ }
17910
+ ggml_backend_tensor_get(tensor, ptr, offset, size);
17911
+ ptr += size;
17912
+ size_written += size;
17913
+ buf_size -= size;
17914
+ }
17915
+
17906
17916
size_t get_size_written() override {
17907
17917
return size_written;
17908
17918
}
@@ -17938,6 +17948,7 @@ struct llama_data_read_buffer : llama_data_read {
17938
17948
struct llama_data_write_file : llama_data_write {
17939
17949
llama_file * file;
17940
17950
size_t size_written = 0;
17951
+ std::vector<uint8_t> temp_buffer;
17941
17952
17942
17953
llama_data_write_file(llama_file * f) : file(f) {}
17943
17954
@@ -17946,6 +17957,12 @@ struct llama_data_write_file : llama_data_write {
17946
17957
size_written += size;
17947
17958
}
17948
17959
17960
+ void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
17961
+ temp_buffer.resize(size);
17962
+ ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
17963
+ write(temp_buffer.data(), temp_buffer.size());
17964
+ }
17965
+
17949
17966
size_t get_size_written() override {
17950
17967
return size_written;
17951
17968
}
0 commit comments