@@ -93,6 +93,7 @@ enum rpc_cmd {
93
93
RPC_CMD_INIT_TENSOR,
94
94
RPC_CMD_GET_ALLOC_SIZE,
95
95
RPC_CMD_HELLO,
96
+ RPC_CMD_LOAD_TENSOR,
96
97
RPC_CMD_COUNT,
97
98
};
98
99
@@ -161,6 +162,18 @@ struct rpc_msg_set_tensor_hash_rsp {
161
162
uint8_t result;
162
163
};
163
164
165
+ struct rpc_msg_load_tensor_req {
166
+ uint64_t model_hash;
167
+ rpc_tensor tensor;
168
+ uint64_t file_offset;
169
+ uint64_t tensor_offset;
170
+ uint64_t size;
171
+ };
172
+
173
+ struct rpc_msg_load_tensor_rsp {
174
+ uint8_t result;
175
+ };
176
+
164
177
struct rpc_msg_get_tensor_req {
165
178
rpc_tensor tensor;
166
179
uint64_t offset;
@@ -213,6 +226,24 @@ struct ggml_backend_rpc_buffer_context {
213
226
214
227
// RPC helper functions
215
228
229
+ typedef uint64_t fnv_ctx_t ;
230
+
231
+ static void fnv_init (fnv_ctx_t * ctx) {
232
+ *ctx = 0xcbf29ce484222325ULL ;
233
+ }
234
+
235
+ static void fnv_update (fnv_ctx_t * ctx, const uint8_t * data, size_t len) {
236
+ const uint64_t fnv_prime = 0x100000001b3ULL ;
237
+ for (size_t i = 0 ; i < len; ++i) {
238
+ *ctx ^= data[i];
239
+ *ctx *= fnv_prime;
240
+ }
241
+ }
242
+
243
+ static void fnv_final (fnv_ctx_t * ctx, uint64_t * digest) {
244
+ *digest = *ctx;
245
+ }
246
+
216
247
// Computes FNV-1a hash of the data
217
248
static uint64_t fnv_hash (const uint8_t * data, size_t len) {
218
249
const uint64_t fnv_prime = 0x100000001b3ULL ;
@@ -225,6 +256,87 @@ static uint64_t fnv_hash(const uint8_t * data, size_t len) {
225
256
return hash;
226
257
}
227
258
259
+ static bool get_model_hash_from_file (const char * model_file, uint64_t * hash) {
260
+ // try loading the hash from model_file + '.rpc'
261
+ std::string rpc_file = std::string (model_file) + " .rpc" ;
262
+ // the hash file must exist, must be exactly 16 bytes and must be a valid hash written in hex
263
+ if (!fs::exists (rpc_file)) {
264
+ return false ;
265
+ }
266
+ std::ifstream file (rpc_file, std::ios::binary);
267
+ if (!file.is_open ()) {
268
+ return false ;
269
+ }
270
+ std::string hash_str;
271
+ file.seekg (0 , std::ios::end);
272
+ size_t file_size = file.tellg ();
273
+ if (file_size != 16 ) {
274
+ return false ;
275
+ }
276
+ file.seekg (0 , std::ios::beg);
277
+ hash_str.resize (file_size);
278
+ file.read (&hash_str[0 ], file_size);
279
+ if ((size_t )file.gcount () != file_size) {
280
+ return false ;
281
+ }
282
+ if (hash_str.find_first_not_of (" 0123456789abcdefABCDEF" ) != std::string::npos) {
283
+ return false ;
284
+ }
285
+ *hash = std::stoull (hash_str, nullptr , 16 );
286
+ return true ;
287
+ }
288
+
289
+ static bool get_model_hash (const char * model_file, uint64_t * hash) {
290
+ // model path -> (hash_exist, hash_value)
291
+ static std::unordered_map<std::string, std::pair<bool , uint64_t >> model_hashes;
292
+ if (model_hashes.find (model_file) != model_hashes.end ()) {
293
+ *hash = model_hashes[model_file].second ;
294
+ return model_hashes[model_file].first ;
295
+ }
296
+ if (get_model_hash_from_file (model_file, hash)) {
297
+ model_hashes[model_file] = {true , *hash};
298
+ return true ;
299
+ }
300
+ model_hashes[model_file] = {false , 0 };
301
+ return false ;
302
+ }
303
+
304
+ static bool generate_model_hash (const char * model_file, uint64_t * hash) {
305
+ ggml_context * ctx = nullptr ;
306
+ struct gguf_init_params params = {
307
+ /* .no_alloc = */ false ,
308
+ /* .ctx = */ &ctx,
309
+ };
310
+ gguf_context_ptr ctx_gguf { gguf_init_from_file (model_file, params) };
311
+ if (!ctx_gguf) {
312
+ return false ;
313
+ }
314
+ fnv_ctx_t fnv_ctx;
315
+ fnv_init (&fnv_ctx);
316
+ size_t data_offset = gguf_get_data_offset (ctx_gguf.get ());
317
+ fnv_update (&fnv_ctx, (const uint8_t *)&data_offset, sizeof (data_offset));
318
+ const int n_tensors = gguf_get_n_tensors (ctx_gguf.get ());
319
+ for (int i = 0 ; i < n_tensors; ++i) {
320
+ const char * name = gguf_get_tensor_name (ctx_gguf.get (), i);
321
+ ggml_tensor * cur = ggml_get_tensor (ctx, name);
322
+ auto n_bytes = ggml_nbytes (cur);
323
+ fnv_update (&fnv_ctx, (const uint8_t *)cur->data , n_bytes);
324
+ }
325
+ fnv_final (&fnv_ctx, hash);
326
+ // save the model hash to model_file + '.rpc' in hex format
327
+ std::string hash_file = std::string (model_file) + " .rpc" ;
328
+ std::ofstream file (hash_file, std::ios::binary);
329
+ if (!file.is_open ()) {
330
+ return false ;
331
+ }
332
+ file << std::hex << std::setfill (' 0' ) << std::setw (16 ) << *hash;
333
+ if (!file) {
334
+ return false ;
335
+ }
336
+ file.close ();
337
+ return true ;
338
+ }
339
+
228
340
static std::shared_ptr<socket_t > make_socket (sockfd_t fd) {
229
341
#ifdef _WIN32
230
342
if (fd == INVALID_SOCKET) {
@@ -605,6 +717,24 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
605
717
return response.result ;
606
718
}
607
719
720
+ bool ggml_backend_rpc_buffer_load_tensor (ggml_backend_buffer_t buffer, ggml_tensor * tensor, const char * path, size_t file_offset, size_t tensor_offset, size_t size) {
721
+ ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context ;
722
+ uint64_t hash;
723
+ if (!get_model_hash (path, &hash)) {
724
+ return false ;
725
+ }
726
+ rpc_msg_load_tensor_req request;
727
+ request.model_hash = hash;
728
+ request.tensor = serialize_tensor (tensor);
729
+ request.file_offset = file_offset;
730
+ request.tensor_offset = tensor_offset;
731
+ request.size = size;
732
+ rpc_msg_load_tensor_rsp response;
733
+ bool status = send_rpc_cmd (ctx->sock , RPC_CMD_LOAD_TENSOR, &request, sizeof (request), &response, sizeof (response));
734
+ GGML_ASSERT (status);
735
+ return response.result ;
736
+ }
737
+
608
738
static void ggml_backend_rpc_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value) {
609
739
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context ;
610
740
rpc_msg_buffer_clear_req request = {ctx->remote_ptr , value};
@@ -854,8 +984,8 @@ void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, si
854
984
855
985
class rpc_server {
856
986
public:
857
- rpc_server (ggml_backend_t backend, const char * cache_dir)
858
- : backend(backend), cache_dir(cache_dir) {
987
+ rpc_server (ggml_backend_t backend, const std::unordered_map< uint64_t , std::string> & model_hashes, const char * cache_dir)
988
+ : backend(backend), cache_dir(cache_dir), model_hashes(model_hashes) {
859
989
}
860
990
~rpc_server ();
861
991
@@ -868,6 +998,7 @@ class rpc_server {
868
998
bool buffer_clear (const rpc_msg_buffer_clear_req & request);
869
999
bool set_tensor (const std::vector<uint8_t > & input);
870
1000
bool set_tensor_hash (const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response);
1001
+ bool load_tensor (const rpc_msg_load_tensor_req & request, rpc_msg_load_tensor_rsp & response);
871
1002
bool get_tensor (const rpc_msg_get_tensor_req & request, std::vector<uint8_t > & response);
872
1003
bool copy_tensor (const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
873
1004
bool graph_compute (const std::vector<uint8_t > & input, rpc_msg_graph_compute_rsp & response);
@@ -886,6 +1017,7 @@ class rpc_server {
886
1017
ggml_backend_t backend;
887
1018
const char * cache_dir;
888
1019
std::unordered_set<ggml_backend_buffer_t > buffers;
1020
+ const std::unordered_map<uint64_t , std::string> & model_hashes;
889
1021
};
890
1022
891
1023
void rpc_server::hello (rpc_msg_hello_rsp & response) {
@@ -1104,6 +1236,18 @@ bool rpc_server::get_cached_file(uint64_t hash, std::vector<uint8_t> & data) {
1104
1236
return true ;
1105
1237
}
1106
1238
1239
+ static bool read_model_data (const char * path, size_t file_offset, size_t size, std::vector<uint8_t > & data) {
1240
+ FILE * f = fopen (path, " rb" );
1241
+ if (f == nullptr ) {
1242
+ return false ;
1243
+ }
1244
+ fseek (f, file_offset, SEEK_SET);
1245
+ data.resize (size);
1246
+ size_t read_size = fread (data.data (), 1 , size, f);
1247
+ fclose (f);
1248
+ return read_size == size;
1249
+ }
1250
+
1107
1251
bool rpc_server::set_tensor_hash (const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response)
1108
1252
{
1109
1253
std::vector<uint8_t > cached_file;
@@ -1146,6 +1290,50 @@ bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rp
1146
1290
return true ;
1147
1291
}
1148
1292
1293
+ bool rpc_server::load_tensor (const rpc_msg_load_tensor_req & request, rpc_msg_load_tensor_rsp & response) {
1294
+ if (model_hashes.find (request.model_hash ) == model_hashes.end ()) {
1295
+ response.result = 0 ;
1296
+ return true ;
1297
+ }
1298
+ std::string path = model_hashes.at (request.model_hash );
1299
+ std::vector<uint8_t > model_data;
1300
+ if (!read_model_data (path.c_str (), request.file_offset , request.size , model_data)) {
1301
+ response.result = 0 ;
1302
+ return true ;
1303
+ }
1304
+ struct ggml_init_params params {
1305
+ /* .mem_size =*/ ggml_tensor_overhead(),
1306
+ /* .mem_buffer =*/ NULL ,
1307
+ /* .no_alloc =*/ true ,
1308
+ };
1309
+ ggml_context_ptr ctx_ptr { ggml_init (params) };
1310
+ GGML_ASSERT (ctx_ptr != nullptr );
1311
+ ggml_context * ctx = ctx_ptr.get ();
1312
+ ggml_tensor * tensor = deserialize_tensor (ctx, &request.tensor );
1313
+ if (tensor == nullptr ) {
1314
+ GGML_LOG_ERROR (" [%s] error deserializing tensor\n " , __func__);
1315
+ return false ;
1316
+ }
1317
+ GGML_PRINT_DEBUG (" [%s] buffer: %p, data: %p, offset: %" PRIu64 " , size: %zu\n " , __func__, (void *)tensor->buffer , tensor->data , request.tensor_offset , request.size );
1318
+
1319
+ // sanitize tensor->data
1320
+ {
1321
+ const size_t p0 = (size_t ) ggml_backend_buffer_get_base (tensor->buffer );
1322
+ const size_t p1 = p0 + ggml_backend_buffer_get_size (tensor->buffer );
1323
+
1324
+ if (request.tensor .data + request.tensor_offset < p0
1325
+ || request.tensor .data + request.tensor_offset >= p1
1326
+ || request.size > (p1 - request.tensor .data - request.tensor_offset )) {
1327
+ GGML_LOG_ERROR (" [%s] tensor data region (data=0x%" PRIx64 " , offset=%" PRIu64 " , size=%" PRIu64 " ) out of buffer bounds [0x%zx, 0x%zx)\n " ,
1328
+ __func__, request.tensor .data , request.tensor_offset , request.size , p0, p1);
1329
+ return false ;
1330
+ }
1331
+ }
1332
+ ggml_backend_tensor_set (tensor, model_data.data (), request.tensor_offset , request.size );
1333
+ response.result = 1 ;
1334
+ return true ;
1335
+ }
1336
+
1149
1337
bool rpc_server::init_tensor (const rpc_msg_init_tensor_req & request) {
1150
1338
struct ggml_init_params params {
1151
1339
/* .mem_size =*/ ggml_tensor_overhead(),
@@ -1368,9 +1556,11 @@ rpc_server::~rpc_server() {
1368
1556
}
1369
1557
}
1370
1558
1371
- static void rpc_serve_client (ggml_backend_t backend, const char * cache_dir,
1559
+ static void rpc_serve_client (ggml_backend_t backend,
1560
+ const std::unordered_map<uint64_t , std::string> & model_hashes,
1561
+ const char * cache_dir,
1372
1562
sockfd_t sockfd, size_t free_mem, size_t total_mem) {
1373
- rpc_server server (backend, cache_dir);
1563
+ rpc_server server (backend, model_hashes, cache_dir);
1374
1564
uint8_t cmd;
1375
1565
if (!recv_data (sockfd, &cmd, 1 )) {
1376
1566
return ;
@@ -1514,6 +1704,20 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
1514
1704
}
1515
1705
break ;
1516
1706
}
1707
+ case RPC_CMD_LOAD_TENSOR: {
1708
+ rpc_msg_load_tensor_req request;
1709
+ if (!recv_msg (sockfd, &request, sizeof (request))) {
1710
+ return ;
1711
+ }
1712
+ rpc_msg_load_tensor_rsp response;
1713
+ if (!server.load_tensor (request, response)) {
1714
+ return ;
1715
+ }
1716
+ if (!send_msg (sockfd, &response, sizeof (response))) {
1717
+ return ;
1718
+ }
1719
+ break ;
1720
+ }
1517
1721
case RPC_CMD_INIT_TENSOR: {
1518
1722
rpc_msg_init_tensor_req request;
1519
1723
if (!recv_msg (sockfd, &request,sizeof (request))) {
@@ -1590,7 +1794,7 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
1590
1794
}
1591
1795
1592
1796
void ggml_backend_rpc_start_server (ggml_backend_t backend, const char * endpoint,
1593
- const char * cache_dir,
1797
+ const char * model_file, const char * cache_dir,
1594
1798
size_t free_mem, size_t total_mem) {
1595
1799
printf (" Starting RPC server v%d.%d.%d\n " ,
1596
1800
RPC_PROTO_MAJOR_VERSION,
@@ -1600,6 +1804,21 @@ void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint
1600
1804
printf (" local cache : %s\n " , cache_dir ? cache_dir : " n/a" );
1601
1805
printf (" backend memory : %zu MB\n " , free_mem / (1024 * 1024 ));
1602
1806
1807
+ std::unordered_map<uint64_t , std::string> model_hashes;
1808
+ if (model_file != nullptr ) {
1809
+ uint64_t model_hash;
1810
+ if (!get_model_hash (model_file, &model_hash)) {
1811
+ printf (" Generating model hash for file: %s\n " , model_file);
1812
+ if (!generate_model_hash (model_file, &model_hash)) {
1813
+ fprintf (stderr, " Failed to generate model hash for file: %s\n " , model_file);
1814
+ return ;
1815
+ }
1816
+ }
1817
+ printf (" model file : %s\n " , model_file);
1818
+ printf (" model hash : %" PRIx64 " \n " , model_hash);
1819
+ model_hashes[model_hash] = model_file;
1820
+ }
1821
+
1603
1822
std::string host;
1604
1823
int port;
1605
1824
if (!parse_endpoint (endpoint, host, port)) {
@@ -1628,7 +1847,7 @@ void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint
1628
1847
}
1629
1848
printf (" Accepted client connection, free_mem=%zu, total_mem=%zu\n " , free_mem, total_mem);
1630
1849
fflush (stdout);
1631
- rpc_serve_client (backend, cache_dir, client_socket->fd , free_mem, total_mem);
1850
+ rpc_serve_client (backend, model_hashes, cache_dir, client_socket->fd , free_mem, total_mem);
1632
1851
printf (" Client connection closed\n " );
1633
1852
fflush (stdout);
1634
1853
}
@@ -1762,6 +1981,9 @@ static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const ch
1762
1981
if (std::strcmp (name, " ggml_backend_rpc_start_server" ) == 0 ) {
1763
1982
return (void *)ggml_backend_rpc_start_server;
1764
1983
}
1984
+ if (std::strcmp (name, " ggml_backend_rpc_buffer_load_tensor" ) == 0 ) {
1985
+ return (void *)ggml_backend_rpc_buffer_load_tensor;
1986
+ }
1765
1987
return NULL ;
1766
1988
1767
1989
GGML_UNUSED (reg);
0 commit comments