Skip to content

Commit 52eab6a

Browse files
committed
llama : load tensors using pre-computed model hash
1 parent b486ba0 commit 52eab6a

File tree

7 files changed

+286
-19
lines changed

7 files changed

+286
-19
lines changed

ggml/include/ggml-rpc.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ extern "C" {
88
#endif
99

1010
#define RPC_PROTO_MAJOR_VERSION 2
11-
#define RPC_PROTO_MINOR_VERSION 0
11+
#define RPC_PROTO_MINOR_VERSION 1
1212
#define RPC_PROTO_PATCH_VERSION 0
1313
#define GGML_RPC_MAX_SERVERS 16
1414

@@ -21,12 +21,18 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
2121
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
2222

2323
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
24-
const char * cache_dir,
24+
const char * model_file, const char * cache_dir,
2525
size_t free_mem, size_t total_mem);
2626

2727
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
2828

2929
GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
30+
GGML_BACKEND_API bool ggml_backend_rpc_buffer_load_tensor(ggml_backend_buffer_t buffer,
31+
ggml_tensor * tensor,
32+
const char * path,
33+
size_t file_offset,
34+
size_t tensor_offset,
35+
size_t size);
3036

3137
#ifdef __cplusplus
3238
}

ggml/src/ggml-rpc/ggml-rpc.cpp

Lines changed: 228 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ enum rpc_cmd {
9393
RPC_CMD_INIT_TENSOR,
9494
RPC_CMD_GET_ALLOC_SIZE,
9595
RPC_CMD_HELLO,
96+
RPC_CMD_LOAD_TENSOR,
9697
RPC_CMD_COUNT,
9798
};
9899

@@ -161,6 +162,18 @@ struct rpc_msg_set_tensor_hash_rsp {
161162
uint8_t result;
162163
};
163164

165+
struct rpc_msg_load_tensor_req {
166+
uint64_t model_hash;
167+
rpc_tensor tensor;
168+
uint64_t file_offset;
169+
uint64_t tensor_offset;
170+
uint64_t size;
171+
};
172+
173+
struct rpc_msg_load_tensor_rsp {
174+
uint8_t result;
175+
};
176+
164177
struct rpc_msg_get_tensor_req {
165178
rpc_tensor tensor;
166179
uint64_t offset;
@@ -213,6 +226,24 @@ struct ggml_backend_rpc_buffer_context {
213226

214227
// RPC helper functions
215228

229+
typedef uint64_t fnv_ctx_t;
230+
231+
static void fnv_init(fnv_ctx_t * ctx) {
232+
*ctx = 0xcbf29ce484222325ULL;
233+
}
234+
235+
static void fnv_update(fnv_ctx_t * ctx, const uint8_t * data, size_t len) {
236+
const uint64_t fnv_prime = 0x100000001b3ULL;
237+
for (size_t i = 0; i < len; ++i) {
238+
*ctx ^= data[i];
239+
*ctx *= fnv_prime;
240+
}
241+
}
242+
243+
static void fnv_final(fnv_ctx_t * ctx, uint64_t * digest) {
244+
*digest = *ctx;
245+
}
246+
216247
// Computes FNV-1a hash of the data
217248
static uint64_t fnv_hash(const uint8_t * data, size_t len) {
218249
const uint64_t fnv_prime = 0x100000001b3ULL;
@@ -225,6 +256,87 @@ static uint64_t fnv_hash(const uint8_t * data, size_t len) {
225256
return hash;
226257
}
227258

259+
static bool get_model_hash_from_file(const char * model_file, uint64_t * hash) {
260+
// try loading the hash from model_file + '.rpc'
261+
std::string rpc_file = std::string(model_file) + ".rpc";
262+
// the hash file must exist, must be exactly 16 bytes and must be a valid hash written in hex
263+
if (!fs::exists(rpc_file)) {
264+
return false;
265+
}
266+
std::ifstream file(rpc_file, std::ios::binary);
267+
if (!file.is_open()) {
268+
return false;
269+
}
270+
std::string hash_str;
271+
file.seekg(0, std::ios::end);
272+
size_t file_size = file.tellg();
273+
if (file_size != 16) {
274+
return false;
275+
}
276+
file.seekg(0, std::ios::beg);
277+
hash_str.resize(file_size);
278+
file.read(&hash_str[0], file_size);
279+
if ((size_t)file.gcount() != file_size) {
280+
return false;
281+
}
282+
if (hash_str.find_first_not_of("0123456789abcdefABCDEF") != std::string::npos) {
283+
return false;
284+
}
285+
*hash = std::stoull(hash_str, nullptr, 16);
286+
return true;
287+
}
288+
289+
static bool get_model_hash(const char * model_file, uint64_t * hash) {
290+
// model path -> (hash_exist, hash_value)
291+
static std::unordered_map<std::string, std::pair<bool, uint64_t>> model_hashes;
292+
if (model_hashes.find(model_file) != model_hashes.end()) {
293+
*hash = model_hashes[model_file].second;
294+
return model_hashes[model_file].first;
295+
}
296+
if (get_model_hash_from_file(model_file, hash)) {
297+
model_hashes[model_file] = {true, *hash};
298+
return true;
299+
}
300+
model_hashes[model_file] = {false, 0};
301+
return false;
302+
}
303+
304+
static bool generate_model_hash(const char * model_file, uint64_t * hash) {
305+
ggml_context * ctx = nullptr;
306+
struct gguf_init_params params = {
307+
/* .no_alloc = */ false,
308+
/* .ctx = */ &ctx,
309+
};
310+
gguf_context_ptr ctx_gguf { gguf_init_from_file(model_file, params) };
311+
if (!ctx_gguf) {
312+
return false;
313+
}
314+
fnv_ctx_t fnv_ctx;
315+
fnv_init(&fnv_ctx);
316+
size_t data_offset = gguf_get_data_offset(ctx_gguf.get());
317+
fnv_update(&fnv_ctx, (const uint8_t*)&data_offset, sizeof(data_offset));
318+
const int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
319+
for (int i = 0; i < n_tensors; ++i) {
320+
const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
321+
ggml_tensor * cur = ggml_get_tensor(ctx, name);
322+
auto n_bytes = ggml_nbytes(cur);
323+
fnv_update(&fnv_ctx, (const uint8_t*)cur->data, n_bytes);
324+
}
325+
fnv_final(&fnv_ctx, hash);
326+
// save the model hash to model_file + '.rpc' in hex format
327+
std::string hash_file = std::string(model_file) + ".rpc";
328+
std::ofstream file(hash_file, std::ios::binary);
329+
if (!file.is_open()) {
330+
return false;
331+
}
332+
file << std::hex << std::setfill('0') << std::setw(16) << *hash;
333+
if (!file) {
334+
return false;
335+
}
336+
file.close();
337+
return true;
338+
}
339+
228340
static std::shared_ptr<socket_t> make_socket(sockfd_t fd) {
229341
#ifdef _WIN32
230342
if (fd == INVALID_SOCKET) {
@@ -605,6 +717,24 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
605717
return response.result;
606718
}
607719

720+
bool ggml_backend_rpc_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const char * path, size_t file_offset, size_t tensor_offset, size_t size) {
721+
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
722+
uint64_t hash;
723+
if (!get_model_hash(path, &hash)) {
724+
return false;
725+
}
726+
rpc_msg_load_tensor_req request;
727+
request.model_hash = hash;
728+
request.tensor = serialize_tensor(tensor);
729+
request.file_offset = file_offset;
730+
request.tensor_offset = tensor_offset;
731+
request.size = size;
732+
rpc_msg_load_tensor_rsp response;
733+
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_LOAD_TENSOR, &request, sizeof(request), &response, sizeof(response));
734+
GGML_ASSERT(status);
735+
return response.result;
736+
}
737+
608738
static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
609739
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
610740
rpc_msg_buffer_clear_req request = {ctx->remote_ptr, value};
@@ -854,8 +984,8 @@ void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, si
854984

855985
class rpc_server {
856986
public:
857-
rpc_server(ggml_backend_t backend, const char * cache_dir)
858-
: backend(backend), cache_dir(cache_dir) {
987+
rpc_server(ggml_backend_t backend, const std::unordered_map<uint64_t, std::string> & model_hashes, const char * cache_dir)
988+
: backend(backend), cache_dir(cache_dir), model_hashes(model_hashes) {
859989
}
860990
~rpc_server();
861991

@@ -868,6 +998,7 @@ class rpc_server {
868998
bool buffer_clear(const rpc_msg_buffer_clear_req & request);
869999
bool set_tensor(const std::vector<uint8_t> & input);
8701000
bool set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response);
1001+
bool load_tensor(const rpc_msg_load_tensor_req & request, rpc_msg_load_tensor_rsp & response);
8711002
bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
8721003
bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
8731004
bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
@@ -886,6 +1017,7 @@ class rpc_server {
8861017
ggml_backend_t backend;
8871018
const char * cache_dir;
8881019
std::unordered_set<ggml_backend_buffer_t> buffers;
1020+
const std::unordered_map<uint64_t, std::string> & model_hashes;
8891021
};
8901022

8911023
void rpc_server::hello(rpc_msg_hello_rsp & response) {
@@ -1104,6 +1236,18 @@ bool rpc_server::get_cached_file(uint64_t hash, std::vector<uint8_t> & data) {
11041236
return true;
11051237
}
11061238

1239+
static bool read_model_data(const char * path, size_t file_offset, size_t size, std::vector<uint8_t> & data) {
1240+
FILE * f = fopen(path, "rb");
1241+
if (f == nullptr) {
1242+
return false;
1243+
}
1244+
fseek(f, file_offset, SEEK_SET);
1245+
data.resize(size);
1246+
size_t read_size = fread(data.data(), 1, size, f);
1247+
fclose(f);
1248+
return read_size == size;
1249+
}
1250+
11071251
bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response)
11081252
{
11091253
std::vector<uint8_t> cached_file;
@@ -1146,6 +1290,50 @@ bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rp
11461290
return true;
11471291
}
11481292

1293+
bool rpc_server::load_tensor(const rpc_msg_load_tensor_req & request, rpc_msg_load_tensor_rsp & response) {
1294+
if (model_hashes.find(request.model_hash) == model_hashes.end()) {
1295+
response.result = 0;
1296+
return true;
1297+
}
1298+
std::string path = model_hashes.at(request.model_hash);
1299+
std::vector<uint8_t> model_data;
1300+
if (!read_model_data(path.c_str(), request.file_offset, request.size, model_data)) {
1301+
response.result = 0;
1302+
return true;
1303+
}
1304+
struct ggml_init_params params {
1305+
/*.mem_size =*/ ggml_tensor_overhead(),
1306+
/*.mem_buffer =*/ NULL,
1307+
/*.no_alloc =*/ true,
1308+
};
1309+
ggml_context_ptr ctx_ptr { ggml_init(params) };
1310+
GGML_ASSERT(ctx_ptr != nullptr);
1311+
ggml_context * ctx = ctx_ptr.get();
1312+
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
1313+
if (tensor == nullptr) {
1314+
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
1315+
return false;
1316+
}
1317+
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, request.tensor_offset, request.size);
1318+
1319+
// sanitize tensor->data
1320+
{
1321+
const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
1322+
const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
1323+
1324+
if (request.tensor.data + request.tensor_offset < p0
1325+
|| request.tensor.data + request.tensor_offset >= p1
1326+
|| request.size > (p1 - request.tensor.data - request.tensor_offset)) {
1327+
GGML_LOG_ERROR("[%s] tensor data region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%" PRIu64 ") out of buffer bounds [0x%zx, 0x%zx)\n",
1328+
__func__, request.tensor.data, request.tensor_offset, request.size, p0, p1);
1329+
return false;
1330+
}
1331+
}
1332+
ggml_backend_tensor_set(tensor, model_data.data(), request.tensor_offset, request.size);
1333+
response.result = 1;
1334+
return true;
1335+
}
1336+
11491337
bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
11501338
struct ggml_init_params params {
11511339
/*.mem_size =*/ ggml_tensor_overhead(),
@@ -1368,9 +1556,11 @@ rpc_server::~rpc_server() {
13681556
}
13691557
}
13701558

1371-
static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
1559+
static void rpc_serve_client(ggml_backend_t backend,
1560+
const std::unordered_map<uint64_t, std::string> & model_hashes,
1561+
const char * cache_dir,
13721562
sockfd_t sockfd, size_t free_mem, size_t total_mem) {
1373-
rpc_server server(backend, cache_dir);
1563+
rpc_server server(backend, model_hashes, cache_dir);
13741564
uint8_t cmd;
13751565
if (!recv_data(sockfd, &cmd, 1)) {
13761566
return;
@@ -1514,6 +1704,20 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
15141704
}
15151705
break;
15161706
}
1707+
case RPC_CMD_LOAD_TENSOR: {
1708+
rpc_msg_load_tensor_req request;
1709+
if (!recv_msg(sockfd, &request, sizeof(request))) {
1710+
return;
1711+
}
1712+
rpc_msg_load_tensor_rsp response;
1713+
if (!server.load_tensor(request, response)) {
1714+
return;
1715+
}
1716+
if (!send_msg(sockfd, &response, sizeof(response))) {
1717+
return;
1718+
}
1719+
break;
1720+
}
15171721
case RPC_CMD_INIT_TENSOR: {
15181722
rpc_msg_init_tensor_req request;
15191723
if (!recv_msg(sockfd, &request,sizeof(request))) {
@@ -1590,7 +1794,7 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
15901794
}
15911795

15921796
void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
1593-
const char * cache_dir,
1797+
const char * model_file, const char * cache_dir,
15941798
size_t free_mem, size_t total_mem) {
15951799
printf("Starting RPC server v%d.%d.%d\n",
15961800
RPC_PROTO_MAJOR_VERSION,
@@ -1600,6 +1804,21 @@ void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint
16001804
printf(" local cache : %s\n", cache_dir ? cache_dir : "n/a");
16011805
printf(" backend memory : %zu MB\n", free_mem / (1024 * 1024));
16021806

1807+
std::unordered_map<uint64_t, std::string> model_hashes;
1808+
if (model_file != nullptr) {
1809+
uint64_t model_hash;
1810+
if (!get_model_hash(model_file, &model_hash)) {
1811+
printf("Generating model hash for file: %s\n", model_file);
1812+
if (!generate_model_hash(model_file, &model_hash)) {
1813+
fprintf(stderr, "Failed to generate model hash for file: %s\n", model_file);
1814+
return;
1815+
}
1816+
}
1817+
printf(" model file : %s\n", model_file);
1818+
printf(" model hash : %" PRIx64 "\n", model_hash);
1819+
model_hashes[model_hash] = model_file;
1820+
}
1821+
16031822
std::string host;
16041823
int port;
16051824
if (!parse_endpoint(endpoint, host, port)) {
@@ -1628,7 +1847,7 @@ void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint
16281847
}
16291848
printf("Accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem);
16301849
fflush(stdout);
1631-
rpc_serve_client(backend, cache_dir, client_socket->fd, free_mem, total_mem);
1850+
rpc_serve_client(backend, model_hashes, cache_dir, client_socket->fd, free_mem, total_mem);
16321851
printf("Client connection closed\n");
16331852
fflush(stdout);
16341853
}
@@ -1762,6 +1981,9 @@ static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const ch
17621981
if (std::strcmp(name, "ggml_backend_rpc_start_server") == 0) {
17631982
return (void *)ggml_backend_rpc_start_server;
17641983
}
1984+
if (std::strcmp(name, "ggml_backend_rpc_buffer_load_tensor") == 0) {
1985+
return (void *)ggml_backend_rpc_buffer_load_tensor;
1986+
}
17651987
return NULL;
17661988

17671989
GGML_UNUSED(reg);

src/llama-mmap.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ struct llama_file::impl {
241241
size_t size;
242242
};
243243

244-
llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
244+
llama_file::llama_file(const char * fname, const char * mode) : fname(fname), pimpl(std::make_unique<impl>(fname, mode)) {}
245245
llama_file::~llama_file() = default;
246246

247247
size_t llama_file::tell() const { return pimpl->tell(); }

src/llama-mmap.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ struct llama_file {
2828

2929
void write_raw(const void * ptr, size_t len) const;
3030
void write_u32(uint32_t val) const;
31+
const char * fname;
3132

3233
private:
3334
struct impl;

0 commit comments

Comments
 (0)