Skip to content

Commit c3ed6ed

Browse files
committed
ggml : add RPC backend
The RPC backend proxies all operations to a remote server which runs a regular backend (CPU, CUDA, Metal, etc).
1 parent 8960fe8 commit c3ed6ed

File tree

11 files changed

+980
-1
lines changed

11 files changed

+980
-1
lines changed

CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
131131
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
132132
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
133133
option(LLAMA_MPI "llama: use MPI" OFF)
134+
option(LLAMA_RPC "llama: use RPC" OFF)
134135
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
135136
option(LLAMA_SYCL "llama: use SYCL" OFF)
136137
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
@@ -494,6 +495,13 @@ if (LLAMA_MPI)
494495
endif()
495496
endif()
496497

498+
if (LLAMA_RPC)
499+
add_compile_definitions(GGML_USE_RPC)
500+
501+
set(GGML_HEADERS_RPC ggml-rpc.h)
502+
set(GGML_SOURCES_RPC ggml-rpc.cpp)
503+
endif()
504+
497505
if (LLAMA_CLBLAST)
498506
find_package(CLBlast)
499507
if (CLBlast_FOUND)
@@ -1176,6 +1184,7 @@ add_library(ggml OBJECT
11761184
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
11771185
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
11781186
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
1187+
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
11791188
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
11801189
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
11811190
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}

common/common.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -999,6 +999,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
999999
#endif // GGML_USE_CUDA_SYCL_VULKAN
10001000
return true;
10011001
}
1002+
if (arg == "--rpc") {
1003+
if (++i >= argc) {
1004+
invalid_param = true;
1005+
return true;
1006+
}
1007+
params.rpc_servers = argv[i];
1008+
return true;
1009+
}
10021010
if (arg == "--no-mmap") {
10031011
params.use_mmap = false;
10041012
return true;
@@ -1507,6 +1515,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
15071515
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
15081516
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
15091517
}
1518+
printf(" --rpc SERVERS comma separated list of RPC servers\n");
15101519
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
15111520
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
15121521
printf(" -gan N, --grp-attn-n N\n");

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ struct gpt_params {
8080
float yarn_beta_slow = 1.0f; // YaRN high correction dim
8181
int32_t yarn_orig_ctx = 0; // YaRN original context length
8282
float defrag_thold = -1.0f; // KV cache defragmentation threshold
83+
std::string rpc_servers = ""; // comma separated list of RPC servers
8384

8485
ggml_backend_sched_eval_callback cb_eval = nullptr;
8586
void * cb_eval_user_data = nullptr;

examples/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,7 @@ else()
4949
add_subdirectory(server)
5050
endif()
5151
add_subdirectory(export-lora)
52+
if (LLAMA_RPC)
53+
add_subdirectory(rpc)
54+
endif()
5255
endif()

examples/main/main.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@ int main(int argc, char ** argv) {
187187
LOG("%s: llama backend init\n", __func__);
188188
llama_backend_init();
189189
llama_numa_init(params.numa);
190+
llama_rpc_init(params.rpc_servers.empty() ? nullptr : params.rpc_servers.c_str());
190191

191192
llama_model * model;
192193
llama_context * ctx;

examples/rpc/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
add_executable(rpc-server rpc-server.cpp)
2+
target_link_libraries(rpc-server PRIVATE ggml llama)

examples/rpc/rpc-server.cpp

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#ifdef GGML_USE_CUDA
2+
#include "ggml-cuda.h"
3+
#endif
4+
5+
#ifdef GGML_USE_METAL
6+
#include "ggml-metal.h"
7+
#endif
8+
9+
#include "ggml-rpc.h"
10+
#include <memory>
11+
#include <string>
12+
#include <sys/types.h>
13+
#include <sys/socket.h>
14+
#include <netinet/in.h>
15+
#include <arpa/inet.h>
16+
#include <stdio.h>
17+
#include <stdlib.h>
18+
#include <unistd.h>
19+
20+
static ggml_backend_t create_backend() {
21+
ggml_backend_t backend = NULL;
22+
#ifdef GGML_USE_CUDA
23+
fprintf(stderr, "%s: using CUDA backend\n", __func__);
24+
backend = ggml_backend_cuda_init(0); // init device 0
25+
if (!backend) {
26+
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
27+
}
28+
#endif
29+
30+
#ifdef GGML_USE_METAL
31+
fprintf(stderr, "%s: using Metal backend\n", __func__);
32+
backend = ggml_backend_metal_init();
33+
if (!backend) {
34+
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
35+
}
36+
#endif
37+
38+
// if there aren't GPU Backends fallback to CPU backend
39+
if (!backend) {
40+
fprintf(stderr, "%s: using CPU backend\n", __func__);
41+
backend = ggml_backend_cpu_init();
42+
}
43+
return backend;
44+
}
45+
46+
static int create_server_socket(const char * host, int port) {
47+
int sockfd = socket(AF_INET, SOCK_STREAM, 0);
48+
if (sockfd < 0) {
49+
return -1;
50+
}
51+
52+
struct sockaddr_in serv_addr;
53+
serv_addr.sin_family = AF_INET;
54+
serv_addr.sin_addr.s_addr = inet_addr(host);
55+
serv_addr.sin_port = htons(port);
56+
57+
if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) {
58+
return -1;
59+
}
60+
if (listen(sockfd, 5) < 0) {
61+
return -1;
62+
}
63+
return sockfd;
64+
}
65+
66+
int main(int argc, char * argv[])
67+
{
68+
if (argc < 3) {
69+
fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
70+
return 1;
71+
}
72+
const char * host = argv[1];
73+
int port = std::stoi(argv[2]);
74+
75+
ggml_backend_t backend = create_backend();
76+
if (!backend) {
77+
fprintf(stderr, "Failed to create backend\n");
78+
return 1;
79+
}
80+
81+
printf("Starting RPC server on %s:%d\n", host, port);
82+
int server_socket = create_server_socket(host, port);
83+
if (server_socket < 0) {
84+
fprintf(stderr, "Failed to create server socket\n");
85+
return 1;
86+
}
87+
while (true) {
88+
struct sockaddr_in cli_addr;
89+
socklen_t clilen = sizeof(cli_addr);
90+
int client_socket = accept(server_socket, (struct sockaddr *) &cli_addr, &clilen);
91+
if (client_socket < 0) {
92+
fprintf(stderr, "Failed to accept client connection\n");
93+
return 1;
94+
}
95+
printf("Accepted client connection\n");
96+
rpc_serve_client(backend, client_socket);
97+
printf("Client connection closed\n");
98+
close(client_socket);
99+
}
100+
return 0;
101+
}

0 commit comments

Comments
 (0)