Skip to content

Commit d634efc

Browse files
chraacabetlen
andauthored
feat: adding rpc_servers parameter to Llama class (#1477)
* passthru rpc_servers params wip * enable llama rpc by default * convert string to byte * add rpc package * Revert "enable llama rpc by default" This reverts commit 832c6dd. * update readme * Only set rpc_servers when provided * Add rpc servers to server options --------- Co-authored-by: Andrei Betlen <[email protected]>
1 parent 6e0642c commit d634efc

File tree

5 files changed

+26
-0
lines changed

5 files changed

+26
-0
lines changed

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ build.kompute:
4545
build.sycl:
4646
CMAKE_ARGS="-DLLAMA_SYCL=on" python3 -m pip install --verbose -e .
4747

48+
build.rpc:
49+
CMAKE_ARGS="-DLLAMA_RPC=on" python3 -m pip install --verbose -e .
50+
4851
build.sdist:
4952
python3 -m build --sdist
5053

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,17 @@ CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pi
221221
```
222222
</details>
223223

224+
<details>
225+
<summary>RPC</summary>
226+
227+
To install with RPC support, set the `LLAMA_RPC=on` environment variable before installing:
228+
229+
```bash
230+
source /opt/intel/oneapi/setvars.sh
231+
CMAKE_ARGS="-DLLAMA_RPC=on" pip install llama-cpp-python
232+
```
233+
</details>
234+
224235

225236
### Windows Notes
226237

llama_cpp/llama.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def __init__(
7272
split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
7373
main_gpu: int = 0,
7474
tensor_split: Optional[List[float]] = None,
75+
rpc_servers: Optional[str] = None,
7576
vocab_only: bool = False,
7677
use_mmap: bool = True,
7778
use_mlock: bool = False,
@@ -150,6 +151,7 @@ def __init__(
150151
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
151152
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
152153
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
154+
rpc_servers: Comma separated list of RPC servers to use for offloading
153155
vocab_only: Only load the vocabulary no weights.
154156
use_mmap: Use mmap if possible.
155157
use_mlock: Force the system to keep the model in RAM.
@@ -221,6 +223,11 @@ def __init__(
221223
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
222224
self.model_params.split_mode = split_mode
223225
self.model_params.main_gpu = main_gpu
226+
if rpc_servers is not None:
227+
self.model_params.rpc_servers = rpc_servers.encode('utf-8')
228+
self._rpc_servers = rpc_servers
229+
else:
230+
self._rpc_servers = None
224231
self.tensor_split = tensor_split
225232
self._c_tensor_split = None
226233
if self.tensor_split is not None:

llama_cpp/server/model.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
226226
use_mmap=settings.use_mmap,
227227
use_mlock=settings.use_mlock,
228228
kv_overrides=kv_overrides,
229+
rpc_servers=settings.rpc_servers,
229230
# Context Params
230231
seed=settings.seed,
231232
n_ctx=settings.n_ctx,

llama_cpp/server/settings.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@ class ModelSettings(BaseSettings):
5858
default=None,
5959
description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
6060
)
61+
rpc_servers: Optional[str] = Field(
62+
default=None,
63+
description="comma seperated list of rpc servers for offloading",
64+
)
6165
# Context Params
6266
seed: int = Field(
6367
default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."

0 commit comments

Comments
 (0)