You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+3Lines changed: 3 additions & 0 deletions
Original file line number
Diff line number
Diff line change
@@ -70,6 +70,7 @@ def __init__(
70
70
split_mode: int=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
71
71
main_gpu: int=0,
72
72
tensor_split: Optional[List[float]] =None,
73
+
rpc_servers: Optional[str] =None,
73
74
vocab_only: bool=False,
74
75
use_mmap: bool=True,
75
76
use_mlock: bool=False,
@@ -148,6 +149,7 @@ def __init__(
148
149
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
149
150
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
150
151
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
152
+
rpc_servers: Comma separated list of RPC servers to use for offloading
151
153
vocab_only: Only load the vocabulary no weights.
152
154
use_mmap: Use mmap if possible.
153
155
use_mlock: Force the system to keep the model in RAM.
@@ -219,6 +221,7 @@ def __init__(
219
221
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
0 commit comments