Skip to content

Commit f4090a0

Browse files
committed
Add numa support, low level api users must now explicitly call llama_backend_init at the start of their programs.
1 parent c999325 commit f4090a0

File tree

5 files changed

+20
-9
lines changed

5 files changed

+20
-9
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize
180180
```python
181181
>>> import llama_cpp
182182
>>> import ctypes
183+
>>> llama_cpp.llama_backend_init(numa=False) # Must be called once at the start of each program
183184
>>> params = llama_cpp.llama_context_default_params()
184185
# use bytes for char * params
185186
>>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/ggml-model.bin", params)

examples/low_level_api/low_level_api_llama_cpp.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
import llama_cpp
66

7+
llama_cpp.llama_backend_init(numa=False)
8+
79
N_THREADS = multiprocessing.cpu_count()
810
MODEL_PATH = os.environ.get('MODEL', "../models/7B/ggml-model.bin")
911

llama_cpp/llama.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,8 @@ def __call__(
209209
class Llama:
210210
"""High-level Python wrapper for a llama.cpp model."""
211211

212+
__backend_initialized = False
213+
212214
def __init__(
213215
self,
214216
model_path: str,
@@ -234,6 +236,7 @@ def __init__(
234236
last_n_tokens_size: int = 64,
235237
lora_base: Optional[str] = None,
236238
lora_path: Optional[str] = None,
239+
numa: bool = False,
237240
verbose: bool = True,
238241
**kwargs # type: ignore
239242
):
@@ -261,6 +264,7 @@ def __init__(
261264
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
262265
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
263266
lora_path: Path to a LoRA file to apply to the model.
267+
numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
264268
verbose: Print verbose output to stderr.
265269
kwargs: Unused keyword arguments (for additional backwards compatibility).
266270
@@ -272,6 +276,15 @@ def __init__(
272276
"""
273277

274278
self.verbose = verbose
279+
280+
if not Llama.__backend_initialized:
281+
if self.verbose:
282+
llama_cpp.llama_backend_init(numa)
283+
else:
284+
with suppress_stdout_stderr():
285+
llama_cpp.llama_backend_init(numa)
286+
Llama.__backend_initialized = True
287+
275288
self.model_path = model_path
276289

277290
self.params = llama_cpp.llama_context_default_params()

llama_cpp/llama_cpp.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1524,12 +1524,3 @@ def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p):
15241524
_lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p]
15251525
_lib.llama_dump_timing_info_yaml.restype = None
15261526

1527-
1528-
###################################################################################################
1529-
1530-
1531-
_llama_initialized = False
1532-
1533-
if not _llama_initialized:
1534-
llama_backend_init(False)
1535-
_llama_initialized = True

llama_cpp/server/app.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ class Settings(BaseSettings):
9898
default=None,
9999
description="Path to a LoRA file to apply to the model.",
100100
)
101+
numa: bool = Field(
102+
default=False,
103+
description="Enable NUMA support.",
104+
)
101105
cache: bool = Field(
102106
default=False,
103107
description="Use a cache to reduce processing times for evaluated prompts.",

0 commit comments

Comments
 (0)