Add numa support, low level api users must now explicitly call llama_backend_init at the start of their programs.

abetlen · abetlen · commit f4090a0bb2a2 · 2023-09-13T23:00:43.000-04:00
diff --git a/README.md b/README.md
@@ -180,6 +180,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize
 ```python
 >>> import llama_cpp
 >>> import ctypes
+>>> llama_cpp.llama_backend_init(numa=False) # Must be called once at the start of each program
 >>> params = llama_cpp.llama_context_default_params()
 # use bytes for char * params
 >>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/ggml-model.bin", params)
diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py
@@ -4,6 +4,8 @@
 
 import llama_cpp
 
+llama_cpp.llama_backend_init(numa=False)
+
 N_THREADS = multiprocessing.cpu_count()
 MODEL_PATH = os.environ.get('MODEL', "../models/7B/ggml-model.bin")
 
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -209,6 +209,8 @@ def __call__(
 class Llama:
     """High-level Python wrapper for a llama.cpp model."""
 
+    __backend_initialized = False
+
     def __init__(
         self,
         model_path: str,
@@ -234,6 +236,7 @@ def __init__(
         last_n_tokens_size: int = 64,
         lora_base: Optional[str] = None,
         lora_path: Optional[str] = None,
+        numa: bool = False,
         verbose: bool = True,
         **kwargs # type: ignore
     ):
@@ -261,6 +264,7 @@ def __init__(
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
             lora_path: Path to a LoRA file to apply to the model.
+            numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
             verbose: Print verbose output to stderr.
             kwargs: Unused keyword arguments (for additional backwards compatibility).
 
@@ -272,6 +276,15 @@ def __init__(
         """
 
         self.verbose = verbose
+
+        if not Llama.__backend_initialized:
+            if self.verbose:
+                llama_cpp.llama_backend_init(numa)
+            else:
+                with suppress_stdout_stderr():
+                    llama_cpp.llama_backend_init(numa)
+            Llama.__backend_initialized = True
+
         self.model_path = model_path
 
         self.params = llama_cpp.llama_context_default_params()
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -1524,12 +1524,3 @@ def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p):
 _lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p]
 _lib.llama_dump_timing_info_yaml.restype = None
 
-
-###################################################################################################
-
-
-_llama_initialized = False
-
-if not _llama_initialized:
-    llama_backend_init(False)
-    _llama_initialized = True
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
@@ -98,6 +98,10 @@ class Settings(BaseSettings):
         default=None,
         description="Path to a LoRA file to apply to the model.",
     )
+    numa: bool = Field(
+        default=False,
+        description="Enable NUMA support.",
+    )
     cache: bool = Field(
         default=False,
         description="Use a cache to reduce processing times for evaluated prompts.",