From e38f5e21a00ef162ee0865a530f40b23c7c63cf1 Mon Sep 17 00:00:00 2001
From: Junpei Kawamoto <kawamoto.junpei@gmail.com>
Date: Wed, 5 Jun 2024 23:06:57 -0600
Subject: [PATCH 1/5] feat: add explicit methods to free model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit introduces a `close` method to both `Llama` and `_LlamaModel`,
allowing users to explicitly free the model from RAM/VRAM.

The previous implementation relied on the destructor of `_LlamaModel` to free
the model. However, in Python, the timing of destructor calls is unclear—for
instance, the `del` statement does not guarantee immediate invocation of the
destructor.

This commit provides an explicit method to release the model, which works
immediately and allows the user to load another model without memory issues.

Additionally, this commit implements a context manager in the `Llama` class,
enabling the automatic closure of the `Llama` object when used with the `with`
statement.
---
 llama_cpp/_internals.py |  5 ++++-
 llama_cpp/llama.py      | 17 ++++++++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index ba0429139..b3c4284fb 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -56,7 +56,10 @@ def __init__(
         if self.model is None:
             raise ValueError(f"Failed to load model from file: {path_model}")
 
-    def __del__(self):
+    def __del__(self) -> None:
+        self.close()
+
+    def close(self) -> None:
         if self.model is not None and self._llama_free_model is not None:
             self._llama_free_model(self.model)
             self.model = None
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index bf3bd656d..796279829 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import contextlib
 import os
 import sys
 import uuid
@@ -10,6 +11,7 @@
 import fnmatch
 import warnings
 import multiprocessing
+from types import TracebackType
 
 from typing import (
     List,
@@ -21,6 +23,7 @@
     Deque,
     Callable,
     Dict,
+    Type,
 )
 from collections import deque
 from pathlib import Path
@@ -58,7 +61,7 @@
 from ._utils import suppress_stdout_stderr
 
 
-class Llama:
+class Llama(contextlib.AbstractContextManager):
     """High-level Python wrapper for a llama.cpp model."""
 
     __backend_initialized = False
@@ -1940,6 +1943,18 @@ def pooling_type(self) -> str:
         """Return the pooling type."""
         return self._ctx.pooling_type()
 
+    def close(self) -> None:
+        """Explicitly free the model from memory."""
+        self._model.close()
+
+    def __exit__(
+        self,
+        __exc_type: Optional[Type[BaseException]],
+        __exc_value: Optional[BaseException],
+        __traceback: Optional[TracebackType]
+    ) -> Optional[bool]:
+        return self.close()
+
     @staticmethod
     def logits_to_logprobs(
         logits: Union[npt.NDArray[np.single], List], axis: int = -1

From eb823542390a0c85ad2199ca5c151538ccd57769 Mon Sep 17 00:00:00 2001
From: Junpei Kawamoto <kawamoto.junpei@gmail.com>
Date: Thu, 6 Jun 2024 21:25:36 -0600
Subject: [PATCH 2/5] feat: Implement ContextManager in _LlamaModel,
 _LlamaContext, and _LlamaBatch

This commit enables automatic resource management by
implementing the `ContextManager` protocol in `_LlamaModel`,
`_LlamaContext`, and `_LlamaBatch`. This ensures that
resources are properly managed and released within a `with`
statement, enhancing robustness and safety in resource handling.
---
 llama_cpp/_internals.py | 43 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index b3c4284fb..c4b8dac42 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -1,12 +1,15 @@
 from __future__ import annotations
 
+import contextlib
 import os
 import ctypes
+from types import TracebackType
 
 from typing import (
     List,
     Optional,
     Sequence,
+    Type,
 )
 from dataclasses import dataclass, field
 
@@ -23,7 +26,7 @@
 # Python wrappers over llama.h structs
 
 
-class _LlamaModel:
+class _LlamaModel(contextlib.AbstractContextManager):
     """Intermediate Python wrapper for a llama.cpp llama_model.
     NOTE: For stability it's recommended you use the Llama class instead."""
 
@@ -59,6 +62,14 @@ def __init__(
     def __del__(self) -> None:
         self.close()
 
+    def __exit__(
+        self,
+        __exc_type: Optional[Type[BaseException]],
+        __exc_value: Optional[BaseException],
+        __traceback: Optional[TracebackType]
+    ) -> Optional[bool]:
+        return self.close()
+
     def close(self) -> None:
         if self.model is not None and self._llama_free_model is not None:
             self._llama_free_model(self.model)
@@ -248,7 +259,7 @@ def default_params():
         return llama_cpp.llama_model_default_params()
 
 
-class _LlamaContext:
+class _LlamaContext(contextlib.AbstractContextManager):
     """Intermediate Python wrapper for a llama.cpp llama_context.
     NOTE: For stability it's recommended you use the Llama class instead."""
 
@@ -277,7 +288,18 @@ def __init__(
         if self.ctx is None:
             raise ValueError("Failed to create llama_context")
 
-    def __del__(self):
+    def __del__(self) -> None:
+        self.close()
+
+    def __exit__(
+        self,
+        __exc_type: Optional[Type[BaseException]],
+        __exc_value: Optional[BaseException],
+        __traceback: Optional[TracebackType]
+    ) -> Optional[bool]:
+        return self.close()
+
+    def close(self) -> None:
         if self.ctx is not None and self._llama_free is not None:
             self._llama_free(self.ctx)
             self.ctx = None
@@ -495,7 +517,7 @@ def default_params():
         return llama_cpp.llama_context_default_params()
 
 
-class _LlamaBatch:
+class _LlamaBatch(contextlib.AbstractContextManager):
     _llama_batch_free = None
 
     def __init__(
@@ -513,7 +535,18 @@ def __init__(
             self._n_tokens, self.embd, self.n_seq_max
         )
 
-    def __del__(self):
+    def __del__(self) -> None:
+        self.close()
+
+    def __exit__(
+        self,
+        __exc_type: Optional[Type[BaseException]],
+        __exc_value: Optional[BaseException],
+        __traceback: Optional[TracebackType]
+    ) -> Optional[bool]:
+        return self.close()
+
+    def close(self) -> None:
         if self.batch is not None and self._llama_batch_free is not None:
             self._llama_batch_free(self.batch)
             self.batch = None

From fa702b47e770bd0ed9e77ef19bc675d4a9075bc1 Mon Sep 17 00:00:00 2001
From: Junpei Kawamoto <kawamoto.junpei@gmail.com>
Date: Thu, 6 Jun 2024 22:48:17 -0600
Subject: [PATCH 3/5] feat: add ExitStack for Llama's internal class closure

This update implements ExitStack to manage and close internal
classes in Llama, enhancing efficient and safe resource
management.
---
 llama_cpp/llama.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 796279829..2455d1174 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -349,9 +349,11 @@ def __init__(
         if not os.path.exists(model_path):
             raise ValueError(f"Model path does not exist: {model_path}")
 
-        self._model = _LlamaModel(
+        self._stack = contextlib.ExitStack()
+
+        self._model = self._stack.enter_context(_LlamaModel(
             path_model=self.model_path, params=self.model_params, verbose=self.verbose
-        )
+        ))
 
         # Override tokenizer
         self.tokenizer_ = tokenizer or LlamaTokenizer(self)
@@ -363,18 +365,18 @@ def __init__(
             self.context_params.n_ctx = self._model.n_ctx_train()
             self.context_params.n_batch = self.n_batch
 
-        self._ctx = _LlamaContext(
+        self._ctx = self._stack.enter_context(_LlamaContext(
             model=self._model,
             params=self.context_params,
             verbose=self.verbose,
-        )
+        ))
 
-        self._batch = _LlamaBatch(
+        self._batch = self._stack.enter_context(_LlamaBatch(
             n_tokens=self.n_batch,
             embd=0,
             n_seq_max=self.context_params.n_ctx,
             verbose=self.verbose,
-        )
+        ))
 
         if self.lora_path:
             if self._model.apply_lora_from_file(
@@ -1945,15 +1947,15 @@ def pooling_type(self) -> str:
 
     def close(self) -> None:
         """Explicitly free the model from memory."""
-        self._model.close()
+        self._stack.close()
 
     def __exit__(
         self,
-        __exc_type: Optional[Type[BaseException]],
-        __exc_value: Optional[BaseException],
-        __traceback: Optional[TracebackType]
+        exc_type: Optional[Type[BaseException]],
+        exc_value: Optional[BaseException],
+        traceback: Optional[TracebackType]
     ) -> Optional[bool]:
-        return self.close()
+        return self._stack.__exit__(exc_type, exc_value, traceback)
 
     @staticmethod
     def logits_to_logprobs(

From bb9d1027124b14c3d1072bb235f212fcea04ce11 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 13 Jun 2024 04:09:54 -0400
Subject: [PATCH 4/5] Use contextlib ExitStack and closing

---
 llama_cpp/_internals.py | 98 +++++++++++++++--------------------------
 llama_cpp/llama.py      | 24 ++++------
 2 files changed, 43 insertions(+), 79 deletions(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 419727432..ee990d474 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -1,17 +1,15 @@
 from __future__ import annotations
 
-import contextlib
 import os
 import ctypes
-from types import TracebackType
 
 from typing import (
     List,
     Optional,
     Sequence,
-    Type,
 )
 from dataclasses import dataclass, field
+from contextlib import ExitStack
 
 import numpy as np
 import numpy.typing as npt
@@ -26,13 +24,10 @@
 # Python wrappers over llama.h structs
 
 
-class _LlamaModel(contextlib.AbstractContextManager):
+class _LlamaModel:
     """Intermediate Python wrapper for a llama.cpp llama_model.
     NOTE: For stability it's recommended you use the Llama class instead."""
 
-    _llama_free_model = None
-    # NOTE: this must be "saved" here to avoid exceptions when calling __del__
-
     def __init__(
         self,
         *,
@@ -43,8 +38,7 @@ def __init__(
         self.path_model = path_model
         self.params = params
         self.verbose = verbose
-
-        self._llama_free_model = llama_cpp._lib.llama_free_model  # type: ignore
+        self._exit_stack = ExitStack()
 
         self.model = None
 
@@ -59,22 +53,17 @@ def __init__(
         if self.model is None:
             raise ValueError(f"Failed to load model from file: {path_model}")
 
-    def __del__(self) -> None:
-        self.close()
-
-    def __exit__(
-        self,
-        __exc_type: Optional[Type[BaseException]],
-        __exc_value: Optional[BaseException],
-        __traceback: Optional[TracebackType]
-    ) -> Optional[bool]:
-        return self.close()
-
-    def close(self) -> None:
-        if self.model is not None and self._llama_free_model is not None:
-            self._llama_free_model(self.model)
+        def free_model():
+            if self.model is None:
+                return
+            llama_cpp.llama_free_model(self.model)
             self.model = None
 
+        self._exit_stack.callback(free_model)
+
+    def close(self):
+        self._exit_stack.close()
+
     def vocab_type(self) -> int:
         assert self.model is not None
         return llama_cpp.llama_vocab_type(self.model)
@@ -267,12 +256,10 @@ def default_params():
         return llama_cpp.llama_model_default_params()
 
 
-class _LlamaContext(contextlib.AbstractContextManager):
+class _LlamaContext:
     """Intermediate Python wrapper for a llama.cpp llama_context.
     NOTE: For stability it's recommended you use the Llama class instead."""
 
-    _llama_free = None
-
     def __init__(
         self,
         *,
@@ -283,35 +270,28 @@ def __init__(
         self.model = model
         self.params = params
         self.verbose = verbose
+        self._exit_stack = ExitStack()
 
-        self._llama_free = llama_cpp._lib.llama_free  # type: ignore
         self.ctx = None
 
         assert self.model.model is not None
 
-        self.ctx = llama_cpp.llama_new_context_with_model(
-            self.model.model, self.params
-        )
+        self.ctx = llama_cpp.llama_new_context_with_model(self.model.model, self.params)
 
         if self.ctx is None:
             raise ValueError("Failed to create llama_context")
 
-    def __del__(self) -> None:
-        self.close()
-
-    def __exit__(
-        self,
-        __exc_type: Optional[Type[BaseException]],
-        __exc_value: Optional[BaseException],
-        __traceback: Optional[TracebackType]
-    ) -> Optional[bool]:
-        return self.close()
-
-    def close(self) -> None:
-        if self.ctx is not None and self._llama_free is not None:
-            self._llama_free(self.ctx)
+        def free_ctx():
+            if self.ctx is None:
+                return
+            llama_cpp.llama_free(self.ctx)
             self.ctx = None
 
+        self._exit_stack.callback(free_ctx)
+
+    def close(self):
+        self._exit_stack.close()
+
     def n_ctx(self) -> int:
         assert self.ctx is not None
         return llama_cpp.llama_n_ctx(self.ctx)
@@ -525,9 +505,7 @@ def default_params():
         return llama_cpp.llama_context_default_params()
 
 
-class _LlamaBatch(contextlib.AbstractContextManager):
-    _llama_batch_free = None
-
+class _LlamaBatch:
     def __init__(
         self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True
     ):
@@ -535,30 +513,24 @@ def __init__(
         self.embd = embd
         self.n_seq_max = n_seq_max
         self.verbose = verbose
-
-        self._llama_batch_free = llama_cpp._lib.llama_batch_free  # type: ignore
+        self._exit_stack = ExitStack()
 
         self.batch = None
         self.batch = llama_cpp.llama_batch_init(
             self._n_tokens, self.embd, self.n_seq_max
         )
 
-    def __del__(self) -> None:
-        self.close()
-
-    def __exit__(
-        self,
-        __exc_type: Optional[Type[BaseException]],
-        __exc_value: Optional[BaseException],
-        __traceback: Optional[TracebackType]
-    ) -> Optional[bool]:
-        return self.close()
-
-    def close(self) -> None:
-        if self.batch is not None and self._llama_batch_free is not None:
-            self._llama_batch_free(self.batch)
+        def free_batch():
+            if self.batch is None:
+                return
+            llama_cpp.llama_batch_free(self.batch)
             self.batch = None
 
+        self._exit_stack.callback(free_batch)
+
+    def close(self):
+        self._exit_stack.close()
+
     def n_tokens(self) -> int:
         assert self.batch is not None
         return self.batch.n_tokens
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 8b7776959..459b29f92 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import contextlib
 import os
 import sys
 import uuid
@@ -10,6 +9,7 @@
 import typing
 import fnmatch
 import warnings
+import contextlib
 import multiprocessing
 from types import TracebackType
 
@@ -61,7 +61,7 @@
 from ._utils import suppress_stdout_stderr
 
 
-class Llama(contextlib.AbstractContextManager):
+class Llama:
     """High-level Python wrapper for a llama.cpp model."""
 
     __backend_initialized = False
@@ -355,9 +355,9 @@ def __init__(
 
         self._stack = contextlib.ExitStack()
 
-        self._model = self._stack.enter_context(_LlamaModel(
+        self._model = self._stack.enter_context(contextlib.closing(_LlamaModel(
             path_model=self.model_path, params=self.model_params, verbose=self.verbose
-        ))
+        )))
 
         # Override tokenizer
         self.tokenizer_ = tokenizer or LlamaTokenizer(self)
@@ -369,18 +369,18 @@ def __init__(
             self.context_params.n_ctx = self._model.n_ctx_train()
             self.context_params.n_batch = self.n_batch
 
-        self._ctx = self._stack.enter_context(_LlamaContext(
+        self._ctx = self._stack.enter_context(contextlib.closing(_LlamaContext(
             model=self._model,
             params=self.context_params,
             verbose=self.verbose,
-        ))
+        )))
 
-        self._batch = self._stack.enter_context(_LlamaBatch(
+        self._batch = self._stack.enter_context(contextlib.closing(_LlamaBatch(
             n_tokens=self.n_batch,
             embd=0,
             n_seq_max=self.context_params.n_ctx,
             verbose=self.verbose,
-        ))
+        )))
 
         if self.lora_path:
             if self._model.apply_lora_from_file(
@@ -1968,14 +1968,6 @@ def close(self) -> None:
         """Explicitly free the model from memory."""
         self._stack.close()
 
-    def __exit__(
-        self,
-        exc_type: Optional[Type[BaseException]],
-        exc_value: Optional[BaseException],
-        traceback: Optional[TracebackType]
-    ) -> Optional[bool]:
-        return self._stack.__exit__(exc_type, exc_value, traceback)
-
     @staticmethod
     def logits_to_logprobs(
         logits: Union[npt.NDArray[np.single], List], axis: int = -1

From 0a4d4a4e02330eccf112cb80794b89d55ead7599 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 13 Jun 2024 04:10:22 -0400
Subject: [PATCH 5/5] Explicitly free model when closing resources on server

---
 llama_cpp/server/model.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index d4d4acbe3..ad39c1004 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -44,6 +44,8 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
             if self._current_model is not None:
                 return self._current_model
 
+        if self._current_model:
+            self._current_model.close()
         self._current_model = None
 
         settings = self._model_settings_dict[model]
@@ -65,6 +67,7 @@ def __iter__(self):
 
     def free(self):
         if self._current_model:
+            self._current_model.close()
             del self._current_model
 
     @staticmethod