feat: add explicit methods to free model

jkawamoto · jkawamoto · commit cf2ea15292c4 · 2024-06-06T02:48:24.000-06:00
This commit introduces a `close` method to both `Llama` and `_LlamaModel`,
allowing users to explicitly free the model from RAM/VRAM.

The previous implementation relied on the destructor of `_LlamaModel` to free
the model. However, in Python, the timing of destructor calls is unclear—for
instance, the `del` statement does not guarantee immediate invocation of the
destructor.

This commit provides an explicit method to release the model, which works
immediately and allows the user to load another model without memory issues.

Additionally, this commit implements a context manager in the `Llama` class,
enabling the automatic closure of the `Llama` object when used with the `with`
statement.
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -56,7 +56,10 @@ def __init__(
         if self.model is None:
             raise ValueError(f"Failed to load model from file: {path_model}")
 
-    def __del__(self):
+    def __del__(self) -> None:
+        self.close()
+
+    def close(self) -> None:
         if self.model is not None and self._llama_free_model is not None:
             self._llama_free_model(self.model)
             self.model = None
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import contextlib
 import os
 import sys
 import uuid
@@ -10,6 +11,7 @@
 import fnmatch
 import warnings
 import multiprocessing
+from types import TracebackType
 
 from typing import (
     List,
@@ -21,6 +23,7 @@
     Deque,
     Callable,
     Dict,
+    Type,
 )
 from collections import deque
 from pathlib import Path
@@ -58,7 +61,7 @@
 from ._utils import suppress_stdout_stderr
 
 
-class Llama:
+class Llama(contextlib.AbstractContextManager):
     """High-level Python wrapper for a llama.cpp model."""
 
     __backend_initialized = False
@@ -1940,6 +1943,18 @@ def pooling_type(self) -> str:
         """Return the pooling type."""
         return self._ctx.pooling_type()
 
+    def close(self) -> None:
+        """Explicitly free the model from memory."""
+        self._model.close()
+
+    def __exit__(
+        self,
+        __exc_type: Optional[Type[BaseException]],
+        __exc_value: Optional[BaseException],
+        __traceback: Optional[TracebackType]
+    ) -> Optional[bool]:
+        return self.close()
+
     @staticmethod
     def logits_to_logprobs(
         logits: Union[npt.NDArray[np.single], List], axis: int = -1