From e38f5e21a00ef162ee0865a530f40b23c7c63cf1 Mon Sep 17 00:00:00 2001 From: Junpei Kawamoto Date: Wed, 5 Jun 2024 23:06:57 -0600 Subject: [PATCH 1/5] feat: add explicit methods to free model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit introduces a `close` method to both `Llama` and `_LlamaModel`, allowing users to explicitly free the model from RAM/VRAM. The previous implementation relied on the destructor of `_LlamaModel` to free the model. However, in Python, the timing of destructor calls is unclear—for instance, the `del` statement does not guarantee immediate invocation of the destructor. This commit provides an explicit method to release the model, which works immediately and allows the user to load another model without memory issues. Additionally, this commit implements a context manager in the `Llama` class, enabling the automatic closure of the `Llama` object when used with the `with` statement. --- llama_cpp/_internals.py | 5 ++++- llama_cpp/llama.py | 17 ++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index ba0429139..b3c4284fb 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -56,7 +56,10 @@ def __init__( if self.model is None: raise ValueError(f"Failed to load model from file: {path_model}") - def __del__(self): + def __del__(self) -> None: + self.close() + + def close(self) -> None: if self.model is not None and self._llama_free_model is not None: self._llama_free_model(self.model) self.model = None diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index bf3bd656d..796279829 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1,5 +1,6 @@ from __future__ import annotations +import contextlib import os import sys import uuid @@ -10,6 +11,7 @@ import fnmatch import warnings import multiprocessing +from types import TracebackType from typing import ( List, @@ -21,6 +23,7 @@ Deque, Callable, Dict, + Type, ) from collections import deque from pathlib import Path @@ -58,7 +61,7 @@ from ._utils import suppress_stdout_stderr -class Llama: +class Llama(contextlib.AbstractContextManager): """High-level Python wrapper for a llama.cpp model.""" __backend_initialized = False @@ -1940,6 +1943,18 @@ def pooling_type(self) -> str: """Return the pooling type.""" return self._ctx.pooling_type() + def close(self) -> None: + """Explicitly free the model from memory.""" + self._model.close() + + def __exit__( + self, + __exc_type: Optional[Type[BaseException]], + __exc_value: Optional[BaseException], + __traceback: Optional[TracebackType] + ) -> Optional[bool]: + return self.close() + @staticmethod def logits_to_logprobs( logits: Union[npt.NDArray[np.single], List], axis: int = -1 From eb823542390a0c85ad2199ca5c151538ccd57769 Mon Sep 17 00:00:00 2001 From: Junpei Kawamoto Date: Thu, 6 Jun 2024 21:25:36 -0600 Subject: [PATCH 2/5] feat: Implement ContextManager in _LlamaModel, _LlamaContext, and _LlamaBatch This commit enables automatic resource management by implementing the `ContextManager` protocol in `_LlamaModel`, `_LlamaContext`, and `_LlamaBatch`. This ensures that resources are properly managed and released within a `with` statement, enhancing robustness and safety in resource handling. --- llama_cpp/_internals.py | 43 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index b3c4284fb..c4b8dac42 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1,12 +1,15 @@ from __future__ import annotations +import contextlib import os import ctypes +from types import TracebackType from typing import ( List, Optional, Sequence, + Type, ) from dataclasses import dataclass, field @@ -23,7 +26,7 @@ # Python wrappers over llama.h structs -class _LlamaModel: +class _LlamaModel(contextlib.AbstractContextManager): """Intermediate Python wrapper for a llama.cpp llama_model. NOTE: For stability it's recommended you use the Llama class instead.""" @@ -59,6 +62,14 @@ def __init__( def __del__(self) -> None: self.close() + def __exit__( + self, + __exc_type: Optional[Type[BaseException]], + __exc_value: Optional[BaseException], + __traceback: Optional[TracebackType] + ) -> Optional[bool]: + return self.close() + def close(self) -> None: if self.model is not None and self._llama_free_model is not None: self._llama_free_model(self.model) @@ -248,7 +259,7 @@ def default_params(): return llama_cpp.llama_model_default_params() -class _LlamaContext: +class _LlamaContext(contextlib.AbstractContextManager): """Intermediate Python wrapper for a llama.cpp llama_context. NOTE: For stability it's recommended you use the Llama class instead.""" @@ -277,7 +288,18 @@ def __init__( if self.ctx is None: raise ValueError("Failed to create llama_context") - def __del__(self): + def __del__(self) -> None: + self.close() + + def __exit__( + self, + __exc_type: Optional[Type[BaseException]], + __exc_value: Optional[BaseException], + __traceback: Optional[TracebackType] + ) -> Optional[bool]: + return self.close() + + def close(self) -> None: if self.ctx is not None and self._llama_free is not None: self._llama_free(self.ctx) self.ctx = None @@ -495,7 +517,7 @@ def default_params(): return llama_cpp.llama_context_default_params() -class _LlamaBatch: +class _LlamaBatch(contextlib.AbstractContextManager): _llama_batch_free = None def __init__( @@ -513,7 +535,18 @@ def __init__( self._n_tokens, self.embd, self.n_seq_max ) - def __del__(self): + def __del__(self) -> None: + self.close() + + def __exit__( + self, + __exc_type: Optional[Type[BaseException]], + __exc_value: Optional[BaseException], + __traceback: Optional[TracebackType] + ) -> Optional[bool]: + return self.close() + + def close(self) -> None: if self.batch is not None and self._llama_batch_free is not None: self._llama_batch_free(self.batch) self.batch = None From fa702b47e770bd0ed9e77ef19bc675d4a9075bc1 Mon Sep 17 00:00:00 2001 From: Junpei Kawamoto Date: Thu, 6 Jun 2024 22:48:17 -0600 Subject: [PATCH 3/5] feat: add ExitStack for Llama's internal class closure This update implements ExitStack to manage and close internal classes in Llama, enhancing efficient and safe resource management. --- llama_cpp/llama.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 796279829..2455d1174 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -349,9 +349,11 @@ def __init__( if not os.path.exists(model_path): raise ValueError(f"Model path does not exist: {model_path}") - self._model = _LlamaModel( + self._stack = contextlib.ExitStack() + + self._model = self._stack.enter_context(_LlamaModel( path_model=self.model_path, params=self.model_params, verbose=self.verbose - ) + )) # Override tokenizer self.tokenizer_ = tokenizer or LlamaTokenizer(self) @@ -363,18 +365,18 @@ def __init__( self.context_params.n_ctx = self._model.n_ctx_train() self.context_params.n_batch = self.n_batch - self._ctx = _LlamaContext( + self._ctx = self._stack.enter_context(_LlamaContext( model=self._model, params=self.context_params, verbose=self.verbose, - ) + )) - self._batch = _LlamaBatch( + self._batch = self._stack.enter_context(_LlamaBatch( n_tokens=self.n_batch, embd=0, n_seq_max=self.context_params.n_ctx, verbose=self.verbose, - ) + )) if self.lora_path: if self._model.apply_lora_from_file( @@ -1945,15 +1947,15 @@ def pooling_type(self) -> str: def close(self) -> None: """Explicitly free the model from memory.""" - self._model.close() + self._stack.close() def __exit__( self, - __exc_type: Optional[Type[BaseException]], - __exc_value: Optional[BaseException], - __traceback: Optional[TracebackType] + exc_type: Optional[Type[BaseException]], + exc_value: Optional[BaseException], + traceback: Optional[TracebackType] ) -> Optional[bool]: - return self.close() + return self._stack.__exit__(exc_type, exc_value, traceback) @staticmethod def logits_to_logprobs( From bb9d1027124b14c3d1072bb235f212fcea04ce11 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 13 Jun 2024 04:09:54 -0400 Subject: [PATCH 4/5] Use contextlib ExitStack and closing --- llama_cpp/_internals.py | 98 +++++++++++++++-------------------------- llama_cpp/llama.py | 24 ++++------ 2 files changed, 43 insertions(+), 79 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 419727432..ee990d474 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1,17 +1,15 @@ from __future__ import annotations -import contextlib import os import ctypes -from types import TracebackType from typing import ( List, Optional, Sequence, - Type, ) from dataclasses import dataclass, field +from contextlib import ExitStack import numpy as np import numpy.typing as npt @@ -26,13 +24,10 @@ # Python wrappers over llama.h structs -class _LlamaModel(contextlib.AbstractContextManager): +class _LlamaModel: """Intermediate Python wrapper for a llama.cpp llama_model. NOTE: For stability it's recommended you use the Llama class instead.""" - _llama_free_model = None - # NOTE: this must be "saved" here to avoid exceptions when calling __del__ - def __init__( self, *, @@ -43,8 +38,7 @@ def __init__( self.path_model = path_model self.params = params self.verbose = verbose - - self._llama_free_model = llama_cpp._lib.llama_free_model # type: ignore + self._exit_stack = ExitStack() self.model = None @@ -59,22 +53,17 @@ def __init__( if self.model is None: raise ValueError(f"Failed to load model from file: {path_model}") - def __del__(self) -> None: - self.close() - - def __exit__( - self, - __exc_type: Optional[Type[BaseException]], - __exc_value: Optional[BaseException], - __traceback: Optional[TracebackType] - ) -> Optional[bool]: - return self.close() - - def close(self) -> None: - if self.model is not None and self._llama_free_model is not None: - self._llama_free_model(self.model) + def free_model(): + if self.model is None: + return + llama_cpp.llama_free_model(self.model) self.model = None + self._exit_stack.callback(free_model) + + def close(self): + self._exit_stack.close() + def vocab_type(self) -> int: assert self.model is not None return llama_cpp.llama_vocab_type(self.model) @@ -267,12 +256,10 @@ def default_params(): return llama_cpp.llama_model_default_params() -class _LlamaContext(contextlib.AbstractContextManager): +class _LlamaContext: """Intermediate Python wrapper for a llama.cpp llama_context. NOTE: For stability it's recommended you use the Llama class instead.""" - _llama_free = None - def __init__( self, *, @@ -283,35 +270,28 @@ def __init__( self.model = model self.params = params self.verbose = verbose + self._exit_stack = ExitStack() - self._llama_free = llama_cpp._lib.llama_free # type: ignore self.ctx = None assert self.model.model is not None - self.ctx = llama_cpp.llama_new_context_with_model( - self.model.model, self.params - ) + self.ctx = llama_cpp.llama_new_context_with_model(self.model.model, self.params) if self.ctx is None: raise ValueError("Failed to create llama_context") - def __del__(self) -> None: - self.close() - - def __exit__( - self, - __exc_type: Optional[Type[BaseException]], - __exc_value: Optional[BaseException], - __traceback: Optional[TracebackType] - ) -> Optional[bool]: - return self.close() - - def close(self) -> None: - if self.ctx is not None and self._llama_free is not None: - self._llama_free(self.ctx) + def free_ctx(): + if self.ctx is None: + return + llama_cpp.llama_free(self.ctx) self.ctx = None + self._exit_stack.callback(free_ctx) + + def close(self): + self._exit_stack.close() + def n_ctx(self) -> int: assert self.ctx is not None return llama_cpp.llama_n_ctx(self.ctx) @@ -525,9 +505,7 @@ def default_params(): return llama_cpp.llama_context_default_params() -class _LlamaBatch(contextlib.AbstractContextManager): - _llama_batch_free = None - +class _LlamaBatch: def __init__( self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True ): @@ -535,30 +513,24 @@ def __init__( self.embd = embd self.n_seq_max = n_seq_max self.verbose = verbose - - self._llama_batch_free = llama_cpp._lib.llama_batch_free # type: ignore + self._exit_stack = ExitStack() self.batch = None self.batch = llama_cpp.llama_batch_init( self._n_tokens, self.embd, self.n_seq_max ) - def __del__(self) -> None: - self.close() - - def __exit__( - self, - __exc_type: Optional[Type[BaseException]], - __exc_value: Optional[BaseException], - __traceback: Optional[TracebackType] - ) -> Optional[bool]: - return self.close() - - def close(self) -> None: - if self.batch is not None and self._llama_batch_free is not None: - self._llama_batch_free(self.batch) + def free_batch(): + if self.batch is None: + return + llama_cpp.llama_batch_free(self.batch) self.batch = None + self._exit_stack.callback(free_batch) + + def close(self): + self._exit_stack.close() + def n_tokens(self) -> int: assert self.batch is not None return self.batch.n_tokens diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 8b7776959..459b29f92 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1,6 +1,5 @@ from __future__ import annotations -import contextlib import os import sys import uuid @@ -10,6 +9,7 @@ import typing import fnmatch import warnings +import contextlib import multiprocessing from types import TracebackType @@ -61,7 +61,7 @@ from ._utils import suppress_stdout_stderr -class Llama(contextlib.AbstractContextManager): +class Llama: """High-level Python wrapper for a llama.cpp model.""" __backend_initialized = False @@ -355,9 +355,9 @@ def __init__( self._stack = contextlib.ExitStack() - self._model = self._stack.enter_context(_LlamaModel( + self._model = self._stack.enter_context(contextlib.closing(_LlamaModel( path_model=self.model_path, params=self.model_params, verbose=self.verbose - )) + ))) # Override tokenizer self.tokenizer_ = tokenizer or LlamaTokenizer(self) @@ -369,18 +369,18 @@ def __init__( self.context_params.n_ctx = self._model.n_ctx_train() self.context_params.n_batch = self.n_batch - self._ctx = self._stack.enter_context(_LlamaContext( + self._ctx = self._stack.enter_context(contextlib.closing(_LlamaContext( model=self._model, params=self.context_params, verbose=self.verbose, - )) + ))) - self._batch = self._stack.enter_context(_LlamaBatch( + self._batch = self._stack.enter_context(contextlib.closing(_LlamaBatch( n_tokens=self.n_batch, embd=0, n_seq_max=self.context_params.n_ctx, verbose=self.verbose, - )) + ))) if self.lora_path: if self._model.apply_lora_from_file( @@ -1968,14 +1968,6 @@ def close(self) -> None: """Explicitly free the model from memory.""" self._stack.close() - def __exit__( - self, - exc_type: Optional[Type[BaseException]], - exc_value: Optional[BaseException], - traceback: Optional[TracebackType] - ) -> Optional[bool]: - return self._stack.__exit__(exc_type, exc_value, traceback) - @staticmethod def logits_to_logprobs( logits: Union[npt.NDArray[np.single], List], axis: int = -1 From 0a4d4a4e02330eccf112cb80794b89d55ead7599 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 13 Jun 2024 04:10:22 -0400 Subject: [PATCH 5/5] Explicitly free model when closing resources on server --- llama_cpp/server/model.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index d4d4acbe3..ad39c1004 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -44,6 +44,8 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: if self._current_model is not None: return self._current_model + if self._current_model: + self._current_model.close() self._current_model = None settings = self._model_settings_dict[model] @@ -65,6 +67,7 @@ def __iter__(self): def free(self): if self._current_model: + self._current_model.close() del self._current_model @staticmethod