Skip to content

Commit cf2ea15

Browse files
committed
feat: add explicit methods to free model
This commit introduces a `close` method to both `Llama` and `_LlamaModel`, allowing users to explicitly free the model from RAM/VRAM. The previous implementation relied on the destructor of `_LlamaModel` to free the model. However, in Python, the timing of destructor calls is unclear—for instance, the `del` statement does not guarantee immediate invocation of the destructor. This commit provides an explicit method to release the model, which works immediately and allows the user to load another model without memory issues. Additionally, this commit implements a context manager in the `Llama` class, enabling the automatic closure of the `Llama` object when used with the `with` statement.
1 parent d634efc commit cf2ea15

File tree

2 files changed

+20
-2
lines changed

2 files changed

+20
-2
lines changed

llama_cpp/_internals.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,10 @@ def __init__(
5656
if self.model is None:
5757
raise ValueError(f"Failed to load model from file: {path_model}")
5858

59-
def __del__(self):
59+
def __del__(self) -> None:
60+
self.close()
61+
62+
def close(self) -> None:
6063
if self.model is not None and self._llama_free_model is not None:
6164
self._llama_free_model(self.model)
6265
self.model = None

llama_cpp/llama.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import contextlib
34
import os
45
import sys
56
import uuid
@@ -10,6 +11,7 @@
1011
import fnmatch
1112
import warnings
1213
import multiprocessing
14+
from types import TracebackType
1315

1416
from typing import (
1517
List,
@@ -21,6 +23,7 @@
2123
Deque,
2224
Callable,
2325
Dict,
26+
Type,
2427
)
2528
from collections import deque
2629
from pathlib import Path
@@ -58,7 +61,7 @@
5861
from ._utils import suppress_stdout_stderr
5962

6063

61-
class Llama:
64+
class Llama(contextlib.AbstractContextManager):
6265
"""High-level Python wrapper for a llama.cpp model."""
6366

6467
__backend_initialized = False
@@ -1940,6 +1943,18 @@ def pooling_type(self) -> str:
19401943
"""Return the pooling type."""
19411944
return self._ctx.pooling_type()
19421945

1946+
def close(self) -> None:
1947+
"""Explicitly free the model from memory."""
1948+
self._model.close()
1949+
1950+
def __exit__(
1951+
self,
1952+
__exc_type: Optional[Type[BaseException]],
1953+
__exc_value: Optional[BaseException],
1954+
__traceback: Optional[TracebackType]
1955+
) -> Optional[bool]:
1956+
return self.close()
1957+
19431958
@staticmethod
19441959
def logits_to_logprobs(
19451960
logits: Union[npt.NDArray[np.single], List], axis: int = -1

0 commit comments

Comments
 (0)