|
15 | 15 | c_size_t,
|
16 | 16 | )
|
17 | 17 | import pathlib
|
18 |
| -from typing import List |
| 18 | +from typing import List, Union |
19 | 19 |
|
20 | 20 |
|
21 | 21 | # Load the library
|
@@ -105,6 +105,9 @@ def _load_shared_library(lib_base_name: str):
|
105 | 105 | LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
|
106 | 106 | LLAMA_SESSION_VERSION = c_int(1)
|
107 | 107 |
|
| 108 | +# struct llama_model; |
| 109 | +llama_model_p = c_void_p |
| 110 | + |
108 | 111 | # struct llama_context;
|
109 | 112 | llama_context_p = c_void_p
|
110 | 113 |
|
@@ -161,6 +164,7 @@ class llama_token_data_array(Structure):
|
161 | 164 | # // context pointer passed to the progress callback
|
162 | 165 | # void * progress_callback_user_data;
|
163 | 166 |
|
| 167 | + |
164 | 168 | # // Keep the booleans together to avoid misalignment during copy-by-value.
|
165 | 169 | # bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
166 | 170 | # bool f16_kv; // use fp16 for KV cache
|
@@ -296,6 +300,41 @@ def llama_init_backend():
|
296 | 300 | _lib.llama_init_backend.restype = None
|
297 | 301 |
|
298 | 302 |
|
| 303 | +# LLAMA_API struct llama_model * llama_load_model_from_file( |
| 304 | +# const char * path_model, |
| 305 | +# struct llama_context_params params); |
| 306 | +def llama_load_model_from_file( |
| 307 | + path_model: bytes, params: llama_context_params |
| 308 | +) -> llama_model_p: |
| 309 | + return _lib.llama_load_model_from_file(path_model, params) |
| 310 | + |
| 311 | + |
| 312 | +_lib.llama_load_model_from_file.argtypes = [c_char_p, llama_context_params] |
| 313 | +_lib.llama_load_model_from_file.restype = llama_model_p |
| 314 | + |
| 315 | + |
| 316 | +# LLAMA_API void llama_free_model(struct llama_model * model); |
| 317 | +def llama_free_model(model: llama_model_p): |
| 318 | + return _lib.llama_free_model(model) |
| 319 | + |
| 320 | + |
| 321 | +_lib.llama_free_model.argtypes = [llama_model_p] |
| 322 | +_lib.llama_free_model.restype = None |
| 323 | + |
| 324 | + |
| 325 | +# LLAMA_API struct llama_context * llama_new_context_with_model( |
| 326 | +# struct llama_model * model, |
| 327 | +# struct llama_context_params params); |
| 328 | +def llama_new_context_with_model( |
| 329 | + model: llama_model_p, params: llama_context_params |
| 330 | +) -> llama_context_p: |
| 331 | + return _lib.llama_new_context_with_model(model, params) |
| 332 | + |
| 333 | + |
| 334 | +_lib.llama_new_context_with_model.argtypes = [llama_model_p, llama_context_params] |
| 335 | +_lib.llama_new_context_with_model.restype = llama_context_p |
| 336 | + |
| 337 | + |
299 | 338 | # LLAMA_API int64_t llama_time_us();
|
300 | 339 | def llama_time_us() -> int:
|
301 | 340 | return _lib.llama_time_us()
|
@@ -376,6 +415,31 @@ def llama_apply_lora_from_file(
|
376 | 415 | _lib.llama_apply_lora_from_file.restype = c_int
|
377 | 416 |
|
378 | 417 |
|
| 418 | +# LLAMA_API int llama_model_apply_lora_from_file( |
| 419 | +# const struct llama_model * model, |
| 420 | +# const char * path_lora, |
| 421 | +# const char * path_base_model, |
| 422 | +# int n_threads); |
| 423 | +def llama_model_apply_lora_from_file( |
| 424 | + model: llama_model_p, |
| 425 | + path_lora: Union[c_char_p, bytes], |
| 426 | + path_base_model: Union[c_char_p, bytes], |
| 427 | + n_threads: c_int, |
| 428 | +) -> int: |
| 429 | + return _lib.llama_model_apply_lora_from_file( |
| 430 | + model, path_lora, path_base_model, n_threads |
| 431 | + ) |
| 432 | + |
| 433 | + |
| 434 | +_lib.llama_model_apply_lora_from_file.argtypes = [ |
| 435 | + llama_model_p, |
| 436 | + c_char_p, |
| 437 | + c_char_p, |
| 438 | + c_int, |
| 439 | +] |
| 440 | +_lib.llama_model_apply_lora_from_file.restype = c_int |
| 441 | + |
| 442 | + |
379 | 443 | # Returns the number of tokens in the KV cache
|
380 | 444 | # LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
381 | 445 | def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
|
|
0 commit comments