Skip to content

Commit 7403e00

Browse files
committed
feat: Update llama.cpp
1 parent 7c4aead commit 7403e00

File tree

3 files changed

+139
-33
lines changed

3 files changed

+139
-33
lines changed

CMakeLists.txt

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ if (LLAMA_BUILD)
5555
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
5656
set(CMAKE_SKIP_RPATH FALSE)
5757

58+
# Enable building of the common library
59+
set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build llama.cpp common library" FORCE)
60+
5861
# Building llama
5962
if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
6063
# Need to disable these llama.cpp flags on Apple x86_64,
@@ -106,7 +109,7 @@ if (LLAMA_BUILD)
106109
# Building llava
107110
add_subdirectory(vendor/llama.cpp/examples/llava)
108111
set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
109-
# Set CUDA_ARCHITECTURES to OFF on windows
112+
# Set CUDA_ARCHITECTURES to OFF on Windows
110113
if (WIN32)
111114
set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
112115
endif()
@@ -121,5 +124,18 @@ if (LLAMA_BUILD)
121124
DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
122125
)
123126
endif()
127+
128+
# Fix for llava build: Add include directory for llama.h
129+
# Move these commands after the add_subdirectory call
130+
target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
131+
target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
132+
133+
if (BUILD_SHARED_LIBS)
134+
target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
135+
target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
136+
endif()
137+
138+
target_include_directories(llama-llava-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
139+
target_include_directories(llama-minicpmv-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
124140
endif()
125141
endif()

llama_cpp/llama_cpp.py

Lines changed: 121 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,8 @@ class llama_token_data(ctypes.Structure):
464464

465465

466466
# typedef struct llama_token_data_array {
467+
# // TODO: consider SoA
468+
# // NOTE: this pointer can be modified by the samplers
467469
# llama_token_data * data;
468470
# size_t size;
469471
# int64_t selected; // this is the index in the data array (i.e. not the token id)
@@ -507,8 +509,11 @@ class llama_token_data_array(ctypes.Structure):
507509
# // - token : the token ids of the input (used when embd is NULL)
508510
# // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
509511
# // - pos : the positions of the respective token in the sequence
512+
# // (if set to NULL, the token position will be tracked automatically by llama_decode)
510513
# // - seq_id : the sequence to which the respective token belongs
514+
# // (if set to NULL, the sequence ID will be assumed to be 0)
511515
# // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
516+
# // (if set to NULL, only the logits for last token will be returned)
512517
# //
513518
# typedef struct llama_batch {
514519
# int32_t n_tokens;
@@ -519,16 +524,6 @@ class llama_token_data_array(ctypes.Structure):
519524
# int32_t * n_seq_id;
520525
# llama_seq_id ** seq_id;
521526
# int8_t * logits; // TODO: rename this to "output"
522-
523-
524-
# // NOTE: helpers for smooth API transition - can be deprecated in the future
525-
# // for future-proof code, use the above fields instead and ignore everything below
526-
# //
527-
# // pos[i] = all_pos_0 + i*all_pos_1
528-
# //
529-
# llama_pos all_pos_0; // used if pos == NULL
530-
# llama_pos all_pos_1; // used if pos == NULL
531-
# llama_seq_id all_seq_id; // used if seq_id == NULL
532527
# } llama_batch;
533528
class llama_batch(ctypes.Structure):
534529
"""Input data for llama_decode
@@ -563,9 +558,6 @@ class llama_batch(ctypes.Structure):
563558
("n_seq_id", ctypes.POINTER(ctypes.c_int32)),
564559
("seq_id", ctypes.POINTER(ctypes.POINTER(llama_seq_id))),
565560
("logits", ctypes.POINTER(ctypes.c_int8)),
566-
("all_pos_0", llama_pos),
567-
("all_pos_1", llama_pos),
568-
("all_seq_id", llama_seq_id),
569561
]
570562

571563

@@ -1170,6 +1162,12 @@ def llama_supports_gpu_offload() -> bool:
11701162
...
11711163

11721164

1165+
# LLAMA_API bool llama_supports_rpc (void);
1166+
@ctypes_function("llama_supports_rpc", [], ctypes.c_bool)
1167+
def llama_supports_rpc() -> bool:
1168+
...
1169+
1170+
11731171
# LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
11741172
@ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
11751173
def llama_n_ctx(ctx: llama_context_p, /) -> int:
@@ -2255,30 +2253,26 @@ def llama_state_seq_load_file(
22552253
# //
22562254

22572255

2258-
# // Return batch for single sequence of tokens starting at pos_0
2256+
# // Return batch for single sequence of tokens
2257+
# // The sequence ID will be fixed to 0
2258+
# // The position of the tokens will be tracked automatically by llama_decode
22592259
# //
22602260
# // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
22612261
# //
22622262
# LLAMA_API struct llama_batch llama_batch_get_one(
22632263
# llama_token * tokens,
2264-
# int32_t n_tokens,
2265-
# llama_pos pos_0,
2266-
# llama_seq_id seq_id);
2264+
# int32_t n_tokens);
22672265
@ctypes_function(
22682266
"llama_batch_get_one",
22692267
[
22702268
llama_token_p,
2271-
ctypes.c_int,
2272-
llama_pos,
2273-
llama_seq_id,
2269+
ctypes.c_int32,
22742270
],
22752271
llama_batch,
22762272
)
22772273
def llama_batch_get_one(
22782274
tokens: CtypesArray[llama_token],
22792275
n_tokens: Union[ctypes.c_int, int],
2280-
pos_0: Union[llama_pos, int],
2281-
seq_id: llama_seq_id,
22822276
/,
22832277
) -> llama_batch:
22842278
"""Return batch for single sequence of tokens starting at pos_0
@@ -2616,6 +2610,13 @@ def llama_token_eos(model: llama_model_p, /) -> int:
26162610
...
26172611

26182612

2613+
# LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
2614+
@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
2615+
def llama_token_eot(model: llama_model_p, /) -> int:
2616+
"""end-of-turn"""
2617+
...
2618+
2619+
26192620
# LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
26202621
@ctypes_function("llama_token_cls", [llama_model_p_ctypes], llama_token)
26212622
def llama_token_cls(model: llama_model_p, /) -> int:
@@ -2650,30 +2651,54 @@ def llama_add_eos_token(model: llama_model_p, /) -> bool:
26502651

26512652

26522653
# // Codellama infill tokens
2653-
# LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
2654+
# DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
26542655
@ctypes_function("llama_token_prefix", [llama_model_p_ctypes], llama_token)
26552656
def llama_token_prefix(model: llama_model_p) -> int:
26562657
"""codellama infill tokens"""
26572658
...
26582659

26592660

2660-
# LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
2661+
# DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
26612662
@ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token)
26622663
def llama_token_middle(model: llama_model_p, /) -> int:
26632664
...
26642665

26652666

2666-
# LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
2667+
# DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
26672668
@ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token)
26682669
def llama_token_suffix(model: llama_model_p, /) -> int:
26692670
...
26702671

26712672

2672-
# LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
2673-
@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
2674-
def llama_token_eot(model: llama_model_p, /) -> int:
2673+
# LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
2674+
@ctypes_function("llama_token_fim_pre", [llama_model_p_ctypes], llama_token)
2675+
def llama_token_fim_pre(model: llama_model_p, /) -> int:
2676+
...
2677+
2678+
# LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
2679+
@ctypes_function("llama_token_fim_suf", [llama_model_p_ctypes], llama_token)
2680+
def llama_token_fim_suf(model: llama_model_p, /) -> int:
26752681
...
26762682

2683+
# LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
2684+
@ctypes_function("llama_token_fim_mid", [llama_model_p_ctypes], llama_token)
2685+
def llama_token_fim_mid(model: llama_model_p, /) -> int:
2686+
...
2687+
2688+
# LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
2689+
@ctypes_function("llama_token_fim_pad", [llama_model_p_ctypes], llama_token)
2690+
def llama_token_fim_pad(model: llama_model_p, /) -> int:
2691+
...
2692+
2693+
# LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
2694+
@ctypes_function("llama_token_fim_rep", [llama_model_p_ctypes], llama_token)
2695+
def llama_token_fim_rep(model: llama_model_p, /) -> int:
2696+
...
2697+
2698+
# LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
2699+
@ctypes_function("llama_token_fim_sep", [llama_model_p_ctypes], llama_token)
2700+
def llama_token_fim_sep(model: llama_model_p, /) -> int:
2701+
...
26772702

26782703
# //
26792704
# // Tokenization
@@ -2786,6 +2811,23 @@ def llama_token_to_piece(
27862811
...
27872812

27882813

2814+
# # // check if token0 is contained as a prefix in token1
2815+
# # LLAMA_API bool llama_token_is_prefix(
2816+
# # const struct llama_model * model,
2817+
# # llama_token token0,
2818+
# # llama_token token1);
2819+
# @ctypes_function(
2820+
# "llama_token_is_prefix",
2821+
# [llama_model_p_ctypes, llama_token, llama_token],
2822+
# ctypes.c_bool,
2823+
# )
2824+
# def llama_token_is_prefix(
2825+
# model: llama_model_p, token0: Union[llama_token, int], token1: Union[llama_token, int], /
2826+
# ) -> bool:
2827+
# """Check if token0 is contained as a prefix in token1"""
2828+
# ...
2829+
2830+
27892831
# /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
27902832
# /// @param text The char pointer must be large enough to hold the resulting text.
27912833
# /// @return Returns the number of chars/bytes on success, no more than text_len_max.
@@ -3099,20 +3141,22 @@ def llama_sampler_chain_remove(
30993141

31003142
# // available samplers:
31013143
#
3102-
# LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
3144+
# LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
31033145
@ctypes_function("llama_sampler_init_greedy", [], llama_sampler_p_ctypes)
31043146
def llama_sampler_init_greedy() -> llama_sampler_p:
31053147
...
31063148

31073149

3108-
# LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
3150+
# LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
31093151
@ctypes_function("llama_sampler_init_dist", [ctypes.c_uint32], llama_sampler_p_ctypes)
31103152
def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
31113153
...
31123154

31133155

31143156
# /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
3115-
# LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void);
3157+
# /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
3158+
# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
3159+
# "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
31163160
@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
31173161
def llama_sampler_init_softmax() -> llama_sampler_p:
31183162
...
@@ -3188,6 +3232,19 @@ def llama_sampler_init_temp_ext(
31883232
...
31893233

31903234

3235+
# /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
3236+
# LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
3237+
@ctypes_function(
3238+
"llama_sampler_init_xtc",
3239+
[ctypes.c_float, ctypes.c_float, ctypes.c_size_t, ctypes.c_uint32],
3240+
llama_sampler_p_ctypes,
3241+
)
3242+
def llama_sampler_init_xtc(
3243+
p: float, t: float, min_keep: int, seed: int, /
3244+
) -> llama_sampler_p:
3245+
...
3246+
3247+
31913248
# /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
31923249
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
31933250
# /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -3301,6 +3358,39 @@ def llama_sampler_init_logit_bias(
33013358
...
33023359

33033360

3361+
# // this sampler is meant to be used for fill-in-the-middle infilling
3362+
# // it's supposed to be used after top_k + top_p sampling
3363+
# //
3364+
# // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
3365+
# // 2. combine probs of tokens that have the same prefix
3366+
# //
3367+
# // example:
3368+
# //
3369+
# // - before:
3370+
# // "hel": 0.5
3371+
# // "hell": 0.2
3372+
# // "hello": 0.1
3373+
# // "dummy": 0.1
3374+
# //
3375+
# // - after:
3376+
# // "hel": 0.8
3377+
# // "dummy": 0.1
3378+
# //
3379+
# // 3. discard non-EOG tokens with low prob
3380+
# // 4. if no tokens are left -> pick EOT
3381+
# //
3382+
# LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
3383+
@ctypes_function(
3384+
"llama_sampler_init_infill",
3385+
[llama_model_p_ctypes],
3386+
llama_sampler_p_ctypes,
3387+
)
3388+
def llama_sampler_init_infill(model: llama_model_p, /) -> llama_sampler_p:
3389+
"""This sampler is meant to be used for fill-in-the-middle infilling.
3390+
"""
3391+
...
3392+
3393+
33043394
# // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
33053395
# LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
33063396
@ctypes_function(

vendor/llama.cpp

0 commit comments

Comments
 (0)