diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c4652d9f..1bea4d7d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -99,10 +99,16 @@ jobs: choco install ninja cmake - name: Install dependencies on ubuntu - if: startsWith(matrix.config.name, 'Ubuntu Latest GCC') + if: startsWith(matrix.config.name, 'Ubuntu GCC') run: | sudo apt-get update - sudo apt-get install ninja-build cmake libtbb-dev + sudo apt-get install ninja-build cmake libtbb-dev g++-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf + + which aarch64-linux-gnu-gcc + which aarch64-linux-gnu-g++ + + which arm-linux-gnueabihf-gcc + which arm-linux-gnueabihf-g++ - name: Install dependencies on macos if: startsWith(matrix.config.os, 'macos') @@ -151,7 +157,7 @@ jobs: case "win": return ["x64" /*, "arm64" */ ]; // disabled arm64 for now as compilation doesn't work case "linux": - return ["x64", "arm64", "armv7l", "ppc64le"]; + return ["x64", "arm64", "armv7l"]; case "mac": return ["x64", "arm64"]; } diff --git a/llama/addon.cpp b/llama/addon.cpp index b1c3a050..6a192799 100644 --- a/llama/addon.cpp +++ b/llama/addon.cpp @@ -10,21 +10,11 @@ class LLAMAModel : public Napi::ObjectWrap { public: - llama_context_params params; + llama_model_params model_params; llama_model* model; - float temperature; - int threads; - int32_t top_k; - float top_p; LLAMAModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap(info) { - params = llama_context_default_params(); - params.seed = -1; - params.n_ctx = 4096; - temperature = 0.0f; - threads = 6; - top_k = 40; - top_p = 0.95f; + model_params = llama_model_default_params(); // Get the model path std::string modelPath = info[0].As().Utf8Value(); @@ -32,69 +22,25 @@ class LLAMAModel : public Napi::ObjectWrap { if (info.Length() > 1 && info[1].IsObject()) { Napi::Object options = info[1].As(); - if (options.Has("seed")) { - params.seed = options.Get("seed").As().Int32Value(); - } - - if (options.Has("contextSize")) { - params.n_ctx = options.Get("contextSize").As().Int32Value(); - } - - if (options.Has("batchSize")) { - params.n_batch = options.Get("batchSize").As().Int32Value(); - } - if (options.Has("gpuLayers")) { - params.n_gpu_layers = options.Get("gpuLayers").As().Int32Value(); - } - - if (options.Has("lowVram")) { - params.low_vram = options.Get("lowVram").As().Value(); - } - - if (options.Has("f16Kv")) { - params.f16_kv = options.Get("f16Kv").As().Value(); - } - - if (options.Has("logitsAll")) { - params.logits_all = options.Get("logitsAll").As().Value(); + model_params.n_gpu_layers = options.Get("gpuLayers").As().Int32Value(); } if (options.Has("vocabOnly")) { - params.vocab_only = options.Get("vocabOnly").As().Value(); + model_params.vocab_only = options.Get("vocabOnly").As().Value(); } if (options.Has("useMmap")) { - params.use_mmap = options.Get("useMmap").As().Value(); + model_params.use_mmap = options.Get("useMmap").As().Value(); } if (options.Has("useMlock")) { - params.use_mlock = options.Get("useMlock").As().Value(); - } - - if (options.Has("embedding")) { - params.embedding = options.Get("embedding").As().Value(); - } - - if (options.Has("threads")) { - threads = options.Get("threads").As().Int32Value(); - } - - if (options.Has("temperature")) { - temperature = options.Get("temperature").As().FloatValue(); - } - - if (options.Has("topK")) { - top_k = options.Get("topK").As().Int32Value(); - } - - if (options.Has("topP")) { - top_p = options.Get("topP").As().FloatValue(); + model_params.use_mlock = options.Get("useMlock").As().Value(); } } llama_backend_init(false); - model = llama_load_model_from_file(modelPath.c_str(), params); + model = llama_load_model_from_file(modelPath.c_str(), model_params); if (model == NULL) { Napi::Error::New(info.Env(), "Failed to load model").ThrowAsJavaScriptException(); @@ -114,7 +60,6 @@ class LLAMAModel : public Napi::ObjectWrap { class LLAMAGrammar : public Napi::ObjectWrap { public: grammar_parser::parse_state parsed_grammar; - llama_grammar *grammar = nullptr; LLAMAGrammar(const Napi::CallbackInfo& info) : Napi::ObjectWrap(info) { // Get the model path @@ -139,13 +84,31 @@ class LLAMAGrammar : public Napi::ObjectWrap { if (should_print_grammar) { grammar_parser::print_grammar(stderr, parsed_grammar); } + } - std::vector grammar_rules(parsed_grammar.c_rules()); + static void init(Napi::Object exports) { + exports.Set("LLAMAGrammar", DefineClass(exports.Env(), "LLAMAGrammar", {})); + } +}; + +class LLAMAGrammarEvaluationState : public Napi::ObjectWrap { + public: + LLAMAGrammar* grammarDef; + llama_grammar *grammar = nullptr; + + LLAMAGrammarEvaluationState(const Napi::CallbackInfo& info) : Napi::ObjectWrap(info) { + grammarDef = Napi::ObjectWrap::Unwrap(info[0].As()); + grammarDef->Ref(); + + std::vector grammar_rules(grammarDef->parsed_grammar.c_rules()); grammar = llama_grammar_init( - grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + grammar_rules.data(), grammar_rules.size(), grammarDef->parsed_grammar.symbol_ids.at("root") + ); } - ~LLAMAGrammar() { + ~LLAMAGrammarEvaluationState() { + grammarDef->Unref(); + if (grammar != nullptr) { llama_grammar_free(grammar); grammar = nullptr; @@ -153,42 +116,67 @@ class LLAMAGrammar : public Napi::ObjectWrap { } static void init(Napi::Object exports) { - exports.Set("LLAMAGrammar", DefineClass(exports.Env(), "LLAMAGrammar", {})); + exports.Set("LLAMAGrammarEvaluationState", DefineClass(exports.Env(), "LLAMAGrammarEvaluationState", {})); } }; class LLAMAContext : public Napi::ObjectWrap { public: LLAMAModel* model; + llama_context_params context_params; llama_context* ctx; - LLAMAGrammar* grammar; - bool use_grammar = false; + int n_cur = 0; LLAMAContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap(info) { model = Napi::ObjectWrap::Unwrap(info[0].As()); model->Ref(); - ctx = llama_new_context_with_model(model->model, model->params); - Napi::MemoryManagement::AdjustExternalMemory(Env(), llama_get_state_size(ctx)); + + context_params = llama_context_default_params(); + context_params.seed = -1; + context_params.n_ctx = 4096; + context_params.n_threads = 6; + context_params.n_threads_batch == -1 ? context_params.n_threads : context_params.n_threads_batch; if (info.Length() > 1 && info[1].IsObject()) { - Napi::Object options = info[1].As(); + Napi::Object options = info[1].As(); - if (options.Has("grammar")) { - grammar = Napi::ObjectWrap::Unwrap(options.Get("grammar").As()); - grammar->Ref(); - use_grammar = true; - } + if (options.Has("seed")) { + context_params.seed = options.Get("seed").As().Int32Value(); + } + + if (options.Has("contextSize")) { + context_params.n_ctx = options.Get("contextSize").As().Int32Value(); + } + + if (options.Has("batchSize")) { + context_params.n_batch = options.Get("batchSize").As().Int32Value(); + } + + if (options.Has("f16Kv")) { + context_params.f16_kv = options.Get("f16Kv").As().Value(); + } + + if (options.Has("logitsAll")) { + context_params.logits_all = options.Get("logitsAll").As().Value(); + } + + if (options.Has("embedding")) { + context_params.embedding = options.Get("embedding").As().Value(); + } + + if (options.Has("threads")) { + context_params.n_threads = options.Get("threads").As().Int32Value(); + context_params.n_threads_batch == -1 ? context_params.n_threads : context_params.n_threads_batch; + } } + + ctx = llama_new_context_with_model(model->model, context_params); + Napi::MemoryManagement::AdjustExternalMemory(Env(), llama_get_state_size(ctx)); } ~LLAMAContext() { Napi::MemoryManagement::AdjustExternalMemory(Env(), -(int64_t)llama_get_state_size(ctx)); llama_free(ctx); model->Unref(); - - if (use_grammar) { - grammar->Unref(); - use_grammar = false; - } } Napi::Value Encode(const Napi::CallbackInfo& info) { std::string text = info[0].As().Utf8Value(); @@ -265,34 +253,124 @@ class LLAMAContext : public Napi::ObjectWrap { class LLAMAContextEvalWorker : Napi::AsyncWorker, Napi::Promise::Deferred { LLAMAContext* ctx; + LLAMAGrammarEvaluationState* grammar_evaluation_state; + bool use_grammar = false; std::vector tokens; llama_token result; + float temperature; + int32_t top_k; + float top_p; + float repeat_penalty = 1.10f; // 1.0 = disabled + float repeat_penalty_presence_penalty = 0.00f; // 0.0 = disabled + float repeat_penalty_frequency_penalty = 0.00f; // 0.0 = disabled + std::vector repeat_penalty_tokens; + bool use_repeat_penalty = false; public: LLAMAContextEvalWorker(const Napi::CallbackInfo& info, LLAMAContext* ctx) : Napi::AsyncWorker(info.Env(), "LLAMAContextEvalWorker"), ctx(ctx), Napi::Promise::Deferred(info.Env()) { ctx->Ref(); Napi::Uint32Array tokens = info[0].As(); + + temperature = 0.0f; + top_k = 40; + top_p = 0.95f; + + if (info.Length() > 1 && info[1].IsObject()) { + Napi::Object options = info[1].As(); + + if (options.Has("temperature")) { + temperature = options.Get("temperature").As().FloatValue(); + } + + if (options.Has("topK")) { + top_k = options.Get("topK").As().Int32Value(); + } + + if (options.Has("topP")) { + top_p = options.Get("topP").As().FloatValue(); + } + + if (options.Has("repeatPenalty")) { + repeat_penalty = options.Get("repeatPenalty").As().FloatValue(); + } + + if (options.Has("repeatPenaltyTokens")) { + Napi::Uint32Array repeat_penalty_tokens_uint32_array = options.Get("repeatPenaltyTokens").As(); + + repeat_penalty_tokens.reserve(repeat_penalty_tokens_uint32_array.ElementLength()); + for (size_t i = 0; i < repeat_penalty_tokens_uint32_array.ElementLength(); i++) { + repeat_penalty_tokens.push_back(static_cast(repeat_penalty_tokens_uint32_array[i])); + } + + use_repeat_penalty = true; + } + + if (options.Has("repeatPenaltyPresencePenalty")) { + repeat_penalty_presence_penalty = options.Get("repeatPenaltyPresencePenalty").As().FloatValue(); + } + + if (options.Has("repeatPenaltyFrequencyPenalty")) { + repeat_penalty_frequency_penalty = options.Get("repeatPenaltyFrequencyPenalty").As().FloatValue(); + } + + if (options.Has("grammarEvaluationState")) { + grammar_evaluation_state = Napi::ObjectWrap::Unwrap(options.Get("grammarEvaluationState").As()); + grammar_evaluation_state->Ref(); + use_grammar = true; + } + } + this->tokens.reserve(tokens.ElementLength()); for (size_t i = 0; i < tokens.ElementLength(); i++) { this->tokens.push_back(static_cast(tokens[i])); } } - ~LLAMAContextEvalWorker() { ctx->Unref(); } + ~LLAMAContextEvalWorker() { + ctx->Unref(); + + if (use_grammar) { + grammar_evaluation_state->Unref(); + use_grammar = false; + } + } using Napi::AsyncWorker::Queue; using Napi::Promise::Deferred::Promise; protected: void Execute() { - // Perform the evaluation using llama_eval. - int r = llama_eval(ctx->ctx, tokens.data(), int(tokens.size()), llama_get_kv_cache_token_count(ctx->ctx), (ctx->model)->threads); + llama_batch batch = llama_batch_init(tokens.size(), 0); + + batch.n_tokens = tokens.size(); + + for (int32_t i = 0; i < batch.n_tokens; i++) { + batch.token[i] = tokens[i]; + batch.pos[i] = ctx->n_cur; + batch.seq_id[i] = 0; + batch.logits[i] = false; + + ctx->n_cur++; + } + + batch.logits[batch.n_tokens - 1] = true; + + // Perform the evaluation using llama_decode. + int r = llama_decode(ctx->ctx, batch); + + llama_batch_free(batch); + if (r != 0) { - SetError("Eval has failed"); + if (r == 1) { + SetError("could not find a KV slot for the batch (try reducing the size of the batch or increase the context)"); + } else { + SetError("Eval has failed"); + } + return; } llama_token new_token_id = 0; // Select the best prediction. - auto logits = llama_get_logits(ctx->ctx); - auto n_vocab = llama_n_vocab(ctx->ctx); + auto logits = llama_get_logits_ith(ctx->ctx, batch.n_tokens - 1); + auto n_vocab = llama_n_vocab(ctx->model->model); std::vector candidates; candidates.reserve(n_vocab); @@ -303,48 +381,43 @@ class LLAMAContextEvalWorker : Napi::AsyncWorker, Napi::Promise::Deferred { llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - float originalEosLogit = 0; auto eos_token = llama_token_eos(ctx->ctx); - for (auto& candidate : candidates) { - if (candidate.id == eos_token) { - originalEosLogit = candidate.logit; - break; - } - } - - if (ctx->use_grammar) { - llama_sample_grammar(ctx->ctx, &candidates_p, (ctx->grammar)->grammar); + if (use_repeat_penalty && !repeat_penalty_tokens.empty()) { + llama_sample_repetition_penalty( + ctx->ctx, &candidates_p, repeat_penalty_tokens.data(), repeat_penalty_tokens.size(), repeat_penalty + ); + llama_sample_frequency_and_presence_penalties( + ctx->ctx, &candidates_p, repeat_penalty_tokens.data(), repeat_penalty_tokens.size(), + repeat_penalty_frequency_penalty, repeat_penalty_presence_penalty + ); } - for (auto& candidate : candidates) { - if (candidate.id == eos_token) { - candidate.logit = originalEosLogit; - break; - } + if (use_grammar && (grammar_evaluation_state)->grammar != nullptr) { + llama_sample_grammar(ctx->ctx, &candidates_p, (grammar_evaluation_state)->grammar); } - if ((ctx->model)->temperature <= 0) { + if (temperature <= 0) { new_token_id = llama_sample_token_greedy(ctx->ctx , &candidates_p); } else { - const int32_t top_k = (ctx->model)->top_k <= 0 ? llama_n_vocab(ctx->ctx) : (ctx->model)->top_k; + const int32_t resolved_top_k = top_k <= 0 ? llama_n_vocab(ctx->model->model) : std::min(top_k, llama_n_vocab(ctx->model->model)); const int32_t n_probs = 0; // Number of probabilities to keep - 0 = disabled const float tfs_z = 1.00f; // Tail free sampling - 1.0 = disabled const float typical_p = 1.00f; // Typical probability - 1.0 = disabled - const float top_p = (ctx->model)->top_p; // Top p sampling - 1.0 = disabled + const float resolved_top_p = top_p; // Top p sampling - 1.0 = disabled // Temperature sampling size_t min_keep = std::max(1, n_probs); - llama_sample_top_k(ctx->ctx, &candidates_p, top_k, min_keep); + llama_sample_top_k(ctx->ctx, &candidates_p, resolved_top_k, min_keep); llama_sample_tail_free(ctx->ctx, &candidates_p, tfs_z, min_keep); llama_sample_typical(ctx->ctx, &candidates_p, typical_p, min_keep); - llama_sample_top_p(ctx->ctx, &candidates_p, top_p, min_keep); - llama_sample_temperature(ctx->ctx, &candidates_p, (ctx->model)->temperature);; + llama_sample_top_p(ctx->ctx, &candidates_p, resolved_top_p, min_keep); + llama_sample_temperature(ctx->ctx, &candidates_p, temperature); new_token_id = llama_sample_token(ctx->ctx, &candidates_p); } - if (new_token_id != eos_token && ctx->use_grammar) { - llama_grammar_accept_token(ctx->ctx, (ctx->grammar)->grammar, new_token_id); + if (new_token_id != eos_token && use_grammar && (grammar_evaluation_state)->grammar != nullptr) { + llama_grammar_accept_token(ctx->ctx, (grammar_evaluation_state)->grammar, new_token_id); } result = new_token_id; @@ -372,6 +445,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) { }); LLAMAModel::init(exports); LLAMAGrammar::init(exports); + LLAMAGrammarEvaluationState::init(exports); LLAMAContext::init(exports); return exports; } diff --git a/llama/toolchains/darwin.host-x64.target-arm64.cmake b/llama/toolchains/darwin.host-x64.target-arm64.cmake new file mode 100644 index 00000000..f6385a1c --- /dev/null +++ b/llama/toolchains/darwin.host-x64.target-arm64.cmake @@ -0,0 +1,8 @@ +set(CMAKE_SYSTEM_NAME Darwin) # macOS +set(CMAKE_SYSTEM_PROCESSOR arm64) + +set(CMAKE_C_COMPILER clang) +set(CMAKE_CXX_COMPILER clang++) + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -arch arm64") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -arch arm64") diff --git a/llama/toolchains/linux.host-arm64.target-x64.cmake b/llama/toolchains/linux.host-arm64.target-x64.cmake new file mode 100644 index 00000000..d92a8607 --- /dev/null +++ b/llama/toolchains/linux.host-arm64.target-x64.cmake @@ -0,0 +1,5 @@ +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR x86_64) + +set(CMAKE_C_COMPILER x86_64-linux-gnu-gcc) +set(CMAKE_CXX_COMPILER x86_64-linux-gnu-g++) diff --git a/llama/toolchains/linux.host-x64.target-arm64.cmake b/llama/toolchains/linux.host-x64.target-arm64.cmake new file mode 100644 index 00000000..948164d5 --- /dev/null +++ b/llama/toolchains/linux.host-x64.target-arm64.cmake @@ -0,0 +1,5 @@ +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR aarch64) + +set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc) +set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++) diff --git a/llama/toolchains/linux.host-x64.target-arm71.cmake b/llama/toolchains/linux.host-x64.target-arm71.cmake new file mode 100644 index 00000000..db2566cb --- /dev/null +++ b/llama/toolchains/linux.host-x64.target-arm71.cmake @@ -0,0 +1,5 @@ +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR arm) + +set(CMAKE_C_COMPILER arm-linux-gnueabihf-gcc) +set(CMAKE_CXX_COMPILER arm-linux-gnueabihf-g++) diff --git a/src/config.ts b/src/config.ts index 32484b55..a9e1311b 100644 --- a/src/config.ts +++ b/src/config.ts @@ -12,6 +12,7 @@ const env = envVar.from(process.env); export const llamaDirectory = path.join(__dirname, "..", "llama"); +export const llamaToolchainsDirectory = path.join(llamaDirectory, "toolchains"); export const llamaBinsDirectory = path.join(__dirname, "..", "llamaBins"); export const llamaBinsGrammarsDirectory = path.join(__dirname, "..", "llama", "grammars"); export const llamaCppDirectory = path.join(llamaDirectory, "llama.cpp"); diff --git a/src/index.ts b/src/index.ts index 733315f1..e8d4308e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,7 +1,8 @@ import {LlamaModel, type LlamaModelOptions} from "./llamaEvaluator/LlamaModel.js"; import {LlamaGrammar, type LlamaGrammarOptions} from "./llamaEvaluator/LlamaGrammar.js"; -import {LlamaContext, type LlamaContextOptions} from "./llamaEvaluator/LlamaContext.js"; -import {LlamaChatSession, type LlamaChatSessionOptions} from "./llamaEvaluator/LlamaChatSession.js"; +import {LlamaGrammarEvaluationState, LlamaGrammarEvaluationStateOptions} from "./llamaEvaluator/LlamaGrammarEvaluationState.js"; +import {LlamaContext, type LlamaContextOptions, type LlamaContextRepeatPenalty} from "./llamaEvaluator/LlamaContext.js"; +import {LlamaChatSession, type LlamaChatSessionOptions, type LlamaChatSessionRepeatPenalty} from "./llamaEvaluator/LlamaChatSession.js"; import {AbortError} from "./AbortError.js"; import {ChatPromptWrapper} from "./ChatPromptWrapper.js"; import {EmptyChatPromptWrapper} from "./chatWrappers/EmptyChatPromptWrapper.js"; @@ -19,10 +20,14 @@ export { type LlamaModelOptions, LlamaGrammar, type LlamaGrammarOptions, + LlamaGrammarEvaluationState, + type LlamaGrammarEvaluationStateOptions, LlamaContext, type LlamaContextOptions, + type LlamaContextRepeatPenalty, LlamaChatSession, type LlamaChatSessionOptions, + type LlamaChatSessionRepeatPenalty, type ConversationInteraction, AbortError, ChatPromptWrapper, diff --git a/src/llamaEvaluator/LlamaBins.ts b/src/llamaEvaluator/LlamaBins.ts index 1985343c..f3343e12 100644 --- a/src/llamaEvaluator/LlamaBins.ts +++ b/src/llamaEvaluator/LlamaBins.ts @@ -1,6 +1,6 @@ -import {loadBin, type LLAMAModel, type LLAMAContext, type LLAMAGrammar} from "../utils/getBin.js"; +import {loadBin, type LLAMAModel, type LLAMAContext, type LLAMAGrammar, type LLAMAGrammarEvaluationState} from "../utils/getBin.js"; export const llamaCppNode = await loadBin(); -const {LLAMAModel, LLAMAContext, LLAMAGrammar} = llamaCppNode; +const {LLAMAModel, LLAMAContext, LLAMAGrammar, LLAMAGrammarEvaluationState} = llamaCppNode; -export {LLAMAModel, LLAMAContext, LLAMAGrammar}; +export {LLAMAModel, LLAMAContext, LLAMAGrammar, LLAMAGrammarEvaluationState}; diff --git a/src/llamaEvaluator/LlamaChatSession.ts b/src/llamaEvaluator/LlamaChatSession.ts index d47b2d70..41254928 100644 --- a/src/llamaEvaluator/LlamaChatSession.ts +++ b/src/llamaEvaluator/LlamaChatSession.ts @@ -6,8 +6,11 @@ import {GeneralChatPromptWrapper} from "../chatWrappers/GeneralChatPromptWrapper import {getChatWrapperByBos} from "../chatWrappers/createChatWrapperByBos.js"; import {ConversationInteraction, Token} from "../types.js"; import {generateContextTextFromConversationHistory} from "../chatWrappers/generateContextTextFromConversationHistory.js"; +import {removeNullFields} from "../utils/removeNullFields.js"; import {LlamaModel} from "./LlamaModel.js"; import {LlamaContext} from "./LlamaContext.js"; +import {LlamaGrammar} from "./LlamaGrammar.js"; +import {LlamaGrammarEvaluationState} from "./LlamaGrammarEvaluationState.js"; const UNKNOWN_UNICODE_CHAR = "\ufffd"; @@ -15,13 +18,53 @@ const UNKNOWN_UNICODE_CHAR = "\ufffd"; export type LlamaChatSessionOptions = { context: LlamaContext, printLLamaSystemInfo?: boolean, + + /** GeneralChatPromptWrapper is ued by default */ promptWrapper?: ChatPromptWrapper | "auto", + systemPrompt?: string, /** Conversation history to load into the context to continue an existing conversation */ conversationHistory?: readonly ConversationInteraction[] }; +export type LlamaChatSessionRepeatPenalty = { + /** + * Number of recent tokens generated by the model to apply penalties to repetition of. + * Defaults to `64`. + */ + lastTokens?: number, + + punishTokensFilter?: (tokens: Token[]) => Token[], + + /** + * Penalize new line tokens. + * Enabled by default. + */ + penalizeNewLine?: boolean, + + /** + * The relative amount to lower the probability of the tokens in `punishTokens` by + * Defaults to `1.1`. + * Set to `1` to disable. + */ + penalty?: number, + + /** + * For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty` + * Disabled by default (`0`). + * Set to a value between `0` and `1` to enable. + */ + frequencyPenalty?: number, + + /** + * Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty` + * Disabled by default (`0`). + * Set to a value between `0` and `1` to enable. + */ + presencePenalty?: number +}; + export class LlamaChatSession { private readonly _systemPrompt: string; private readonly _printLLamaSystemInfo: boolean; @@ -33,6 +76,9 @@ export class LlamaChatSession { private _conversationHistoryToLoad: readonly ConversationInteraction[] | null = null; private readonly _ctx: LlamaContext; + /** + * @param {LlamaChatSessionOptions} options + */ public constructor({ context, printLLamaSystemInfo = false, @@ -78,9 +124,68 @@ export class LlamaChatSession { }); } + /** + * @param {string} prompt + * @param {object} options + * @returns {Promise} + */ public async prompt(prompt: string, { - onToken, signal, maxTokens - }: { onToken?(tokens: Token[]): void, signal?: AbortSignal, maxTokens?: number } = {}) { + onToken, + signal, + maxTokens, + temperature, + topK, + topP, + grammar = this.context._chatGrammar, + trimWhitespaceSuffix = false, + repeatPenalty + }: { + onToken?: (tokens: Token[]) => void, + signal?: AbortSignal, + maxTokens?: number, + + /** + * Temperature is a hyperparameter that controls the randomness of the generated text. + * It affects the probability distribution of the model's output tokens. + * A higher temperature (e.g., 1.5) makes the output more random and creative, + * while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. + * The suggested temperature is 0.8, which provides a balance between randomness and determinism. + * At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run. + * + * Set to `0` to disable. + * Disabled by default (set to `0`). + */ + temperature?: number, + + /** + * Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation. + * An integer number between `1` and the size of the vocabulary. + * Set to `0` to disable (which uses the full vocabulary). + * + * Only relevant when `temperature` is set to a value greater than 0. + */ + topK?: number, + + /** + * Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P, + * and samples the next token only from this set. + * A float number between `0` and `1`. + * Set to `1` to disable. + * + * Only relevant when `temperature` is set to a value greater than `0`. + */ + topP?: number, + + grammar?: LlamaGrammar, + + /** + * Trim whitespace from the end of the generated text + * Disabled by default. + */ + trimWhitespaceSuffix?: boolean, + + repeatPenalty?: false | LlamaChatSessionRepeatPenalty + } = {}) { if (!this.initialized) await this.init(); @@ -127,7 +232,10 @@ export class LlamaChatSession { this._lastStopStringSuffix = null; const {text, stopString, stopStringSuffix} = - await this._evalTokens(this._ctx.encode(promptText), {onToken, signal, maxTokens}); + await this._evalTokens(this._ctx.encode(promptText), { + onToken, signal, maxTokens, temperature, topK, topP, grammar, trimWhitespaceSuffix, + repeatPenalty: repeatPenalty == false ? {lastTokens: 0} : repeatPenalty + }); this._lastStopString = stopString; this._lastStopStringSuffix = stopStringSuffix; @@ -136,14 +244,73 @@ export class LlamaChatSession { } private async _evalTokens(tokens: Uint32Array, { - onToken, signal, maxTokens - }: { onToken?(tokens: Token[]): void, signal?: AbortSignal, maxTokens?: number } = {}) { - const stopStrings = this._promptWrapper.getStopStrings(); + onToken, + signal, + maxTokens, + temperature, + topK, + topP, + grammar = this.context._chatGrammar, + trimWhitespaceSuffix = false, + repeatPenalty: { + lastTokens: repeatPenaltyLastTokens = 64, + punishTokensFilter, + penalizeNewLine, + penalty, + frequencyPenalty, + presencePenalty + } = {} + }: { + onToken?: (tokens: Token[]) => void, + signal?: AbortSignal, + maxTokens?: number, + temperature?: number, + topK?: number, + topP?: number, + grammar?: LlamaGrammar, + trimWhitespaceSuffix?: boolean, + repeatPenalty?: LlamaChatSessionRepeatPenalty + } = {}) { + let stopStrings = this._promptWrapper.getStopStrings(); + + if (grammar != null) + stopStrings = stopStrings.concat(grammar.stopStrings); + const stopStringIndexes: number[] = Array(stopStrings.length).fill(0); const skippedChunksQueue: Token[] = []; const res: Token[] = []; + const grammarEvaluationState = grammar != null + ? new LlamaGrammarEvaluationState({grammar}) + : undefined; + const repeatPenaltyEnabled = repeatPenaltyLastTokens > 0; - for await (const chunk of this._ctx.evaluate(tokens)) { + const getPenaltyTokens = () => { + let punishTokens = res.slice(-repeatPenaltyLastTokens); + + if (punishTokensFilter != null) + punishTokens = punishTokensFilter(punishTokens); + + if (!penalizeNewLine) { + const nlToken = this.context.getNlToken(); + + if (nlToken != null) + punishTokens = punishTokens.filter(token => token !== nlToken); + } + + return Uint32Array.from(punishTokens); + }; + + const evaluationIterator = this._ctx.evaluate(tokens, removeNullFields({ + temperature, topK, topP, grammarEvaluationState, + repeatPenalty: !repeatPenaltyEnabled ? undefined : { + punishTokens: getPenaltyTokens, + penalty, + frequencyPenalty, + presencePenalty + } + })); + + for await (const chunk of evaluationIterator) { if (signal?.aborted) throw new AbortError(); @@ -158,7 +325,10 @@ export class LlamaChatSession { ? this._ctx.decode(Uint32Array.from(skippedChunksQueue)) : ""; - const [queuedTextBeforeStopString] = skippedChunksText.split(stopString); + let [queuedTextBeforeStopString] = skippedChunksText.split(stopString); + + if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) + queuedTextBeforeStopString = queuedTextBeforeStopString.trimEnd(); if (queuedTextBeforeStopString.length > 0) { const beforeStopStringTokens: Token[] = Array.from(this._ctx.encode(queuedTextBeforeStopString)); @@ -176,7 +346,9 @@ export class LlamaChatSession { } // if the token is unknown, it means it's not complete character - if (tokenStr === UNKNOWN_UNICODE_CHAR || skipTokenEvent) { + if (tokenStr === UNKNOWN_UNICODE_CHAR || skipTokenEvent || ( + (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) && tokenStr.trim() === "" + )) { skippedChunksQueue.push(chunk); continue; } @@ -194,8 +366,13 @@ export class LlamaChatSession { break; } + let resText = this._ctx.decode(Uint32Array.from(res)); + + if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) + resText = resText.trimEnd(); + return { - text: this._ctx.decode(Uint32Array.from(res)), + text: resText, stopString: null, stopStringSuffix: null }; diff --git a/src/llamaEvaluator/LlamaContext.ts b/src/llamaEvaluator/LlamaContext.ts index ef2678e9..b7d85b82 100644 --- a/src/llamaEvaluator/LlamaContext.ts +++ b/src/llamaEvaluator/LlamaContext.ts @@ -2,26 +2,107 @@ import {removeNullFields} from "../utils/removeNullFields.js"; import {Token} from "../types.js"; import {LLAMAContext} from "./LlamaBins.js"; import {LlamaModel} from "./LlamaModel.js"; +import {LlamaGrammarEvaluationState} from "./LlamaGrammarEvaluationState.js"; import {LlamaGrammar} from "./LlamaGrammar.js"; export type LlamaContextOptions = { model: LlamaModel, + prependBos?: boolean, + + /** + * @deprecated use the `grammar` option on `LlamaChatSession`'s `prompt` function + * or the `grammarEvaluationState` option on `LlamaContext`'s `evaluate` function instead + * @hidden + */ grammar?: LlamaGrammar, - prependBos?: boolean + + /** If null, a random seed will be used */ + seed?: number | null, + + /** text context size */ + contextSize?: number, + + /** prompt processing batch size */ + batchSize?: number, + + /** use fp16 for KV cache */ + f16Kv?: boolean, + + /** the llama_eval() call computes all logits, not just the last one */ + logitsAll?: boolean, + + /** embedding mode only */ + embedding?: boolean + + /** number of threads to use to evaluate tokens */ + threads?: number, +}; + +export type LlamaContextRepeatPenalty = { + /** Tokens to lower the predication probability of to be the next predicted token */ + punishTokens: Uint32Array | (() => Uint32Array), + + /** + * The relative amount to lower the probability of the tokens in `punishTokens` by + * Defaults to `1.1`. + * Set to `1` to disable. + */ + penalty?: number, + + /** + * For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty` + * Disabled by default (`0`). + * Set to a value between `0` and `1` to enable. + */ + frequencyPenalty?: number, + + /** + * Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty` + * Disabled by default (`0`). + * Set to a value between `0` and `1` to enable. + */ + presencePenalty?: number }; export class LlamaContext { + private readonly _model: LlamaModel; private readonly _ctx: LLAMAContext; private readonly _prependBos: boolean; private _prependTokens: Token[]; - public constructor({model, grammar, prependBos = true}: LlamaContextOptions) { + /** @internal */ + public readonly _chatGrammar?: LlamaGrammar; + + + /** + * @param {LlamaContextOptions} options + */ + public constructor({ + model, + prependBos = true, + grammar, + seed = model._contextOptions.seed, + contextSize = model._contextOptions.contextSize, + batchSize = model._contextOptions.batchSize, + f16Kv = model._contextOptions.f16Kv, + logitsAll = model._contextOptions.logitsAll, + embedding = model._contextOptions.embedding, + threads = model._contextOptions.threads + }: LlamaContextOptions) { + this._model = model; this._ctx = new LLAMAContext(model._model, removeNullFields({ - grammar: grammar?._grammar + seed: seed != null ? Math.max(-1, seed) : undefined, + contextSize, + batchSize, + f16Kv, + logitsAll, + embedding, + threads })); this._prependBos = prependBos; this._prependTokens = []; + this._chatGrammar = grammar; if (prependBos) { this._prependTokens.unshift(this._ctx.tokenBos()); @@ -125,7 +206,21 @@ export class LlamaContext { return this._ctx.getContextSize(); } - public async *evaluate(tokens: Uint32Array): AsyncGenerator { + /** + * @param {Uint32Array} tokens + * @param {object} options + * @returns {AsyncGenerator} + */ + public async *evaluate(tokens: Uint32Array, { + temperature = this._model._evaluationOptions.temperature, + topK = this._model._evaluationOptions.topK, + topP = this._model._evaluationOptions.topP, + grammarEvaluationState, + repeatPenalty + }: { + temperature?: number, topK?: number, topP?: number, grammarEvaluationState?: LlamaGrammarEvaluationState, + repeatPenalty?: LlamaContextRepeatPenalty + } = {}): AsyncGenerator { let evalTokens = tokens; if (this._prependTokens.length > 0) { @@ -135,10 +230,24 @@ export class LlamaContext { this._prependTokens = []; } + if (evalTokens.length === 0) + return; + // eslint-disable-next-line no-constant-condition while (true) { // Evaluate to get the next token. - const nextToken: Token = await this._ctx.eval(evalTokens); + const nextToken: Token = await this._ctx.eval(evalTokens, removeNullFields({ + temperature, + topK, + topP, + repeatPenalty: repeatPenalty?.penalty, + repeatPenaltyTokens: repeatPenalty?.punishTokens instanceof Function + ? repeatPenalty.punishTokens() + : repeatPenalty?.punishTokens, + repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty, + repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty, + grammarEvaluationState: grammarEvaluationState?._state + })); // the assistant finished answering if (nextToken === this._ctx.tokenEos()) diff --git a/src/llamaEvaluator/LlamaGrammar.ts b/src/llamaEvaluator/LlamaGrammar.ts index 523f53ec..33100054 100644 --- a/src/llamaEvaluator/LlamaGrammar.ts +++ b/src/llamaEvaluator/LlamaGrammar.ts @@ -10,23 +10,48 @@ export type LlamaGrammarOptions = { /** print the grammar to stdout */ printGrammar?: boolean + + /** Consider any of these texts as EOS for the generated out. Only supported by `LlamaChatSession` */ + stopStrings?: string[], + + /** Trim whitespace from the end of the generated text. Only supported by `LlamaChatSession` */ + trimWhitespaceSuffix?: boolean }; export class LlamaGrammar { /** @internal */ public readonly _grammar: LLAMAGrammar; + private readonly _stopStrings: readonly string[]; + private readonly _trimWhitespaceSuffix: boolean; /** - * GBNF files are supported. - * More info here: https://github.com/ggerganov/llama.cpp/blob/f5fe98d11bdf9e7797bcfb05c0c3601ffc4b9d26/grammars/README.md + * > GBNF files are supported. + * > More info here: [github:ggerganov/llama.cpp:grammars/README.md]( + * > https://github.com/ggerganov/llama.cpp/blob/f5fe98d11bdf9e7797bcfb05c0c3601ffc4b9d26/grammars/README.md) * @param {object} options * @param {string} options.grammar - GBNF grammar + * @param {string[]} [options.stopStrings] - Consider any of these texts as EOS for the generated out. + * Only supported by `LlamaChatSession` + * @param {boolean} [options.trimWhitespaceSuffix] - Trim whitespace from the end of the generated text. + * Only supported by `LlamaChatSession` * @param {boolean} [options.printGrammar] - print the grammar to stdout */ - public constructor({grammar, printGrammar = false}: LlamaGrammarOptions) { + public constructor({ + grammar, stopStrings = [], trimWhitespaceSuffix = false, printGrammar = false + }: LlamaGrammarOptions) { this._grammar = new LLAMAGrammar(grammar, { printGrammar }); + this._stopStrings = stopStrings ?? []; + this._trimWhitespaceSuffix = trimWhitespaceSuffix; + } + + public get stopStrings() { + return this._stopStrings; + } + + public get trimWhitespaceSuffix() { + return this._trimWhitespaceSuffix; } public static async getFor(type: "json" | "list" | "arithmetic" | "japanese" | "chess") { @@ -36,7 +61,11 @@ export class LlamaGrammar { if (await fs.pathExists(grammarFile)) { const grammar = await fs.readFile(grammarFile, "utf8"); - return new LlamaGrammar({grammar}); + return new LlamaGrammar({ + grammar, + stopStrings: ["\n".repeat(10)], // this is a workaround for the model not stopping to generate text, + trimWhitespaceSuffix: true + }); } throw new Error(`Grammar file for type "${type}" was not found in "${grammarsFolder}"`); diff --git a/src/llamaEvaluator/LlamaGrammarEvaluationState.ts b/src/llamaEvaluator/LlamaGrammarEvaluationState.ts new file mode 100644 index 00000000..42a90c2e --- /dev/null +++ b/src/llamaEvaluator/LlamaGrammarEvaluationState.ts @@ -0,0 +1,23 @@ +import {LLAMAGrammarEvaluationState} from "./LlamaBins.js"; +import {LlamaGrammar} from "./LlamaGrammar.js"; + + +export type LlamaGrammarEvaluationStateOptions = { + grammar: LlamaGrammar, +}; + +export class LlamaGrammarEvaluationState { + /** @internal */ + public readonly _state: LLAMAGrammarEvaluationState; + + /** + * Grammar evaluation state is used to track the model response to determine the next allowed characters for the model to generate. + * Create a new grammar evaluation state for every response you generate with the model. + * This is only needed when using the `LlamaContext` class directly, as `LlamaChatSession` already handles this for you. + * @param {object} options + * @param {LlamaGrammar} options.grammar + */ + public constructor({grammar}: LlamaGrammarEvaluationStateOptions) { + this._state = new LLAMAGrammarEvaluationState(grammar._grammar); + } +} diff --git a/src/llamaEvaluator/LlamaModel.ts b/src/llamaEvaluator/LlamaModel.ts index e592be27..d7b2e963 100644 --- a/src/llamaEvaluator/LlamaModel.ts +++ b/src/llamaEvaluator/LlamaModel.ts @@ -8,22 +8,35 @@ export type LlamaModelOptions = { /** path to the model on the filesystem */ modelPath: string, - /** If null, a random seed will be used */ + /** + * If null, a random seed will be used + * @deprecated use the `seed` option on `LlamaContext` instead + * @hidden + * */ seed?: number | null, - /** text context size */ + /** + * text context size + * @deprecated use the `contextSize` option on `LlamaContext` instead + * @hidden + * */ contextSize?: number, - /** prompt processing batch size */ + /** + * prompt processing batch size + * @deprecated use the `batchSize` option on `LlamaContext` instead + * @hidden + * */ batchSize?: number, /** number of layers to store in VRAM */ gpuLayers?: number, - /** if true, reduce VRAM usage at the cost of performance */ - lowVram?: boolean, - - /** number of threads to use to evaluate tokens */ + /** + * number of threads to use to evaluate tokens + * @deprecated use the `threads` option on `LlamaContext` instead + * @hidden + * */ threads?: number, /** @@ -35,6 +48,8 @@ export type LlamaModelOptions = { * At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run. * * Set to `0` to disable. + * @deprecated use the `temperature` option on `LlamaChatSession`'s `prompt` function or `LlamaContext`'s `evaluate` function instead + * @hidden */ temperature?: number, @@ -44,6 +59,8 @@ export type LlamaModelOptions = { * Set to `0` to disable (which uses the full vocabulary). * * Only relevant when `temperature` is set to a value greater than 0. + * @deprecated use the `topK` option on `LlamaChatSession`'s `prompt` function or `LlamaContext`'s `evaluate` function instead + * @hidden * */ topK?: number, @@ -54,13 +71,23 @@ export type LlamaModelOptions = { * Set to `1` to disable. * * Only relevant when `temperature` is set to a value greater than `0`. - * */ + * @deprecated use the `topP` option on `LlamaChatSession`'s `prompt` function or `LlamaContext`'s `evaluate` function instead + * @hidden + */ topP?: number, - /** use fp16 for KV cache */ + /** + * use fp16 for KV cache + * @deprecated use the `f16Kv` option on `LlamaContext` instead + * @hidden + */ f16Kv?: boolean, - /** the llama_eval() call computes all logits, not just the last one */ + /** + * the llama_eval() call computes all logits, not just the last one + * @deprecated use the `logitsAll` option on `LlamaContext` instead + * @hidden + */ logitsAll?: boolean, /** only load the vocabulary, no weights */ @@ -72,7 +99,11 @@ export type LlamaModelOptions = { /** force system to keep model in RAM */ useMlock?: boolean, - /** embedding mode only */ + /** + * embedding mode only + * @deprecated use the `embedding` option on `LlamaContext` instead + * @hidden + */ embedding?: boolean }; @@ -80,16 +111,34 @@ export class LlamaModel { /** @internal */ public readonly _model: LLAMAModel; + /** @internal */ + public readonly _contextOptions: { + seed: LlamaModelOptions["seed"], + contextSize: LlamaModelOptions["contextSize"], + batchSize: LlamaModelOptions["batchSize"], + f16Kv: LlamaModelOptions["f16Kv"], + logitsAll: LlamaModelOptions["logitsAll"], + embedding: LlamaModelOptions["embedding"], + threads: LlamaModelOptions["threads"] + }; + + /** @internal */ + public readonly _evaluationOptions: { + temperature: LlamaModelOptions["temperature"], + topK: LlamaModelOptions["topK"], + topP: LlamaModelOptions["topP"] + }; + /** - * options source: - * https://github.com/ggerganov/llama.cpp/blob/b5ffb2849d23afe73647f68eec7b68187af09be6/llama.h#L102 (struct llama_context_params) + * > options source: + * > [github:ggerganov/llama.cpp/llama.h]( + * > https://github.com/ggerganov/llama.cpp/blob/b5ffb2849d23afe73647f68eec7b68187af09be6/llama.h#L102) (`struct llama_context_params`) * @param {object} options * @param {string} options.modelPath - path to the model on the filesystem * @param {number | null} [options.seed] - If null, a random seed will be used * @param {number} [options.contextSize] - text context size * @param {number} [options.batchSize] - prompt processing batch size * @param {number} [options.gpuLayers] - number of layers to store in VRAM - * @param {boolean} [options.lowVram] - if true, reduce VRAM usage at the cost of performance * @param {number} [options.threads] - number of threads to use to evaluate tokens * @param {number} [options.temperature] - Temperature is a hyperparameter that controls the randomness of the generated text. * It affects the probability distribution of the model's output tokens. @@ -120,25 +169,30 @@ export class LlamaModel { */ public constructor({ modelPath, seed = null, contextSize = 1024 * 4, batchSize, gpuLayers, - lowVram, threads = 6, temperature = 0, topK = 40, topP = 0.95, f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding + threads = 6, temperature = 0, topK = 40, topP = 0.95, f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding }: LlamaModelOptions) { this._model = new LLAMAModel(path.resolve(process.cwd(), modelPath), removeNullFields({ - seed: seed != null ? Math.max(-1, seed) : undefined, - contextSize, - batchSize, gpuLayers, - lowVram, - threads, - temperature, - topK, - topP, - f16Kv, - logitsAll, vocabOnly, useMmap, - useMlock, - embedding + useMlock })); + + this._contextOptions = { + seed, + contextSize, + batchSize, + f16Kv, + logitsAll, + embedding, + threads + }; + + this._evaluationOptions = { + temperature, + topK, + topP + }; } public static get systemInfo() { diff --git a/src/utils/compileLLamaCpp.ts b/src/utils/compileLLamaCpp.ts index 402ae64a..5064c726 100644 --- a/src/utils/compileLLamaCpp.ts +++ b/src/utils/compileLLamaCpp.ts @@ -2,7 +2,7 @@ import path from "path"; import {fileURLToPath} from "url"; import process from "process"; import fs from "fs-extra"; -import {customCmakeOptionsEnvVarPrefix, llamaCppDirectory, llamaDirectory} from "../config.js"; +import {customCmakeOptionsEnvVarPrefix, llamaCppDirectory, llamaDirectory, llamaToolchainsDirectory} from "../config.js"; import {clearLlamaBuild} from "./clearLlamaBuild.js"; import {setUsedBinFlag} from "./usedBinFlag.js"; import {spawnCommand} from "./spawnCommand.js"; @@ -22,6 +22,7 @@ export async function compileLlamaCpp({ } const cmakePathArgs = await getCmakePathArgs(); + const toolchainFile = await getToolchainFileForArch(arch); const runtimeVersion = nodeTarget.startsWith("v") ? nodeTarget.slice("v".length) : nodeTarget; const cmakeCustomOptions = []; @@ -41,6 +42,9 @@ export async function compileLlamaCpp({ if (process.env.LLAMA_HIPBLAS === "1") cmakeCustomOptions.push("LLAMA_HIPBLAS=1"); if (process.env.LLAMA_CLBLAST === "1") cmakeCustomOptions.push("LLAMA_CLBLAST=1"); + if (toolchainFile != null) + cmakeCustomOptions.push("CMAKE_TOOLCHAIN_FILE=" + toolchainFile); + for (const key in process.env) { if (key.startsWith(customCmakeOptionsEnvVarPrefix)) { const option = key.slice(customCmakeOptionsEnvVarPrefix.length); @@ -61,7 +65,7 @@ export async function compileLlamaCpp({ ); const binFilesDirPath = path.join(llamaDirectory, "build", "llama.cpp", "bin"); - const compiledResultDirPath = await getCompiledResultDir(); + const compiledResultDirPath = await getCompiledResultDir(true); if (await fs.pathExists(binFilesDirPath)) { const files = await fs.readdir(binFilesDirPath); @@ -89,7 +93,11 @@ export async function compileLlamaCpp({ } export async function getCompiledLlamaCppBinaryPath() { - const compiledResultDirPath = await getCompiledResultDir(); + const compiledResultDirPath = await getCompiledResultDir(false); + + if (compiledResultDirPath == null) + return null; + const modulePath = path.join(compiledResultDirPath, "llama-addon.node"); if (await fs.pathExists(modulePath)) @@ -98,14 +106,19 @@ export async function getCompiledLlamaCppBinaryPath() { return null; } -async function getCompiledResultDir() { +async function getCompiledResultDir(failIfNotFound?: false): Promise; +async function getCompiledResultDir(failIfNotFound: true): Promise; +async function getCompiledResultDir(failIfNotFound: boolean = false) { if (await fs.pathExists(path.join(llamaDirectory, "build", "Release"))) { return path.join(llamaDirectory, "build", "Release"); } else if (await fs.pathExists(path.join(llamaDirectory, "build", "Debug"))) { return path.join(llamaDirectory, "build", "Debug"); } - throw new Error("Could not find Release or Debug directory"); + if (failIfNotFound) + throw new Error("Could not find Release or Debug directory"); + + return null; } async function getCmakePathArgs() { @@ -119,3 +132,20 @@ async function getCmakePathArgs() { return ["--cmake-path", cmakePath]; } + +async function getToolchainFileForArch(targetArch: string) { + if (process.arch === targetArch) + return null; + + const platform = process.platform; + const hostArch = process.arch; + + const toolchainFilename = `${platform}.host-${hostArch}.target-${targetArch}.cmake`; + + const filePath = path.join(llamaToolchainsDirectory, toolchainFilename); + + if (await fs.pathExists(filePath)) + return filePath; + + return null; +} diff --git a/src/utils/getBin.ts b/src/utils/getBin.ts index 24e60ab3..3c86fca6 100644 --- a/src/utils/getBin.ts +++ b/src/utils/getBin.ts @@ -99,35 +99,40 @@ export type LlamaCppNodeModule = { LLAMAModel: LLAMAModel, LLAMAContext: LLAMAContext, LLAMAGrammar: LLAMAGrammar, + LLAMAGrammarEvaluationState: LLAMAGrammarEvaluationState, systemInfo(): string }; export type LLAMAModel = { new (modelPath: string, params: { + gpuLayers?: number, + vocabOnly?: boolean, + useMmap?: boolean, + useMlock?: boolean + }): LLAMAModel +}; + +export type LLAMAContext = { + new (model: LLAMAModel, params: { seed?: number, contextSize?: number, batchSize?: number, - gpuCores?: number, - lowVram?: boolean, f16Kv?: boolean, logitsAll?: boolean, - vocabOnly?: boolean, - useMmap?: boolean, - useMlock?: boolean, embedding?: boolean, threads?: number, - temperature?: number, - topK?: number, - topP?: number - }): LLAMAModel -}; - -export type LLAMAContext = { - new (model: LLAMAModel, params?: { - grammar?: LLAMAGrammar }): LLAMAContext, encode(text: string): Uint32Array, - eval(tokens: Uint32Array): Promise, + eval(tokens: Uint32Array, options?: { + temperature?: number, + topK?: number, + topP?: number, + repeatPenalty?: number, + repeatPenaltyTokens?: Uint32Array, + repeatPenaltyPresencePenalty?: number, // alpha_presence + repeatPenaltyFrequencyPenalty?: number, // alpha_frequency + grammarEvaluationState?: LLAMAGrammarEvaluationState + }): Promise, decode(tokens: Uint32Array): string, tokenBos(): number, tokenEos(): number, @@ -141,3 +146,7 @@ export type LLAMAGrammar = { printGrammar?: boolean, }): LLAMAGrammar }; + +export type LLAMAGrammarEvaluationState = { + new (grammar: LLAMAGrammar): LLAMAGrammarEvaluationState +};