diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c4652d9f..1bea4d7d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -99,10 +99,16 @@ jobs:
           choco install ninja cmake
 
       - name: Install dependencies on ubuntu
-        if: startsWith(matrix.config.name, 'Ubuntu Latest GCC')
+        if: startsWith(matrix.config.name, 'Ubuntu GCC')
         run: |
           sudo apt-get update
-          sudo apt-get install ninja-build cmake libtbb-dev
+          sudo apt-get install ninja-build cmake libtbb-dev g++-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf
+          
+          which aarch64-linux-gnu-gcc
+          which aarch64-linux-gnu-g++
+          
+          which arm-linux-gnueabihf-gcc
+          which arm-linux-gnueabihf-g++
 
       - name: Install dependencies on macos
         if: startsWith(matrix.config.os, 'macos')
@@ -151,7 +157,7 @@ jobs:
               case "win":
                 return ["x64" /*, "arm64" */ ]; // disabled arm64 for now as compilation doesn't work
               case "linux":
-                return ["x64", "arm64", "armv7l", "ppc64le"];
+                return ["x64", "arm64", "armv7l"];
               case "mac":
                 return ["x64", "arm64"];
             }
diff --git a/llama/addon.cpp b/llama/addon.cpp
index b1c3a050..6a192799 100644
--- a/llama/addon.cpp
+++ b/llama/addon.cpp
@@ -10,21 +10,11 @@
 
 class LLAMAModel : public Napi::ObjectWrap<LLAMAModel> {
   public:
-    llama_context_params params;
+    llama_model_params model_params;
     llama_model* model;
-    float temperature;
-    int threads;
-    int32_t top_k;
-    float top_p;
 
     LLAMAModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap<LLAMAModel>(info) {
-        params = llama_context_default_params();
-        params.seed = -1;
-        params.n_ctx = 4096;
-        temperature = 0.0f;
-        threads = 6;
-        top_k = 40;
-        top_p = 0.95f;
+        model_params = llama_model_default_params();
 
         // Get the model path
         std::string modelPath = info[0].As<Napi::String>().Utf8Value();
@@ -32,69 +22,25 @@ class LLAMAModel : public Napi::ObjectWrap<LLAMAModel> {
         if (info.Length() > 1 && info[1].IsObject()) {
             Napi::Object options = info[1].As<Napi::Object>();
 
-            if (options.Has("seed")) {
-                params.seed = options.Get("seed").As<Napi::Number>().Int32Value();
-            }
-
-            if (options.Has("contextSize")) {
-                params.n_ctx = options.Get("contextSize").As<Napi::Number>().Int32Value();
-            }
-
-            if (options.Has("batchSize")) {
-                params.n_batch = options.Get("batchSize").As<Napi::Number>().Int32Value();
-            }
-
             if (options.Has("gpuLayers")) {
-                params.n_gpu_layers = options.Get("gpuLayers").As<Napi::Number>().Int32Value();
-            }
-
-            if (options.Has("lowVram")) {
-                params.low_vram = options.Get("lowVram").As<Napi::Boolean>().Value();
-            }
-
-            if (options.Has("f16Kv")) {
-                params.f16_kv = options.Get("f16Kv").As<Napi::Boolean>().Value();
-            }
-
-            if (options.Has("logitsAll")) {
-                params.logits_all = options.Get("logitsAll").As<Napi::Boolean>().Value();
+                model_params.n_gpu_layers = options.Get("gpuLayers").As<Napi::Number>().Int32Value();
             }
 
             if (options.Has("vocabOnly")) {
-                params.vocab_only = options.Get("vocabOnly").As<Napi::Boolean>().Value();
+                model_params.vocab_only = options.Get("vocabOnly").As<Napi::Boolean>().Value();
             }
 
             if (options.Has("useMmap")) {
-                params.use_mmap = options.Get("useMmap").As<Napi::Boolean>().Value();
+                model_params.use_mmap = options.Get("useMmap").As<Napi::Boolean>().Value();
             }
 
             if (options.Has("useMlock")) {
-                params.use_mlock = options.Get("useMlock").As<Napi::Boolean>().Value();
-            }
-
-            if (options.Has("embedding")) {
-                params.embedding = options.Get("embedding").As<Napi::Boolean>().Value();
-            }
-
-            if (options.Has("threads")) {
-                threads = options.Get("threads").As<Napi::Number>().Int32Value();
-            }
-
-            if (options.Has("temperature")) {
-                temperature = options.Get("temperature").As<Napi::Number>().FloatValue();
-            }
-
-            if (options.Has("topK")) {
-                top_k = options.Get("topK").As<Napi::Number>().Int32Value();
-            }
-
-            if (options.Has("topP")) {
-                top_p = options.Get("topP").As<Napi::Number>().FloatValue();
+                model_params.use_mlock = options.Get("useMlock").As<Napi::Boolean>().Value();
             }
         }
 
         llama_backend_init(false);
-        model = llama_load_model_from_file(modelPath.c_str(), params);
+        model = llama_load_model_from_file(modelPath.c_str(), model_params);
 
         if (model == NULL) {
             Napi::Error::New(info.Env(), "Failed to load model").ThrowAsJavaScriptException();
@@ -114,7 +60,6 @@ class LLAMAModel : public Napi::ObjectWrap<LLAMAModel> {
 class LLAMAGrammar : public Napi::ObjectWrap<LLAMAGrammar> {
   public:
     grammar_parser::parse_state parsed_grammar;
-    llama_grammar *grammar = nullptr;
 
     LLAMAGrammar(const Napi::CallbackInfo& info) : Napi::ObjectWrap<LLAMAGrammar>(info) {
         // Get the model path
@@ -139,13 +84,31 @@ class LLAMAGrammar : public Napi::ObjectWrap<LLAMAGrammar> {
         if (should_print_grammar) {
             grammar_parser::print_grammar(stderr, parsed_grammar);
         }
+    }
 
-        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+    static void init(Napi::Object exports) {
+        exports.Set("LLAMAGrammar", DefineClass(exports.Env(), "LLAMAGrammar", {}));
+    }
+};
+
+class LLAMAGrammarEvaluationState : public Napi::ObjectWrap<LLAMAGrammarEvaluationState> {
+  public:
+    LLAMAGrammar* grammarDef;
+    llama_grammar *grammar = nullptr;
+
+    LLAMAGrammarEvaluationState(const Napi::CallbackInfo& info) : Napi::ObjectWrap<LLAMAGrammarEvaluationState>(info) {
+        grammarDef = Napi::ObjectWrap<LLAMAGrammar>::Unwrap(info[0].As<Napi::Object>());
+        grammarDef->Ref();
+
+        std::vector<const llama_grammar_element *> grammar_rules(grammarDef->parsed_grammar.c_rules());
         grammar = llama_grammar_init(
-            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+            grammar_rules.data(), grammar_rules.size(), grammarDef->parsed_grammar.symbol_ids.at("root")
+        );
     }
 
-    ~LLAMAGrammar() {
+    ~LLAMAGrammarEvaluationState() {
+      grammarDef->Unref();
+
         if (grammar != nullptr) {
             llama_grammar_free(grammar);
             grammar = nullptr;
@@ -153,42 +116,67 @@ class LLAMAGrammar : public Napi::ObjectWrap<LLAMAGrammar> {
     }
 
     static void init(Napi::Object exports) {
-        exports.Set("LLAMAGrammar", DefineClass(exports.Env(), "LLAMAGrammar", {}));
+        exports.Set("LLAMAGrammarEvaluationState", DefineClass(exports.Env(), "LLAMAGrammarEvaluationState", {}));
     }
 };
 
 class LLAMAContext : public Napi::ObjectWrap<LLAMAContext> {
   public:
   LLAMAModel* model;
+  llama_context_params context_params;
   llama_context* ctx;
-  LLAMAGrammar* grammar;
-  bool use_grammar = false;
+  int n_cur = 0;
 
   LLAMAContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<LLAMAContext>(info) {
     model = Napi::ObjectWrap<LLAMAModel>::Unwrap(info[0].As<Napi::Object>());
     model->Ref();
-    ctx = llama_new_context_with_model(model->model, model->params);
-    Napi::MemoryManagement::AdjustExternalMemory(Env(), llama_get_state_size(ctx));
+
+    context_params = llama_context_default_params();
+    context_params.seed = -1;
+    context_params.n_ctx = 4096;
+    context_params.n_threads = 6;
+    context_params.n_threads_batch == -1 ? context_params.n_threads : context_params.n_threads_batch;
 
     if (info.Length() > 1 && info[1].IsObject()) {
-        Napi::Object options = info[1].As<Napi::Object>();
+      Napi::Object options = info[1].As<Napi::Object>();
 
-        if (options.Has("grammar")) {
-            grammar = Napi::ObjectWrap<LLAMAGrammar>::Unwrap(options.Get("grammar").As<Napi::Object>());
-            grammar->Ref();
-            use_grammar = true;
-        }
+      if (options.Has("seed")) {
+        context_params.seed = options.Get("seed").As<Napi::Number>().Int32Value();
+      }
+
+      if (options.Has("contextSize")) {
+        context_params.n_ctx = options.Get("contextSize").As<Napi::Number>().Int32Value();
+      }
+
+      if (options.Has("batchSize")) {
+        context_params.n_batch = options.Get("batchSize").As<Napi::Number>().Int32Value();
+      }
+
+      if (options.Has("f16Kv")) {
+          context_params.f16_kv = options.Get("f16Kv").As<Napi::Boolean>().Value();
+      }
+
+      if (options.Has("logitsAll")) {
+          context_params.logits_all = options.Get("logitsAll").As<Napi::Boolean>().Value();
+      }
+
+      if (options.Has("embedding")) {
+        context_params.embedding = options.Get("embedding").As<Napi::Boolean>().Value();
+      }
+
+      if (options.Has("threads")) {
+        context_params.n_threads = options.Get("threads").As<Napi::Number>().Int32Value();
+        context_params.n_threads_batch == -1 ? context_params.n_threads : context_params.n_threads_batch;
+      }
     }
+
+    ctx = llama_new_context_with_model(model->model, context_params);
+    Napi::MemoryManagement::AdjustExternalMemory(Env(), llama_get_state_size(ctx));
   }
   ~LLAMAContext() {
     Napi::MemoryManagement::AdjustExternalMemory(Env(), -(int64_t)llama_get_state_size(ctx));
     llama_free(ctx);
     model->Unref();
-
-    if (use_grammar) {
-        grammar->Unref();
-        use_grammar = false;
-    }
   }
   Napi::Value Encode(const Napi::CallbackInfo& info) {
     std::string text = info[0].As<Napi::String>().Utf8Value();
@@ -265,34 +253,124 @@ class LLAMAContext : public Napi::ObjectWrap<LLAMAContext> {
 
 class LLAMAContextEvalWorker : Napi::AsyncWorker, Napi::Promise::Deferred {
   LLAMAContext* ctx;
+  LLAMAGrammarEvaluationState* grammar_evaluation_state;
+  bool use_grammar = false;
   std::vector<llama_token> tokens;
   llama_token result;
+  float temperature;
+  int32_t top_k;
+  float top_p;
+  float repeat_penalty = 1.10f; // 1.0 = disabled
+  float repeat_penalty_presence_penalty = 0.00f; // 0.0 = disabled
+  float repeat_penalty_frequency_penalty = 0.00f; // 0.0 = disabled
+  std::vector<llama_token> repeat_penalty_tokens;
+  bool use_repeat_penalty = false;
 
   public:
   LLAMAContextEvalWorker(const Napi::CallbackInfo& info, LLAMAContext* ctx) : Napi::AsyncWorker(info.Env(), "LLAMAContextEvalWorker"), ctx(ctx), Napi::Promise::Deferred(info.Env()) {
     ctx->Ref();
     Napi::Uint32Array tokens = info[0].As<Napi::Uint32Array>();
+
+    temperature = 0.0f;
+    top_k = 40;
+    top_p = 0.95f;
+
+    if (info.Length() > 1 && info[1].IsObject()) {
+      Napi::Object options = info[1].As<Napi::Object>();
+
+      if (options.Has("temperature")) {
+          temperature = options.Get("temperature").As<Napi::Number>().FloatValue();
+      }
+
+      if (options.Has("topK")) {
+          top_k = options.Get("topK").As<Napi::Number>().Int32Value();
+      }
+
+      if (options.Has("topP")) {
+          top_p = options.Get("topP").As<Napi::Number>().FloatValue();
+      }
+
+      if (options.Has("repeatPenalty")) {
+          repeat_penalty = options.Get("repeatPenalty").As<Napi::Number>().FloatValue();
+      }
+
+      if (options.Has("repeatPenaltyTokens")) {
+          Napi::Uint32Array repeat_penalty_tokens_uint32_array = options.Get("repeatPenaltyTokens").As<Napi::Uint32Array>();
+
+          repeat_penalty_tokens.reserve(repeat_penalty_tokens_uint32_array.ElementLength());
+          for (size_t i = 0; i < repeat_penalty_tokens_uint32_array.ElementLength(); i++) {
+            repeat_penalty_tokens.push_back(static_cast<llama_token>(repeat_penalty_tokens_uint32_array[i]));
+          }
+
+          use_repeat_penalty = true;
+      }
+
+      if (options.Has("repeatPenaltyPresencePenalty")) {
+          repeat_penalty_presence_penalty = options.Get("repeatPenaltyPresencePenalty").As<Napi::Number>().FloatValue();
+      }
+
+      if (options.Has("repeatPenaltyFrequencyPenalty")) {
+          repeat_penalty_frequency_penalty = options.Get("repeatPenaltyFrequencyPenalty").As<Napi::Number>().FloatValue();
+      }
+
+      if (options.Has("grammarEvaluationState")) {
+          grammar_evaluation_state = Napi::ObjectWrap<LLAMAGrammarEvaluationState>::Unwrap(options.Get("grammarEvaluationState").As<Napi::Object>());
+          grammar_evaluation_state->Ref();
+          use_grammar = true;
+      }
+    }
+
     this->tokens.reserve(tokens.ElementLength());
     for (size_t i = 0; i < tokens.ElementLength(); i++) { this->tokens.push_back(static_cast<llama_token>(tokens[i])); }
   }
-  ~LLAMAContextEvalWorker() { ctx->Unref(); }
+  ~LLAMAContextEvalWorker() {
+    ctx->Unref();
+
+    if (use_grammar) {
+        grammar_evaluation_state->Unref();
+        use_grammar = false;
+    }
+  }
   using Napi::AsyncWorker::Queue;
   using Napi::Promise::Deferred::Promise;
 
   protected:
   void Execute() {
-    // Perform the evaluation using llama_eval.
-    int r = llama_eval(ctx->ctx, tokens.data(), int(tokens.size()), llama_get_kv_cache_token_count(ctx->ctx), (ctx->model)->threads);
+    llama_batch batch = llama_batch_init(tokens.size(), 0);
+
+    batch.n_tokens = tokens.size();
+
+    for (int32_t i = 0; i < batch.n_tokens; i++) {
+        batch.token[i]  = tokens[i];
+        batch.pos[i]    = ctx->n_cur;
+        batch.seq_id[i] = 0;
+        batch.logits[i] = false;
+
+        ctx->n_cur++;
+    }
+
+    batch.logits[batch.n_tokens - 1] = true;
+
+    // Perform the evaluation using llama_decode.
+    int r = llama_decode(ctx->ctx, batch);
+
+    llama_batch_free(batch);
+
     if (r != 0) {
-      SetError("Eval has failed");
+      if (r == 1) {
+        SetError("could not find a KV slot for the batch (try reducing the size of the batch or increase the context)");
+      } else {
+        SetError("Eval has failed");
+      }
+
       return;
     }
 
     llama_token new_token_id = 0;
 
     // Select the best prediction.
-    auto logits = llama_get_logits(ctx->ctx);
-    auto n_vocab = llama_n_vocab(ctx->ctx);
+    auto logits = llama_get_logits_ith(ctx->ctx, batch.n_tokens - 1);
+    auto n_vocab = llama_n_vocab(ctx->model->model);
 
     std::vector<llama_token_data> candidates;
     candidates.reserve(n_vocab);
@@ -303,48 +381,43 @@ class LLAMAContextEvalWorker : Napi::AsyncWorker, Napi::Promise::Deferred {
 
     llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 
-    float originalEosLogit = 0;
     auto eos_token = llama_token_eos(ctx->ctx);
 
-    for (auto& candidate : candidates) {
-      if (candidate.id == eos_token) {
-        originalEosLogit = candidate.logit;
-        break;
-      }
-    }
-
-    if (ctx->use_grammar) {
-        llama_sample_grammar(ctx->ctx, &candidates_p, (ctx->grammar)->grammar);
+    if (use_repeat_penalty && !repeat_penalty_tokens.empty()) {
+      llama_sample_repetition_penalty(
+        ctx->ctx, &candidates_p, repeat_penalty_tokens.data(), repeat_penalty_tokens.size(), repeat_penalty
+      );
+      llama_sample_frequency_and_presence_penalties(
+        ctx->ctx, &candidates_p, repeat_penalty_tokens.data(), repeat_penalty_tokens.size(),
+        repeat_penalty_frequency_penalty, repeat_penalty_presence_penalty
+      );
     }
 
-    for (auto& candidate : candidates) {
-      if (candidate.id == eos_token) {
-        candidate.logit = originalEosLogit;
-        break;
-      }
+    if (use_grammar && (grammar_evaluation_state)->grammar != nullptr) {
+        llama_sample_grammar(ctx->ctx, &candidates_p, (grammar_evaluation_state)->grammar);
     }
 
-    if ((ctx->model)->temperature <= 0) {
+    if (temperature <= 0) {
         new_token_id = llama_sample_token_greedy(ctx->ctx , &candidates_p);
     } else {
-        const int32_t top_k = (ctx->model)->top_k <= 0 ? llama_n_vocab(ctx->ctx) : (ctx->model)->top_k;
+        const int32_t resolved_top_k = top_k <= 0 ? llama_n_vocab(ctx->model->model) : std::min(top_k, llama_n_vocab(ctx->model->model));
         const int32_t n_probs = 0; // Number of probabilities to keep - 0 = disabled
         const float tfs_z = 1.00f; // Tail free sampling - 1.0 = disabled
         const float typical_p = 1.00f; // Typical probability - 1.0 = disabled
-        const float top_p = (ctx->model)->top_p; // Top p sampling - 1.0 = disabled
+        const float resolved_top_p = top_p; // Top p sampling - 1.0 = disabled
 
         // Temperature sampling
         size_t min_keep = std::max(1, n_probs);
-        llama_sample_top_k(ctx->ctx, &candidates_p, top_k, min_keep);
+        llama_sample_top_k(ctx->ctx, &candidates_p, resolved_top_k, min_keep);
         llama_sample_tail_free(ctx->ctx, &candidates_p, tfs_z, min_keep);
         llama_sample_typical(ctx->ctx, &candidates_p, typical_p, min_keep);
-        llama_sample_top_p(ctx->ctx, &candidates_p, top_p, min_keep);
-        llama_sample_temperature(ctx->ctx, &candidates_p, (ctx->model)->temperature);;
+        llama_sample_top_p(ctx->ctx, &candidates_p, resolved_top_p, min_keep);
+        llama_sample_temperature(ctx->ctx, &candidates_p, temperature);
         new_token_id = llama_sample_token(ctx->ctx, &candidates_p);
     }
 
-    if (new_token_id != eos_token && ctx->use_grammar) {
-        llama_grammar_accept_token(ctx->ctx, (ctx->grammar)->grammar, new_token_id);
+    if (new_token_id != eos_token && use_grammar && (grammar_evaluation_state)->grammar != nullptr) {
+        llama_grammar_accept_token(ctx->ctx, (grammar_evaluation_state)->grammar, new_token_id);
     }
 
     result = new_token_id;
@@ -372,6 +445,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
   });
   LLAMAModel::init(exports);
   LLAMAGrammar::init(exports);
+  LLAMAGrammarEvaluationState::init(exports);
   LLAMAContext::init(exports);
   return exports;
 }
diff --git a/llama/toolchains/darwin.host-x64.target-arm64.cmake b/llama/toolchains/darwin.host-x64.target-arm64.cmake
new file mode 100644
index 00000000..f6385a1c
--- /dev/null
+++ b/llama/toolchains/darwin.host-x64.target-arm64.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_NAME Darwin) # macOS
+set(CMAKE_SYSTEM_PROCESSOR arm64)
+
+set(CMAKE_C_COMPILER clang)
+set(CMAKE_CXX_COMPILER clang++)
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -arch arm64")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -arch arm64")
diff --git a/llama/toolchains/linux.host-arm64.target-x64.cmake b/llama/toolchains/linux.host-arm64.target-x64.cmake
new file mode 100644
index 00000000..d92a8607
--- /dev/null
+++ b/llama/toolchains/linux.host-arm64.target-x64.cmake
@@ -0,0 +1,5 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR x86_64)
+
+set(CMAKE_C_COMPILER x86_64-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER x86_64-linux-gnu-g++)
diff --git a/llama/toolchains/linux.host-x64.target-arm64.cmake b/llama/toolchains/linux.host-x64.target-arm64.cmake
new file mode 100644
index 00000000..948164d5
--- /dev/null
+++ b/llama/toolchains/linux.host-x64.target-arm64.cmake
@@ -0,0 +1,5 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+
+set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
diff --git a/llama/toolchains/linux.host-x64.target-arm71.cmake b/llama/toolchains/linux.host-x64.target-arm71.cmake
new file mode 100644
index 00000000..db2566cb
--- /dev/null
+++ b/llama/toolchains/linux.host-x64.target-arm71.cmake
@@ -0,0 +1,5 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR arm)
+
+set(CMAKE_C_COMPILER arm-linux-gnueabihf-gcc)
+set(CMAKE_CXX_COMPILER arm-linux-gnueabihf-g++)
diff --git a/src/config.ts b/src/config.ts
index 32484b55..a9e1311b 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -12,6 +12,7 @@ const env = envVar.from(process.env);
 
 
 export const llamaDirectory = path.join(__dirname, "..", "llama");
+export const llamaToolchainsDirectory = path.join(llamaDirectory, "toolchains");
 export const llamaBinsDirectory = path.join(__dirname, "..", "llamaBins");
 export const llamaBinsGrammarsDirectory = path.join(__dirname, "..", "llama", "grammars");
 export const llamaCppDirectory = path.join(llamaDirectory, "llama.cpp");
diff --git a/src/index.ts b/src/index.ts
index 733315f1..e8d4308e 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -1,7 +1,8 @@
 import {LlamaModel, type LlamaModelOptions} from "./llamaEvaluator/LlamaModel.js";
 import {LlamaGrammar, type LlamaGrammarOptions} from "./llamaEvaluator/LlamaGrammar.js";
-import {LlamaContext, type LlamaContextOptions} from "./llamaEvaluator/LlamaContext.js";
-import {LlamaChatSession, type LlamaChatSessionOptions} from "./llamaEvaluator/LlamaChatSession.js";
+import {LlamaGrammarEvaluationState, LlamaGrammarEvaluationStateOptions} from "./llamaEvaluator/LlamaGrammarEvaluationState.js";
+import {LlamaContext, type LlamaContextOptions, type LlamaContextRepeatPenalty} from "./llamaEvaluator/LlamaContext.js";
+import {LlamaChatSession, type LlamaChatSessionOptions, type LlamaChatSessionRepeatPenalty} from "./llamaEvaluator/LlamaChatSession.js";
 import {AbortError} from "./AbortError.js";
 import {ChatPromptWrapper} from "./ChatPromptWrapper.js";
 import {EmptyChatPromptWrapper} from "./chatWrappers/EmptyChatPromptWrapper.js";
@@ -19,10 +20,14 @@ export {
     type LlamaModelOptions,
     LlamaGrammar,
     type LlamaGrammarOptions,
+    LlamaGrammarEvaluationState,
+    type LlamaGrammarEvaluationStateOptions,
     LlamaContext,
     type LlamaContextOptions,
+    type LlamaContextRepeatPenalty,
     LlamaChatSession,
     type LlamaChatSessionOptions,
+    type LlamaChatSessionRepeatPenalty,
     type ConversationInteraction,
     AbortError,
     ChatPromptWrapper,
diff --git a/src/llamaEvaluator/LlamaBins.ts b/src/llamaEvaluator/LlamaBins.ts
index 1985343c..f3343e12 100644
--- a/src/llamaEvaluator/LlamaBins.ts
+++ b/src/llamaEvaluator/LlamaBins.ts
@@ -1,6 +1,6 @@
-import {loadBin, type LLAMAModel, type LLAMAContext, type LLAMAGrammar} from "../utils/getBin.js";
+import {loadBin, type LLAMAModel, type LLAMAContext, type LLAMAGrammar, type LLAMAGrammarEvaluationState} from "../utils/getBin.js";
 
 export const llamaCppNode = await loadBin();
-const {LLAMAModel, LLAMAContext, LLAMAGrammar} = llamaCppNode;
+const {LLAMAModel, LLAMAContext, LLAMAGrammar, LLAMAGrammarEvaluationState} = llamaCppNode;
 
-export {LLAMAModel, LLAMAContext, LLAMAGrammar};
+export {LLAMAModel, LLAMAContext, LLAMAGrammar, LLAMAGrammarEvaluationState};
diff --git a/src/llamaEvaluator/LlamaChatSession.ts b/src/llamaEvaluator/LlamaChatSession.ts
index d47b2d70..41254928 100644
--- a/src/llamaEvaluator/LlamaChatSession.ts
+++ b/src/llamaEvaluator/LlamaChatSession.ts
@@ -6,8 +6,11 @@ import {GeneralChatPromptWrapper} from "../chatWrappers/GeneralChatPromptWrapper
 import {getChatWrapperByBos} from "../chatWrappers/createChatWrapperByBos.js";
 import {ConversationInteraction, Token} from "../types.js";
 import {generateContextTextFromConversationHistory} from "../chatWrappers/generateContextTextFromConversationHistory.js";
+import {removeNullFields} from "../utils/removeNullFields.js";
 import {LlamaModel} from "./LlamaModel.js";
 import {LlamaContext} from "./LlamaContext.js";
+import {LlamaGrammar} from "./LlamaGrammar.js";
+import {LlamaGrammarEvaluationState} from "./LlamaGrammarEvaluationState.js";
 
 const UNKNOWN_UNICODE_CHAR = "\ufffd";
 
@@ -15,13 +18,53 @@ const UNKNOWN_UNICODE_CHAR = "\ufffd";
 export type LlamaChatSessionOptions = {
     context: LlamaContext,
     printLLamaSystemInfo?: boolean,
+
+    /** GeneralChatPromptWrapper is ued by default */
     promptWrapper?: ChatPromptWrapper | "auto",
+
     systemPrompt?: string,
 
     /** Conversation history to load into the context to continue an existing conversation */
     conversationHistory?: readonly ConversationInteraction[]
 };
 
+export type LlamaChatSessionRepeatPenalty = {
+    /**
+     * Number of recent tokens generated by the model to apply penalties to repetition of.
+     * Defaults to `64`.
+     */
+    lastTokens?: number,
+
+    punishTokensFilter?: (tokens: Token[]) => Token[],
+
+    /**
+     * Penalize new line tokens.
+     * Enabled by default.
+     */
+    penalizeNewLine?: boolean,
+
+    /**
+     * The relative amount to lower the probability of the tokens in `punishTokens` by
+     * Defaults to `1.1`.
+     * Set to `1` to disable.
+     */
+    penalty?: number,
+
+    /**
+     * For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty`
+     * Disabled by default (`0`).
+     * Set to a value between `0` and `1` to enable.
+     */
+    frequencyPenalty?: number,
+
+    /**
+     * Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty`
+     * Disabled by default (`0`).
+     * Set to a value between `0` and `1` to enable.
+     */
+    presencePenalty?: number
+};
+
 export class LlamaChatSession {
     private readonly _systemPrompt: string;
     private readonly _printLLamaSystemInfo: boolean;
@@ -33,6 +76,9 @@ export class LlamaChatSession {
     private _conversationHistoryToLoad: readonly ConversationInteraction[] | null = null;
     private readonly _ctx: LlamaContext;
 
+    /**
+     * @param {LlamaChatSessionOptions} options
+     */
     public constructor({
         context,
         printLLamaSystemInfo = false,
@@ -78,9 +124,68 @@ export class LlamaChatSession {
         });
     }
 
+    /**
+     * @param {string} prompt
+     * @param {object} options
+     * @returns {Promise<string>}
+     */
     public async prompt(prompt: string, {
-        onToken, signal, maxTokens
-    }: { onToken?(tokens: Token[]): void, signal?: AbortSignal, maxTokens?: number } = {}) {
+        onToken,
+        signal,
+        maxTokens,
+        temperature,
+        topK,
+        topP,
+        grammar = this.context._chatGrammar,
+        trimWhitespaceSuffix = false,
+        repeatPenalty
+    }: {
+        onToken?: (tokens: Token[]) => void,
+        signal?: AbortSignal,
+        maxTokens?: number,
+
+        /**
+         * Temperature is a hyperparameter that controls the randomness of the generated text.
+         * It affects the probability distribution of the model's output tokens.
+         * A higher temperature (e.g., 1.5) makes the output more random and creative,
+         * while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
+         * The suggested temperature is 0.8, which provides a balance between randomness and determinism.
+         * At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
+         *
+         * Set to `0` to disable.
+         * Disabled by default (set to `0`).
+         */
+        temperature?: number,
+
+        /**
+         * Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
+         * An integer number between `1` and the size of the vocabulary.
+         * Set to `0` to disable (which uses the full vocabulary).
+         *
+         * Only relevant when `temperature` is set to a value greater than 0.
+         */
+        topK?: number,
+
+        /**
+         * Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
+         * and samples the next token only from this set.
+         * A float number between `0` and `1`.
+         * Set to `1` to disable.
+         *
+         * Only relevant when `temperature` is set to a value greater than `0`.
+         */
+        topP?: number,
+
+        grammar?: LlamaGrammar,
+
+        /**
+         * Trim whitespace from the end of the generated text
+         * Disabled by default.
+         */
+        trimWhitespaceSuffix?: boolean,
+
+        repeatPenalty?: false | LlamaChatSessionRepeatPenalty
+    } = {}) {
         if (!this.initialized)
             await this.init();
 
@@ -127,7 +232,10 @@ export class LlamaChatSession {
             this._lastStopStringSuffix = null;
 
             const {text, stopString, stopStringSuffix} =
-                await this._evalTokens(this._ctx.encode(promptText), {onToken, signal, maxTokens});
+                await this._evalTokens(this._ctx.encode(promptText), {
+                    onToken, signal, maxTokens, temperature, topK, topP, grammar, trimWhitespaceSuffix,
+                    repeatPenalty: repeatPenalty == false ? {lastTokens: 0} : repeatPenalty
+                });
             this._lastStopString = stopString;
             this._lastStopStringSuffix = stopStringSuffix;
 
@@ -136,14 +244,73 @@ export class LlamaChatSession {
     }
 
     private async _evalTokens(tokens: Uint32Array, {
-        onToken, signal, maxTokens
-    }: { onToken?(tokens: Token[]): void, signal?: AbortSignal, maxTokens?: number } = {}) {
-        const stopStrings = this._promptWrapper.getStopStrings();
+        onToken,
+        signal,
+        maxTokens,
+        temperature,
+        topK,
+        topP,
+        grammar = this.context._chatGrammar,
+        trimWhitespaceSuffix = false,
+        repeatPenalty: {
+            lastTokens: repeatPenaltyLastTokens = 64,
+            punishTokensFilter,
+            penalizeNewLine,
+            penalty,
+            frequencyPenalty,
+            presencePenalty
+        } = {}
+    }: {
+        onToken?: (tokens: Token[]) => void,
+        signal?: AbortSignal,
+        maxTokens?: number,
+        temperature?: number,
+        topK?: number,
+        topP?: number,
+        grammar?: LlamaGrammar,
+        trimWhitespaceSuffix?: boolean,
+        repeatPenalty?: LlamaChatSessionRepeatPenalty
+    } = {}) {
+        let stopStrings = this._promptWrapper.getStopStrings();
+
+        if (grammar != null)
+            stopStrings = stopStrings.concat(grammar.stopStrings);
+
         const stopStringIndexes: number[] = Array(stopStrings.length).fill(0);
         const skippedChunksQueue: Token[] = [];
         const res: Token[] = [];
+        const grammarEvaluationState = grammar != null
+            ? new LlamaGrammarEvaluationState({grammar})
+            : undefined;
+        const repeatPenaltyEnabled = repeatPenaltyLastTokens > 0;
 
-        for await (const chunk of this._ctx.evaluate(tokens)) {
+        const getPenaltyTokens = () => {
+            let punishTokens = res.slice(-repeatPenaltyLastTokens);
+
+            if (punishTokensFilter != null)
+                punishTokens = punishTokensFilter(punishTokens);
+
+            if (!penalizeNewLine) {
+                const nlToken = this.context.getNlToken();
+
+                if (nlToken != null)
+                    punishTokens = punishTokens.filter(token => token !== nlToken);
+            }
+
+            return Uint32Array.from(punishTokens);
+        };
+
+        const evaluationIterator = this._ctx.evaluate(tokens, removeNullFields({
+            temperature, topK, topP, grammarEvaluationState,
+            repeatPenalty: !repeatPenaltyEnabled ? undefined : {
+                punishTokens: getPenaltyTokens,
+                penalty,
+                frequencyPenalty,
+                presencePenalty
+            }
+        }));
+
+        for await (const chunk of evaluationIterator) {
             if (signal?.aborted)
                 throw new AbortError();
 
@@ -158,7 +325,10 @@ export class LlamaChatSession {
                     ? this._ctx.decode(Uint32Array.from(skippedChunksQueue))
                     : "";
 
-                const [queuedTextBeforeStopString] = skippedChunksText.split(stopString);
+                let [queuedTextBeforeStopString] = skippedChunksText.split(stopString);
+
+                if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix)
+                    queuedTextBeforeStopString = queuedTextBeforeStopString.trimEnd();
 
                 if (queuedTextBeforeStopString.length > 0) {
                     const beforeStopStringTokens: Token[] = Array.from(this._ctx.encode(queuedTextBeforeStopString));
@@ -176,7 +346,9 @@ export class LlamaChatSession {
             }
 
             // if the token is unknown, it means it's not complete character
-            if (tokenStr === UNKNOWN_UNICODE_CHAR || skipTokenEvent) {
+            if (tokenStr === UNKNOWN_UNICODE_CHAR || skipTokenEvent || (
+                (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) && tokenStr.trim() === ""
+            )) {
                 skippedChunksQueue.push(chunk);
                 continue;
             }
@@ -194,8 +366,13 @@ export class LlamaChatSession {
                 break;
         }
 
+        let resText = this._ctx.decode(Uint32Array.from(res));
+
+        if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix)
+            resText = resText.trimEnd();
+
         return {
-            text: this._ctx.decode(Uint32Array.from(res)),
+            text: resText,
             stopString: null,
             stopStringSuffix: null
         };
diff --git a/src/llamaEvaluator/LlamaContext.ts b/src/llamaEvaluator/LlamaContext.ts
index ef2678e9..b7d85b82 100644
--- a/src/llamaEvaluator/LlamaContext.ts
+++ b/src/llamaEvaluator/LlamaContext.ts
@@ -2,26 +2,107 @@ import {removeNullFields} from "../utils/removeNullFields.js";
 import {Token} from "../types.js";
 import {LLAMAContext} from "./LlamaBins.js";
 import {LlamaModel} from "./LlamaModel.js";
+import {LlamaGrammarEvaluationState} from "./LlamaGrammarEvaluationState.js";
 import {LlamaGrammar} from "./LlamaGrammar.js";
 
 
 export type LlamaContextOptions = {
     model: LlamaModel,
+    prependBos?: boolean,
+
+    /**
+     * @deprecated use the `grammar` option on `LlamaChatSession`'s `prompt` function
+     * or the `grammarEvaluationState` option on `LlamaContext`'s `evaluate` function instead
+     * @hidden
+     */
     grammar?: LlamaGrammar,
-    prependBos?: boolean
+
+    /** If null, a random seed will be used */
+    seed?: number | null,
+
+    /** text context size */
+    contextSize?: number,
+
+    /** prompt processing batch size */
+    batchSize?: number,
+
+    /** use fp16 for KV cache */
+    f16Kv?: boolean,
+
+    /** the llama_eval() call computes all logits, not just the last one */
+    logitsAll?: boolean,
+
+    /** embedding mode only */
+    embedding?: boolean
+
+    /** number of threads to use to evaluate tokens */
+    threads?: number,
+};
+
+export type LlamaContextRepeatPenalty = {
+    /** Tokens to lower the predication probability of to be the next predicted token */
+    punishTokens: Uint32Array | (() => Uint32Array),
+
+    /**
+     * The relative amount to lower the probability of the tokens in `punishTokens` by
+     * Defaults to `1.1`.
+     * Set to `1` to disable.
+     */
+    penalty?: number,
+
+    /**
+     * For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty`
+     * Disabled by default (`0`).
+     * Set to a value between `0` and `1` to enable.
+     */
+    frequencyPenalty?: number,
+
+    /**
+     * Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty`
+     * Disabled by default (`0`).
+     * Set to a value between `0` and `1` to enable.
+     */
+    presencePenalty?: number
 };
 
 export class LlamaContext {
+    private readonly _model: LlamaModel;
     private readonly _ctx: LLAMAContext;
     private readonly _prependBos: boolean;
     private _prependTokens: Token[];
 
-    public constructor({model, grammar, prependBos = true}: LlamaContextOptions) {
+    /** @internal */
+    public readonly _chatGrammar?: LlamaGrammar;
+
+
+    /**
+     * @param {LlamaContextOptions} options
+     */
+    public constructor({
+        model,
+        prependBos = true,
+        grammar,
+        seed = model._contextOptions.seed,
+        contextSize = model._contextOptions.contextSize,
+        batchSize = model._contextOptions.batchSize,
+        f16Kv = model._contextOptions.f16Kv,
+        logitsAll = model._contextOptions.logitsAll,
+        embedding = model._contextOptions.embedding,
+        threads = model._contextOptions.threads
+    }: LlamaContextOptions) {
+        this._model = model;
         this._ctx = new LLAMAContext(model._model, removeNullFields({
-            grammar: grammar?._grammar
+            seed: seed != null ? Math.max(-1, seed) : undefined,
+            contextSize,
+            batchSize,
+            f16Kv,
+            logitsAll,
+            embedding,
+            threads
         }));
         this._prependBos = prependBos;
         this._prependTokens = [];
+        this._chatGrammar = grammar;
 
         if (prependBos) {
             this._prependTokens.unshift(this._ctx.tokenBos());
@@ -125,7 +206,21 @@ export class LlamaContext {
         return this._ctx.getContextSize();
     }
 
-    public async *evaluate(tokens: Uint32Array): AsyncGenerator<Token, void> {
+    /**
+     * @param {Uint32Array} tokens
+     * @param {object} options
+     * @returns {AsyncGenerator<Token, void>}
+     */
+    public async *evaluate(tokens: Uint32Array, {
+        temperature = this._model._evaluationOptions.temperature,
+        topK = this._model._evaluationOptions.topK,
+        topP = this._model._evaluationOptions.topP,
+        grammarEvaluationState,
+        repeatPenalty
+    }: {
+        temperature?: number, topK?: number, topP?: number, grammarEvaluationState?: LlamaGrammarEvaluationState,
+        repeatPenalty?: LlamaContextRepeatPenalty
+    } = {}): AsyncGenerator<Token, void> {
         let evalTokens = tokens;
 
         if (this._prependTokens.length > 0) {
@@ -135,10 +230,24 @@ export class LlamaContext {
             this._prependTokens = [];
         }
 
+        if (evalTokens.length === 0)
+            return;
+
         // eslint-disable-next-line no-constant-condition
         while (true) {
             // Evaluate to get the next token.
-            const nextToken: Token = await this._ctx.eval(evalTokens);
+            const nextToken: Token = await this._ctx.eval(evalTokens, removeNullFields({
+                temperature,
+                topK,
+                topP,
+                repeatPenalty: repeatPenalty?.penalty,
+                repeatPenaltyTokens: repeatPenalty?.punishTokens instanceof Function
+                    ? repeatPenalty.punishTokens()
+                    : repeatPenalty?.punishTokens,
+                repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
+                repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
+                grammarEvaluationState: grammarEvaluationState?._state
+            }));
 
             // the assistant finished answering
             if (nextToken === this._ctx.tokenEos())
diff --git a/src/llamaEvaluator/LlamaGrammar.ts b/src/llamaEvaluator/LlamaGrammar.ts
index 523f53ec..33100054 100644
--- a/src/llamaEvaluator/LlamaGrammar.ts
+++ b/src/llamaEvaluator/LlamaGrammar.ts
@@ -10,23 +10,48 @@ export type LlamaGrammarOptions = {
 
     /** print the grammar to stdout */
     printGrammar?: boolean
+
+    /** Consider any of these texts as EOS for the generated out. Only supported by `LlamaChatSession` */
+    stopStrings?: string[],
+
+    /** Trim whitespace from the end of the generated text. Only supported by `LlamaChatSession` */
+    trimWhitespaceSuffix?: boolean
 };
 
 export class LlamaGrammar {
     /** @internal */
     public readonly _grammar: LLAMAGrammar;
+    private readonly _stopStrings: readonly string[];
+    private readonly _trimWhitespaceSuffix: boolean;
 
     /**
-     * GBNF files are supported.
-     * More info here: https://github.com/ggerganov/llama.cpp/blob/f5fe98d11bdf9e7797bcfb05c0c3601ffc4b9d26/grammars/README.md
+     * > GBNF files are supported.
+     * > More info here: [github:ggerganov/llama.cpp:grammars/README.md](
+     * > https://github.com/ggerganov/llama.cpp/blob/f5fe98d11bdf9e7797bcfb05c0c3601ffc4b9d26/grammars/README.md)
      * @param {object} options
      * @param {string} options.grammar - GBNF grammar
+     * @param {string[]} [options.stopStrings] - Consider any of these texts as EOS for the generated out.
+     * Only supported by `LlamaChatSession`
+     * @param {boolean} [options.trimWhitespaceSuffix] - Trim whitespace from the end of the generated text.
+     * Only supported by `LlamaChatSession`
      * @param {boolean} [options.printGrammar] - print the grammar to stdout
      */
-    public constructor({grammar, printGrammar = false}: LlamaGrammarOptions) {
+    public constructor({
+        grammar, stopStrings = [], trimWhitespaceSuffix = false, printGrammar = false
+    }: LlamaGrammarOptions) {
         this._grammar = new LLAMAGrammar(grammar, {
             printGrammar
         });
+        this._stopStrings = stopStrings ?? [];
+        this._trimWhitespaceSuffix = trimWhitespaceSuffix;
+    }
+
+    public get stopStrings() {
+        return this._stopStrings;
+    }
+
+    public get trimWhitespaceSuffix() {
+        return this._trimWhitespaceSuffix;
     }
 
     public static async getFor(type: "json" | "list" | "arithmetic" | "japanese" | "chess") {
@@ -36,7 +61,11 @@ export class LlamaGrammar {
 
         if (await fs.pathExists(grammarFile)) {
             const grammar = await fs.readFile(grammarFile, "utf8");
-            return new LlamaGrammar({grammar});
+            return new LlamaGrammar({
+                grammar,
+                stopStrings: ["\n".repeat(10)], // this is a workaround for the model not stopping to generate text,
+                trimWhitespaceSuffix: true
+            });
         }
 
         throw new Error(`Grammar file for type "${type}" was not found in "${grammarsFolder}"`);
diff --git a/src/llamaEvaluator/LlamaGrammarEvaluationState.ts b/src/llamaEvaluator/LlamaGrammarEvaluationState.ts
new file mode 100644
index 00000000..42a90c2e
--- /dev/null
+++ b/src/llamaEvaluator/LlamaGrammarEvaluationState.ts
@@ -0,0 +1,23 @@
+import {LLAMAGrammarEvaluationState} from "./LlamaBins.js";
+import {LlamaGrammar} from "./LlamaGrammar.js";
+
+
+export type LlamaGrammarEvaluationStateOptions = {
+    grammar: LlamaGrammar,
+};
+
+export class LlamaGrammarEvaluationState {
+    /** @internal */
+    public readonly _state: LLAMAGrammarEvaluationState;
+
+    /**
+     * Grammar evaluation state is used to track the model response to determine the next allowed characters for the model to generate.
+     * Create a new grammar evaluation state for every response you generate with the model.
+     * This is only needed when using the `LlamaContext` class directly, as `LlamaChatSession` already handles this for you.
+     * @param {object} options
+     * @param {LlamaGrammar} options.grammar
+     */
+    public constructor({grammar}: LlamaGrammarEvaluationStateOptions) {
+        this._state = new LLAMAGrammarEvaluationState(grammar._grammar);
+    }
+}
diff --git a/src/llamaEvaluator/LlamaModel.ts b/src/llamaEvaluator/LlamaModel.ts
index e592be27..d7b2e963 100644
--- a/src/llamaEvaluator/LlamaModel.ts
+++ b/src/llamaEvaluator/LlamaModel.ts
@@ -8,22 +8,35 @@ export type LlamaModelOptions = {
     /** path to the model on the filesystem */
     modelPath: string,
 
-    /** If null, a random seed will be used */
+    /**
+     * If null, a random seed will be used
+     * @deprecated use the `seed` option on `LlamaContext` instead
+     * @hidden
+     * */
     seed?: number | null,
 
-    /** text context size */
+    /**
+     * text context size
+     * @deprecated use the `contextSize` option on `LlamaContext` instead
+     * @hidden
+     * */
     contextSize?: number,
 
-    /** prompt processing batch size */
+    /**
+     * prompt processing batch size
+     * @deprecated use the `batchSize` option on `LlamaContext` instead
+     * @hidden
+     * */
     batchSize?: number,
 
     /** number of layers to store in VRAM */
     gpuLayers?: number,
 
-    /** if true, reduce VRAM usage at the cost of performance */
-    lowVram?: boolean,
-
-    /** number of threads to use to evaluate tokens */
+    /**
+     * number of threads to use to evaluate tokens
+     * @deprecated use the `threads` option on `LlamaContext` instead
+     * @hidden
+     * */
     threads?: number,
 
     /**
@@ -35,6 +48,8 @@ export type LlamaModelOptions = {
      * At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
      *
      * Set to `0` to disable.
+     * @deprecated use the `temperature` option on `LlamaChatSession`'s `prompt` function or `LlamaContext`'s `evaluate` function instead
+     * @hidden
      */
     temperature?: number,
 
@@ -44,6 +59,8 @@ export type LlamaModelOptions = {
      * Set to `0` to disable (which uses the full vocabulary).
      *
      * Only relevant when `temperature` is set to a value greater than 0.
+     * @deprecated use the `topK` option on `LlamaChatSession`'s `prompt` function or `LlamaContext`'s `evaluate` function instead
+     * @hidden
      * */
     topK?: number,
 
@@ -54,13 +71,23 @@ export type LlamaModelOptions = {
      * Set to `1` to disable.
      *
      * Only relevant when `temperature` is set to a value greater than `0`.
-     * */
+     * @deprecated use the `topP` option on `LlamaChatSession`'s `prompt` function or `LlamaContext`'s `evaluate` function instead
+     * @hidden
+     */
     topP?: number,
 
-    /** use fp16 for KV cache */
+    /**
+     * use fp16 for KV cache
+     * @deprecated use the `f16Kv` option on `LlamaContext` instead
+     * @hidden
+     */
     f16Kv?: boolean,
 
-    /** the llama_eval() call computes all logits, not just the last one */
+    /**
+     * the llama_eval() call computes all logits, not just the last one
+     * @deprecated use the `logitsAll` option on `LlamaContext` instead
+     * @hidden
+     */
     logitsAll?: boolean,
 
     /** only load the vocabulary, no weights */
@@ -72,7 +99,11 @@ export type LlamaModelOptions = {
     /** force system to keep model in RAM */
     useMlock?: boolean,
 
-    /** embedding mode only */
+    /**
+     * embedding mode only
+     * @deprecated use the `embedding` option on `LlamaContext` instead
+     * @hidden
+     */
     embedding?: boolean
 };
 
@@ -80,16 +111,34 @@ export class LlamaModel {
     /** @internal */
     public readonly _model: LLAMAModel;
 
+    /** @internal */
+    public readonly _contextOptions: {
+        seed: LlamaModelOptions["seed"],
+        contextSize: LlamaModelOptions["contextSize"],
+        batchSize: LlamaModelOptions["batchSize"],
+        f16Kv: LlamaModelOptions["f16Kv"],
+        logitsAll: LlamaModelOptions["logitsAll"],
+        embedding: LlamaModelOptions["embedding"],
+        threads: LlamaModelOptions["threads"]
+    };
+
+    /** @internal */
+    public readonly _evaluationOptions: {
+        temperature: LlamaModelOptions["temperature"],
+        topK: LlamaModelOptions["topK"],
+        topP: LlamaModelOptions["topP"]
+    };
+
     /**
-     * options source:
-     * https://github.com/ggerganov/llama.cpp/blob/b5ffb2849d23afe73647f68eec7b68187af09be6/llama.h#L102 (struct llama_context_params)
+     * > options source:
+     * > [github:ggerganov/llama.cpp/llama.h](
+     * > https://github.com/ggerganov/llama.cpp/blob/b5ffb2849d23afe73647f68eec7b68187af09be6/llama.h#L102) (`struct llama_context_params`)
      * @param {object} options
      * @param {string} options.modelPath - path to the model on the filesystem
      * @param {number | null} [options.seed] - If null, a random seed will be used
      * @param {number} [options.contextSize] - text context size
      * @param {number} [options.batchSize] - prompt processing batch size
      * @param {number} [options.gpuLayers] - number of layers to store in VRAM
-     * @param {boolean} [options.lowVram] - if true, reduce VRAM usage at the cost of performance
      * @param {number} [options.threads] - number of threads to use to evaluate tokens
      * @param {number} [options.temperature] - Temperature is a hyperparameter that controls the randomness of the generated text.
      * It affects the probability distribution of the model's output tokens.
@@ -120,25 +169,30 @@ export class LlamaModel {
      */
     public constructor({
         modelPath, seed = null, contextSize = 1024 * 4, batchSize, gpuLayers,
-        lowVram, threads = 6, temperature = 0, topK = 40, topP = 0.95, f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding
+        threads = 6, temperature = 0, topK = 40, topP = 0.95, f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding
     }: LlamaModelOptions) {
         this._model = new LLAMAModel(path.resolve(process.cwd(), modelPath), removeNullFields({
-            seed: seed != null ? Math.max(-1, seed) : undefined,
-            contextSize,
-            batchSize,
             gpuLayers,
-            lowVram,
-            threads,
-            temperature,
-            topK,
-            topP,
-            f16Kv,
-            logitsAll,
             vocabOnly,
             useMmap,
-            useMlock,
-            embedding
+            useMlock
         }));
+
+        this._contextOptions = {
+            seed,
+            contextSize,
+            batchSize,
+            f16Kv,
+            logitsAll,
+            embedding,
+            threads
+        };
+
+        this._evaluationOptions = {
+            temperature,
+            topK,
+            topP
+        };
     }
 
     public static get systemInfo() {
diff --git a/src/utils/compileLLamaCpp.ts b/src/utils/compileLLamaCpp.ts
index 402ae64a..5064c726 100644
--- a/src/utils/compileLLamaCpp.ts
+++ b/src/utils/compileLLamaCpp.ts
@@ -2,7 +2,7 @@ import path from "path";
 import {fileURLToPath} from "url";
 import process from "process";
 import fs from "fs-extra";
-import {customCmakeOptionsEnvVarPrefix, llamaCppDirectory, llamaDirectory} from "../config.js";
+import {customCmakeOptionsEnvVarPrefix, llamaCppDirectory, llamaDirectory, llamaToolchainsDirectory} from "../config.js";
 import {clearLlamaBuild} from "./clearLlamaBuild.js";
 import {setUsedBinFlag} from "./usedBinFlag.js";
 import {spawnCommand} from "./spawnCommand.js";
@@ -22,6 +22,7 @@ export async function compileLlamaCpp({
         }
 
         const cmakePathArgs = await getCmakePathArgs();
+        const toolchainFile = await getToolchainFileForArch(arch);
         const runtimeVersion = nodeTarget.startsWith("v") ? nodeTarget.slice("v".length) : nodeTarget;
         const cmakeCustomOptions = [];
 
@@ -41,6 +42,9 @@ export async function compileLlamaCpp({
         if (process.env.LLAMA_HIPBLAS === "1") cmakeCustomOptions.push("LLAMA_HIPBLAS=1");
         if (process.env.LLAMA_CLBLAST === "1") cmakeCustomOptions.push("LLAMA_CLBLAST=1");
 
+        if (toolchainFile != null)
+            cmakeCustomOptions.push("CMAKE_TOOLCHAIN_FILE=" + toolchainFile);
+
         for (const key in process.env) {
             if (key.startsWith(customCmakeOptionsEnvVarPrefix)) {
                 const option = key.slice(customCmakeOptionsEnvVarPrefix.length);
@@ -61,7 +65,7 @@ export async function compileLlamaCpp({
         );
 
         const binFilesDirPath = path.join(llamaDirectory, "build", "llama.cpp", "bin");
-        const compiledResultDirPath = await getCompiledResultDir();
+        const compiledResultDirPath = await getCompiledResultDir(true);
 
         if (await fs.pathExists(binFilesDirPath)) {
             const files = await fs.readdir(binFilesDirPath);
@@ -89,7 +93,11 @@ export async function compileLlamaCpp({
 }
 
 export async function getCompiledLlamaCppBinaryPath() {
-    const compiledResultDirPath = await getCompiledResultDir();
+    const compiledResultDirPath = await getCompiledResultDir(false);
+
+    if (compiledResultDirPath == null)
+        return null;
+
     const modulePath = path.join(compiledResultDirPath, "llama-addon.node");
 
     if (await fs.pathExists(modulePath))
@@ -98,14 +106,19 @@ export async function getCompiledLlamaCppBinaryPath() {
     return null;
 }
 
-async function getCompiledResultDir() {
+async function getCompiledResultDir(failIfNotFound?: false): Promise<string | null>;
+async function getCompiledResultDir(failIfNotFound: true): Promise<string>;
+async function getCompiledResultDir(failIfNotFound: boolean = false) {
     if (await fs.pathExists(path.join(llamaDirectory, "build", "Release"))) {
         return path.join(llamaDirectory, "build", "Release");
     } else if (await fs.pathExists(path.join(llamaDirectory, "build", "Debug"))) {
         return path.join(llamaDirectory, "build", "Debug");
     }
 
-    throw new Error("Could not find Release or Debug directory");
+    if (failIfNotFound)
+        throw new Error("Could not find Release or Debug directory");
+
+    return null;
 }
 
 async function getCmakePathArgs() {
@@ -119,3 +132,20 @@ async function getCmakePathArgs() {
 
     return ["--cmake-path", cmakePath];
 }
+
+async function getToolchainFileForArch(targetArch: string) {
+    if (process.arch === targetArch)
+        return null;
+
+    const platform = process.platform;
+    const hostArch = process.arch;
+
+    const toolchainFilename = `${platform}.host-${hostArch}.target-${targetArch}.cmake`;
+
+    const filePath = path.join(llamaToolchainsDirectory, toolchainFilename);
+
+    if (await fs.pathExists(filePath))
+        return filePath;
+
+    return null;
+}
diff --git a/src/utils/getBin.ts b/src/utils/getBin.ts
index 24e60ab3..3c86fca6 100644
--- a/src/utils/getBin.ts
+++ b/src/utils/getBin.ts
@@ -99,35 +99,40 @@ export type LlamaCppNodeModule = {
     LLAMAModel: LLAMAModel,
     LLAMAContext: LLAMAContext,
     LLAMAGrammar: LLAMAGrammar,
+    LLAMAGrammarEvaluationState: LLAMAGrammarEvaluationState,
     systemInfo(): string
 };
 
 export type LLAMAModel = {
     new (modelPath: string, params: {
+        gpuLayers?: number,
+        vocabOnly?: boolean,
+        useMmap?: boolean,
+        useMlock?: boolean
+    }): LLAMAModel
+};
+
+export type LLAMAContext = {
+    new (model: LLAMAModel, params: {
         seed?: number,
         contextSize?: number,
         batchSize?: number,
-        gpuCores?: number,
-        lowVram?: boolean,
         f16Kv?: boolean,
         logitsAll?: boolean,
-        vocabOnly?: boolean,
-        useMmap?: boolean,
-        useMlock?: boolean,
         embedding?: boolean,
         threads?: number,
-        temperature?: number,
-        topK?: number,
-        topP?: number
-    }): LLAMAModel
-};
-
-export type LLAMAContext = {
-    new (model: LLAMAModel, params?: {
-        grammar?: LLAMAGrammar
     }): LLAMAContext,
     encode(text: string): Uint32Array,
-    eval(tokens: Uint32Array): Promise<number>,
+    eval(tokens: Uint32Array, options?: {
+        temperature?: number,
+        topK?: number,
+        topP?: number,
+        repeatPenalty?: number,
+        repeatPenaltyTokens?: Uint32Array,
+        repeatPenaltyPresencePenalty?: number, // alpha_presence
+        repeatPenaltyFrequencyPenalty?: number, // alpha_frequency
+        grammarEvaluationState?: LLAMAGrammarEvaluationState
+    }): Promise<number>,
     decode(tokens: Uint32Array): string,
     tokenBos(): number,
     tokenEos(): number,
@@ -141,3 +146,7 @@ export type LLAMAGrammar = {
         printGrammar?: boolean,
     }): LLAMAGrammar
 };
+
+export type LLAMAGrammarEvaluationState = {
+    new (grammar: LLAMAGrammar): LLAMAGrammarEvaluationState
+};