diff --git a/LLama.Unittest/TemplateTests.cs b/LLama.Unittest/TemplateTests.cs
index 13f33198e..32d6fa129 100644
--- a/LLama.Unittest/TemplateTests.cs
+++ b/LLama.Unittest/TemplateTests.cs
@@ -1,15 +1,20 @@
 using System.Text;
 using LLama.Common;
+using LLama.Extensions;
+using LLama.Native;
+using Xunit.Abstractions;
 
 namespace LLama.Unittest;
 
 public sealed class TemplateTests
     : IDisposable
 {
+    private readonly ITestOutputHelper _output;
     private readonly LLamaWeights _model;
 
-    public TemplateTests()
+    public TemplateTests(ITestOutputHelper output)
     {
+        _output = output;
         var @params = new ModelParams(Constants.GenerativeModelPath)
         {
             ContextSize = 1,
@@ -260,6 +265,37 @@ public void EndOTurnToken_ReturnsExpected()
     [Fact]
     public void EndOSpeechToken_ReturnsExpected()
     {
+        _output.WriteLine($"EOS: {_model.Tokens.EOS}");
+        _output.WriteLine($"EOT: {_model.Tokens.EOT}");
+        _output.WriteLine($"BOS: {_model.Tokens.BOS}");
+
+        var eosStr = ConvertTokenToString(_model.Tokens.EOS!.Value);
+        _output.WriteLine(eosStr ?? "null");
+
         Assert.Equal("</s>", _model.Tokens.EndOfSpeechToken);
     }
+
+    private string? ConvertTokenToString(LLamaToken token)
+    {
+        _output.WriteLine($"ConvertTokenToString: {token}");
+
+        const int buffSize = 32;
+        Span<byte> buff = stackalloc byte[buffSize];
+        var tokenLength = _model.NativeHandle.TokenToSpan(token, buff, 0, true);
+
+        _output.WriteLine($"tokenLength = {tokenLength}");
+        if (tokenLength <= 0)
+            return null;
+
+        // if the original buffer wasn't large enough, create a new one
+        _output.WriteLine($"tokenLength = {tokenLength}, buffSize = {buffSize}");
+        if (tokenLength > buffSize)
+        {
+            buff = stackalloc byte[(int)tokenLength];
+            _ = _model.NativeHandle.TokenToSpan(token, buff, 0, true);
+        }
+
+        var slice = buff.Slice(0, (int)tokenLength);
+        return Encoding.UTF8.GetStringFromSpan(slice);
+    }
 }
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
index f2b92b245..3d75a0cbc 100644
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -118,5 +118,8 @@ public class ModelOptions
 
         /// <inheritdoc />
         public LLamaPoolingType PoolingType { get; set; }
+
+        /// <inheritdoc />
+        public LLamaAttentionType AttentionType { get; set; } = LLamaAttentionType.Unspecified;
     }
 }
\ No newline at end of file
diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs
index 8aa7d52b7..a3399f925 100644
--- a/LLama/Abstractions/IContextParams.cs
+++ b/LLama/Abstractions/IContextParams.cs
@@ -123,4 +123,9 @@ public interface IContextParams
     /// How to pool (sum) embedding results by sequence id (ignored if no pooling layer)
     /// </summary>
     LLamaPoolingType PoolingType { get; }
+
+    /// <summary>
+    /// Attention type to use for embeddings
+    /// </summary>
+    LLamaAttentionType AttentionType { get; }
 }
\ No newline at end of file
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
index 28b1ef4e0..b93959a49 100644
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -109,6 +109,9 @@ public record ModelParams
         /// <inheritdoc />
         public LLamaPoolingType PoolingType { get; set; } = LLamaPoolingType.Unspecified;
 
+        /// <inheritdoc />
+        public LLamaAttentionType AttentionType { get; set; } = LLamaAttentionType.Unspecified;
+
         /// <inheritdoc />
         public bool VocabOnly { get; set; }
 
diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs
index 29ca30bd4..466a5a83c 100644
--- a/LLama/Extensions/IContextParamsExtensions.cs
+++ b/LLama/Extensions/IContextParamsExtensions.cs
@@ -52,6 +52,7 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
             result.offload_kqv = !@params.NoKqvOffload;
             result.flash_attention = @params.FlashAttention;
             result.llama_pooling_type = @params.PoolingType;
+            result.attention_type = @params.AttentionType;
 
             result.n_threads = Threads(@params.Threads);
             result.n_threads_batch = Threads(@params.BatchThreads);
diff --git a/LLama/LLamaQuantizer.cs b/LLama/LLamaQuantizer.cs
index a14158878..2ddbe2273 100644
--- a/LLama/LLamaQuantizer.cs
+++ b/LLama/LLamaQuantizer.cs
@@ -1,4 +1,4 @@
-﻿using LLama.Native;
+using LLama.Native;
 using System;
 using System.Collections.Generic;
 
@@ -66,49 +66,49 @@ private static bool ValidateFtype(LLamaFtype ftype)
 
             switch (ftype)
             {
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_F16:
-                case LLamaFtype.LLAMA_FTYPE_ALL_F32:
+                case LLamaFtype.MOSTLY_Q4_0:
+                case LLamaFtype.MOSTLY_Q4_1:
+                case LLamaFtype.MOSTLY_Q5_0:
+                case LLamaFtype.MOSTLY_Q5_1:
+                case LLamaFtype.MOSTLY_Q8_0:
+                case LLamaFtype.MOSTLY_F16:
+                case LLamaFtype.ALL_F32:
 
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q2_K_S:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q2_K:
+                case LLamaFtype.MOSTLY_Q2_K_S:
+                case LLamaFtype.MOSTLY_Q2_K:
 
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_K_XS:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_S:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_M:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_L:
+                case LLamaFtype.MOSTLY_IQ3_K_XS:
+                case LLamaFtype.MOSTLY_Q3_K_S:
+                case LLamaFtype.MOSTLY_Q3_K_M:
+                case LLamaFtype.MOSTLY_Q3_K_L:
 
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_S:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_M:
+                case LLamaFtype.MOSTLY_Q4_K_S:
+                case LLamaFtype.MOSTLY_Q4_K_M:
 
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_S:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_M:
+                case LLamaFtype.MOSTLY_Q5_K_S:
+                case LLamaFtype.MOSTLY_Q5_K_M:
 
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q6_K:
+                case LLamaFtype.MOSTLY_Q6_K:
 
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_XXS:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_XS:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_S:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_M:
+                case LLamaFtype.MOSTLY_IQ2_XXS:
+                case LLamaFtype.MOSTLY_IQ2_XS:
+                case LLamaFtype.MOSTLY_IQ2_S:
+                case LLamaFtype.MOSTLY_IQ2_M:
 
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_XXS:
+                case LLamaFtype.MOSTLY_IQ3_XXS:
 
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ1_S:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ1_M:
+                case LLamaFtype.MOSTLY_IQ1_S:
+                case LLamaFtype.MOSTLY_IQ1_M:
 
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ4_NL:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ4_XS:
+                case LLamaFtype.MOSTLY_IQ4_NL:
+                case LLamaFtype.MOSTLY_IQ4_XS:
 
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_S:
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_M:
+                case LLamaFtype.MOSTLY_IQ3_S:
+                case LLamaFtype.MOSTLY_IQ3_M:
                     return true;
 
-                case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
-                case LLamaFtype.LLAMA_FTYPE_GUESSED:
+                case LLamaFtype.MOSTLY_Q4_1_SOME_F16:
+                case LLamaFtype.GUESSED:
                 default:
                     return false;
             }
diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
index a65ddcabc..1562e8434 100644
--- a/LLama/LLamaSharp.csproj
+++ b/LLama/LLamaSharp.csproj
@@ -53,7 +53,7 @@
   </ItemGroup>
 
   <PropertyGroup>
-    <BinaryReleaseId>1c5eba6f8e62</BinaryReleaseId>
+    <BinaryReleaseId>368645698ab648e390dc</BinaryReleaseId>
   </PropertyGroup>
 
   <PropertyGroup>
diff --git a/LLama/Native/LLamaAttentionType.cs b/LLama/Native/LLamaAttentionType.cs
new file mode 100644
index 000000000..543f89b47
--- /dev/null
+++ b/LLama/Native/LLamaAttentionType.cs
@@ -0,0 +1,8 @@
+namespace LLama.Native;
+
+public enum LLamaAttentionType
+{
+    Unspecified = -1,
+    Causal = 0,
+    NonCausal = 1,
+}
\ No newline at end of file
diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs
index 0135da95d..e1d1417c4 100644
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@@ -65,6 +65,11 @@ public struct LLamaContextParams
         /// whether to pool (sum) embedding results by sequence id
         /// </summary>
         public LLamaPoolingType llama_pooling_type;
+
+        /// <summary>
+        /// Attention type to use for embeddings
+        /// </summary>
+        public LLamaAttentionType attention_type;
         
         /// <summary>
         /// RoPE base frequency, 0 = from model
diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs
index bc4b5c4cb..15216b71b 100644
--- a/LLama/Native/LLamaFtype.cs
+++ b/LLama/Native/LLamaFtype.cs
@@ -1,4 +1,4 @@
-﻿namespace LLama.Native
+namespace LLama.Native
 {
     /// <summary>
     /// Supported model file types
@@ -9,176 +9,191 @@ public enum LLamaFtype
         /// All f32
         /// </summary>
         /// <remarks>Benchmark@7B: 26GB</remarks>
-        LLAMA_FTYPE_ALL_F32 = 0,
+        ALL_F32 = 0,
 
         /// <summary>
         /// Mostly f16
         /// </summary>
         /// <remarks>Benchmark@7B: 13GB</remarks>
-        LLAMA_FTYPE_MOSTLY_F16 = 1,
+        MOSTLY_F16 = 1,
 
         /// <summary>
         /// Mostly 8 bit
         /// </summary>
         /// <remarks>Benchmark@7B: 6.7GB, +0.0004ppl</remarks>
-        LLAMA_FTYPE_MOSTLY_Q8_0 = 7,
+        MOSTLY_Q8_0 = 7,
 
         /// <summary>
         /// Mostly 4 bit
         /// </summary>
         /// <remarks>Benchmark@7B: 3.50GB, +0.2499 ppl</remarks>
-        LLAMA_FTYPE_MOSTLY_Q4_0 = 2,
+        MOSTLY_Q4_0 = 2,
 
         /// <summary>
         /// Mostly 4 bit
         /// </summary>
         /// <remarks>Benchmark@7B: 3.90GB, +0.1846 ppl</remarks>
-        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,
+        MOSTLY_Q4_1 = 3,
 
         /// <summary>
         /// Mostly 4 bit, tok_embeddings.weight and output.weight are f16
         /// </summary>
-        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,
+        MOSTLY_Q4_1_SOME_F16 = 4,
 
         /// <summary>
         /// Mostly 5 bit
         /// </summary>
         /// <remarks>Benchmark@7B: 4.30GB @ 7B tokens, +0.0796 ppl</remarks>
-        LLAMA_FTYPE_MOSTLY_Q5_0 = 8,
+        MOSTLY_Q5_0 = 8,
 
         /// <summary>
         /// Mostly 5 bit
         /// </summary>
         /// <remarks>Benchmark@7B: 4.70GB, +0.0415 ppl</remarks>
-        LLAMA_FTYPE_MOSTLY_Q5_1 = 9,
+        MOSTLY_Q5_1 = 9,
 
         /// <summary>
         /// K-Quant 2 bit
         /// </summary>
         /// <remarks>Benchmark@7B: 2.67GB @ 7N parameters, +0.8698 ppl</remarks>
-        LLAMA_FTYPE_MOSTLY_Q2_K = 10,
+        MOSTLY_Q2_K = 10,
 
         /// <summary>
         /// K-Quant 3 bit (Small)
         /// </summary>
         /// <remarks>Benchmark@7B: 2.75GB, +0.5505 ppl</remarks>
-        LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,
+        MOSTLY_Q3_K_S = 11,
 
         /// <summary>
         /// K-Quant 3 bit (Medium)
         /// </summary>
         /// <remarks>Benchmark@7B: 3.06GB, +0.2437 ppl</remarks>
-        LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,
+        MOSTLY_Q3_K_M = 12,
 
         /// <summary>
         /// K-Quant 3 bit (Large)
         /// </summary>
         /// <remarks>Benchmark@7B: 3.35GB, +0.1803 ppl</remarks>
-        LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,
+        MOSTLY_Q3_K_L = 13,
 
         /// <summary>
         /// K-Quant 4 bit (Small)
         /// </summary>
         /// <remarks>Benchmark@7B: 3.56GB, +0.1149 ppl</remarks>
-        LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,
+        MOSTLY_Q4_K_S = 14,
 
         /// <summary>
         /// K-Quant 4 bit (Medium)
         /// </summary>
         /// <remarks>Benchmark@7B: 3.80GB, +0.0535 ppl</remarks>
-        LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,
+        MOSTLY_Q4_K_M = 15,
 
         /// <summary>
         /// K-Quant 5 bit (Small)
         /// </summary>
         /// <remarks>Benchmark@7B: 4.33GB, +0.0353 ppl</remarks>
-        LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,
+        MOSTLY_Q5_K_S = 16,
 
         /// <summary>
         /// K-Quant 5 bit (Medium)
         /// </summary>
         /// <remarks>Benchmark@7B: 4.45GB, +0.0142 ppl</remarks>
-        LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,
+        MOSTLY_Q5_K_M = 17,
 
         /// <summary>
         /// K-Quant 6 bit
         /// </summary>
         /// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks>
-        LLAMA_FTYPE_MOSTLY_Q6_K = 18,
+        MOSTLY_Q6_K = 18,
 
         /// <summary>
         /// except 1d tensors
         /// </summary>
-        LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19,
+        MOSTLY_IQ2_XXS = 19,
 
         /// <summary>
         /// except 1d tensors
         /// </summary>
-        LLAMA_FTYPE_MOSTLY_IQ2_XS = 20,
+        MOSTLY_IQ2_XS = 20,
 
         /// <summary>
         /// except 1d tensors
         /// </summary>
-        LLAMA_FTYPE_MOSTLY_Q2_K_S = 21,
+        MOSTLY_Q2_K_S = 21,
 
         /// <summary>
         /// except 1d tensors
         /// </summary>
-        LLAMA_FTYPE_MOSTLY_IQ3_K_XS = 22,
+        MOSTLY_IQ3_K_XS = 22,
 
         /// <summary>
         /// except 1d tensors
         /// </summary>
-        LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23,
+        MOSTLY_IQ3_XXS = 23,
 
         /// <summary>
         /// except 1d tensors
         /// </summary>
-        LLAMA_FTYPE_MOSTLY_IQ1_S = 24,
+        MOSTLY_IQ1_S = 24,
 
         /// <summary>
         /// except 1d tensors
         /// </summary>
-        LLAMA_FTYPE_MOSTLY_IQ4_NL = 25,
+        MOSTLY_IQ4_NL = 25,
 
         /// <summary>
         /// except 1d tensors
         /// </summary>
-        LLAMA_FTYPE_MOSTLY_IQ3_S = 26,
+        MOSTLY_IQ3_S = 26,
 
         /// <summary>
         /// except 1d tensors
         /// </summary>
-        LLAMA_FTYPE_MOSTLY_IQ3_M = 27,
+        MOSTLY_IQ3_M = 27,
 
         /// <summary>
         /// except 1d tensors
         /// </summary>
-        LLAMA_FTYPE_MOSTLY_IQ2_S = 28,
+        MOSTLY_IQ2_S = 28,
 
         /// <summary>
         /// except 1d tensors
         /// </summary>
-        LLAMA_FTYPE_MOSTLY_IQ2_M = 29,
+        MOSTLY_IQ2_M = 29,
 
         /// <summary>
         /// except 1d tensors
         /// </summary>
-        LLAMA_FTYPE_MOSTLY_IQ4_XS = 30,
+        MOSTLY_IQ4_XS = 30,
 
         /// <summary>
         /// except 1d tensors
         /// </summary>
-        LLAMA_FTYPE_MOSTLY_IQ1_M = 31,
+        MOSTLY_IQ1_M = 31,
 
         /// <summary>
         /// except 1d tensors
         /// </summary>
-        LLAMA_FTYPE_MOSTLY_BF16 = 32,
+        MOSTLY_BF16 = 32,
+
+        /// <summary>
+        /// except 1d tensors
+        /// </summary>
+        MOSTLY_Q4_0_4_4 = 33,
+
+        /// <summary>
+        /// except 1d tensors
+        /// </summary>
+        MOSTLY_Q4_0_4_8 = 34,
+
+        /// <summary>
+        /// except 1d tensors
+        /// </summary>
+        MOSTLY_Q4_0_8_8 = 35,
 
         /// <summary>
         /// File type was not specified
         /// </summary>
-        LLAMA_FTYPE_GUESSED = 1024
+        GUESSED = 1024
     }
 }
diff --git a/LLama/Native/LLamaVocabPreType.cs b/LLama/Native/LLamaVocabPreType.cs
index ef7166962..6a3c89f53 100644
--- a/LLama/Native/LLamaVocabPreType.cs
+++ b/LLama/Native/LLamaVocabPreType.cs
@@ -23,5 +23,8 @@ internal enum LLamaVocabPreType
     DBRX = 13,
     SMAUG = 14,
     PORO = 15,
-    VIKING = 16,
+    CHATGLM3 = 16,
+    CHATGLM4 = 17,
+    VIKING = 18,
+    JAIS = 19,
 }
\ No newline at end of file
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index 735888cee..8314249ec 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -223,9 +223,10 @@ public static unsafe int llama_chat_apply_template(SafeLlamaModelHandle? model,
         /// <param name="model"></param>
         /// <param name="llamaToken"></param>
         /// <param name="buffer">buffer to write string into</param>
+        /// <param name="lstrip">User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')</param>
         /// <param name="special">If true, special tokens are rendered in the output</param>
         /// <returns>The length written, or if the buffer is too small a negative that indicates the length required</returns>
-        public static int llama_token_to_piece(SafeLlamaModelHandle model, LLamaToken llamaToken, Span<byte> buffer, bool special)
+        public static int llama_token_to_piece(SafeLlamaModelHandle model, LLamaToken llamaToken, Span<byte> buffer, int lstrip, bool special)
         {
             // Handle invalid tokens
             if ((int)llamaToken < 0)
@@ -235,12 +236,12 @@ public static int llama_token_to_piece(SafeLlamaModelHandle model, LLamaToken ll
             {
                 fixed (byte* bufferPtr = buffer)
                 {
-                    return llama_token_to_piece_native(model, llamaToken, bufferPtr, buffer.Length, special);
+                    return llama_token_to_piece_native(model, llamaToken, bufferPtr, buffer.Length, lstrip, special);
                 }
             }
 
             [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_token_to_piece")]
-            static extern unsafe int llama_token_to_piece_native(SafeLlamaModelHandle model, LLamaToken llamaToken, byte* buffer, int length, [MarshalAs(UnmanagedType.U1)] bool special);
+            static extern unsafe int llama_token_to_piece_native(SafeLlamaModelHandle model, LLamaToken llamaToken, byte* buffer, int length, int lstrip, [MarshalAs(UnmanagedType.U1)] bool special);
         }
 
         /// <summary>
@@ -249,9 +250,9 @@ public static int llama_token_to_piece(SafeLlamaModelHandle model, LLamaToken ll
         /// <param name="model"></param>
         /// <param name="text"></param>
         /// <param name="text_len"></param>
-        /// <param name="tokens"></param>
+        /// <param name="tokens">The tokens pointer must be large enough to hold the resulting tokens.</param>
         /// <param name="n_max_tokens"></param>
-        /// <param name="add_special"></param>
+        /// <param name="add_special">add_special Allow to add BOS and EOS tokens if model is configured to do so.</param>
         /// <param name="parse_special">Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space.</param>
         /// <returns>Returns the number of tokens on success, no more than n_max_tokens.
         /// Returns a negative number on failure - the number of tokens that would have been returned
@@ -259,6 +260,20 @@ public static int llama_token_to_piece(SafeLlamaModelHandle model, LLamaToken ll
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern unsafe int llama_tokenize(SafeLlamaModelHandle model, byte* text, int text_len, LLamaToken* tokens, int n_max_tokens, [MarshalAs(UnmanagedType.U1)] bool add_special, [MarshalAs(UnmanagedType.U1)] bool parse_special);
 
+        /// <summary>
+        /// Convert the provided tokens into text (inverse of llama_tokenize()).
+        /// </summary>
+        /// <param name="model"></param>
+        /// <param name="tokens"></param>
+        /// <param name="nTokens"></param>
+        /// <param name="textOut">The char pointer must be large enough to hold the resulting text.</param>
+        /// <param name="textLengthMax"></param>
+        /// <param name="removeSpecial">remove_special Allow to remove BOS and EOS tokens if model is configured to do so.</param>
+        /// <param name="unparseSpecial">unparse_special If true, special tokens are rendered in the output.</param>
+        /// <returns>Returns the number of chars/bytes on success, no more than textLengthMax. Returns a negative number on failure - the number of chars/bytes that would have been returned.</returns>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern unsafe int llama_detokenize(SafeLlamaModelHandle model, LLamaToken* tokens, int nTokens, byte* textOut, int textLengthMax, bool removeSpecial, bool unparseSpecial);
+
         /// <summary>
         /// Register a callback to receive llama log messages
         /// </summary>
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
index 9d5ca4ffd..40cb8f2cb 100644
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -169,6 +169,15 @@ static SafeLLamaContextHandle()
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern int llama_decode(SafeLLamaContextHandle ctx, LLamaNativeBatch batch);
 
+        /// <summary>
+        /// Processes a batch of tokens with the ecoder part of the encoder-decoder model. Stores the encoder output
+        /// internally for later use by the decoder cross-attention layers.
+        /// </summary>
+        /// <param name="ctx"></param>
+        /// <param name="batch"></param>
+        /// <returns>0 = success <br />&lt; 0 = error</returns>
+        private static extern int llama_encode(SafeLLamaContextHandle ctx, LLamaNativeBatch batch);
+
         /// <summary>
         /// Set the number of threads used for decoding
         /// </summary>
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
index ee5a6c552..4970a4fdf 100644
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -59,7 +59,12 @@ public sealed class SafeLlamaModelHandle
         /// <summary>
         /// Get the number of layers in this model
         /// </summary>
-        public int LayerCount => llama_n_embd(this);
+        public int LayerCount => llama_n_layers(this);
+
+        /// <summary>
+        /// Returns true if the model contains an encoder that requires llama_encode() call
+        /// </summary>
+        public bool HasEncoder => llama_model_has_encoder(this);
 
         /// <summary>
         /// Get a description of this model
@@ -387,6 +392,14 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern int llama_token_eot(SafeLlamaModelHandle model);
 
+        /// <summary>
+        /// For encoder-decoder models, this function returns id of the token that must be provided
+        /// to the decoder to start generating output sequence. For other models, it returns -1.
+        /// </summary>
+        /// <returns></returns>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern int llama_model_decoder_start_token(SafeLlamaModelHandle model);
+
         /// <summary>
         /// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
         /// </summary>
@@ -409,6 +422,18 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k
 
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern LLamaTokenAttr llama_token_get_attr(SafeLlamaModelHandle model, LLamaToken token);
+
+        //[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        //private static extern GGMLTensor llama_get_model_tensor(SafeLlamaModelHandle model, string name);
+
+        /// <summary>
+        /// Returns true if the model contains an encoder that requires llama_encode() call
+        /// </summary>
+        /// <param name="model"></param>
+        /// <returns></returns>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        [return: MarshalAs(UnmanagedType.U1)]
+        private static extern bool llama_model_has_encoder(SafeLlamaModelHandle model);
         #endregion
 
         #region LoRA
@@ -450,11 +475,12 @@ public void ApplyLoraFromFile(string lora, float scale, string? modelBase = null
         /// </summary>
         /// <param name="token">Token to decode</param>
         /// <param name="dest">A span to attempt to write into. If this is too small nothing will be written</param>
+        /// <param name="lstrip">User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')</param>
         /// <param name="special">If true, special characters will be converted to text. If false they will be invisible.</param>
         /// <returns>The size of this token. **nothing will be written** if this is larger than `dest`</returns>
-        public uint TokenToSpan(LLamaToken token, Span<byte> dest, bool special = false)
+        public uint TokenToSpan(LLamaToken token, Span<byte> dest, int lstrip = 0, bool special = false)
         {
-            var length = NativeApi.llama_token_to_piece(this, token, dest, special);
+            var length = NativeApi.llama_token_to_piece(this, token, dest, lstrip, special);
             return (uint)Math.Abs(length);
         }
 
@@ -729,6 +755,12 @@ internal ModelTokens(SafeLlamaModelHandle model)
             /// </summary>
             public LLamaToken? EOT => Normalize(llama_token_eot(_model));
 
+            /// <summary>
+            /// For encoder-decoder models, this function returns id of the token that must be provided
+            /// to the decoder to start generating output sequence.
+            /// </summary>
+            public LLamaToken? DecoderStartToken => Normalize(llama_model_decoder_start_token(_model));
+
             /// <summary>
             /// Returns the string representation of this model's end_of_text token
             /// </summary>
diff --git a/LLama/StreamingTokenDecoder.cs b/LLama/StreamingTokenDecoder.cs
index 60de20769..aef643f43 100644
--- a/LLama/StreamingTokenDecoder.cs
+++ b/LLama/StreamingTokenDecoder.cs
@@ -115,7 +115,7 @@ public void Add(LLamaToken token)
             static Span<byte> TokenToBytes(ref byte[] bytes, LLamaToken token, SafeLlamaModelHandle model, bool special)
             {
                 // Try to get bytes
-                var l = model.TokenToSpan(token, bytes, special);
+                var l = model.TokenToSpan(token, bytes, 0, special);
 
                 // Check if the length was larger than the buffer. If so expand the buffer and try again
                 if (l > bytes.Length)
diff --git a/llama.cpp b/llama.cpp
index 1c5eba6f8..368645698 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit 1c5eba6f8e628fb0a98afb27d8aaeb3b0e136451
+Subproject commit 368645698ab648e390dcd7c00a2bf60efa654f57