diff --git a/LLama.Unittest/TemplateTests.cs b/LLama.Unittest/TemplateTests.cs index 13f33198e..32d6fa129 100644 --- a/LLama.Unittest/TemplateTests.cs +++ b/LLama.Unittest/TemplateTests.cs @@ -1,15 +1,20 @@ using System.Text; using LLama.Common; +using LLama.Extensions; +using LLama.Native; +using Xunit.Abstractions; namespace LLama.Unittest; public sealed class TemplateTests : IDisposable { + private readonly ITestOutputHelper _output; private readonly LLamaWeights _model; - public TemplateTests() + public TemplateTests(ITestOutputHelper output) { + _output = output; var @params = new ModelParams(Constants.GenerativeModelPath) { ContextSize = 1, @@ -260,6 +265,37 @@ public void EndOTurnToken_ReturnsExpected() [Fact] public void EndOSpeechToken_ReturnsExpected() { + _output.WriteLine($"EOS: {_model.Tokens.EOS}"); + _output.WriteLine($"EOT: {_model.Tokens.EOT}"); + _output.WriteLine($"BOS: {_model.Tokens.BOS}"); + + var eosStr = ConvertTokenToString(_model.Tokens.EOS!.Value); + _output.WriteLine(eosStr ?? "null"); + Assert.Equal("", _model.Tokens.EndOfSpeechToken); } + + private string? ConvertTokenToString(LLamaToken token) + { + _output.WriteLine($"ConvertTokenToString: {token}"); + + const int buffSize = 32; + Span buff = stackalloc byte[buffSize]; + var tokenLength = _model.NativeHandle.TokenToSpan(token, buff, 0, true); + + _output.WriteLine($"tokenLength = {tokenLength}"); + if (tokenLength <= 0) + return null; + + // if the original buffer wasn't large enough, create a new one + _output.WriteLine($"tokenLength = {tokenLength}, buffSize = {buffSize}"); + if (tokenLength > buffSize) + { + buff = stackalloc byte[(int)tokenLength]; + _ = _model.NativeHandle.TokenToSpan(token, buff, 0, true); + } + + var slice = buff.Slice(0, (int)tokenLength); + return Encoding.UTF8.GetStringFromSpan(slice); + } } diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs index f2b92b245..3d75a0cbc 100644 --- a/LLama.Web/Common/ModelOptions.cs +++ b/LLama.Web/Common/ModelOptions.cs @@ -118,5 +118,8 @@ public class ModelOptions /// public LLamaPoolingType PoolingType { get; set; } + + /// + public LLamaAttentionType AttentionType { get; set; } = LLamaAttentionType.Unspecified; } } \ No newline at end of file diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs index 8aa7d52b7..a3399f925 100644 --- a/LLama/Abstractions/IContextParams.cs +++ b/LLama/Abstractions/IContextParams.cs @@ -123,4 +123,9 @@ public interface IContextParams /// How to pool (sum) embedding results by sequence id (ignored if no pooling layer) /// LLamaPoolingType PoolingType { get; } + + /// + /// Attention type to use for embeddings + /// + LLamaAttentionType AttentionType { get; } } \ No newline at end of file diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs index 28b1ef4e0..b93959a49 100644 --- a/LLama/Common/ModelParams.cs +++ b/LLama/Common/ModelParams.cs @@ -109,6 +109,9 @@ public record ModelParams /// public LLamaPoolingType PoolingType { get; set; } = LLamaPoolingType.Unspecified; + /// + public LLamaAttentionType AttentionType { get; set; } = LLamaAttentionType.Unspecified; + /// public bool VocabOnly { get; set; } diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs index 29ca30bd4..466a5a83c 100644 --- a/LLama/Extensions/IContextParamsExtensions.cs +++ b/LLama/Extensions/IContextParamsExtensions.cs @@ -52,6 +52,7 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo result.offload_kqv = !@params.NoKqvOffload; result.flash_attention = @params.FlashAttention; result.llama_pooling_type = @params.PoolingType; + result.attention_type = @params.AttentionType; result.n_threads = Threads(@params.Threads); result.n_threads_batch = Threads(@params.BatchThreads); diff --git a/LLama/LLamaQuantizer.cs b/LLama/LLamaQuantizer.cs index a14158878..2ddbe2273 100644 --- a/LLama/LLamaQuantizer.cs +++ b/LLama/LLamaQuantizer.cs @@ -1,4 +1,4 @@ -using LLama.Native; +using LLama.Native; using System; using System.Collections.Generic; @@ -66,49 +66,49 @@ private static bool ValidateFtype(LLamaFtype ftype) switch (ftype) { - case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_F16: - case LLamaFtype.LLAMA_FTYPE_ALL_F32: + case LLamaFtype.MOSTLY_Q4_0: + case LLamaFtype.MOSTLY_Q4_1: + case LLamaFtype.MOSTLY_Q5_0: + case LLamaFtype.MOSTLY_Q5_1: + case LLamaFtype.MOSTLY_Q8_0: + case LLamaFtype.MOSTLY_F16: + case LLamaFtype.ALL_F32: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q2_K_S: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q2_K: + case LLamaFtype.MOSTLY_Q2_K_S: + case LLamaFtype.MOSTLY_Q2_K: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_K_XS: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_S: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_M: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_L: + case LLamaFtype.MOSTLY_IQ3_K_XS: + case LLamaFtype.MOSTLY_Q3_K_S: + case LLamaFtype.MOSTLY_Q3_K_M: + case LLamaFtype.MOSTLY_Q3_K_L: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_S: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_M: + case LLamaFtype.MOSTLY_Q4_K_S: + case LLamaFtype.MOSTLY_Q4_K_M: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_S: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_M: + case LLamaFtype.MOSTLY_Q5_K_S: + case LLamaFtype.MOSTLY_Q5_K_M: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q6_K: + case LLamaFtype.MOSTLY_Q6_K: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_XXS: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_XS: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_S: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ2_M: + case LLamaFtype.MOSTLY_IQ2_XXS: + case LLamaFtype.MOSTLY_IQ2_XS: + case LLamaFtype.MOSTLY_IQ2_S: + case LLamaFtype.MOSTLY_IQ2_M: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_XXS: + case LLamaFtype.MOSTLY_IQ3_XXS: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ1_S: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ1_M: + case LLamaFtype.MOSTLY_IQ1_S: + case LLamaFtype.MOSTLY_IQ1_M: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ4_NL: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ4_XS: + case LLamaFtype.MOSTLY_IQ4_NL: + case LLamaFtype.MOSTLY_IQ4_XS: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_S: - case LLamaFtype.LLAMA_FTYPE_MOSTLY_IQ3_M: + case LLamaFtype.MOSTLY_IQ3_S: + case LLamaFtype.MOSTLY_IQ3_M: return true; - case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: - case LLamaFtype.LLAMA_FTYPE_GUESSED: + case LLamaFtype.MOSTLY_Q4_1_SOME_F16: + case LLamaFtype.GUESSED: default: return false; } diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index a65ddcabc..1562e8434 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -53,7 +53,7 @@ - 1c5eba6f8e62 + 368645698ab648e390dc diff --git a/LLama/Native/LLamaAttentionType.cs b/LLama/Native/LLamaAttentionType.cs new file mode 100644 index 000000000..543f89b47 --- /dev/null +++ b/LLama/Native/LLamaAttentionType.cs @@ -0,0 +1,8 @@ +namespace LLama.Native; + +public enum LLamaAttentionType +{ + Unspecified = -1, + Causal = 0, + NonCausal = 1, +} \ No newline at end of file diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs index 0135da95d..e1d1417c4 100644 --- a/LLama/Native/LLamaContextParams.cs +++ b/LLama/Native/LLamaContextParams.cs @@ -65,6 +65,11 @@ public struct LLamaContextParams /// whether to pool (sum) embedding results by sequence id /// public LLamaPoolingType llama_pooling_type; + + /// + /// Attention type to use for embeddings + /// + public LLamaAttentionType attention_type; /// /// RoPE base frequency, 0 = from model diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs index bc4b5c4cb..15216b71b 100644 --- a/LLama/Native/LLamaFtype.cs +++ b/LLama/Native/LLamaFtype.cs @@ -1,4 +1,4 @@ -namespace LLama.Native +namespace LLama.Native { /// /// Supported model file types @@ -9,176 +9,191 @@ public enum LLamaFtype /// All f32 /// /// Benchmark@7B: 26GB - LLAMA_FTYPE_ALL_F32 = 0, + ALL_F32 = 0, /// /// Mostly f16 /// /// Benchmark@7B: 13GB - LLAMA_FTYPE_MOSTLY_F16 = 1, + MOSTLY_F16 = 1, /// /// Mostly 8 bit /// /// Benchmark@7B: 6.7GB, +0.0004ppl - LLAMA_FTYPE_MOSTLY_Q8_0 = 7, + MOSTLY_Q8_0 = 7, /// /// Mostly 4 bit /// /// Benchmark@7B: 3.50GB, +0.2499 ppl - LLAMA_FTYPE_MOSTLY_Q4_0 = 2, + MOSTLY_Q4_0 = 2, /// /// Mostly 4 bit /// /// Benchmark@7B: 3.90GB, +0.1846 ppl - LLAMA_FTYPE_MOSTLY_Q4_1 = 3, + MOSTLY_Q4_1 = 3, /// /// Mostly 4 bit, tok_embeddings.weight and output.weight are f16 /// - LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, + MOSTLY_Q4_1_SOME_F16 = 4, /// /// Mostly 5 bit /// /// Benchmark@7B: 4.30GB @ 7B tokens, +0.0796 ppl - LLAMA_FTYPE_MOSTLY_Q5_0 = 8, + MOSTLY_Q5_0 = 8, /// /// Mostly 5 bit /// /// Benchmark@7B: 4.70GB, +0.0415 ppl - LLAMA_FTYPE_MOSTLY_Q5_1 = 9, + MOSTLY_Q5_1 = 9, /// /// K-Quant 2 bit /// /// Benchmark@7B: 2.67GB @ 7N parameters, +0.8698 ppl - LLAMA_FTYPE_MOSTLY_Q2_K = 10, + MOSTLY_Q2_K = 10, /// /// K-Quant 3 bit (Small) /// /// Benchmark@7B: 2.75GB, +0.5505 ppl - LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, + MOSTLY_Q3_K_S = 11, /// /// K-Quant 3 bit (Medium) /// /// Benchmark@7B: 3.06GB, +0.2437 ppl - LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, + MOSTLY_Q3_K_M = 12, /// /// K-Quant 3 bit (Large) /// /// Benchmark@7B: 3.35GB, +0.1803 ppl - LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, + MOSTLY_Q3_K_L = 13, /// /// K-Quant 4 bit (Small) /// /// Benchmark@7B: 3.56GB, +0.1149 ppl - LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, + MOSTLY_Q4_K_S = 14, /// /// K-Quant 4 bit (Medium) /// /// Benchmark@7B: 3.80GB, +0.0535 ppl - LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, + MOSTLY_Q4_K_M = 15, /// /// K-Quant 5 bit (Small) /// /// Benchmark@7B: 4.33GB, +0.0353 ppl - LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, + MOSTLY_Q5_K_S = 16, /// /// K-Quant 5 bit (Medium) /// /// Benchmark@7B: 4.45GB, +0.0142 ppl - LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, + MOSTLY_Q5_K_M = 17, /// /// K-Quant 6 bit /// /// Benchmark@7B: 5.15GB, +0.0044 ppl - LLAMA_FTYPE_MOSTLY_Q6_K = 18, + MOSTLY_Q6_K = 18, /// /// except 1d tensors /// - LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, + MOSTLY_IQ2_XXS = 19, /// /// except 1d tensors /// - LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, + MOSTLY_IQ2_XS = 20, /// /// except 1d tensors /// - LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, + MOSTLY_Q2_K_S = 21, /// /// except 1d tensors /// - LLAMA_FTYPE_MOSTLY_IQ3_K_XS = 22, + MOSTLY_IQ3_K_XS = 22, /// /// except 1d tensors /// - LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, + MOSTLY_IQ3_XXS = 23, /// /// except 1d tensors /// - LLAMA_FTYPE_MOSTLY_IQ1_S = 24, + MOSTLY_IQ1_S = 24, /// /// except 1d tensors /// - LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, + MOSTLY_IQ4_NL = 25, /// /// except 1d tensors /// - LLAMA_FTYPE_MOSTLY_IQ3_S = 26, + MOSTLY_IQ3_S = 26, /// /// except 1d tensors /// - LLAMA_FTYPE_MOSTLY_IQ3_M = 27, + MOSTLY_IQ3_M = 27, /// /// except 1d tensors /// - LLAMA_FTYPE_MOSTLY_IQ2_S = 28, + MOSTLY_IQ2_S = 28, /// /// except 1d tensors /// - LLAMA_FTYPE_MOSTLY_IQ2_M = 29, + MOSTLY_IQ2_M = 29, /// /// except 1d tensors /// - LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, + MOSTLY_IQ4_XS = 30, /// /// except 1d tensors /// - LLAMA_FTYPE_MOSTLY_IQ1_M = 31, + MOSTLY_IQ1_M = 31, /// /// except 1d tensors /// - LLAMA_FTYPE_MOSTLY_BF16 = 32, + MOSTLY_BF16 = 32, + + /// + /// except 1d tensors + /// + MOSTLY_Q4_0_4_4 = 33, + + /// + /// except 1d tensors + /// + MOSTLY_Q4_0_4_8 = 34, + + /// + /// except 1d tensors + /// + MOSTLY_Q4_0_8_8 = 35, /// /// File type was not specified /// - LLAMA_FTYPE_GUESSED = 1024 + GUESSED = 1024 } } diff --git a/LLama/Native/LLamaVocabPreType.cs b/LLama/Native/LLamaVocabPreType.cs index ef7166962..6a3c89f53 100644 --- a/LLama/Native/LLamaVocabPreType.cs +++ b/LLama/Native/LLamaVocabPreType.cs @@ -23,5 +23,8 @@ internal enum LLamaVocabPreType DBRX = 13, SMAUG = 14, PORO = 15, - VIKING = 16, + CHATGLM3 = 16, + CHATGLM4 = 17, + VIKING = 18, + JAIS = 19, } \ No newline at end of file diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index 735888cee..8314249ec 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -223,9 +223,10 @@ public static unsafe int llama_chat_apply_template(SafeLlamaModelHandle? model, /// /// /// buffer to write string into + /// User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix') /// If true, special tokens are rendered in the output /// The length written, or if the buffer is too small a negative that indicates the length required - public static int llama_token_to_piece(SafeLlamaModelHandle model, LLamaToken llamaToken, Span buffer, bool special) + public static int llama_token_to_piece(SafeLlamaModelHandle model, LLamaToken llamaToken, Span buffer, int lstrip, bool special) { // Handle invalid tokens if ((int)llamaToken < 0) @@ -235,12 +236,12 @@ public static int llama_token_to_piece(SafeLlamaModelHandle model, LLamaToken ll { fixed (byte* bufferPtr = buffer) { - return llama_token_to_piece_native(model, llamaToken, bufferPtr, buffer.Length, special); + return llama_token_to_piece_native(model, llamaToken, bufferPtr, buffer.Length, lstrip, special); } } [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_token_to_piece")] - static extern unsafe int llama_token_to_piece_native(SafeLlamaModelHandle model, LLamaToken llamaToken, byte* buffer, int length, [MarshalAs(UnmanagedType.U1)] bool special); + static extern unsafe int llama_token_to_piece_native(SafeLlamaModelHandle model, LLamaToken llamaToken, byte* buffer, int length, int lstrip, [MarshalAs(UnmanagedType.U1)] bool special); } /// @@ -249,9 +250,9 @@ public static int llama_token_to_piece(SafeLlamaModelHandle model, LLamaToken ll /// /// /// - /// + /// The tokens pointer must be large enough to hold the resulting tokens. /// - /// + /// add_special Allow to add BOS and EOS tokens if model is configured to do so. /// Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space. /// Returns the number of tokens on success, no more than n_max_tokens. /// Returns a negative number on failure - the number of tokens that would have been returned @@ -259,6 +260,20 @@ public static int llama_token_to_piece(SafeLlamaModelHandle model, LLamaToken ll [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern unsafe int llama_tokenize(SafeLlamaModelHandle model, byte* text, int text_len, LLamaToken* tokens, int n_max_tokens, [MarshalAs(UnmanagedType.U1)] bool add_special, [MarshalAs(UnmanagedType.U1)] bool parse_special); + /// + /// Convert the provided tokens into text (inverse of llama_tokenize()). + /// + /// + /// + /// + /// The char pointer must be large enough to hold the resulting text. + /// + /// remove_special Allow to remove BOS and EOS tokens if model is configured to do so. + /// unparse_special If true, special tokens are rendered in the output. + /// Returns the number of chars/bytes on success, no more than textLengthMax. Returns a negative number on failure - the number of chars/bytes that would have been returned. + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern unsafe int llama_detokenize(SafeLlamaModelHandle model, LLamaToken* tokens, int nTokens, byte* textOut, int textLengthMax, bool removeSpecial, bool unparseSpecial); + /// /// Register a callback to receive llama log messages /// diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs index 9d5ca4ffd..40cb8f2cb 100644 --- a/LLama/Native/SafeLLamaContextHandle.cs +++ b/LLama/Native/SafeLLamaContextHandle.cs @@ -169,6 +169,15 @@ static SafeLLamaContextHandle() [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern int llama_decode(SafeLLamaContextHandle ctx, LLamaNativeBatch batch); + /// + /// Processes a batch of tokens with the ecoder part of the encoder-decoder model. Stores the encoder output + /// internally for later use by the decoder cross-attention layers. + /// + /// + /// + /// 0 = success
< 0 = error
+ private static extern int llama_encode(SafeLLamaContextHandle ctx, LLamaNativeBatch batch); + /// /// Set the number of threads used for decoding /// diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index ee5a6c552..4970a4fdf 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -59,7 +59,12 @@ public sealed class SafeLlamaModelHandle /// /// Get the number of layers in this model /// - public int LayerCount => llama_n_embd(this); + public int LayerCount => llama_n_layers(this); + + /// + /// Returns true if the model contains an encoder that requires llama_encode() call + /// + public bool HasEncoder => llama_model_has_encoder(this); /// /// Get a description of this model @@ -387,6 +392,14 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern int llama_token_eot(SafeLlamaModelHandle model); + /// + /// For encoder-decoder models, this function returns id of the token that must be provided + /// to the decoder to start generating output sequence. For other models, it returns -1. + /// + /// + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern int llama_model_decoder_start_token(SafeLlamaModelHandle model); + /// /// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) /// @@ -409,6 +422,18 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern LLamaTokenAttr llama_token_get_attr(SafeLlamaModelHandle model, LLamaToken token); + + //[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + //private static extern GGMLTensor llama_get_model_tensor(SafeLlamaModelHandle model, string name); + + /// + /// Returns true if the model contains an encoder that requires llama_encode() call + /// + /// + /// + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + [return: MarshalAs(UnmanagedType.U1)] + private static extern bool llama_model_has_encoder(SafeLlamaModelHandle model); #endregion #region LoRA @@ -450,11 +475,12 @@ public void ApplyLoraFromFile(string lora, float scale, string? modelBase = null /// /// Token to decode /// A span to attempt to write into. If this is too small nothing will be written + /// User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix') /// If true, special characters will be converted to text. If false they will be invisible. /// The size of this token. **nothing will be written** if this is larger than `dest` - public uint TokenToSpan(LLamaToken token, Span dest, bool special = false) + public uint TokenToSpan(LLamaToken token, Span dest, int lstrip = 0, bool special = false) { - var length = NativeApi.llama_token_to_piece(this, token, dest, special); + var length = NativeApi.llama_token_to_piece(this, token, dest, lstrip, special); return (uint)Math.Abs(length); } @@ -729,6 +755,12 @@ internal ModelTokens(SafeLlamaModelHandle model) ///
public LLamaToken? EOT => Normalize(llama_token_eot(_model)); + /// + /// For encoder-decoder models, this function returns id of the token that must be provided + /// to the decoder to start generating output sequence. + /// + public LLamaToken? DecoderStartToken => Normalize(llama_model_decoder_start_token(_model)); + /// /// Returns the string representation of this model's end_of_text token /// diff --git a/LLama/StreamingTokenDecoder.cs b/LLama/StreamingTokenDecoder.cs index 60de20769..aef643f43 100644 --- a/LLama/StreamingTokenDecoder.cs +++ b/LLama/StreamingTokenDecoder.cs @@ -115,7 +115,7 @@ public void Add(LLamaToken token) static Span TokenToBytes(ref byte[] bytes, LLamaToken token, SafeLlamaModelHandle model, bool special) { // Try to get bytes - var l = model.TokenToSpan(token, bytes, special); + var l = model.TokenToSpan(token, bytes, 0, special); // Check if the length was larger than the buffer. If so expand the buffer and try again if (l > bytes.Length) diff --git a/llama.cpp b/llama.cpp index 1c5eba6f8..368645698 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 1c5eba6f8e628fb0a98afb27d8aaeb3b0e136451 +Subproject commit 368645698ab648e390dcd7c00a2bf60efa654f57