SciSharp
diff --git a/‎LLama.Unittest/GrammarParserTest.cs
Lines changed: 1 addition & 2 deletions b/‎LLama.Unittest/GrammarParserTest.cs
Lines changed: 1 addition & 2 deletions
diff --git a/‎LLama.Unittest/StatelessExecutorTest.cs
Lines changed: 2 additions & 33 deletions b/‎LLama.Unittest/StatelessExecutorTest.cs
Lines changed: 2 additions & 33 deletions
diff --git a/‎LLama/Native/LLamaTokenDataArray.cs
Lines changed: 28 additions & 1 deletion b/‎LLama/Native/LLamaTokenDataArray.cs
Lines changed: 28 additions & 1 deletion
diff --git a/‎LLama/Sampling/BaseSamplingPipeline.cs
Lines changed: 128 additions & 0 deletions b/‎LLama/Sampling/BaseSamplingPipeline.cs
Lines changed: 128 additions & 0 deletions
diff --git a/‎LLama/Sampling/DefaultSamplingPipeline.cs
Lines changed: 149 additions & 0 deletions b/‎LLama/Sampling/DefaultSamplingPipeline.cs
Lines changed: 149 additions & 0 deletions
@@ -1,5 +1,4 @@
-using System.Text;
-using LLama.Exceptions;
+using LLama.Exceptions;
 using LLama.Native;
 using LLama.Grammars;
 
 
@@ -1,9 +1,6 @@
 using System.Diagnostics;
 using LLama.Common;
 using LLama.Sampling;
-using LLama.Sampling.Logits;
-using LLama.Sampling.Selection;
-using LLama.Sampling.Tokens;
 using Xunit.Abstractions;
 
 namespace LLama.Unittest
@@ -35,40 +32,12 @@ public void Dispose()
         public async Task Stateless()
         {
             // Create a custom pipeline that mimics the default pipeline
-            var pipeline = new ConfigurableSamplingPipeline()
-            {
-                ProtectedLogits =
-                {
-                    _weights.NewlineToken,
-                    _weights.BeginningOfSentenceToken,
-                    _weights.EndOfSentenceToken
-                },
-                LogitProcessors =
-                {
-                    new LogitBias
-                    {
-                        Biases =
-                        {
-                            { _weights.NewlineToken, 1000 }, // This is an insane bias, but because newline is a protected logit it will do nothing!
-                            { 42, 0f },
-                        }
-                    }
-                },
-                TokenDataProcessors =
-                {
-                    new TailFreeSampling { Z = 1 },
-                    new LocallyTypicalSampling { P = 1 },
-                    new TopPSampling { P = 0.95f },
-                    new MinPSampling { P = 0.05f },
-                    new TemperatureSampling { Temperature = 0.8f },
-                },
-                Selector = new StandardSelection(),
-            };
+            var pipeline = new DefaultSamplingPipeline();
 
             var executor = new StatelessExecutor(_weights, _params);
 
             const string question = "Question. what is a cat?\nAnswer: ";
-            var @params = new InferenceParams { MaxTokens = 32, AntiPrompts = new[] { "." }, SamplingPipeline = pipeline};
+            var @params = new InferenceParams { MaxTokens = 32, AntiPrompts = new[] { "." }, SamplingPipeline = pipeline };
 
             var timer = new Stopwatch();
             timer.Start();
 
@@ -46,14 +46,41 @@ public static LLamaTokenDataArray Create(ReadOnlySpan<float> logits)
             return new LLamaTokenDataArray(candidates);
         }
 
+        /// <summary>
+        /// Overwrite the logit values for all given tokens
+        /// </summary>
+        /// <param name="values">tuples of token and logit value to overwrite</param>
+        public void OverwriteLogits(ReadOnlySpan<(llama_token token, float logit)> values)
+        {
+            if (values.Length == 0)
+                return;
+
+            var dataSpan = data.Span;
+            foreach (var (token, value) in values)
+            {
+                for (var i = 0; i < data.Length; i++)
+                {
+                    if (dataSpan[i].id == token)
+                    {
+                        dataSpan[i].logit = value;
+                        break;
+                    }
+                }   
+            }
+            sorted = false;
+        }
+
         #region sampling
         /// <summary>
         /// Apply grammar rules to candidate tokens
         /// </summary>
         /// <param name="ctx"></param>
         /// <param name="grammar"></param>
-        public void ApplyGrammar(SafeLLamaContextHandle ctx, SafeLLamaGrammarHandle grammar)
+        public void ApplyGrammar(SafeLLamaContextHandle ctx, SafeLLamaGrammarHandle? grammar)
         {
+            if (grammar == null)
+                return;
+
             using (LLamaTokenDataArrayNative.Create(this, out var st))
             {
                 NativeApi.llama_sample_grammar(ctx, ref st, grammar);
 
@@ -0,0 +1,128 @@
+using System;
+using System.Buffers;
+using System.Collections.Generic;
+using LLama.Native;
+
+namespace LLama.Sampling;
+
+/// <summary>
+/// Base class for implementing custom sampling pipelines. This provides a helpful framework for implementing `ISamplingPipeline`.
+/// </summary>
+public abstract class BaseSamplingPipeline
+    : ISamplingPipeline
+{
+    private int _savedLogitsCount;
+    private (int index, float logit)[]? _savedLogits;
+
+    /// <inheritdoc/>
+    public int Sample(SafeLLamaContextHandle ctx, Span<float> logits, ReadOnlySpan<int> lastTokens)
+    {
+        var protectedLogits = GetProtectedTokens(ctx);
+        _savedLogitsCount = protectedLogits.Count;
+        _savedLogits = ArrayPool<(int, float)>.Shared.Rent(_savedLogitsCount);
+        try
+        {
+            // Save the values of protected logits
+            for (var i = 0; i < protectedLogits.Count; i++)
+            {
+                var index = protectedLogits[i];
+                var value = logits[index];
+                _savedLogits[i] = (index, value);
+            }
+
+            // Process raw logits
+            ProcessLogits(ctx, logits, lastTokens);
+
+            // Automatically restore saved logit values after processing
+            RestoreProtectedTokens(logits);
+
+            // Convert logits into token candidates
+            var candidates = LLamaTokenDataArray.Create(logits);
+
+            // Process token data array
+            ProcessTokenDataArray(ctx, candidates, lastTokens);
+
+            // Choose the final value
+            return ChooseToken(ctx, candidates);
+        }
+        finally
+        {
+            ArrayPool<(int, float)>.Shared.Return(_savedLogits);
+            _savedLogits = null;
+            _savedLogitsCount = 0;
+        }
+    }
+
+    #region protected tokens
+    /// <summary>
+    /// Get all of the "protected" tokens that cannot be changed by ProcessLogits
+    /// </summary>
+    /// <returns></returns>
+    protected abstract IReadOnlyList<int> GetProtectedTokens(SafeLLamaContextHandle ctx);
+
+    /// <summary>
+    /// Restore the value of the "protected" tokens which were saved before the call to ProcessLogits
+    /// </summary>
+    /// <param name="logits"></param>
+    protected void RestoreProtectedTokens(Span<float> logits)
+    {
+        if (_savedLogits == null)
+            return;
+
+        // The array may be bigger than necessary, get a span of the valid bit
+        var saved = _savedLogits.AsSpan(0, _savedLogitsCount);
+
+        // Restore the values of protected logits
+        for (var i = 0; i < saved.Length; i++)
+            logits[saved[i].index] = saved[i].logit;
+    }
+
+    /// <summary>
+    /// Restore the value of the "protected" tokens which were saved before the call to ProcessLogits
+    /// </summary>
+    /// <param name="candidates"></param>
+    protected void RestoreProtectedTokens(LLamaTokenDataArray candidates)
+    {
+        if (_savedLogits == null || _savedLogits.Length == 0)
+            return;
+
+        candidates.OverwriteLogits(_savedLogits.AsSpan(0, _savedLogitsCount));
+    }
+    #endregion
+
+    /// <summary>
+    /// Process the raw logit values
+    /// </summary>
+    /// <param name="ctx">The context being sampled from</param>
+    /// <param name="logits">The logits produced by the model</param>
+    /// <param name="lastTokens">A list of tokens recently returned by the model</param>
+    protected abstract void ProcessLogits(SafeLLamaContextHandle ctx, Span<float> logits, ReadOnlySpan<int> lastTokens);
+
+    /// <summary>
+    /// Process the LLamaTokenDataArray and select a single token
+    /// </summary>
+    /// <param name="ctx">The context being sampled from</param>
+    /// <param name="candidates">The LLamaTokenDataArray data produced by the model</param>
+    /// <param name="lastTokens">A list of tokens recently returned by the model</param>
+    /// <returns></returns>
+    protected abstract int ProcessTokenDataArray(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, ReadOnlySpan<int> lastTokens);
+
+    /// <summary>
+    /// Choose the final token from the candidates
+    /// </summary>
+    /// <param name="ctx"></param>
+    /// <param name="candidates"></param>
+    /// <returns></returns>
+    protected abstract int ChooseToken(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates);
+
+    /// <inheritdoc/>
+    public virtual void Reset()
+    {
+    }
+
+    /// <inheritdoc/>
+    public virtual void Dispose()
+    {
+        GC.SuppressFinalize(this);
+    }
+}
@@ -0,0 +1,149 @@
+using System;
+using System.Collections.Generic;
+using LLama.Extensions;
+using LLama.Native;
+
+namespace LLama.Sampling;
+
+/// <summary>
+/// An implementation of ISamplePipeline which mimics the default llama.cpp sampling
+/// </summary>
+public sealed class DefaultSamplingPipeline
+    : BaseSamplingPipeline
+{
+    /// <summary>
+    /// Bias values to add to certain logits
+    /// </summary>
+    public Dictionary<int, float> LogitBias { get; } = new();
+
+    /// <summary>
+    /// Grammar to constrain valid tokens
+    /// </summary>
+    public SafeLLamaGrammarHandle? Grammar { get; set; }
+
+    /// <summary>
+    /// Repetition penalty, as described in https://arxiv.org/abs/1909.05858
+    /// </summary>
+    public float RepeatPenalty { get; set; } = 1.1f;
+
+    /// <summary>
+    /// Frequency penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create<br />
+    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text
+    /// so far, decreasing the model's likelihood to repeat the same line verbatim.
+    /// </summary>
+    public float AlphaFrequency
+    {
+        get => _alphaFreq;
+        set
+        {
+            if (value < -2)
+                throw new ArgumentOutOfRangeException(nameof(value), "AlphaFrequency must be greater than -2");
+            if (value > 2)
+                throw new ArgumentOutOfRangeException(nameof(value), "AlphaFrequency must be less than 2");
+            _alphaFreq = value;
+        }
+    }
+    private float _alphaFreq = 0.1f;
+
+    /// <summary>
+    /// Presence penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create<br />
+    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the
+    /// text so far, increasing the model's likelihood to talk about new topics.
+    /// </summary>
+    public float AlphaPresence
+    {
+        get => _alphaPresence;
+        set
+        {
+            if (value < -2)
+                throw new ArgumentOutOfRangeException(nameof(value), "AlphaFrequency must be greater than -2");
+            if (value > 2)
+                throw new ArgumentOutOfRangeException(nameof(value), "AlphaFrequency must be less than 2");
+            _alphaPresence = value;
+        }
+    }
+    private float _alphaPresence = 0.1f;
+
+    /// <summary>
+    /// Temperature to apply (higher temperature is more "creative")
+    /// </summary>
+    public float Temperature { get; set; } = 0.75f;
+
+    /// <summary>
+    /// Number of tokens to keep in TopK sampling
+    /// </summary>
+    public int TopK { get; set; }
+
+    /// <summary>
+    /// Z value for tail free sampling
+    /// </summary>
+    public float TailFreeZ { get; set; }
+
+    /// <summary>
+    /// P value for locally typical sampling
+    /// </summary>
+    public float TypicalP { get; set; }
+
+    /// <summary>
+    /// P value for TopP sampling
+    /// </summary>
+    public float TopP { get; set; } = 1f;
+
+    /// <summary>
+    /// P value for MinP sampling
+    /// </summary>
+    public float MinP { get; set; }
+
+    /// <summary>
+    /// Whether the newline value should be protected from being modified by logit bias and repeat penalty
+    /// </summary>
+    public bool PenalizeNewline { get; set; } = false;
+
+    private readonly int[] _newlineToken = new int[1];
+
+    /// <inheritdoc />
+    protected override IReadOnlyList<int> GetProtectedTokens(SafeLLamaContextHandle ctx)
+    {
+        if (PenalizeNewline)
+            return Array.Empty<int>();
+
+        _newlineToken[0] = NativeApi.llama_token_nl(ctx.ModelHandle);
+        return _newlineToken;
+    }
+
+    /// <inheritdoc />
+    protected override void ProcessLogits(SafeLLamaContextHandle ctx, Span<float> logits, ReadOnlySpan<int> lastTokens)
+    {
+        foreach (var (key, value) in LogitBias)
+            logits[key] += value;
+    }
+
+    /// <inheritdoc />
+    protected override int ProcessTokenDataArray(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, ReadOnlySpan<int> lastTokens)
+    {
+        // Apply penalties to candidates
+        candidates.RepetitionPenalty(ctx, lastTokens, RepeatPenalty, AlphaFrequency, AlphaPresence);
+
+        // Restore protected tokens, so they are not affected by repetition penalties
+        RestoreProtectedTokens(candidates);
+
+        // Apply the normal llama.cpp pipeline
+        candidates.ApplyGrammar(ctx, Grammar);
+        candidates.TopK(ctx, TopK);
+        candidates.TailFree(ctx, TailFreeZ);
+        candidates.LocallyTypical(ctx, TypicalP);
+        candidates.TopP(ctx, TopP);
+        candidates.MinP(ctx, MinP);
+        candidates.Temperature(ctx, Temperature);
+        var id = candidates.SampleToken(ctx);
+
+        Grammar?.AcceptToken(ctx, id);
+        return id;
+    }
+
+    /// <inheritdoc />
+    protected override int ChooseToken(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates)
+    {
+        return candidates.SampleToken(ctx);
+    }
+}