Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion LLama.Unittest/LLamaContextTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public void Dispose()
[Fact]
public void CheckProperties()
{
Assert.Equal(768, _context.ContextSize);
Assert.Equal(768u, _context.ContextSize);
Assert.Equal(4096, _context.EmbeddingSize);
Assert.Equal(32000, _context.VocabCount);
}
Expand Down
3 changes: 3 additions & 0 deletions LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ public class ModelOptions
/// <inheritdoc />
public int MainGpu { get; set; } = 0;

/// <inheritdoc />
public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;

/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;

Expand Down
21 changes: 20 additions & 1 deletion LLama/Abstractions/IModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,29 @@ namespace LLama.Abstractions
public interface IModelParams
{
/// <summary>
/// the GPU that is used for scratch and small tensors
/// main_gpu interpretation depends on split_mode:
/// <list type="bullet">
/// <item>
/// <term>None</term>
/// <description>The GPU that is used for the entire mode.</description>
/// </item>
/// <item>
/// <term>Row</term>
/// <description>The GPU that is used for small tensors and intermediate results.</description>
/// </item>
/// <item>
/// <term>Layer</term>
/// <description>Ignored.</description>
/// </item>
/// </list>
/// </summary>
int MainGpu { get; set; }

/// <summary>
/// How to split the model across multiple GPUs
/// </summary>
GPUSplitMode SplitMode { get; set; }

/// <summary>
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
/// </summary>
Expand Down
3 changes: 3 additions & 0 deletions LLama/Common/ModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ public record ModelParams
/// <inheritdoc />
public int MainGpu { get; set; } = 0;

/// <inheritdoc />
public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;

/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;

Expand Down
3 changes: 3 additions & 0 deletions LLama/Extensions/IContextParamsExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0;
result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.LLAMA_ROPE_SCALING_UNSPECIFIED;

result.cb_eval = IntPtr.Zero;
result.cb_eval_user_data = IntPtr.Zero;

result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
result.offload_kqv = [email protected];
Expand Down
1 change: 1 addition & 0 deletions LLama/Extensions/IModelParamsExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam

result = NativeApi.llama_model_default_params();
result.main_gpu = @params.MainGpu;
result.split_mode = @params.SplitMode;
result.n_gpu_layers = @params.GpuLayerCount;
result.use_mlock = @params.UseMemoryLock;
result.use_mmap = @params.UseMemorymap;
Expand Down
4 changes: 2 additions & 2 deletions LLama/LLamaContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public sealed class LLamaContext
/// <summary>
/// Total number of tokens in the context
/// </summary>
public int ContextSize => NativeHandle.ContextSize;
public uint ContextSize => NativeHandle.ContextSize;

/// <summary>
/// Dimension of embedding vectors
Expand Down Expand Up @@ -323,7 +323,7 @@ public LLamaTokenDataArray ApplyPenalty(int logits_i, IEnumerable<LLamaToken> la
var candidates_p = LLamaTokenDataArray.Create(logits);

// Extract most recently returned tokens
var last_n_repeat = Math.Min(ContextSize, repeatLastTokensCount);
var last_n_repeat = Math.Min((int)ContextSize, repeatLastTokensCount);
var last_n_array = lastTokens.TakeLast(last_n_repeat).ToArray();

// Apply penalties to candidates
Expand Down
4 changes: 2 additions & 2 deletions LLama/LLamaExecutorBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ protected StatefulExecutorBase(LLamaContext context, ILogger? logger = null)
_pastTokensCount = 0;
_consumedTokensCount = 0;
_n_session_consumed = 0;
_last_n_tokens = new FixedSizeQueue<LLamaToken>(Context.ContextSize);
_last_n_tokens = new FixedSizeQueue<LLamaToken>((int)Context.ContextSize);
_decoder = new StreamingTokenDecoder(context);
}

Expand Down Expand Up @@ -170,7 +170,7 @@ protected virtual void HandleRunOutOfContext(int tokensToKeep)
_pastTokensCount = Math.Max(1, tokensToKeep);

// insert n_left/2 tokens at the start of embed from last_n_tokens
_embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip(Context.ContextSize - n_left / 2 - _embeds.Count));
_embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip((int)Context.ContextSize - n_left / 2 - _embeds.Count));

// stop saving session if we run out of context
_pathSession = string.Empty;
Expand Down
2 changes: 1 addition & 1 deletion LLama/LLamaInstructExecutor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ protected override Task InferInternal(IInferenceParams inferenceParams, InferSta

if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput)
{
var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount;
var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount;

// optionally save the session on first sample (for faster prompt loading next time)
if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession)
Expand Down
2 changes: 1 addition & 1 deletion LLama/LLamaInteractExecutor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ protected override async Task InferInternal(IInferenceParams inferenceParams, In

if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput)
{
var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount;
var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount;

// optionally save the session on first sample (for faster prompt loading next time)
if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession)
Expand Down
23 changes: 23 additions & 0 deletions LLama/Native/GPUSplitMode.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
namespace LLama.Native;

/// <summary>
///
/// </summary>
/// <remarks>llama_split_mode</remarks>
public enum GPUSplitMode
{
/// <summary>
/// Single GPU
/// </summary>
None = 0,

/// <summary>
/// Split layers and KV across GPUs
/// </summary>
Layer = 1,

/// <summary>
/// split rows across GPUs
/// </summary>
Row = 2,
}
26 changes: 19 additions & 7 deletions LLama/Native/LLamaContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ namespace LLama.Native
/// </summary>
/// <param name="progress"></param>
/// <param name="ctx"></param>
public delegate void LlamaProgressCallback(float progress, IntPtr ctx);
/// <returns>Return true to continue loading, return false to abort</returns>
/// <remarks>llama_progress_callback</remarks>
public delegate bool LlamaProgressCallback(float progress, IntPtr ctx);

/// <summary>
/// A C# representation of the llama.cpp `llama_context_params` struct
Expand Down Expand Up @@ -50,33 +52,43 @@ public struct LLamaContextParams
/// <summary>
/// RoPE base frequency, 0 = from model
/// </summary>
public float rope_freq_base;
public float rope_freq_base;
/// <summary>
/// RoPE frequency scaling factor, 0 = from model
/// </summary>
public float rope_freq_scale;
public float rope_freq_scale;
/// <summary>
/// YaRN extrapolation mix factor, negative = from model
/// </summary>
public float yarn_ext_factor;
public float yarn_ext_factor;
/// <summary>
/// YaRN magnitude scaling factor
/// </summary>
public float yarn_attn_factor;
public float yarn_attn_factor;
/// <summary>
/// YaRN low correction dim
/// </summary>
public float yarn_beta_fast;
public float yarn_beta_fast;
/// <summary>
/// YaRN high correction dim
/// </summary>
public float yarn_beta_slow;
public float yarn_beta_slow;

/// <summary>
/// YaRN original context size
/// </summary>
public uint yarn_orig_ctx;

/// <summary>
/// ggml_backend_sched_eval_callback
/// </summary>
public IntPtr cb_eval;

/// <summary>
/// User data passed into cb_eval
/// </summary>
public IntPtr cb_eval_user_data;

/// <summary>
/// data type for K cache
/// </summary>
Expand Down
20 changes: 20 additions & 0 deletions LLama/Native/LLamaFtype.cs
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,26 @@ public enum LLamaFtype
/// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q6_K = 18,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_IQ2_XXS,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_IQ2_XS,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_Q2_K_S,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_Q3_K_XS,

/// <summary>
/// File type was not specified
/// </summary>
Expand Down
7 changes: 6 additions & 1 deletion LLama/Native/LLamaModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ public unsafe struct LLamaModelParams
/// </summary>
public int n_gpu_layers;

/// <summary>
/// how to split the model across multiple GPUs
/// </summary>
public GPUSplitMode split_mode;

/// <summary>
/// the GPU that is used for scratch and small tensors
/// </summary>
Expand All @@ -25,7 +30,7 @@ public unsafe struct LLamaModelParams
public float* tensor_split;

/// <summary>
/// called with a progress value between 0 and 1, pass NULL to disable
/// called with a progress value between 0 and 1, pass NULL to disable.
/// </summary>
public LlamaProgressCallback progress_callback;

Expand Down
6 changes: 6 additions & 0 deletions LLama/Native/LLamaModelQuantizeParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ namespace LLama.Native
/// <summary>
/// Quantizer parameters used in the native API
/// </summary>
/// <remarks>llama_model_quantize_params</remarks>
[StructLayout(LayoutKind.Sequential)]
public struct LLamaModelQuantizeParams
{
Expand Down Expand Up @@ -58,5 +59,10 @@ public bool pure
set => _pure = Convert.ToSByte(value);
}
private sbyte _pure;

/// <summary>
/// pointer to importance matrix data
/// </summary>
public IntPtr imatrix;
}
}
2 changes: 1 addition & 1 deletion LLama/Native/NativeApi.Quantize.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ public static partial class NativeApi
/// <remarks>not great API - very likely to change</remarks>
/// <returns>Returns 0 on success</returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern unsafe int llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param);
public static extern unsafe uint llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param);
}
}
17 changes: 14 additions & 3 deletions LLama/Native/NativeApi.Sampling.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ public static extern unsafe void llama_sample_repetition_penalties(SafeLLamaCont
/// Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
/// </summary>
/// <param name="ctx"></param>
/// <param name="candidates">A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.</param>
/// <param name="guidance_ctx">A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param>
/// <param name="logits">logits Logits extracted from the original generation context.</param>
/// <param name="logits_guidance">Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param>
/// <param name="scale">Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.</param>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_sample_classifier_free_guidance(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, SafeLLamaContextHandle guidance_ctx, float scale);
public static unsafe extern void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, float* logits, float* logits_guidance, float scale);

/// <summary>
/// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
Expand Down Expand Up @@ -92,6 +92,17 @@ public static extern unsafe void llama_sample_repetition_penalties(SafeLLamaCont
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep);

/// <summary>
/// Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
/// </summary>
/// <param name="ctx"></param>
/// <param name="candidates"></param>
/// <param name="min_temp"></param>
/// <param name="max_temp"></param>
/// <param name="exponent_val"></param>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_sample_entropy(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float min_temp, float max_temp, float exponent_val);

/// <summary>
/// Modify logits by temperature
/// </summary>
Expand Down
19 changes: 18 additions & 1 deletion LLama/Native/NativeApi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,10 @@ public static void llama_empty_call()
/// <param name="ctx"></param>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern int llama_n_ctx(SafeLLamaContextHandle ctx);
public static extern uint llama_n_ctx(SafeLLamaContextHandle ctx);

[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern uint llama_n_batch(SafeLLamaContextHandle ctx);

/// <summary>
/// Token logits obtained from the last call to llama_eval()
Expand Down Expand Up @@ -380,6 +383,20 @@ public static int llama_token_to_piece(SafeLlamaModelHandle model, LLamaToken ll
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_kv_cache_seq_shift(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, LLamaPos delta);

/// <summary>
/// Integer division of the positions by factor of `d > 1`
/// If the KV cache is RoPEd, the KV data is updated accordingly
/// p0 &lt; 0 : [0, p1]
/// p1 &lt; 0 : [p0, inf)
/// </summary>
/// <param name="ctx"></param>
/// <param name="seq"></param>
/// <param name="p0"></param>
/// <param name="p1"></param>
/// <param name="d"></param>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_kv_cache_seq_div(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d);

/// <summary>
/// Allocates a batch of tokens on the heap
/// Each token can be assigned up to n_seq_max sequence ids
Expand Down
2 changes: 1 addition & 1 deletion LLama/Native/SafeLLamaContextHandle.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public sealed class SafeLLamaContextHandle
/// <summary>
/// Total number of tokens in the context
/// </summary>
public int ContextSize => NativeApi.llama_n_ctx(this);
public uint ContextSize => NativeApi.llama_n_ctx(this);

/// <summary>
/// Dimension of embedding vectors
Expand Down
Binary file modified LLama/runtimes/deps/avx/libllama.dll
Binary file not shown.
Binary file modified LLama/runtimes/deps/avx/libllama.so
Binary file not shown.
Binary file modified LLama/runtimes/deps/avx2/libllama.dll
Binary file not shown.
Binary file modified LLama/runtimes/deps/avx2/libllama.so
Binary file not shown.
Binary file modified LLama/runtimes/deps/avx512/libllama.dll
Binary file not shown.
Binary file modified LLama/runtimes/deps/avx512/libllama.so
Binary file not shown.
Binary file modified LLama/runtimes/deps/cu11.7.1/libllama.dll
Binary file not shown.
Binary file modified LLama/runtimes/deps/cu11.7.1/libllama.so
Binary file not shown.
Binary file modified LLama/runtimes/deps/cu12.1.0/libllama.dll
Binary file not shown.
Binary file modified LLama/runtimes/deps/cu12.1.0/libllama.so
Binary file not shown.
Binary file modified LLama/runtimes/deps/libllama.dll
Binary file not shown.
Binary file modified LLama/runtimes/deps/libllama.so
Binary file not shown.
Loading