SciSharp
diff --git a/‎.github/workflows/compile.yml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/compile.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎LLama.Benchmark/Collections/FixedSizeQueueBenchmark.cs‎
Lines changed: 46 additions & 0 deletions b/‎LLama.Benchmark/Collections/FixedSizeQueueBenchmark.cs‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎LLama.Benchmark/LLama.Benchmark.csproj‎
Lines changed: 2 additions & 2 deletions b/‎LLama.Benchmark/LLama.Benchmark.csproj‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎LLama.Examples/LLama.Examples.csproj‎
Lines changed: 1 addition & 1 deletion b/‎LLama.Examples/LLama.Examples.csproj‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎LLama.Web/LLama.Web.csproj‎
Lines changed: 1 addition & 1 deletion b/‎LLama.Web/LLama.Web.csproj‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎LLama.WebAPI/LLama.WebAPI.csproj‎
Lines changed: 1 addition & 1 deletion b/‎LLama.WebAPI/LLama.WebAPI.csproj‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎LLama/Abstractions/IInferenceParams.cs‎
Lines changed: 1 addition & 1 deletion b/‎LLama/Abstractions/IInferenceParams.cs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎LLama/AntipromptProcessor.cs‎
Lines changed: 8 additions & 6 deletions b/‎LLama/AntipromptProcessor.cs‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎LLama/Batched/BatchedExecutor.cs‎
Lines changed: 76 additions & 16 deletions b/‎LLama/Batched/BatchedExecutor.cs‎
Lines changed: 76 additions & 16 deletions
diff --git a/‎LLama/Batched/Conversation.cs‎
Lines changed: 1 addition & 1 deletion b/‎LLama/Batched/Conversation.cs‎
Lines changed: 1 addition & 1 deletion
@@ -465,6 +465,8 @@ jobs:
             defines: '-DCMAKE_OSX_ARCHITECTURES=x86_64 -DGGML_METAL=OFF -DGGML_AVX=ON -DGGML_AVX2=ON'
           - build: 'x64-rosetta2'
             defines: '-DCMAKE_OSX_ARCHITECTURES=x86_64 -DGGML_METAL=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF'
+    env:
+      MACOS_RPATH_DEFINE: "-DCMAKE_INSTALL_RPATH='@loader_path' -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON"
     runs-on: macos-latest   
     steps:
       - uses: actions/checkout@v4
@@ -481,7 +483,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. ${{ env.COMMON_DEFINE }} ${{ matrix.defines }}
+          cmake .. ${{ env.COMMON_DEFINE }} ${{ env.MACOS_RPATH_DEFINE }} ${{ matrix.defines }}
           cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
           ls -R
       - name: Upload ggml
 
@@ -0,0 +1,46 @@
+using System.Linq;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Engines;
+using BenchmarkDotNet.Jobs;
+using LLama.Common;
+
+namespace LLama.Benchmark.Collections;
+
+[SimpleJob(RunStrategy.Throughput, RuntimeMoniker.Net80)]
+[MemoryDiagnoser]
+[BenchmarkCategory("Collections", "FixedSizeQueue")]
+public class FixedSizeQueueBenchmark
+{
+    [Params(32, 512, 4096)]
+    public int Capacity { get; set; }
+
+    private int[] _values = Array.Empty<int>();
+
+    [GlobalSetup]
+    public void Setup()
+    {
+        _values = Enumerable.Range(0, Capacity * 4).ToArray();
+    }
+
+    [Benchmark]
+    public int EnqueueWrap()
+    {
+        var queue = new FixedSizeQueue<int>(Capacity);
+        foreach (var value in _values)
+            queue.Enqueue(value);
+        return queue.Count;
+    }
+
+    [Benchmark]
+    public int IterateTailSum()
+    {
+        var queue = new FixedSizeQueue<int>(Capacity);
+        foreach (var value in _values)
+            queue.Enqueue(value);
+
+        var sum = 0;
+        foreach (var value in queue)
+            sum += value;
+        return sum;
+    }
+}
@@ -10,8 +10,8 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="BenchmarkDotNet" Version="0.15.2" />
-    <PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.15.2" />
+    <PackageReference Include="BenchmarkDotNet" Version="0.15.4" />
+    <PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.15.4" />
   </ItemGroup>
 
     <ItemGroup>
 
@@ -16,7 +16,7 @@
   <ItemGroup>
     <PackageReference Include="Microsoft.Extensions.Logging.Console" Version="9.0.9" />
     <PackageReference Include="Microsoft.KernelMemory.Core" Version="0.98.250508.3" />
-    <PackageReference Include="Microsoft.SemanticKernel" Version="1.64.0" />
+    <PackageReference Include="Microsoft.SemanticKernel" Version="1.65.0" />
     <PackageReference Include="Microsoft.SemanticKernel.Plugins.Memory" Version="1.44.0-alpha" />
     <PackageReference Include="NAudio" Version="2.2.1" />
     <PackageReference Include="SixLabors.ImageSharp" Version="3.1.11" />
 
@@ -15,7 +15,7 @@
   </ItemGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.AspNetCore.Mvc.Razor.RuntimeCompilation" Version="8.0.19" />
+    <PackageReference Include="Microsoft.AspNetCore.Mvc.Razor.RuntimeCompilation" Version="8.0.20" />
     <PackageReference Include="System.Linq.Async" Version="6.0.3" />
   </ItemGroup>
 
 
@@ -9,7 +9,7 @@
 
   <ItemGroup>
     <PackageReference Include="Microsoft.VisualStudio.Validation" Version="17.8.8" />
-    <PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="8.0.19" />
+    <PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="8.0.20" />
     <PackageReference Include="Swashbuckle.AspNetCore" Version="7.3.1" />
   </ItemGroup>
 
 
@@ -14,7 +14,7 @@ public interface IInferenceParams
 		public int TokensKeep { get; set; }
 
 		/// <summary>
-		/// how many new tokens to predict (n_predict), set to -1 to inifinitely generate response
+		/// how many new tokens to predict (n_predict), set to -1 to infinitely generate response
 		/// until it complete.
 		/// </summary>
 		public int MaxTokens { get; set; }
 
@@ -11,7 +11,7 @@ public sealed class AntipromptProcessor
         private int _longestAntiprompt;
         private readonly List<string> _antiprompts = new();
 
-        private string? _string;
+        private string _buffer = string.Empty;
 
 
         /// <summary>
@@ -46,6 +46,8 @@ public void SetAntiprompts(IEnumerable<string> antiprompts)
             _longestAntiprompt = 0;
             foreach (var antiprompt in _antiprompts)
                 _longestAntiprompt = Math.Max(_longestAntiprompt, antiprompt.Length);
+
+            _buffer = string.Empty;
         }
 
         /// <summary>
@@ -55,21 +57,21 @@ public void SetAntiprompts(IEnumerable<string> antiprompts)
         /// <returns>true if the text buffer ends with any antiprompt</returns>
         public bool Add(string text)
         {
-            _string += text;
+            _buffer += text;
 
             // When the string gets very long (4x antiprompt length) trim it down (to 2x antiprompt length).
             // This trimming leaves a lot of extra characters because two sequences can be considered "equal" in unicode
             // even with different numbers of characters. Hopefully there are enough characters here to handle all those weird circumstances!
             var maxLength = Math.Max(32, _longestAntiprompt * 4);
             var trimLength = Math.Max(16, _longestAntiprompt * 2);
-            if (_string.Length > maxLength)
-                _string = _string.Substring(_string.Length - trimLength);
+            if (_buffer.Length > maxLength)
+                _buffer = _buffer.Substring(_buffer.Length - trimLength);
 
             foreach (var antiprompt in _antiprompts)
-                if (_string.EndsWith(antiprompt, StringComparison.CurrentCulture))
+                if (_buffer.EndsWith(antiprompt, StringComparison.CurrentCulture))
                     return true;
 
             return false;
         }
     }
-}
+}
@@ -1,7 +1,6 @@
 using System;
 using System.Collections.Generic;
 using System.Diagnostics;
-using System.Linq;
 using System.Threading;
 using System.Threading.Tasks;
 using LLama.Abstractions;
@@ -16,7 +15,12 @@ public sealed class BatchedExecutor
     : IDisposable
 {
     private int _nextSequenceId;
-    private readonly List<IBatch> _batchQueue = [ ];
+    private readonly List<IBatch> _batchQueue = [];
+    private int _batchQueueHead;
+    private int _batchedTokenCount;
+    private bool _batchedTokenCountDirty = true;
+    // Skip compacting the queue until this many processed batches accumulate at the front.
+    private const int CleanupThreshold = 16;
 
     /// <summary>
     /// Set to 1 using interlocked exchange while inference is running
@@ -42,12 +46,27 @@ public sealed class BatchedExecutor
     /// <summary>
     /// Get the number of tokens in the batch, waiting for <see cref="Infer"/> to be called
     /// </summary>
-    public int BatchedTokenCount => _batchQueue.Sum(a => a.ItemCount);
+    public int BatchedTokenCount
+    {
+        get
+        {
+            if (_batchedTokenCountDirty)
+            {
+                var total = 0;
+                for (var i = _batchQueueHead; i < _batchQueue.Count; i++)
+                    total += _batchQueue[i].ItemCount;
+                _batchedTokenCount = total;
+                _batchedTokenCountDirty = false;
+            }
+
+            return _batchedTokenCount;
+        }
+    }
 
     /// <summary>
     /// Number of batches in the queue, waiting for <see cref="Infer"/> to be called
     /// </summary>
-    public int BatchQueueCount => _batchQueue.Count;
+    public int BatchQueueCount => _batchQueue.Count - _batchQueueHead;
 
     /// <summary>
     /// Check if this executor has been disposed.
@@ -147,12 +166,13 @@ public async Task<DecodeResult> Infer(CancellationToken cancellation = default)
             // again after the issue has been fixed (e.g. some KV cache space has been freed) to retry this operation.
             if (status != DecodeResult.Ok)
             {
-                _batchQueue.Insert(0, next);
+                RequeueFront(next);
                 return status;
             }
 
             // Everything was ok, advance the epoch
             Epoch++;
+            CleanupQueue();
 
             return status;
         }
@@ -166,13 +186,45 @@ public async Task<DecodeResult> Infer(CancellationToken cancellation = default)
 
         IBatch? GetNextBatch()
         {
-            if (_batchQueue.Count == 0)
+            if (_batchQueueHead >= _batchQueue.Count)
+            {
+                _batchQueue.Clear();
+                _batchQueueHead = 0;
                 return null;
-            
-            var nextBatch = _batchQueue[0];
-            _batchQueue.RemoveAt(0);
+            }
+
+            var nextBatch = _batchQueue[_batchQueueHead];
+            _batchQueueHead++;
+            _batchedTokenCountDirty = true;
             return nextBatch;
         }
+
+        void RequeueFront(IBatch batch)
+        {
+            Debug.Assert(_batchQueueHead > 0, "Cannot requeue batch when queue head is at zero.");
+            _batchQueue[--_batchQueueHead] = batch;
+            _batchedTokenCountDirty = true;
+        }
+
+        // Remove batches that have already been consumed so the head index does not grow without bound.
+        void CleanupQueue()
+        {
+            if (_batchQueueHead == 0)
+                return;
+
+            if (_batchQueueHead >= _batchQueue.Count)
+            {
+                _batchQueue.Clear();
+                _batchQueueHead = 0;
+                return;
+            }
+
+            if (_batchQueueHead > CleanupThreshold && _batchQueueHead > _batchQueue.Count / 2)
+            {
+                _batchQueue.RemoveRange(0, _batchQueueHead);
+                _batchQueueHead = 0;
+            }
+        }
     }
 
     /// <inheritdoc />
@@ -202,7 +254,7 @@ internal LLamaSeqId GetNextSequenceId()
             throw new ArgumentOutOfRangeException(nameof(minCapacity), $"Request batch capacity must be less than or equal to BatchSize ({Context.BatchSize})");
 
         // Find a batch with space for at least minCapacity tokens
-        for (var i = 0; i < _batchQueue.Count; i++)
+        for (var i = _batchQueueHead; i < _batchQueue.Count; i++)
         {
             var item = _batchQueue[i];
             if (item is not TokenBatch { Batch: var batch })
@@ -213,13 +265,17 @@ internal LLamaSeqId GetNextSequenceId()
                 continue;
 
             if (batch.TokenCount < Context.BatchSize)
-                return (batch, Epoch + (uint)(i + 1) * 2);
+            {
+                _batchedTokenCountDirty = true;
+                return (batch, Epoch + (uint)(i - _batchQueueHead + 1) * 2);
+            }
         }
 
         // Add a new batch to the end of the queue
         var end = new LLamaBatch();
         _batchQueue.Add(new TokenBatch(end));
-        return (end, Epoch + (uint)_batchQueue.Count * 2);
+        _batchedTokenCountDirty = true;
+        return (end, Epoch + (uint)(_batchQueue.Count - _batchQueueHead) * 2);
     }
 
     /// <summary>
@@ -234,7 +290,7 @@ internal LLamaSeqId GetNextSequenceId()
             throw new ArgumentOutOfRangeException(nameof(minCapacity), $"Request batch capacity must be less than or equal to BatchSize ({Context.BatchSize})");
 
         // Find a batch with space for at least minCapacity embeddings
-        for (var i = 0; i < _batchQueue.Count; i++)
+        for (var i = _batchQueueHead; i < _batchQueue.Count; i++)
         {
             var item = _batchQueue[i];
             if (item is not EmbeddingBatch { Batch: var batch })
@@ -245,13 +301,17 @@ internal LLamaSeqId GetNextSequenceId()
                 continue;
 
             if (batch.EmbeddingsCount < Context.BatchSize)
-                return (batch, Epoch + (uint)(i + 1) * 2);
+            {
+                _batchedTokenCountDirty = true;
+                return (batch, Epoch + (uint)(i - _batchQueueHead + 1) * 2);
+            }
         }
 
         // Add a new batch to the end of the queue
         var end = new LLamaBatchEmbeddings(Context.EmbeddingSize);
         _batchQueue.Add(new EmbeddingBatch(end));
-        return (end, Epoch + (uint)_batchQueue.Count * 2);
+        _batchedTokenCountDirty = true;
+        return (end, Epoch + (uint)(_batchQueue.Count - _batchQueueHead) * 2);
     }
 
     #region batches
@@ -286,4 +346,4 @@ public Task<DecodeResult> DecodeAsync(LLamaContext ctx, CancellationToken token)
         }
     }
     #endregion
-}
+}
@@ -410,7 +410,7 @@ public void Remove(LLamaPos start, LLamaPos end)
         }
 
         /// <summary>
-        /// Removes <see cref="count"/> tokens starting from <see cref="start"/>
+        /// Removes <paramref name="count"/> tokens starting from <paramref name="start"/>
         /// </summary>
         /// <param name="start">Start position (inclusive)</param>
         /// <param name="count">Number of tokens</param>
Original file line number	Diff line number	Diff line change
`@@ -410,7 +410,7 @@ public void Remove(LLamaPos start, LLamaPos end)`
`410`	`410`	`}`
`411`	`411`
`412`	`412`	`/// <summary>`
`413`		`- /// Removes <see cref="count"/> tokens starting from <see cref="start"/>`
	`413`	`+ /// Removes <paramref name="count"/> tokens starting from <paramref name="start"/>`
`414`	`414`	`/// </summary>`
`415`	`415`	`/// <param name="start">Start position (inclusive)</param>`
`416`	`416`	`/// <param name="count">Number of tokens</param>`