curiosity-ai · Youssef-essam-Abdelaal · Jun 10, 2024 · Jun 10, 2024
diff --git a/README.md b/README.md
@@ -0,0 +1,47 @@
+# Sentence Transformers
+
+Welcome to the Sentence Transformers Repository! This repository contains a C# project developed by Curiosity aimed at facilitating sentence encoding tasks.
+it provides developers with a set of utilities and implementations for working with sentence encoding models in natural language processing (NLP) applications.
+Whether you're building a chatbot, search engine, or sentiment analysis tool, this offers everything you need to efficiently encode sentences, calculate similarities, and benchmark model performance. **Below is an overview of each project and its contents:**
+
+## 1. SentenceTransformers
+
+This folder provides fundamental interfaces and utility classes for sentence encoding and chunking. Here's what each file contains:
+
+- **ISentenceEncoder.cs**: Defines interfaces and data structures related to sentence encoding and chunking.
+- **ResourceLoader.cs**: Provides utility methods for loading resources from assemblies.
+
+## 2. SentenceTransformers.ArcticXs
+
+The ArcticXs folder focuses on implementing a sentence encoder using the ArcticXs model. It includes the following files:
+
+- **ArcticTokenizer.cs**: Provides tokenization functionality using the ArcticXs model.
+- **DenseTensorHelpers.cs**: Defines helper methods for working with dense tensors.
+- **SentenceEncoder.cs**: Implements the SentenceEncoder class responsible for encoding sentences using the ArcticXs model.
+
+## 3. SentenceTransformers.MiniLM
+
+The MiniLM folder implements a sentence encoder based on the MiniLM model. It comprises the following files:
+
+- **DenseTensorHelpers.cs**: Helper methods for working with dense tensors.
+- **MiniLMTokenizer.cs**: Tokenizer class for the MiniLM model.
+- **SentenceEncoder.cs**: Implementation of a MiniLM-based sentence encoder.
+
+## 4. SentenceTransformers.Test
+
+This folder contains utility files for testing and benchmarking different sentence encoders. Here's what the file includes:
+
+- **Program.cs**: Contains the Main class with methods for executing tests and benchmarks for various sentence encoders, which includes methods for running simple and advanced test cases, including QA testing and performance profiling.
+
+Each file in the project contains detailed comments to help you better understand its functionality.
+
+---
+
+### Why Sentence Encoding is Important
+
+Sentence encoding is vital because it transforms complex text data into meaningful numerical representations that machines can understand and manipulate. This process is essential for:
+
+- **Natural Language Processing (NLP):** Enhancing tasks like machine translation, sentiment analysis, and chatbot functionality.
+- **Search and Information Retrieval:** Improving search engines by enabling semantic search capabilities.
+- **Recommendation Systems:** Enhancing recommendations by understanding and leveraging user reviews and feedback.
+- **Data Clustering and Classification:** Grouping similar documents and classifying text data effectively.
diff --git a/SentenceTransformers.ArcticXs/src/ArcticTokenizer.cs b/SentenceTransformers.ArcticXs/src/ArcticTokenizer.cs
@@ -1,11 +1,23 @@
-using BERTTokenizers.Base;
-using SentenceTransformers;
+/*
+ * This file defines the ArcticTokenizer class, which serves as a tokenizer for the ArcticXs model.
+ * The ArcticTokenizer class inherits from the UncasedTokenizer class provided by the BERTTokenizers.Base namespace.
+ * It initializes the tokenizer using a vocabulary file named "vocab.txt" loaded from the assembly of the SentenceEncoder type.
+ * This tokenizer is designed to tokenize text data according to the specifications of the ArcticXs model.
+ */
+using BERTTokenizers.Base; // Importing the Base namespace from the BERTTokenizers assembly
+using SentenceTransformers; // Importing the SentenceTransformers namespace
 
-namespace SentenceTransformers.ArcticXs;
-
-public class ArcticTokenizer : UncasedTokenizer
+namespace SentenceTransformers.ArcticXs
 {
-    public ArcticTokenizer() : base(ResourceLoader.OpenResource(typeof(SentenceEncoder).Assembly, "vocab.txt"))
+    // Defines a tokenizer for the ArcticXs model, inheriting from UncasedTokenizer
+    public class ArcticTokenizer : UncasedTokenizer
     {
+        // Constructor for ArcticTokenizer
+        public ArcticTokenizer() : base(ResourceLoader.OpenResource(typeof(SentenceEncoder).Assembly, "vocab.txt"))
+        {
+            // Calls the base class constructor with the vocabulary file "vocab.txt"
+            // The vocabulary file is loaded using the ResourceLoader.OpenResource method
+            // It retrieves the vocab.txt file embedded within the assembly of the SentenceEncoder type
+        }
     }
 }
diff --git a/SentenceTransformers.ArcticXs/src/DenseTensorHelpers.cs b/SentenceTransformers.ArcticXs/src/DenseTensorHelpers.cs
@@ -1,44 +1,62 @@
-using Microsoft.ML.OnnxRuntime.Tensors;
+/*
+ * This file defines the DenseTensorHelpers class, which provides helper methods for working with dense tensors.
+ * The class contains a static method Normalize, which normalizes the input dense tensor along a specified axis.
+ * The Normalize method computes the normalization denominators for each sentence and returns a jagged array of normalized vectors.
+ * These normalized vectors can be used for various natural language processing tasks, such as text encoding or similarity calculation.
+ */
 
-namespace SentenceTransformers.ArcticXs;
+using Microsoft.ML.OnnxRuntime.Tensors;
 
-public static class DenseTensorHelpers
+namespace SentenceTransformers.ArcticXs
 {
-    public static float[][] Normalize(DenseTensor<float> input_dense, float eps = 1e-12f)
+    // Provides helper methods for working with dense tensors.
+    public static class DenseTensorHelpers
     {
-        //Computes sum(abs(x)^2)^(1/2)
+        // Normalizes the input dense tensor along the specified axis.
+        // Returns a jagged array of normalized vectors.
+        public static float[][] Normalize(DenseTensor<float> inputTensor, float epsilon = 1e-12f)
+        {
+            // Computes the L2 norm of each vector in the dense tensor.
 
-        const int tokenIndexForEncoding = 0;
+            const int tokenIndex = 0; // Index for the token to be encoded
 
-        var sentencesCount = input_dense.Dimensions[0];
-        var hiddenStates   = input_dense.Dimensions[2];
+            // Get the dimensions of the input dense tensor
+            var sentenceCount = inputTensor.Dimensions[0]; // Number of sentences
+            var hiddenStates = inputTensor.Dimensions[2]; // Number of hidden states
 
-        var denom_dense = new float [sentencesCount];
+            // Array to store the normalization denominators for each sentence
+            var norms = new float[sentenceCount];
 
-        for (int s = 0; s < sentencesCount; s++)
-        {
-            for (int i = 0; i < hiddenStates; i++)
+            // Compute the normalization denominators for each sentence
+            for (int s = 0; s < sentenceCount; s++)
             {
-                denom_dense[s] += input_dense[s, tokenIndexForEncoding, i] * input_dense[s, tokenIndexForEncoding, i];
+                for (int i = 0; i < hiddenStates; i++)
+                {
+                    norms[s] += inputTensor[s, tokenIndex, i] * inputTensor[s, tokenIndex, i];
+                }
+                norms[s] = MathF.Max(MathF.Sqrt(norms[s]), epsilon);
             }
-            denom_dense[s] = MathF.Max(MathF.Sqrt(denom_dense[s]), eps);
-        }
 
-        var outputFlatten = new float[sentencesCount][];
+            // Array to store the output normalized vectors
+            var normalizedVectors = new float[sentenceCount][];
 
-        for (int s = 0; s < sentencesCount; s++)
-        {
-            var invNorm = 1 / denom_dense[s];
+            // Normalize the input tensor and store the normalized vectors
+            for (int s = 0; s < sentenceCount; s++)
+            {
+                var invNorm = 1 / norms[s]; // Compute the inverse normalization factor
 
-            var emb = new float[hiddenStates];
-            outputFlatten[s] = emb;
+                // Array to store the normalized vector for the current sentence
+                var normalizedVector = new float[hiddenStates];
+                normalizedVectors[s] = normalizedVector;
 
-            for (int i = 0; i < hiddenStates; i++)
-            {
-                emb[i] = input_dense[s, tokenIndexForEncoding, i] * invNorm;
+                // Normalize each element of the input tensor for the current sentence
+                for (int i = 0; i < hiddenStates; i++)
+                {
+                    normalizedVector[i] = inputTensor[s, tokenIndex, i] * invNorm;
+                }
             }
-        }
 
-        return outputFlatten;
+            return normalizedVectors; // Return the normalized vectors
+        }
     }
-}
+}
diff --git a/SentenceTransformers.ArcticXs/src/SentenceEncoder.cs b/SentenceTransformers.ArcticXs/src/SentenceEncoder.cs
@@ -1,4 +1,10 @@
-using BERTTokenizers;
+/*
+ * This file defines the SentenceEncoder class, which is responsible for encoding sentences using the ArcticXs model.
+ * It utilizes BERT tokenization to convert sentences into tokens and then performs inference using an ONNX model to generate sentence embeddings.
+ * The class implements the IDisposable interface to release resources and provides a method to encode an array of sentences into vectors.
+ */
+
+using BERTTokenizers;
 using BERTTokenizers.Base;
 using Microsoft.ML.OnnxRuntime;
 using Microsoft.ML.OnnxRuntime.Tensors;
@@ -7,74 +13,84 @@
 using System.Collections.Generic;
 using SentenceTransformers;
 
-namespace SentenceTransformers.ArcticXs;
-
-public sealed class SentenceEncoder : IDisposable, ISentenceEncoder
+namespace SentenceTransformers.ArcticXs
 {
-    private readonly SessionOptions   _sessionOptions;
-    private readonly InferenceSession _session;
-    private readonly TokenizerBase    _tokenizer;
-    private readonly string[]         _outputNames;
-
-    public SentenceEncoder(SessionOptions sessionOptions = null)
+    // Represents an encoder for sentences using the ArcticXs model.
+    public sealed class SentenceEncoder : IDisposable, ISentenceEncoder
     {
-        _sessionOptions = sessionOptions ?? new SessionOptions();
-        _session        = new InferenceSession(ResourceLoader.GetResource(typeof(SentenceEncoder).Assembly, "model.onnx"), _sessionOptions);
-        _tokenizer      = new ArcticTokenizer();
-        _outputNames    = _session.OutputMetadata.Keys.ToArray();
-    }
+        private readonly SessionOptions   _sessionOptions;
+        private readonly InferenceSession _inferenceSession;
+        private readonly TokenizerBase    _tokenizer;
+        private readonly string[]         _outputNames;
 
-    public void Dispose()
-    {
-        _sessionOptions.Dispose();
-        _session.Dispose();
-    }
+        // Constructor for SentenceEncoder class
+        public SentenceEncoder(SessionOptions sessionOptions = null)
+        {
+            _sessionOptions = sessionOptions ?? new SessionOptions();
+            _inferenceSession = new InferenceSession(ResourceLoader.GetResource(typeof(SentenceEncoder).Assembly, "model.onnx"), _sessionOptions);
+            _tokenizer = new ArcticTokenizer(); // Initialize tokenizer
+            _outputNames = _inferenceSession.OutputMetadata.Keys.ToArray(); // Get output names from the session
+        }
 
-    public float[][] Encode(string[] sentences, CancellationToken cancellationToken = default)
-    {
-        var numSentences = sentences.Length;
+        // Dispose method to release resources
+        public void Dispose()
+        {
+            _sessionOptions.Dispose();
+            _inferenceSession.Dispose();
+        }
 
-        var encoded    = _tokenizer.Encode(sentences);
-        var tokenCount = encoded.First().InputIds.Length;
+        // Encodes an array of sentences into vectors
+        public float[][] Encode(string[] sentences, CancellationToken cancellationToken = default)
+        {
+            var numSentences = sentences.Length;
 
-        long[] flattenIDs           = new long[encoded.Sum(s => s.InputIds.Length)];
-        long[] flattenAttentionMask = new long[encoded.Sum(s => s.AttentionMask.Length)];
-        long[] flattenTokenTypeIds  = new long[encoded.Sum(s => s.TokenTypeIds.Length)];
+            // Tokenize sentences
+            var tokenizedSentences = _tokenizer.Encode(sentences);
+            var tokenCount = tokenizedSentences.First().InputIds.Length;
 
-        var flattenIDsSpan           = flattenIDs.AsSpan();
-        var flattenAttentionMaskSpan = flattenAttentionMask.AsSpan();
-        var flattenTokenTypeIdsSpan  = flattenTokenTypeIds.AsSpan();
+            // Flatten token IDs, attention masks, and token type IDs
+            var flattenInputIds = new long[tokenizedSentences.Sum(s => s.InputIds.Length)];
+            var flattenAttentionMask = new long[tokenizedSentences.Sum(s => s.AttentionMask.Length)];
+            var flattenTokenTypeIds = new long[tokenizedSentences.Sum(s => s.TokenTypeIds.Length)];
 
-        foreach (var (InputIds, TokenTypeIds, AttentionMask) in encoded)
-        {
-            InputIds.AsSpan().CopyTo(flattenIDsSpan);
-            flattenIDsSpan = flattenIDsSpan.Slice(InputIds.Length);
+            var flattenInputIdsSpan = flattenInputIds.AsSpan();
+            var flattenAttentionMaskSpan = flattenAttentionMask.AsSpan();
+            var flattenTokenTypeIdsSpan = flattenTokenTypeIds.AsSpan();
 
-            AttentionMask.AsSpan().CopyTo(flattenAttentionMaskSpan);
-            flattenAttentionMaskSpan = flattenAttentionMaskSpan.Slice(AttentionMask.Length);
+            foreach (var (inputIds, tokenTypeIds, attentionMask) in tokenizedSentences)
+            {
+                inputIds.AsSpan().CopyTo(flattenInputIdsSpan);
+                flattenInputIdsSpan = flattenInputIdsSpan.Slice(inputIds.Length);
 
-            TokenTypeIds.AsSpan().CopyTo(flattenTokenTypeIdsSpan);
-            flattenTokenTypeIdsSpan = flattenTokenTypeIdsSpan.Slice(TokenTypeIds.Length);
-        }
+                attentionMask.AsSpan().CopyTo(flattenAttentionMaskSpan);
+                flattenAttentionMaskSpan = flattenAttentionMaskSpan.Slice(attentionMask.Length);
 
-        var dimensions = new[] { numSentences, tokenCount };
+                tokenTypeIds.AsSpan().CopyTo(flattenTokenTypeIdsSpan);
+                flattenTokenTypeIdsSpan = flattenTokenTypeIdsSpan.Slice(tokenTypeIds.Length);
+            }
 
-        var input = new NamedOnnxValue[3]
-        {
-            NamedOnnxValue.CreateFromTensor("input_ids",      new DenseTensor<long>(flattenIDs,           dimensions)),
-            NamedOnnxValue.CreateFromTensor("attention_mask", new DenseTensor<long>(flattenAttentionMask, dimensions)),
-            NamedOnnxValue.CreateFromTensor("token_type_ids", new DenseTensor<long>(flattenTokenTypeIds,  dimensions))
-        };
+            // Create NamedOnnxValue objects for input tensor
+            var dimensions = new[] { numSentences, tokenCount };
 
-        using var runOptions   = new RunOptions();
-        using var registration = cancellationToken.Register(() => runOptions.Terminate = true);
+            var inputTensors = new NamedOnnxValue[3]
+            {
+                NamedOnnxValue.CreateFromTensor("input_ids", new DenseTensor<long>(flattenInputIds, dimensions)),
+                NamedOnnxValue.CreateFromTensor("attention_mask", new DenseTensor<long>(flattenAttentionMask, dimensions)),
+                NamedOnnxValue.CreateFromTensor("token_type_ids", new DenseTensor<long>(flattenTokenTypeIds, dimensions))
+            };
 
-        using var output = _session.Run(input, _outputNames, runOptions);
+            // Run inference using the input tensor
+            using var runOptions = new RunOptions();
+            using var registration = cancellationToken.Register(() => runOptions.Terminate = true);
 
-        var outputValue = (DenseTensor<float>)output.First().Value;
+            using var output = _inferenceSession.Run(inputTensors, _outputNames, runOptions);
 
-        cancellationToken.ThrowIfCancellationRequested();
+            var outputTensor = (DenseTensor<float>)output.First().Value;
 
-        return Normalize(outputValue);
+            cancellationToken.ThrowIfCancellationRequested();
+
+            // Normalize the output tensor
+            return Normalize(outputTensor);
+        }
     }
-}
+}