beehive-lab
diff --git a/‎src/main/java/com/example/aot/AOT.java‎
Lines changed: 5 additions & 10 deletions b/‎src/main/java/com/example/aot/AOT.java‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎src/main/java/com/example/auxiliary/Utf8Mask.java‎
Lines changed: 2 additions & 0 deletions b/‎src/main/java/com/example/auxiliary/Utf8Mask.java‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/main/java/com/example/inference/state/LlamaState.java‎
Lines changed: 3 additions & 3 deletions b/‎src/main/java/com/example/inference/state/LlamaState.java‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/main/java/com/example/inference/state/Qwen3State.java‎
Lines changed: 5 additions & 7 deletions b/‎src/main/java/com/example/inference/state/Qwen3State.java‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎src/main/java/com/example/inference/state/State.java‎
Lines changed: 1 addition & 1 deletion b/‎src/main/java/com/example/inference/state/State.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/main/java/com/example/inference/weights/Weights.java‎
Lines changed: 10 additions & 0 deletions b/‎src/main/java/com/example/inference/weights/Weights.java‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/main/java/com/example/inference/weights/standard/LlamaStandardWeights.java‎
Lines changed: 60 additions & 3 deletions b/‎src/main/java/com/example/inference/weights/standard/LlamaStandardWeights.java‎
Lines changed: 60 additions & 3 deletions
diff --git a/‎src/main/java/com/example/inference/weights/standard/Qwen3StandardWeights.java‎
Lines changed: 61 additions & 6 deletions b/‎src/main/java/com/example/inference/weights/standard/Qwen3StandardWeights.java‎
Lines changed: 61 additions & 6 deletions
diff --git a/‎src/main/java/com/example/inference/weights/standard/StandardWeights.java‎
Lines changed: 21 additions & 31 deletions b/‎src/main/java/com/example/inference/weights/standard/StandardWeights.java‎
Lines changed: 21 additions & 31 deletions
@@ -32,8 +32,8 @@ public final class AOT {
 
     static LlamaModelLoader modelLoader;
 
-
-    record PartialModel(String modelFileName, Llama model, long tensorDataOffset, Map<String, GGUF.GGUFTensorInfo> tensorInfos) {}
+    record PartialModel(String modelFileName, Llama model, long tensorDataOffset, Map<String, GGUF.GGUFTensorInfo> tensorInfos) {
+    }
 
     private static final PartialModel PRELOADED_GGUF = preLoadGGUF(System.getProperty("llama.PreloadGGUF"));
 
@@ -49,12 +49,8 @@ private static PartialModel preLoadGGUF(String modelPath) {
             GGUF gguf = GGUF.loadModel(path);
             try (FileChannel fileChannel = FileChannel.open(path, StandardOpenOption.READ)) {
                 modelLoader = new LlamaModelLoader(fileChannel, gguf, Options.DEFAULT_MAX_TOKENS, false);
-                return new PartialModel(
-                        path.getFileName().toString(),
-                        modelLoader.loadModel(), // TODO: needs proper handling for AOT
-                        gguf.getTensorDataOffset(),
-                        gguf.getTensorInfos()
-                );
+                return new PartialModel(path.getFileName().toString(), modelLoader.loadModel(), // TODO: needs proper handling for AOT
+                        gguf.getTensorDataOffset(), gguf.getTensorInfos());
             }
         } catch (IOException e) {
             throw new RuntimeException(e);
@@ -78,8 +74,7 @@ public static Model tryUsePreLoaded(Path modelPath, int contextLength) throws IO
             return null;
         }
         Llama baseModel = preLoaded.model();
-        try (var timer = Timer.log("Load tensors from pre-loaded model");
-                var fileChannel = FileChannel.open(modelPath, StandardOpenOption.READ)) {
+        try (var timer = Timer.log("Load tensors from pre-loaded model"); var fileChannel = FileChannel.open(modelPath, StandardOpenOption.READ)) {
             // Load only the tensors (mmap slices).
             Map<String, GGMLTensorEntry> tensorEntries = GGUF.loadTensors(fileChannel, preLoaded.tensorDataOffset(), preLoaded.tensorInfos());
             Weights weights = modelLoader.loadWeights(tensorEntries, baseModel.configuration());
 
@@ -2,9 +2,11 @@
 
 /** mask of a byte-sequence in UTF-8 encoding */
 public record Utf8Mask(int mask, int pattern, int len) {
+    //@formatter:off
     public static final Utf8Mask[] MASKS = {
             new Utf8Mask(0b11100000, 0b11000000, 2),
             new Utf8Mask(0b11110000, 0b11100000, 3),
             new Utf8Mask(0b11111000, 0b11110000, 4)
     };
+    //@formatter:on
 }
@@ -56,9 +56,9 @@ protected StateFields createStateFields(Configuration config) {
         fields.positionHolder = new IntArray(1);
 
         // Temporary arrays
-        fields.temp = new FloatArray(1 + ((config.dim() + localSize-1) / localSize));
-        fields.tempFFN = new FloatArray(1 + ((config.dim() + localSize-1) / localSize));
-        fields.tempLogits = new FloatArray(1 + ((config.dim() + localSize-1) / localSize));
+        fields.temp = new FloatArray(1 + ((config.dim() + localSize - 1) / localSize));
+        fields.tempFFN = new FloatArray(1 + ((config.dim() + localSize - 1) / localSize));
+        fields.tempLogits = new FloatArray(1 + ((config.dim() + localSize - 1) / localSize));
 
         return fields;
     }
 
@@ -52,10 +52,8 @@ protected StateFields createStateFields(Configuration configuration) {
         fields.logits = ArrayFloatTensor.allocate(config.vocabularySize());
 
         // Key-value cache with Qwen3 dimensions
-        fields.keyCache = Stream.generate(() -> ArrayFloatTensor.allocate(config.contextLength(), nEmbdGqa))
-                .limit(config.numberOfLayers()).toArray(FloatTensor[]::new);
-        fields.valueCache = Stream.generate(() -> ArrayFloatTensor.allocate(config.contextLength(), nEmbdGqa))
-                .limit(config.numberOfLayers()).toArray(FloatTensor[]::new);
+        fields.keyCache = Stream.generate(() -> ArrayFloatTensor.allocate(config.contextLength(), nEmbdGqa)).limit(config.numberOfLayers()).toArray(FloatTensor[]::new);
+        fields.valueCache = Stream.generate(() -> ArrayFloatTensor.allocate(config.contextLength(), nEmbdGqa)).limit(config.numberOfLayers()).toArray(FloatTensor[]::new);
 
         // TornadoVM wrappers with Qwen3-specific sizes
         fields.wrapX = new FloatArray(config.dim());
@@ -76,9 +74,9 @@ protected StateFields createStateFields(Configuration configuration) {
         fields.positionHolder = new IntArray(1);
 
         // Temporary arrays
-        fields.temp = new FloatArray(1 + ((config.dim() + localSize-1) / localSize));
-        fields.tempFFN = new FloatArray(1 + ((config.dim() + localSize-1) / localSize));
-        fields.tempLogits = new FloatArray(1 + ((config.dim() + localSize-1) / localSize));
+        fields.temp = new FloatArray(1 + ((config.dim() + localSize - 1) / localSize));
+        fields.tempFFN = new FloatArray(1 + ((config.dim() + localSize - 1) / localSize));
+        fields.tempLogits = new FloatArray(1 + ((config.dim() + localSize - 1) / localSize));
 
         return fields;
     }
 
@@ -8,7 +8,7 @@
 /**
  * Base class for State
  */
-public abstract class State{
+public abstract class State {
 
     // current wave of activations
     public final FloatTensor x;         // activation at current time stamp (dim,)
 
@@ -2,6 +2,16 @@
 
 import com.example.core.model.GGMLType;
 
+/**
+ * The GPULlama3.java utilizes two distinct weight types:
+ * <ul>
+ *   <li><b>StandardWeights:</b> Designed for standard Java-based inference on the CPU.</li>
+ *   <li><b>TornadoWeights:</b> Optimized for GPU-accelerated inference using TornadoVM.</li>
+ * </ul>
+ *
+ * The packages <code>weights.standard</code> and <code>weights.tornado</code> define
+ * base classes and model-specific implementations for weights in their respective formats.
+ */
 public interface Weights {
 
     GGMLType getWeightType();
 
@@ -3,12 +3,69 @@
 import com.example.core.model.GGMLType;
 import com.example.core.model.tensor.FloatTensor;
 
+/**
+ * A model-specific implementation of {@link StandardWeights} for the Llama model.
+ * This class encapsulates the weights required for performing inference
+ * using the Llama model in the standard CPU-based format.
+ *
+ * <p><b>Note:</b> This weight format is also used for the Mistral model.</p>
+ */
 public class LlamaStandardWeights extends StandardWeights {
 
-    public LlamaStandardWeights(FloatTensor token_embedding_table, FloatTensor[] rms_att_weight, FloatTensor[] wq, FloatTensor[] wk, FloatTensor[] wv, FloatTensor[] wo, FloatTensor[] rms_ffn_weight,
-            FloatTensor[] w1, FloatTensor[] w2, FloatTensor[] w3, FloatTensor rms_final_weight, FloatTensor freq_cis_real, FloatTensor freq_cis_imag, FloatTensor wcls, GGMLType weightType) {
-        super(token_embedding_table, rms_att_weight, wq, wk, wv, wo, rms_ffn_weight, w1, w2, w3, rms_final_weight, freq_cis_real, freq_cis_imag, wcls, weightType);
+    // @formatter:off
+    /**
+     * Constructor for LlamaStandardWeights.
+     *
+     * @param token_embedding_table  The token embedding table tensor.
+     * @param rms_att_weight         Array of RMS attention weights tensors.
+     * @param wq                     Array of query weight tensors.
+     * @param wk                     Array of key weight tensors.
+     * @param wv                     Array of value weight tensors.
+     * @param wo                     Array of output weight tensors.
+     * @param rms_ffn_weight         Array of RMS feed-forward network weights.
+     * @param w1                     Array of first feed-forward layer weights.
+     * @param w2                     Array of second feed-forward layer weights.
+     * @param w3                     Array of third feed-forward layer weights.
+     * @param rms_final_weight       Final RMS weight tensor.
+     * @param freq_cis_real          Real part of frequency cis tensor.
+     * @param freq_cis_imag          Imaginary part of frequency cis tensor.
+     * @param wcls                   Class token weight tensor.
+     * @param weightType             The GGML weight type.
+     */
+    public LlamaStandardWeights(
+            FloatTensor token_embedding_table,
+            FloatTensor[] rms_att_weight,
+            FloatTensor[] wq,
+            FloatTensor[] wk,
+            FloatTensor[] wv,
+            FloatTensor[] wo,
+            FloatTensor[] rms_ffn_weight,
+            FloatTensor[] w1,
+            FloatTensor[] w2,
+            FloatTensor[] w3,
+            FloatTensor rms_final_weight,
+            FloatTensor freq_cis_real,
+            FloatTensor freq_cis_imag,
+            FloatTensor wcls,
+            GGMLType weightType) {
+        // call to StandardWeights constructor
+        super(token_embedding_table,
+                rms_att_weight,
+                wq,
+                wk,
+                wv,
+                wo,
+                rms_ffn_weight,
+                w1,
+                w2,
+                w3,
+                rms_final_weight,
+                freq_cis_real,
+                freq_cis_imag,
+                wcls,
+                weightType);
     }
+    // @formatter:on
 
     @Override
     public GGMLType getWeightType() {
 
@@ -3,20 +3,75 @@
 import com.example.core.model.GGMLType;
 import com.example.core.model.tensor.FloatTensor;
 
+/**
+ * A model-specific implementation of {@link StandardWeights} for the Qwen-3 model.
+ * This class defines the weights required for performing inference
+ * using the Qwen-3 model in the standard CPU-based format.
+ */
 public class Qwen3StandardWeights extends StandardWeights {
     public final FloatTensor[] attnKNorm, attnQNorm;
 
-    public Qwen3StandardWeights(FloatTensor token_embedding_table, FloatTensor[] rms_att_weight,
-            FloatTensor[] wq, FloatTensor[] wk, FloatTensor[] wv, FloatTensor[] wo,
-            FloatTensor[] attnKNorm, FloatTensor[] attnQNorm,
+    // @formatter:off
+    /**
+     * Constructor for {@code Qwen3StandardWeights}.
+     *
+     * @param token_embedding_table The token embedding table, used to map tokens to embeddings.
+     * @param rms_att_weight        The array of Root Mean Square (RMS) attention weights.
+     * @param wq                    The array of query weight tensors for attention layers.
+     * @param wk                    The array of key weight tensors for attention layers.
+     * @param wv                    The array of value weight tensors for attention layers.
+     * @param wo                    The array of output weight tensors for attention layers.
+     * @param attnKNorm             The array of normalization tensors for attention keys.
+     * @param attnQNorm             The array of normalization tensors for attention queries.
+     * @param rms_ffn_weight        The array of RMS weights for feed-forward neural network layers.
+     * @param w1                    The array of first weight tensors for feed-forward layers.
+     * @param w2                    The array of second weight tensors for feed-forward layers.
+     * @param w3                    The array of third weight tensors for feed-forward layers.
+     * @param rms_final_weight      The RMS weight used for final output normalization.
+     * @param freq_cis_real         The real part of the frequency position encodings.
+     * @param freq_cis_imag         The imaginary part of the frequency position encodings.
+     * @param wcls                  The weight tensor for the classification head.
+     * @param weightType            The type of the weights, defined as {@link GGMLType}.
+     */
+    public Qwen3StandardWeights(
+            FloatTensor token_embedding_table,
+            FloatTensor[] rms_att_weight,
+            FloatTensor[] wq,
+            FloatTensor[] wk,
+            FloatTensor[] wv,
+            FloatTensor[] wo,
+            FloatTensor[] attnKNorm,
+            FloatTensor[] attnQNorm,
             FloatTensor[] rms_ffn_weight,
-            FloatTensor[] w1, FloatTensor[] w2, FloatTensor[] w3,
-            FloatTensor rms_final_weight, FloatTensor freq_cis_real, FloatTensor freq_cis_imag, FloatTensor wcls, GGMLType weightType) {
+            FloatTensor[] w1,
+            FloatTensor[] w2,
+            FloatTensor[] w3,
+            FloatTensor rms_final_weight,
+            FloatTensor freq_cis_real,
+            FloatTensor freq_cis_imag,
+            FloatTensor wcls,
+            GGMLType weightType) {
         // call to StandardWeights constructor
-        super(token_embedding_table, rms_att_weight, wq, wk, wv, wo, rms_ffn_weight, w1, w2, w3, rms_final_weight, freq_cis_real, freq_cis_imag, wcls, weightType);
+        super(token_embedding_table,
+                rms_att_weight,
+                wq,
+                wk,
+                wv,
+                wo,
+                rms_ffn_weight,
+                w1,
+                w2,
+                w3,
+                rms_final_weight,
+                freq_cis_real,
+                freq_cis_imag,
+                wcls,
+                weightType);
+        // init Qwen3-specific fields
         this.attnKNorm = attnKNorm;
         this.attnQNorm = attnQNorm;
     }
+    // @formatter:on
 
     @Override
     public GGMLType getWeightType() {
 
@@ -4,6 +4,11 @@
 import com.example.core.model.tensor.FloatTensor;
 import com.example.inference.weights.Weights;
 
+/**
+ * Base class that represents the standard weight format used for Java-based CPU inference.
+ * This abstract class provides the foundation for defining model-specific
+ * weights in the StandardWeights format.
+ */
 public abstract class StandardWeights implements Weights {
     // token embedding table
     public final FloatTensor token_embedding_table; // (vocab_size, dim)
@@ -14,8 +19,6 @@ public abstract class StandardWeights implements Weights {
     public final FloatTensor[] wk; // (layer, n_kv_heads, head_size)
     public final FloatTensor[] wv; // (layer, n_kv_heads * head_size)
     public final FloatTensor[] wo; // (layer, n_heads * head_size, dim)
-    //public final FloatTensor[] attnKNorm; // qwen3
-    //public final FloatTensor[] attnQNorm; // qwen3
     public final FloatTensor[] rms_ffn_weight; // (layer, dim)
 
     // weights for ffn
@@ -33,41 +36,27 @@ public abstract class StandardWeights implements Weights {
     // (optional) classifier weights for the logits, on the last layer
     protected final GGMLType weightType;
 
+    //@formatter:off
     /**
      * Constructor for standard (non-TornadoVM) mode
      *
-     * @param token_embedding_table
-     *         Token embeddings matrix
-     * @param rms_att_weight
-     *         RMSNorm weights for attention layers
-     * @param wq
-     *         Query weight matrices
-     * @param wk
-     *         Key weight matrices
-     * @param wv
-     *         Value weight matrices
-     * @param wo
-     *         Output projection matrices
-     * @param rms_ffn_weight
-     *         RMSNorm weights for FFN layers
-     * @param w1
-     *         First FFN weight matrices
-     * @param w2
-     *         Second FFN weight matrices
-     * @param w3
-     *         Third FFN weight matrices (gate)
-     * @param rms_final_weight
-     *         Final layer normalization weights
-     * @param freq_cis_real
-     *         RoPE cosine components
-     * @param freq_cis_imag
-     *         RoPE sine components
-     * @param wcls
-     *         Classifier weights for output logits
+     * @param token_embedding_table Token embeddings matrix
+     * @param rms_att_weight        RMSNorm weights for attention layers
+     * @param wq                    Query weight matrices
+     * @param wk                    Key weight matrices
+     * @param wv                    Value weight matrices
+     * @param wo                    Output projection matrices
+     * @param rms_ffn_weight        RMSNorm weights for FFN layers
+     * @param w1                    First FFN weight matrices
+     * @param w2                    Second FFN weight matrices
+     * @param w3                    Third FFN weight matrices (gate)
+     * @param rms_final_weight      Final layer normalization weights
+     * @param freq_cis_real         RoPE cosine components
+     * @param freq_cis_imag         RoPE sine components
+     * @param wcls                  Classifier weights for output logits
      */
     protected StandardWeights(FloatTensor token_embedding_table, FloatTensor[] rms_att_weight,
             FloatTensor[] wq, FloatTensor[] wk, FloatTensor[] wv, FloatTensor[] wo,
-            //FloatTensor[] attnKNorm, FloatTensor[] attnQNorm,
             FloatTensor[] rms_ffn_weight,
             FloatTensor[] w1, FloatTensor[] w2, FloatTensor[] w3,
             FloatTensor rms_final_weight,
@@ -92,4 +81,5 @@ protected StandardWeights(FloatTensor token_embedding_table, FloatTensor[] rms_a
         this.freq_cis_imag = freq_cis_imag;
         this.weightType = weightType;
     }
+    //@formatter:on
 }