Cleanup Qwen3Kernels

orionpapadakis · orionpapadakis · commit 6224ae0d5ac9 · 2025-07-30T16:04:17.000+03:00
diff --git a/src/main/java/com/example/tornadovm/Qwen3Kernels.java b/src/main/java/com/example/tornadovm/Qwen3Kernels.java
@@ -9,24 +9,32 @@
 
 public class Qwen3Kernels {
 
-    //public static void dbgCopy(FloatArray destKeyCache, FloatArray srcKey, FloatArray destValueCache, FloatArray srcValue, IntArray positioNlayer, int kvDim, int layer, int contextLength) {
-    public static void dbgCopy(FloatArray srcBuffer, FloatArray dstBuffer, IntArray positioNlayer, int layer) {
-        //int position = positioNlayer.get(0);
-        //if (position == 1) {
+    /**
+     * For explicit copy out useful in debugging.
+     * With this kernel we can store the values of an array to a tmp buffer at a timing of interest.
+     * In the end of the taskgraph we copy out the tmp buffer to inspect the array values at the timing of interest.
+     * @param srcBuffer the array we want to inspect.
+     * @param dstBuffer the tmp buffer.
+     */
+    public static void dbgCopy(FloatArray srcBuffer, FloatArray dstBuffer) {
             for (@Parallel int i = 0; i < srcBuffer.getSize(); i++) {
                 dstBuffer.set(i, srcBuffer.get(i));
             }
-        //}
     }
 
-    public static void rmsnormReductionWithOffset(
+    /**
+     * RmsNorm with parallel offset:
+     * The following 3 kernels implement rmsnorm in offset range in parallel for qCur and Kcur rmsnorm calculations.
+     *
+     * Step 1: Reduction.
+     * This kernel implements rmsnorm in offset range in parallel for qCur and Kcur rmsnorm calculations.
+     */
+    public static void rmsnormReductionWithParallelOffset(
             KernelContext context,
             FloatArray output,
             FloatArray x,
             int localMemSize) {
 
-        // global size: 0 - (config.numberOfHeads() * nEmbdHead)
-        // local size : 0 - nEmbdHead
         int gid = context.globalIdx;
         int lid = context.localIdx;
         int groupId = context.groupIdx;
@@ -36,13 +44,8 @@ public static void rmsnormReductionWithOffset(
         float[] localX = context.allocateFloatLocalArray(localMemSize);
 
         // Load input value and compute square
-        //int globalReadIndex = gid + offset;
-        //if (gid < size && globalReadIndex < x.getSize()) {
-            localX[lid] = x.get(gid);
-            localX[lid] = localX[lid] * localX[lid];
-        //} else {
-        //    localX[lid] = 0.0f;
-        //}
+        localX[lid] = x.get(gid);
+        localX[lid] = localX[lid] * localX[lid];
 
         // Perform parallel reduction within the work group
         for (int stride = (groupSize / 2); stride > 0; stride /= 2) {
@@ -59,7 +62,11 @@ public static void rmsnormReductionWithOffset(
         }
     }
 
-    // Second kernel - Combines partial sums and computes final normalization
+    /**
+     * RmsNorm with parallel offset:
+     *
+     * Step 2: Combines partial reduction outputs and computes final normalization.
+     */
     public static void rmsnormFinalNormalizationWithParallelOffset(
             KernelContext context,
             FloatArray output, // size should be related to offsetIndex
@@ -72,12 +79,7 @@ public static void rmsnormFinalNormalizationWithParallelOffset(
         // Only the index threads need to perform this calculation
         if (gid < offsetIndex) {
             // Combine partial sums from all workgroups
-            float ss = 0.0f;
-            //for (int i = 1; i < output.getSize(); i++) {  // Fixed bounds to avoid out of bounds
-//            for (int i = 1; i < output.getSize(); i++) {  // Fixed bounds to avoid out of bounds
-//                ss += output.get(i);
-//            }
-            ss = output.get(gid);
+            float ss = output.get(gid);
 
             ss /= size;
             ss += ermsNorm;
@@ -87,36 +89,28 @@ public static void rmsnormFinalNormalizationWithParallelOffset(
         }
     }
 
+    /**
+     * RmsNorm with parallel offset:
+     *
+     * Step 3: perform mapIndex operation.
+     */
     public static void rmsnormMapIndexInPlaceWithParallelOffset(
             KernelContext context,
-            FloatArray out,         // Q
+            FloatArray out,
             FloatArray weights,
             int size,
-            FloatArray ss           // tempQcur1
-    ) {
+            FloatArray ss) {
 
-        int gid = context.globalIdx; // 0 - size
-        //int index = offset + gid;
+        int gid = context.globalIdx;
         int groupId = context.groupIdx;
 
         float finalss = ss.get(groupId);
-        //out.set(index, weights.get(index % size) * (finalss * x.get(index)));
-        //out.set(index, weights.get(index) * (finalss * x.get(index)));
-        //if (index < offset + size) {
+
         if (gid < out.getSize()) { // TODO: check if redundant
             float a = weights.get(gid % size);
             float b = finalss * out.get(gid);
             out.set(gid, a * b);
         }
-
-        //old gid, index:
-        //        int gid = context.globalIdx; // 0 - size
-        //        int index = offset + gid;
-        //        context.globalBarrier();
-        //        // reset ss
-        //        if (gid < ss.getSize()) {
-        //            ss.set(gid, 0.0f);
-        //        }
     }
 
     /**
@@ -162,92 +156,12 @@ public static void rmsnormWithParallelOffset(
         }
     }
 
-    public static void reductionOneBlockWithLayerWithOffset(
-            KernelContext context,
-            FloatArray output,
-            FloatArray x,
-            int offset,
-            int size,
-            float ermsNorm,
-            int localMemSize) {
-
-        int gid = context.globalIdx; // 0 - nEmbHead = 128
-        int lid = context.localIdx;  // 0 - state.localsize [
-        int groupId = context.groupIdx;
-        int groupSize = context.localGroupSizeX;
-
-        // Allocate local memory with the provided size
-        float[] localX = context.allocateFloatLocalArray(localMemSize);
-
-        // Load input value and compute square
-        int globalReadIndex = gid + offset;
-        if (gid < size && globalReadIndex < x.getSize()) {
-            localX[lid] = x.get(globalReadIndex);
-            localX[lid] = localX[lid] * localX[lid];
-        } else {
-            localX[lid] = 0.0f;
-        }
-
-        // Perform parallel reduction within the work group
-        for (int stride = (groupSize / 2); stride > 0; stride /= 2) {
-            context.localBarrier();
-            if (lid < stride) {
-                localX[lid] += localX[lid + stride];
-            }
-        }
-
-        // Each workgroup stores its partial sum in a different location
-        if (lid == 0) {
-            // Store the partial sum from each workgroup
-            output.set(groupId + 1, localX[0]);
-        }
-
-//        // Only the first thread in the first workgroup computes the final normalization factor
-//        if (gid == 0) {
-//            // Combine partial sums from all workgroups
-//            float ss = 0.0f;
-//            for (int i = 1; i <= (size / localMemSize); i++) {  // Assuming 8 workgroups
-//                ss += output.get(i);
-//            }
-//
-//            ss /= size;
-//            ss += ermsNorm;
-//            ss = 1.0f / TornadoMath.sqrt(ss);
-//            output.set(0, ss);  // Store the final scale factor
-//        }
-    }
-
-    /**
-     * Normalize and scale (in-place) of rmsnorm operation.
-     */
-    public static void mapIndexInPlace(KernelContext context, FloatArray out, /*FloatArray x,*/ FloatArray weights, int offset, int size, FloatArray ss) {
-        int gid = context.globalIdx; // 0 - size
-        int index = offset + gid;
-
-        float finalss = ss.get(0);
-        //out.set(index, weights.get(index % size) * (finalss * x.get(index)));
-        //out.set(index, weights.get(index) * (finalss * x.get(index)));
-        //if (index < offset + size) {
-        if (index < out.getSize()) { // TODO: check if redundant
-            float a = weights.get(index % size);
-            float b = finalss * out.get(index);
-            out.set(index, a * b);
-        }
-
-        context.globalBarrier();
-        // reset ss
-        if (gid < ss.getSize()) {
-            ss.set(gid, 0.0f);
-        }
-    }
-
     public static void ropeRotation(KernelContext context,
             IntArray position,
             FloatArray q,
             FloatArray k,
             int numberOfKeyValueHeads,
             int nEmbdHead) {
-        //System.out.println("ropeRotationSplit");
         int h = context.globalIdx;
         int ic = context.globalIdy;
 
@@ -256,7 +170,6 @@ public static void ropeRotation(KernelContext context,
         int nComplEmbdHead = nEmbdHead / 2;
 
         // Compute RoPE frequencies for Qwen3
-        //float freq = 1.0f / TornadoMath.pow(10000.0f, (2.0f * ic) / (float) nEmbdHead);
         float theta = 1000000.0f;
         int i = ic * 2; // match i in precompute (see RoPE.precomputeFreqsCis)
         float freq = 1.0f / TornadoMath.pow(theta, (float)i / (float)nEmbdHead);
@@ -290,13 +203,11 @@ public static void processHeadsParallel(
             int nEmbdHeadV, /* = config.numberOfHeadsValue(), replace headSize in lines: 266, 268, 273 */
             int nEmbdGqa, /* kvDim */
             int gqa, /* kvMul */
-            int seqLen,
             IntArray positionHolder,
             FloatArray wrapAtt,
             int layer, int contextLength) {
 
         int pos = positionHolder.get(0);
-        //int loff = layer * contextLength * kvDim;
         int loff = layer * contextLength * nEmbdGqa;
 
         // Parallelize computation across attention heads
@@ -332,22 +243,16 @@ private static void processHeadTornado(
 
         // Base index for this head's attention weights
         int headOffset = h * (pos + 1);
-        //int headOffset = h * contextLength;
 
         // STEP 1: Calculate attention scores for all timesteps
         for (int t = 0; t <= pos; t++) {
-            //int kvHeadIdx = h / kvMul;
             int kvHeadIdx = h / gqa;
-            //int keyOffset = (int) (loff + t * kvDim + kvHeadIdx * headSize);
             int keyOffset = (int) (loff + t * nEmbdGqa + kvHeadIdx * nEmbdHeadK); // line 255
 
             float score = 0.0f;
-            //for (int i = 0; i < headSize; i++) {
             for (int i = 0; i < nEmbdHeadK; i++) {
-                //score += allQ.get(h * headSize + i) * key_cache.get(keyOffset + i);
                 score += allQ.get(h * nEmbdHeadK + i) * key_cache.get(keyOffset + i); // line 255
             }
-            //score = score / TornadoMath.sqrt(headSize);
             score = score / TornadoMath.sqrt(nEmbdHead); // line 257
 
             // Store in attention buffer
@@ -380,28 +285,24 @@ private static void processHeadTornado(
         }
 
         // STEP 5: Compute weighted sum of values for each dimension
-        //for (int i = 0; i < headSize; i++) {
         for (int i = 0; i < nEmbdHeadV; i++) {
             float weightedSum = 0.0f;
             for (int t = 0; t <= pos; t++) {
-                //int kvHeadIdx = h / kvMul;
                 int kvHeadIdx = h / gqa;
-                //int valueOffset = (int) (loff + t * kvDim + kvHeadIdx * headSize);
                 int valueOffset = (int) (loff + t * nEmbdGqa + kvHeadIdx * nEmbdHeadV); //line 273
                 weightedSum += wrapAtt.get(headOffset + t) * value_cache.get(valueOffset + i);
             }
-            //allXb.set(h * headSize + i, weightedSum);
             allXb.set(h * nEmbdHeadV + i, weightedSum); // offset from line 266
         }
     }
 
     public static void matrixVectorGenericWithResidual(
             KernelContext context,
-            FloatArray v,           // vector = [2048]
-            FloatArray out,         // out    = [1024]
-            HalfFloatArray m,       // matrix = [2048, 1024]
-            int dim1,               // dim1   = 2048, vectorSize
-            int dim0,               // dim0   = 1024, outputSize
+            FloatArray v,
+            FloatArray out,
+            HalfFloatArray m,
+            int dim1,
+            int dim0,
             int localWorkGroupSize) {
 
         // One row per workgroup (not per thread)
@@ -431,8 +332,8 @@ public static float matrixVectorRowMajorOptimized(
             int dim1,
             int dim0
     ) {
-        int rowId = context.groupIdx; // 0-dim
-        int localId = context.localIdx; // 0-32
+        int rowId = context.groupIdx;
+        int localId = context.localIdx;
 
         // Allocate local memory for reduction
         float[] localSum = context.allocateFloatLocalArray(localSize);
@@ -444,48 +345,6 @@ public static float matrixVectorRowMajorOptimized(
         for (int j = localId; j < dim1; j += localSize) {
             int matrixIdx = rowOffset + j;
             partialSum += m.get(matrixIdx).getFloat32() * v.get(j);
-            //partialSum += w.get(rowOffset + j).getFloat32() * x.get(j);
-        }
-
-        // Store partial sum in local memory
-        localSum[localId] = partialSum;
-        context.localBarrier();
-
-        // Parallel reduction within workgroup
-        for (int stride = localSize / 2; stride > 0; stride >>= 1) {
-            if (localId < stride) {
-                localSum[localId] += localSum[localId + stride];
-            }
-            context.localBarrier();
-        }
-
-        return localSum[0];
-    }
-
-    public static float matrixVectorRowMajorOptimized2(
-            KernelContext context,
-            int localSize,
-            FloatArray v,           // input vector [2048]
-            HalfFloatArray m,       // matrix [2048, 1024]
-            int vectorSize,         // 2048
-            int outputSize,
-            int rowId               // which output row we're computing (0-1023)
-    ) {
-        int localId = context.localIdx; // 0 to localSize-1
-
-        // Allocate local memory for reduction
-        float[] localSum = context.allocateFloatLocalArray(localSize);
-
-        // For matrix [2048, 1024], if we want row 'rowId' of the OUTPUT,
-        // we need to compute dot product of INPUT vector with COLUMN 'rowId' of the matrix
-        // Matrix element [i][j] is at index i * outputSize + j
-        // We want column 'rowId', so elements are at: 0*outputSize + rowId, 1*outputSize + rowId, etc.
-
-        // Each thread calculates partial dot product
-        float partialSum = 0.0f;
-        for (int i = localId; i < vectorSize; i += localSize) {
-            int matrixIdx = i * outputSize + rowId;  // Column-wise access for row rowId
-            partialSum += m.get(matrixIdx).getFloat32() * v.get(i);
         }
 
         // Store partial sum in local memory