Optimize Qcur rmsnorm

orionpapadakis · orionpapadakis · commit 40733d018248 · 2025-07-29T16:06:27.000+03:00
diff --git a/src/main/java/com/example/inference/state/Qwen3State.java b/src/main/java/com/example/inference/state/Qwen3State.java
@@ -28,12 +28,13 @@ public final class Qwen3State extends State {
     public Qwen3State(Configuration config, int batchsize) {
         super(config, batchsize);
         // Initialize Qwen3-specific field
+        Qwen3Configuration qwen3config = (Qwen3Configuration) config;
+        int nEmbdHead = qwen3config.numberOfHeads();
         this.kq = ArrayFloatTensor.allocate(config.numberOfHeads(), 32, 15);
-        this.tempQcur = new FloatArray(1 + ((config.dim() + localSize-1) / localSize));
+        this.tempQcur = new FloatArray(nEmbdHead);
         this.tempKcur = new FloatArray(1 + ((config.dim() + localSize-1) / localSize));
 
         // dbg buffers
-        Qwen3Configuration qwen3config = (Qwen3Configuration) config;
         int nHeadKv = qwen3config.numberOfKeyValueHeads();
         int nEmbdHeadK = qwen3config.numberOfHeadsKey();
         int nEmbdKGqa = nEmbdHeadK * nHeadKv;
diff --git a/src/main/java/com/example/tornadovm/Qwen3Kernels.java b/src/main/java/com/example/tornadovm/Qwen3Kernels.java
@@ -19,6 +19,106 @@ public static void dbgCopy(FloatArray srcBuffer, FloatArray dstBuffer, IntArray
         //}
     }
 
+    public static void rmsnormReductionWithOffset(
+            KernelContext context,
+            FloatArray output,
+            FloatArray x,
+            int localMemSize) {
+
+        // global size: 0 - (config.numberOfHeads() * nEmbdHead)
+        // local size : 0 - nEmbdHead
+        int gid = context.globalIdx;
+        int lid = context.localIdx;
+        int groupId = context.groupIdx;
+        int groupSize = context.localGroupSizeX;
+
+        // Allocate local memory with the provided size
+        float[] localX = context.allocateFloatLocalArray(localMemSize);
+
+        // Load input value and compute square
+        //int globalReadIndex = gid + offset;
+        //if (gid < size && globalReadIndex < x.getSize()) {
+            localX[lid] = x.get(gid);
+            localX[lid] = localX[lid] * localX[lid];
+        //} else {
+        //    localX[lid] = 0.0f;
+        //}
+
+        // Perform parallel reduction within the work group
+        for (int stride = (groupSize / 2); stride > 0; stride /= 2) {
+            context.localBarrier();
+            if (lid < stride) {
+                localX[lid] += localX[lid + stride];
+            }
+        }
+
+        // Each workgroup stores its partial sum in a different location
+        if (lid == 0) {
+            // Store the partial sum from each workgroup
+            output.set(groupId, localX[0]);
+        }
+    }
+
+    // Second kernel - Combines partial sums and computes final normalization
+    public static void rmsnormFinalNormalizationWithParallelOffset(
+            KernelContext context,
+            FloatArray output, // size should be related to offsetIndex
+            int offsetIndex,   // = config.numberOfHeads()
+            int size,
+            float ermsNorm) {
+
+        int gid = context.globalIdx;
+
+        // Only the index threads need to perform this calculation
+        if (gid < offsetIndex) {
+            // Combine partial sums from all workgroups
+            float ss = 0.0f;
+            //for (int i = 1; i < output.getSize(); i++) {  // Fixed bounds to avoid out of bounds
+//            for (int i = 1; i < output.getSize(); i++) {  // Fixed bounds to avoid out of bounds
+//                ss += output.get(i);
+//            }
+            ss = output.get(gid);
+
+            ss /= size;
+            ss += ermsNorm;
+            ss = 1.0f / TornadoMath.sqrt(ss);
+            // in place
+            output.set(gid, ss);  // Store the final scale factor
+        }
+    }
+
+    public static void rmsnormMapIndexInPlaceWithParallelOffset(
+            KernelContext context,
+            FloatArray out,         // Q
+            FloatArray weights,
+            int size,
+            FloatArray ss           // tempQcur1
+    ) {
+
+        int gid = context.globalIdx; // 0 - size
+        //int index = offset + gid;
+        int groupId = context.groupIdx;
+
+        float finalss = ss.get(groupId);
+        //out.set(index, weights.get(index % size) * (finalss * x.get(index)));
+        //out.set(index, weights.get(index) * (finalss * x.get(index)));
+        //if (index < offset + size) {
+        if (gid < out.getSize()) { // TODO: check if redundant
+            float a = weights.get(gid % size);
+            float b = finalss * out.get(gid);
+            out.set(gid, a * b);
+        }
+
+        //old gid, index:
+        //        int gid = context.globalIdx; // 0 - size
+        //        int index = offset + gid;
+        //        context.globalBarrier();
+        //        // reset ss
+        //        if (gid < ss.getSize()) {
+        //            ss.set(gid, 0.0f);
+        //        }
+    }
+
     public static void reductionOneBlockWithLayerWithOffset(
             KernelContext context,
             FloatArray output,
diff --git a/src/main/java/com/example/tornadovm/Qwen3TornadoVMLayerPlanner.java b/src/main/java/com/example/tornadovm/Qwen3TornadoVMLayerPlanner.java
@@ -75,7 +75,6 @@ public Tuple2<List<ImmutableTaskGraph>, GridScheduler> setupTornadoForwardPlanLa
         state.tempFFN.init(0.0f);
         state.tempLogits.init(0.0f);
         state.wrapLogits.init(0.0f);
-        state.tempQcur.init(0.0f);
         state.tempKcur.init(0.0f);
 
 //        state.dbgQ.init(0.0f);
@@ -185,28 +184,28 @@ public Tuple2<List<ImmutableTaskGraph>, GridScheduler> setupTornadoForwardPlanLa
 //            unifiedLayer.transferToHost(DataTransferMode.EVERY_EXECUTION, state.wrapV);
 
             // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-            for (int i = 0; i < config.numberOfHeads(); i++) {
-                //rmsnorm(state.q, state.q, weights.attnQNorm[curLayer], i * nEmbdHead, nEmbdHead, config.rmsNormEps());
-                int offset = i * nEmbdHead;
-                unifiedLayer.task("reductionsOneBlock" + "_Qcur_" + i ,
-                                Qwen3Kernels::reductionOneBlockWithLayerWithOffset,
-                                context,
-                                state.tempQcur,         // output
-                                state.wrapQ,            // input
-                                offset, nEmbdHead,
-                                config.rmsNormEps(), state.localSize)
-                        .task("reductionFinalNormalization" + "_Qcur_" + i ,
-                                    TransformerComputeKernelsLayered::reductionFinalNormalization, context,
-                                    state.tempQcur,     // output
-                                    nEmbdHead,
-                                    config.rmsNormEps())
-                            .task("mapContext" + "_Qcur_" + i,
-                                    Qwen3Kernels::mapIndexInPlace, context,
-                                    state.wrapQ,        // output
-                                    weights.rms_att_QNormLayered[layerIndex],
-                                    offset, nEmbdHead,
-                                    state.tempQcur);
-            }
+            //rmsnorm(state.q, state.q, weights.attnQNorm[curLayer], i * nEmbdHead, nEmbdHead, config.rmsNormEps());
+            unifiedLayer
+                    .task("reductionsOneBlock_Qcur",
+                            Qwen3Kernels::rmsnormReductionWithOffset,
+                            context,
+                            state.tempQcur,         // output
+                            state.wrapQ,            // input
+                            state.localSize) // currently 128, should be variable of global nEmbHead
+                    .task("reductionFinalNormalization_Qcur",
+                            Qwen3Kernels::rmsnormFinalNormalizationWithParallelOffset,
+                            context,
+                            state.tempQcur,     // output
+                            config.numberOfHeads(),
+                            nEmbdHead,
+                            config.rmsNormEps())
+                    .task("mapContext_Qcur",
+                            Qwen3Kernels::rmsnormMapIndexInPlaceWithParallelOffset,
+                            context,
+                            state.wrapQ,        // output
+                            weights.rms_att_QNormLayered[layerIndex],
+                            nEmbdHead,
+                            state.tempQcur);
 
 //            unifiedLayer.task("dbg_copy_out_wrapQ",
 //                    Qwen3Kernels::dbgCopy,
@@ -446,6 +445,15 @@ private GridScheduler setupQwen3GridSchedulersLayeredNonNvidia() {
         curWorker.setGlobalWork(nEmbdHead, 1, 1);  // Set global work size to total dimension
         curWorker.setLocalWork(128, 1, 1);         // Set local work size to 256 (standard efficient size)
 
+        // config.numberOfHeads() = 16
+        // nEmbdHead = 128
+        // total = 2048
+        WorkerGrid qCurWorker = new WorkerGrid1D(config.numberOfHeads() * nEmbdHead);
+        qCurWorker.setLocalWork(nEmbdHead, 1, 1);
+
+        WorkerGrid qCurWorker2 = new WorkerGrid1D(config.numberOfHeads());
+        qCurWorker2.setLocalWork(1, 1, 1);
+
         int h = config.numberOfHeads();
         int ic = nEmbdHead / 2;
         WorkerGrid ropeWorker = new WorkerGrid2D(h, ic);
@@ -485,24 +493,10 @@ private GridScheduler setupQwen3GridSchedulersLayeredNonNvidia() {
             gridScheduler.addWorkerGrid("layer_" + i + ".kmatmul", matmulKVRowMajorWorker);
             gridScheduler.addWorkerGrid("layer_" + i + ".vmatmul", matmulKVRowMajorWorker);
 
-//            //int size = nEmbdHead;
-//            for (int j = 0; j < config.numberOfHeads(); j++) {
-////                int offset = j * nEmbdHead;
-////                WorkerGrid qRmsReductionWorker = new WorkerGrid1D(size);
-////                qRmsReductionWorker.setLocalWork(state.localSize, 1, 1);
-//                gridScheduler.addWorkerGrid("layer_" + i + ".reductionsOneBlock" + "_Qcur_" + j, curWorker);
-//                //gridScheduler.addWorkerGrid("layer_" + i + ".reductionFinalNormalization" + "_Qcur_" + j, curWorker);
-//                gridScheduler.addWorkerGrid("layer_" + i + ".mapContext" + "_Qcur_" + j, curWorker);
-//            }
-            // Create separate WorkerGrid for each head
-            for (int j = 0; j < config.numberOfHeads(); j++) {
-                WorkerGrid headWorker = new WorkerGrid1D(nEmbdHead);  // nEmbdHead = 128
-                headWorker.setGlobalWork(nEmbdHead, 1, 1);  // Set global work size to total dimension
-                headWorker.setLocalWork(128, 1, 1);
-
-                gridScheduler.addWorkerGrid("layer_" + i + ".reductionsOneBlock" + "_Qcur_" + j, headWorker);
-                gridScheduler.addWorkerGrid("layer_" + i + ".mapContext" + "_Qcur_" + j, headWorker);
-            }
+            gridScheduler.addWorkerGrid("layer_" + i + ".reductionsOneBlock_Qcur", qCurWorker);
+            gridScheduler.addWorkerGrid("layer_" + i + ".reductionFinalNormalization_Qcur", qCurWorker2);
+            gridScheduler.addWorkerGrid("layer_" + i + ".mapContext_Qcur", qCurWorker);
+
             for (int j = 0; j < config.numberOfKeyValueHeads(); j++) {
                 gridScheduler.addWorkerGrid("layer_" + i + ".reductionsOneBlock" + "_Kcur_" + j, curWorker);
                 //gridScheduler.addWorkerGrid("layer_" + i + ".reductionFinalNormalization" + "_Kcur_" + j, curWorker);