From 8ff7a96003f531b6651780b4132eee1b14481153 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Sun, 18 May 2025 11:37:32 +0530
Subject: [PATCH 01/10] SYCL: Add non contiguous input support to norm kernel

---
 ggml/src/ggml-sycl/ggml-sycl.cpp |  1 +
 ggml/src/ggml-sycl/norm.cpp      | 86 ++++++++++++++++----------------
 2 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 271f54e5773d9..eae76158ac23e 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4192,6 +4192,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
             return (op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
 #endif
         case GGML_OP_NORM:
+            return true;
         case GGML_OP_RMS_NORM:
         case GGML_OP_L2_NORM:
         case GGML_OP_GROUP_NORM:
diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp
index 4e9f438b46ba6..7d73da605fce4 100644
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -1,50 +1,47 @@
 #include "norm.hpp"
+#include "ggml-sycl/presets.hpp"
 
-static void norm_f32(const float* x, float* dst, const int ncols, const float eps,
-    const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-        item_ct1.get_local_id(1);
-    const int tid = item_ct1.get_local_id(2);
+static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
+        const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
 
-    const int nthreads = item_ct1.get_local_range(2);
-    const int nwarps = nthreads / WARP_SIZE;
-    sycl::float2 mean_var = sycl::float2(0.f, 0.f);
+    const int nrows = item_ct1.get_group_range(2);
+    const int nchannels = item_ct1.get_group_range(1);
+    int sample  = item_ct1.get_group(0);
+    int channel = item_ct1.get_group(1);
+    int row     = item_ct1.get_group(2);
 
+    int tid = item_ct1.get_local_id(2);
+
+    x += sample  * stride_sample + channel * stride_channel + row     * stride_row;
+    dst += ((sample * nchannels + channel) * nrows + row) * ncols;
+
+    sycl::float2 mean_var{0.f, 0.f};
     for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[row * ncols + col];
+        const float xi = x[col];
         mean_var.x() += xi;
         mean_var.y() += xi * xi;
     }
 
     // sum up partial sums
     mean_var = warp_reduce_sum(mean_var, item_ct1);
-    if (block_size > WARP_SIZE) {
-
-        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+    if  (block_size > WARP_SIZE) {
+        int warp_id = tid / WARP_SIZE;
+        int lane_id = tid % WARP_SIZE;
         if (lane_id == 0) {
             s_sum[warp_id] = mean_var;
         }
-        /*
-        DPCT1118:0: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
         item_ct1.barrier(sycl::access::fence_space::local_space);
-        mean_var = 0.f;
-        size_t nreduce = nwarps / WARP_SIZE;
-        for (size_t i = 0; i < nreduce; i += 1)
-        {
-            mean_var += s_sum[lane_id + i * WARP_SIZE];
-        }
+
+        mean_var = s_sum[lane_id];
         mean_var = warp_reduce_sum(mean_var, item_ct1);
     }
 
     const float mean = mean_var.x() / ncols;
-    const float var = mean_var.y() / ncols - mean * mean;
+    const float var  = mean_var.y() / ncols - mean * mean;
     const float inv_std = sycl::rsqrt(var + eps);
 
     for (int col = tid; col < ncols; col += block_size) {
-        dst[row * ncols + col] = (x[row * ncols + col] - mean) * inv_std;
+        dst[col] = (x[col] - mean) * inv_std;
     }
 }
 
@@ -224,20 +221,20 @@ static void l2_norm_f32(const float* x, float* dst, const int ncols, const float
     }
 }
 
-static void norm_f32_sycl(const float* x, float* dst, const int ncols,
-    const int nrows, const float eps,
-    queue_ptr stream, int device) {
+static void norm_f32_sycl(const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
+        const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
+        const float eps, queue_ptr stream, int device) {
+
+    const sycl::range<3> global_dims(nsamples, nchannels, nrows);
     GGML_ASSERT(ncols % WARP_SIZE == 0);
     if (ncols < 1024) {
-        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+        const sycl::range<3> block_dims(1, 1, WARP_SIZE); // Equivalent to CUDA's (WARP_SIZE, 1, 1)
         stream->submit([&](sycl::handler& cgh) {
             cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                    block_dims),
+                sycl::nd_range<3>(global_dims * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
                 [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    norm_f32(x, dst, ncols, eps, item_ct1,
-                        nullptr, WARP_SIZE);
+                    norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
                 });
             });
     }
@@ -251,16 +248,13 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
         stream->submit([&](sycl::handler& cgh) {
-            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
-                sycl::range<1>(work_group_size / WARP_SIZE), cgh);
+            auto s_sum_acc_ct1 = sycl::local_accessor<sycl::float2, 1>(sycl::range<1>(work_group_size / WARP_SIZE), cgh);
 
             cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                    block_dims),
+                sycl::nd_range<3>(global_dims * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
                 [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    norm_f32(x, dst, ncols, eps, item_ct1,
-                        get_pointer(s_sum_acc_ct1), work_group_size);
+                    norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
                 });
             });
     }
@@ -398,12 +392,12 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
 }
 
 void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
+    const ggml_tensor * src0 = dst->src[0];
 
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
 
-    const int64_t ne00 = dst->src[0]->ne[0];
-    const int64_t nrows = ggml_nrows(dst->src[0]);
+    GGML_TENSOR_UNARY_OP_LOCALS
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
@@ -411,8 +405,14 @@ void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
 
     float eps;
     memcpy(&eps, dst->op_params, sizeof(float));
-
-    norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
+    GGML_ASSERT(eps >= 0.0f);
+    const size_t ts0 = ggml_type_size(src0->type);
+    GGML_ASSERT(nb00 == ts0);
+    const int64_t s01 = nb01 / ts0;
+    const int64_t s02 = nb02 / ts0;
+    const int64_t s03 = nb03 / ts0;
+
+    norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
 }
 
 void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {

From 5107933df78963c2a3e72a436a8146978b26024c Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Sun, 18 May 2025 12:05:35 +0530
Subject: [PATCH 02/10] refactor and add RMS_NORM non contiguous input support

ggml-ci
---
 ggml/src/ggml-sycl/ggml-sycl.cpp |  2 +-
 ggml/src/ggml-sycl/norm.cpp      | 77 ++++++++++++++++----------------
 2 files changed, 40 insertions(+), 39 deletions(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index eae76158ac23e..3bf95f88cb3b8 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4192,8 +4192,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
             return (op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
 #endif
         case GGML_OP_NORM:
-            return true;
         case GGML_OP_RMS_NORM:
+            return true;
         case GGML_OP_L2_NORM:
         case GGML_OP_GROUP_NORM:
             return ggml_is_contiguous(op->src[0]);
diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp
index 7d73da605fce4..59fbd4b880fcb 100644
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -1,18 +1,17 @@
 #include "norm.hpp"
-#include "ggml-sycl/presets.hpp"
 
 static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
         const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
 
     const int nrows = item_ct1.get_group_range(2);
     const int nchannels = item_ct1.get_group_range(1);
-    int sample  = item_ct1.get_group(0);
-    int channel = item_ct1.get_group(1);
-    int row     = item_ct1.get_group(2);
+    const int sample  = item_ct1.get_group(0);
+    const int channel = item_ct1.get_group(1);
+    const int row     = item_ct1.get_group(2);
 
-    int tid = item_ct1.get_local_id(2);
+    const int tid = item_ct1.get_local_id(2);
 
-    x += sample  * stride_sample + channel * stride_channel + row     * stride_row;
+    x += sample  * stride_sample + channel * stride_channel + row * stride_row;
     dst += ((sample * nchannels + channel) * nrows + row) * ncols;
 
     sycl::float2 mean_var{0.f, 0.f};
@@ -132,17 +131,25 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
     }
 }
 
-static void rms_norm_f32(const float* x, float* dst, const int ncols, const float eps,
-    const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-        item_ct1.get_local_id(1);
+static void rms_norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
+        const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
+
+    const int nrows = item_ct1.get_group_range(2);
+    const int nchannels = item_ct1.get_group_range(1);
+    const int sample  = item_ct1.get_group(0);
+    const int channel = item_ct1.get_group(1);
+    const int row     = item_ct1.get_group(2);
+
     const int tid = item_ct1.get_local_id(2);
-    const int nthreads = item_ct1.get_local_range(2);
-    const int nwarps = nthreads / WARP_SIZE;
+
+    x   += sample*stride_sample + channel*stride_channel + row*stride_row;
+    dst += ((sample*nchannels + channel)*nrows + row)*ncols;
+
+
     float tmp = 0.0f; // partial sum for thread in warp
 
     for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[row * ncols + col];
+        const float xi = x[col];
         tmp += xi * xi;
     }
 
@@ -155,17 +162,9 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
         if (lane_id == 0) {
             s_sum[warp_id] = tmp;
         }
-        /*
-        DPCT1118:3: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
+
         item_ct1.barrier(sycl::access::fence_space::local_space);
-        size_t nreduce = nwarps / WARP_SIZE;
-        tmp = 0.f;
-        for (size_t i = 0; i < nreduce; i += 1)
-        {
-            tmp += s_sum[lane_id + i * WARP_SIZE];
-        }
+        tmp = s_sum[lane_id];
         tmp = warp_reduce_sum(tmp, item_ct1);
     }
 
@@ -173,7 +172,7 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
     const float scale = sycl::rsqrt(mean + eps);
 
     for (int col = tid; col < ncols; col += block_size) {
-        dst[row * ncols + col] = scale * x[row * ncols + col];
+        dst[col] = scale * x[col];
     }
 }
 
@@ -307,21 +306,20 @@ static void group_norm_f32_sycl(const float* x, float* dst,
     }
 }
 
-static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
-    const int nrows, const float eps,
-    queue_ptr stream, int device) {
+static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
+        const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, queue_ptr stream, int device) {
     GGML_ASSERT(ncols % WARP_SIZE == 0);
     // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
+
+    const sycl::range<3> global_dims(nsamples, nchannels, nrows);
     if (ncols < 1024) {
         const sycl::range<3> block_dims(1, 1, WARP_SIZE);
         stream->submit([&](sycl::handler& cgh) {
             cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                    block_dims),
+                sycl::nd_range<3>(global_dims * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
                 [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    rms_norm_f32(x, dst, ncols, eps, item_ct1,
-                        nullptr, WARP_SIZE);
+                    rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
                 });
             });
     }
@@ -338,12 +336,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
             sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
                 cgh);
             cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                    block_dims),
+                sycl::nd_range<3>(global_dims * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
                 [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    rms_norm_f32(x, dst, ncols, eps, item_ct1,
-                        get_pointer(s_sum_acc_ct1), work_group_size);
+                    rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
                 });
             });
     }
@@ -436,11 +432,10 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
 
 void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
+    const ggml_tensor * src0 = dst->src[0];
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
 
-    const int64_t ne00 = dst->src[0]->ne[0];
-    const int64_t nrows = ggml_nrows(dst->src[0]);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
 
@@ -450,7 +445,13 @@ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     float eps;
     memcpy(&eps, dst->op_params, sizeof(float));
 
-    rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
+    GGML_TENSOR_UNARY_OP_LOCALS
+    const size_t ts0 = ggml_type_size(src0->type);
+    GGML_ASSERT(nb00 == ts0);
+    const int64_t s01 = nb01 / ts0;
+    const int64_t s02 = nb02 / ts0;
+    const int64_t s03 = nb03 / ts0;
+    rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
 }
 
 void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {

From 0a1888cb33f678d6f0d44e862ec0237750f562bc Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Sun, 18 May 2025 12:29:01 +0530
Subject: [PATCH 03/10] restore subgroup reduction for multi-subgroup thread
 blocks in norm kernels

---
 ggml/src/ggml-sycl/norm.cpp | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp
index 59fbd4b880fcb..47f241b25ac84 100644
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -5,11 +5,13 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t
 
     const int nrows = item_ct1.get_group_range(2);
     const int nchannels = item_ct1.get_group_range(1);
+    const int nthreads = item_ct1.get_local_range(2);
     const int sample  = item_ct1.get_group(0);
     const int channel = item_ct1.get_group(1);
     const int row     = item_ct1.get_group(2);
 
     const int tid = item_ct1.get_local_id(2);
+    const int nwarps = nthreads / WARP_SIZE;
 
     x += sample  * stride_sample + channel * stride_channel + row * stride_row;
     dst += ((sample * nchannels + channel) * nrows + row) * ncols;
@@ -30,8 +32,12 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t
             s_sum[warp_id] = mean_var;
         }
         item_ct1.barrier(sycl::access::fence_space::local_space);
-
-        mean_var = s_sum[lane_id];
+        mean_var = 0.f;
+        size_t nreduce = nwarps / WARP_SIZE;
+        for (size_t i = 0; i < nreduce; i += 1)
+        {
+            mean_var += s_sum[lane_id + i * WARP_SIZE];
+        }
         mean_var = warp_reduce_sum(mean_var, item_ct1);
     }
 
@@ -139,8 +145,10 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6
     const int sample  = item_ct1.get_group(0);
     const int channel = item_ct1.get_group(1);
     const int row     = item_ct1.get_group(2);
+    const int nthreads = item_ct1.get_local_range(2);
 
     const int tid = item_ct1.get_local_id(2);
+    const int nwarps = nthreads / WARP_SIZE;
 
     x   += sample*stride_sample + channel*stride_channel + row*stride_row;
     dst += ((sample*nchannels + channel)*nrows + row)*ncols;
@@ -164,7 +172,12 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6
         }
 
         item_ct1.barrier(sycl::access::fence_space::local_space);
-        tmp = s_sum[lane_id];
+        size_t nreduce = nwarps / WARP_SIZE;
+        tmp = 0.f;
+        for (size_t i = 0; i < nreduce; i += 1)
+        {
+            tmp += s_sum[lane_id + i * WARP_SIZE];
+        }
         tmp = warp_reduce_sum(tmp, item_ct1);
     }
 

From 23cc97967b8c6b8b1a16ba3dd7c1432c980eaf65 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Sun, 18 May 2025 18:22:03 +0530
Subject: [PATCH 04/10] Swap grid dims of nsamples and nrows

ggml-ci
---
 ggml/src/ggml-sycl/norm.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp
index 47f241b25ac84..e15305e1a5e95 100644
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -3,12 +3,12 @@
 static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
         const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
 
-    const int nrows = item_ct1.get_group_range(2);
+    const int nrows = item_ct1.get_group_range(0);
     const int nchannels = item_ct1.get_group_range(1);
     const int nthreads = item_ct1.get_local_range(2);
-    const int sample  = item_ct1.get_group(0);
+    const int sample  = item_ct1.get_group(2);
     const int channel = item_ct1.get_group(1);
-    const int row     = item_ct1.get_group(2);
+    const int row     = item_ct1.get_group(0);
 
     const int tid = item_ct1.get_local_id(2);
     const int nwarps = nthreads / WARP_SIZE;
@@ -140,11 +140,11 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
 static void rms_norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
         const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
 
-    const int nrows = item_ct1.get_group_range(2);
+    const int nrows = item_ct1.get_group_range(0);
     const int nchannels = item_ct1.get_group_range(1);
-    const int sample  = item_ct1.get_group(0);
+    const int sample  = item_ct1.get_group(2);
     const int channel = item_ct1.get_group(1);
-    const int row     = item_ct1.get_group(2);
+    const int row     = item_ct1.get_group(0);
     const int nthreads = item_ct1.get_local_range(2);
 
     const int tid = item_ct1.get_local_id(2);
@@ -237,10 +237,10 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
         const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
         const float eps, queue_ptr stream, int device) {
 
-    const sycl::range<3> global_dims(nsamples, nchannels, nrows);
+    const sycl::range<3> global_dims(nrows, nchannels, nsamples);
     GGML_ASSERT(ncols % WARP_SIZE == 0);
     if (ncols < 1024) {
-        const sycl::range<3> block_dims(1, 1, WARP_SIZE); // Equivalent to CUDA's (WARP_SIZE, 1, 1)
+        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
         stream->submit([&](sycl::handler& cgh) {
             cgh.parallel_for(
                 sycl::nd_range<3>(global_dims * block_dims, block_dims),
@@ -324,7 +324,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
     GGML_ASSERT(ncols % WARP_SIZE == 0);
     // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
 
-    const sycl::range<3> global_dims(nsamples, nchannels, nrows);
+    const sycl::range<3> global_dims(nrows, nchannels, nsamples);
     if (ncols < 1024) {
         const sycl::range<3> block_dims(1, 1, WARP_SIZE);
         stream->submit([&](sycl::handler& cgh) {

From 95bd077b280448e9c9c7b54a26b4bf9ebec32fbf Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Sun, 18 May 2025 18:43:12 +0530
Subject: [PATCH 05/10] Revert "Swap grid dims of nsamples and nrows"

This reverts commit 43be2d657fec7f7fba54e2cd154106bc0fc45adf.
---
 ggml/src/ggml-sycl/norm.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp
index e15305e1a5e95..47f241b25ac84 100644
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -3,12 +3,12 @@
 static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
         const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
 
-    const int nrows = item_ct1.get_group_range(0);
+    const int nrows = item_ct1.get_group_range(2);
     const int nchannels = item_ct1.get_group_range(1);
     const int nthreads = item_ct1.get_local_range(2);
-    const int sample  = item_ct1.get_group(2);
+    const int sample  = item_ct1.get_group(0);
     const int channel = item_ct1.get_group(1);
-    const int row     = item_ct1.get_group(0);
+    const int row     = item_ct1.get_group(2);
 
     const int tid = item_ct1.get_local_id(2);
     const int nwarps = nthreads / WARP_SIZE;
@@ -140,11 +140,11 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
 static void rms_norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
         const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
 
-    const int nrows = item_ct1.get_group_range(0);
+    const int nrows = item_ct1.get_group_range(2);
     const int nchannels = item_ct1.get_group_range(1);
-    const int sample  = item_ct1.get_group(2);
+    const int sample  = item_ct1.get_group(0);
     const int channel = item_ct1.get_group(1);
-    const int row     = item_ct1.get_group(0);
+    const int row     = item_ct1.get_group(2);
     const int nthreads = item_ct1.get_local_range(2);
 
     const int tid = item_ct1.get_local_id(2);
@@ -237,10 +237,10 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
         const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
         const float eps, queue_ptr stream, int device) {
 
-    const sycl::range<3> global_dims(nrows, nchannels, nsamples);
+    const sycl::range<3> global_dims(nsamples, nchannels, nrows);
     GGML_ASSERT(ncols % WARP_SIZE == 0);
     if (ncols < 1024) {
-        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+        const sycl::range<3> block_dims(1, 1, WARP_SIZE); // Equivalent to CUDA's (WARP_SIZE, 1, 1)
         stream->submit([&](sycl::handler& cgh) {
             cgh.parallel_for(
                 sycl::nd_range<3>(global_dims * block_dims, block_dims),
@@ -324,7 +324,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
     GGML_ASSERT(ncols % WARP_SIZE == 0);
     // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
 
-    const sycl::range<3> global_dims(nrows, nchannels, nsamples);
+    const sycl::range<3> global_dims(nsamples, nchannels, nrows);
     if (ncols < 1024) {
         const sycl::range<3> block_dims(1, 1, WARP_SIZE);
         stream->submit([&](sycl::handler& cgh) {

From 64e2dffe9d3bbb0e6a79833f9e7c40108b51966e Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Mon, 19 May 2025 11:24:34 +0530
Subject: [PATCH 06/10] restore not required changes ggml-ci

---
 ggml/src/ggml-sycl/norm.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp
index 47f241b25ac84..26bf48d03ad46 100644
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -16,7 +16,8 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t
     x += sample  * stride_sample + channel * stride_channel + row * stride_row;
     dst += ((sample * nchannels + channel) * nrows + row) * ncols;
 
-    sycl::float2 mean_var{0.f, 0.f};
+    sycl::float2 mean_var = sycl::float2(0.f, 0.f);
+
     for (int col = tid; col < ncols; col += block_size) {
         const float xi = x[col];
         mean_var.x() += xi;
@@ -42,7 +43,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t
     }
 
     const float mean = mean_var.x() / ncols;
-    const float var  = mean_var.y() / ncols - mean * mean;
+    const float var = mean_var.y() / ncols - mean * mean;
     const float inv_std = sycl::rsqrt(var + eps);
 
     for (int col = tid; col < ncols; col += block_size) {
@@ -240,7 +241,7 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
     const sycl::range<3> global_dims(nsamples, nchannels, nrows);
     GGML_ASSERT(ncols % WARP_SIZE == 0);
     if (ncols < 1024) {
-        const sycl::range<3> block_dims(1, 1, WARP_SIZE); // Equivalent to CUDA's (WARP_SIZE, 1, 1)
+        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
         stream->submit([&](sycl::handler& cgh) {
             cgh.parallel_for(
                 sycl::nd_range<3>(global_dims * block_dims, block_dims),
@@ -260,8 +261,8 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
         stream->submit([&](sycl::handler& cgh) {
-            auto s_sum_acc_ct1 = sycl::local_accessor<sycl::float2, 1>(sycl::range<1>(work_group_size / WARP_SIZE), cgh);
-
+            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
+                            sycl::range<1>(work_group_size / WARP_SIZE), cgh);
             cgh.parallel_for(
                 sycl::nd_range<3>(global_dims * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)

From 6dab4bf03b8b9e9d88141f0cb022275d9d2c1c03 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Thu, 22 May 2025 18:42:59 +0530
Subject: [PATCH 07/10] address review comments: change it to more like SYCL

---
 ggml/src/ggml-sycl/norm.cpp | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp
index 26bf48d03ad46..a68525c0019c7 100644
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -27,17 +27,18 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t
     // sum up partial sums
     mean_var = warp_reduce_sum(mean_var, item_ct1);
     if  (block_size > WARP_SIZE) {
-        int warp_id = tid / WARP_SIZE;
-        int lane_id = tid % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = mean_var;
+        const auto sub_group = item_ct1.get_sub_group();
+        const auto sg_id = sub_group.get_group_linear_id();
+        const auto wi_in_sg = sub_group.get_local_linear_id();
+        if (wi_in_sg == 0) {
+            s_sum[sg_id] = mean_var;
         }
         item_ct1.barrier(sycl::access::fence_space::local_space);
         mean_var = 0.f;
-        size_t nreduce = nwarps / WARP_SIZE;
+        const size_t nreduce = (nwarps + WARP_SIZE - 1) / WARP_SIZE;
         for (size_t i = 0; i < nreduce; i += 1)
         {
-            mean_var += s_sum[lane_id + i * WARP_SIZE];
+            mean_var += s_sum[wi_in_sg + i * WARP_SIZE];
         }
         mean_var = warp_reduce_sum(mean_var, item_ct1);
     }
@@ -165,19 +166,19 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6
     // sum up partial sums
     tmp = warp_reduce_sum(tmp, item_ct1);
     if (block_size > WARP_SIZE) {
-
-        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
+        const auto sub_group = item_ct1.get_sub_group();
+        const auto sg_id = sub_group.get_group_linear_id();
+        const auto wi_in_sg = sub_group.get_local_linear_id();
+        if (wi_in_sg == 0) {
+            s_sum[sg_id] = tmp;
         }
 
         item_ct1.barrier(sycl::access::fence_space::local_space);
-        size_t nreduce = nwarps / WARP_SIZE;
+        const size_t nreduce = (nwarps + WARP_SIZE - 1) / WARP_SIZE;
         tmp = 0.f;
         for (size_t i = 0; i < nreduce; i += 1)
         {
-            tmp += s_sum[lane_id + i * WARP_SIZE];
+            tmp += s_sum[wi_in_sg + i * WARP_SIZE];
         }
         tmp = warp_reduce_sum(tmp, item_ct1);
     }

From 7f65c00a16309d110ab9b11fdd363be28932b4bc Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Thu, 22 May 2025 19:44:38 +0530
Subject: [PATCH 08/10] Use a common function to calculate offset

---
 ggml/src/ggml-sycl/common.hpp | 18 ++++++++++++++++++
 ggml/src/ggml-sycl/norm.cpp   | 20 ++++++++++++++++----
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 60909dde7d087..170ec484d25eb 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -13,6 +13,7 @@
 #ifndef GGML_SYCL_COMMON_HPP
 #define GGML_SYCL_COMMON_HPP
 
+#include <cstddef>
 #include <fstream>
 #include <iostream>
 
@@ -471,6 +472,23 @@ static __dpct_inline__ float warp_reduce_max(float x,
     return x;
 }
 
+/* Helper for Computing the linear offset into an 4-dimensional ggml_tensor given
+per-dimension sizes, strides, and indices */
+template<int N>
+static __dpct_inline__ size_t calculate_offset(const std::array<int, N> & dims, const std::array<int, N> & strides, const std::array<int, N> & indices) {
+    size_t offset = 0;
+#pragma unroll
+    for (int i = 0; i < N; i++) {
+        auto index_i = indices[i];
+        // Handle wrap-around for indices that exceed dimensions
+        if (indices[i] >= dims[i]) {
+            index_i = indices[i] % dims[i];
+        }
+        offset += strides[i] * index_i;
+    }
+    return offset;
+}
+
 // Helper for vec loading aligned data
 template <typename Tp, int n>
 inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp
index a68525c0019c7..13f21e5d41121 100644
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -1,10 +1,13 @@
 #include "norm.hpp"
+#include "ggml-sycl/common.hpp"
 
 static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
         const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
 
     const int nrows = item_ct1.get_group_range(2);
     const int nchannels = item_ct1.get_group_range(1);
+    const int nsamples    = item_ct1.get_group_range(0);
+
     const int nthreads = item_ct1.get_local_range(2);
     const int sample  = item_ct1.get_group(0);
     const int channel = item_ct1.get_group(1);
@@ -13,8 +16,11 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t
     const int tid = item_ct1.get_local_id(2);
     const int nwarps = nthreads / WARP_SIZE;
 
-    x += sample  * stride_sample + channel * stride_channel + row * stride_row;
-    dst += ((sample * nchannels + channel) * nrows + row) * ncols;
+    const auto strided_offset = calculate_offset<3>({nsamples, nchannels, nrows}, {stride_sample, stride_channel, stride_row}, {sample, channel, row});
+    const auto packed_offset = calculate_offset<3>({nsamples, nchannels, nrows}, {nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
+
+    x += strided_offset;
+    dst += packed_offset;
 
     sycl::float2 mean_var = sycl::float2(0.f, 0.f);
 
@@ -144,16 +150,22 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6
 
     const int nrows = item_ct1.get_group_range(2);
     const int nchannels = item_ct1.get_group_range(1);
+    const int nsamples = item_ct1.get_group_range(0);
+
     const int sample  = item_ct1.get_group(0);
     const int channel = item_ct1.get_group(1);
     const int row     = item_ct1.get_group(2);
+
     const int nthreads = item_ct1.get_local_range(2);
 
     const int tid = item_ct1.get_local_id(2);
     const int nwarps = nthreads / WARP_SIZE;
 
-    x   += sample*stride_sample + channel*stride_channel + row*stride_row;
-    dst += ((sample*nchannels + channel)*nrows + row)*ncols;
+    const auto strided_offset = calculate_offset<3>({nsamples, nchannels, nrows}, {stride_sample, stride_channel, stride_row}, {sample, channel, row});
+    const auto packed_offset = calculate_offset<3>({nsamples, nchannels, nrows}, {nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
+
+    x   += strided_offset;
+    dst += packed_offset;
 
 
     float tmp = 0.0f; // partial sum for thread in warp

From 22dbcdf17250f9165ab502e1c5edcd11a5c9b427 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Thu, 22 May 2025 20:46:05 +0530
Subject: [PATCH 09/10] remove wrap around logic for handling broadcasts

---
 ggml/src/ggml-sycl/common.hpp |  6 +-----
 ggml/src/ggml-sycl/norm.cpp   | 10 ++++------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 170ec484d25eb..836aab0dd6a79 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -475,15 +475,11 @@ static __dpct_inline__ float warp_reduce_max(float x,
 /* Helper for Computing the linear offset into an 4-dimensional ggml_tensor given
 per-dimension sizes, strides, and indices */
 template<int N>
-static __dpct_inline__ size_t calculate_offset(const std::array<int, N> & dims, const std::array<int, N> & strides, const std::array<int, N> & indices) {
+static __dpct_inline__ size_t calculate_offset(const std::array<int, N> & strides, const std::array<int, N> & indices) {
     size_t offset = 0;
 #pragma unroll
     for (int i = 0; i < N; i++) {
         auto index_i = indices[i];
-        // Handle wrap-around for indices that exceed dimensions
-        if (indices[i] >= dims[i]) {
-            index_i = indices[i] % dims[i];
-        }
         offset += strides[i] * index_i;
     }
     return offset;
diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp
index 13f21e5d41121..88b0aa124842e 100644
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -6,7 +6,6 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t
 
     const int nrows = item_ct1.get_group_range(2);
     const int nchannels = item_ct1.get_group_range(1);
-    const int nsamples    = item_ct1.get_group_range(0);
 
     const int nthreads = item_ct1.get_local_range(2);
     const int sample  = item_ct1.get_group(0);
@@ -16,8 +15,8 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t
     const int tid = item_ct1.get_local_id(2);
     const int nwarps = nthreads / WARP_SIZE;
 
-    const auto strided_offset = calculate_offset<3>({nsamples, nchannels, nrows}, {stride_sample, stride_channel, stride_row}, {sample, channel, row});
-    const auto packed_offset = calculate_offset<3>({nsamples, nchannels, nrows}, {nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
+    const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
+    const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
 
     x += strided_offset;
     dst += packed_offset;
@@ -150,7 +149,6 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6
 
     const int nrows = item_ct1.get_group_range(2);
     const int nchannels = item_ct1.get_group_range(1);
-    const int nsamples = item_ct1.get_group_range(0);
 
     const int sample  = item_ct1.get_group(0);
     const int channel = item_ct1.get_group(1);
@@ -161,8 +159,8 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6
     const int tid = item_ct1.get_local_id(2);
     const int nwarps = nthreads / WARP_SIZE;
 
-    const auto strided_offset = calculate_offset<3>({nsamples, nchannels, nrows}, {stride_sample, stride_channel, stride_row}, {sample, channel, row});
-    const auto packed_offset = calculate_offset<3>({nsamples, nchannels, nrows}, {nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
+    const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
+    const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
 
     x   += strided_offset;
     dst += packed_offset;

From db0caa69a4e5446a38837ab925cb0c8e8740cd04 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Sat, 24 May 2025 10:53:30 +0530
Subject: [PATCH 10/10] remove static from calculate_offset fn and use ceil_div

---
 ggml/src/ggml-sycl/common.hpp | 4 ++--
 ggml/src/ggml-sycl/norm.cpp   | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 836aab0dd6a79..54f307d51abd6 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -472,10 +472,10 @@ static __dpct_inline__ float warp_reduce_max(float x,
     return x;
 }
 
-/* Helper for Computing the linear offset into an 4-dimensional ggml_tensor given
+/* Helper for Computing the linear offset of a ggml_tensor given
 per-dimension sizes, strides, and indices */
 template<int N>
-static __dpct_inline__ size_t calculate_offset(const std::array<int, N> & strides, const std::array<int, N> & indices) {
+__dpct_inline__ size_t calculate_offset(const std::array<int, N> & strides, const std::array<int, N> & indices) {
     size_t offset = 0;
 #pragma unroll
     for (int i = 0; i < N; i++) {
diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp
index 88b0aa124842e..4ec1416849c7e 100644
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -1,5 +1,6 @@
 #include "norm.hpp"
 #include "ggml-sycl/common.hpp"
+#include "ggml-sycl/presets.hpp"
 
 static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
         const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
@@ -40,7 +41,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t
         }
         item_ct1.barrier(sycl::access::fence_space::local_space);
         mean_var = 0.f;
-        const size_t nreduce = (nwarps + WARP_SIZE - 1) / WARP_SIZE;
+        const size_t nreduce = ceil_div(nwarps, WARP_SIZE);
         for (size_t i = 0; i < nreduce; i += 1)
         {
             mean_var += s_sum[wi_in_sg + i * WARP_SIZE];
@@ -184,7 +185,7 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6
         }
 
         item_ct1.barrier(sycl::access::fence_space::local_space);
-        const size_t nreduce = (nwarps + WARP_SIZE - 1) / WARP_SIZE;
+        const size_t nreduce = ceil_div(nwarps, WARP_SIZE);
         tmp = 0.f;
         for (size_t i = 0; i < nreduce; i += 1)
         {