From 8ff7a96003f531b6651780b4132eee1b14481153 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 18 May 2025 11:37:32 +0530 Subject: [PATCH 01/10] SYCL: Add non contiguous input support to norm kernel --- ggml/src/ggml-sycl/ggml-sycl.cpp | 1 + ggml/src/ggml-sycl/norm.cpp | 86 ++++++++++++++++---------------- 2 files changed, 44 insertions(+), 43 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 271f54e5773d9..eae76158ac23e 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4192,6 +4192,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g return (op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32) && (op->type == op->src[0]->type); #endif case GGML_OP_NORM: + return true; case GGML_OP_RMS_NORM: case GGML_OP_L2_NORM: case GGML_OP_GROUP_NORM: diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 4e9f438b46ba6..7d73da605fce4 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -1,50 +1,47 @@ #include "norm.hpp" +#include "ggml-sycl/presets.hpp" -static void norm_f32(const float* x, float* dst, const int ncols, const float eps, - const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) { - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - const int tid = item_ct1.get_local_id(2); +static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, + const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) { - const int nthreads = item_ct1.get_local_range(2); - const int nwarps = nthreads / WARP_SIZE; - sycl::float2 mean_var = sycl::float2(0.f, 0.f); + const int nrows = item_ct1.get_group_range(2); + const int nchannels = item_ct1.get_group_range(1); + int sample = item_ct1.get_group(0); + int channel = item_ct1.get_group(1); + int row = item_ct1.get_group(2); + int tid = item_ct1.get_local_id(2); + + x += sample * stride_sample + channel * stride_channel + row * stride_row; + dst += ((sample * nchannels + channel) * nrows + row) * ncols; + + sycl::float2 mean_var{0.f, 0.f}; for (int col = tid; col < ncols; col += block_size) { - const float xi = x[row * ncols + col]; + const float xi = x[col]; mean_var.x() += xi; mean_var.y() += xi * xi; } // sum up partial sums mean_var = warp_reduce_sum(mean_var, item_ct1); - if (block_size > WARP_SIZE) { - - int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; - int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (block_size > WARP_SIZE) { + int warp_id = tid / WARP_SIZE; + int lane_id = tid % WARP_SIZE; if (lane_id == 0) { s_sum[warp_id] = mean_var; } - /* - DPCT1118:0: SYCL group functions and algorithms must be encountered in - converged control flow. You may need to adjust the code. - */ item_ct1.barrier(sycl::access::fence_space::local_space); - mean_var = 0.f; - size_t nreduce = nwarps / WARP_SIZE; - for (size_t i = 0; i < nreduce; i += 1) - { - mean_var += s_sum[lane_id + i * WARP_SIZE]; - } + + mean_var = s_sum[lane_id]; mean_var = warp_reduce_sum(mean_var, item_ct1); } const float mean = mean_var.x() / ncols; - const float var = mean_var.y() / ncols - mean * mean; + const float var = mean_var.y() / ncols - mean * mean; const float inv_std = sycl::rsqrt(var + eps); for (int col = tid; col < ncols; col += block_size) { - dst[row * ncols + col] = (x[row * ncols + col] - mean) * inv_std; + dst[col] = (x[col] - mean) * inv_std; } } @@ -224,20 +221,20 @@ static void l2_norm_f32(const float* x, float* dst, const int ncols, const float } } -static void norm_f32_sycl(const float* x, float* dst, const int ncols, - const int nrows, const float eps, - queue_ptr stream, int device) { +static void norm_f32_sycl(const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples, + const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, + const float eps, queue_ptr stream, int device) { + + const sycl::range<3> global_dims(nsamples, nchannels, nrows); GGML_ASSERT(ncols % WARP_SIZE == 0); if (ncols < 1024) { - const sycl::range<3> block_dims(1, 1, WARP_SIZE); + const sycl::range<3> block_dims(1, 1, WARP_SIZE); // Equivalent to CUDA's (WARP_SIZE, 1, 1) stream->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, - block_dims), + sycl::nd_range<3>(global_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - norm_f32(x, dst, ncols, eps, item_ct1, - nullptr, WARP_SIZE); + norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE); }); }); } @@ -251,16 +248,13 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols, info::device::max_work_group_size. Adjust the work-group size if needed. */ stream->submit([&](sycl::handler& cgh) { - sycl::local_accessor s_sum_acc_ct1( - sycl::range<1>(work_group_size / WARP_SIZE), cgh); + auto s_sum_acc_ct1 = sycl::local_accessor(sycl::range<1>(work_group_size / WARP_SIZE), cgh); cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, - block_dims), + sycl::nd_range<3>(global_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - norm_f32(x, dst, ncols, eps, item_ct1, - get_pointer(s_sum_acc_ct1), work_group_size); + norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size); }); }); } @@ -398,12 +392,12 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols, } void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { + const ggml_tensor * src0 = dst->src[0]; GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - const int64_t ne00 = dst->src[0]->ne[0]; - const int64_t nrows = ggml_nrows(dst->src[0]); + GGML_TENSOR_UNARY_OP_LOCALS dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); @@ -411,8 +405,14 @@ void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { float eps; memcpy(&eps, dst->op_params, sizeof(float)); - - norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device); + GGML_ASSERT(eps >= 0.0f); + const size_t ts0 = ggml_type_size(src0->type); + GGML_ASSERT(nb00 == ts0); + const int64_t s01 = nb01 / ts0; + const int64_t s02 = nb02 / ts0; + const int64_t s03 = nb03 / ts0; + + norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device); } void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { From 5107933df78963c2a3e72a436a8146978b26024c Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 18 May 2025 12:05:35 +0530 Subject: [PATCH 02/10] refactor and add RMS_NORM non contiguous input support ggml-ci --- ggml/src/ggml-sycl/ggml-sycl.cpp | 2 +- ggml/src/ggml-sycl/norm.cpp | 77 ++++++++++++++++---------------- 2 files changed, 40 insertions(+), 39 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index eae76158ac23e..3bf95f88cb3b8 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4192,8 +4192,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g return (op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32) && (op->type == op->src[0]->type); #endif case GGML_OP_NORM: - return true; case GGML_OP_RMS_NORM: + return true; case GGML_OP_L2_NORM: case GGML_OP_GROUP_NORM: return ggml_is_contiguous(op->src[0]); diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 7d73da605fce4..59fbd4b880fcb 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -1,18 +1,17 @@ #include "norm.hpp" -#include "ggml-sycl/presets.hpp" static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) { const int nrows = item_ct1.get_group_range(2); const int nchannels = item_ct1.get_group_range(1); - int sample = item_ct1.get_group(0); - int channel = item_ct1.get_group(1); - int row = item_ct1.get_group(2); + const int sample = item_ct1.get_group(0); + const int channel = item_ct1.get_group(1); + const int row = item_ct1.get_group(2); - int tid = item_ct1.get_local_id(2); + const int tid = item_ct1.get_local_id(2); - x += sample * stride_sample + channel * stride_channel + row * stride_row; + x += sample * stride_sample + channel * stride_channel + row * stride_row; dst += ((sample * nchannels + channel) * nrows + row) * ncols; sycl::float2 mean_var{0.f, 0.f}; @@ -132,17 +131,25 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con } } -static void rms_norm_f32(const float* x, float* dst, const int ncols, const float eps, - const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) { - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); +static void rms_norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, + const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) { + + const int nrows = item_ct1.get_group_range(2); + const int nchannels = item_ct1.get_group_range(1); + const int sample = item_ct1.get_group(0); + const int channel = item_ct1.get_group(1); + const int row = item_ct1.get_group(2); + const int tid = item_ct1.get_local_id(2); - const int nthreads = item_ct1.get_local_range(2); - const int nwarps = nthreads / WARP_SIZE; + + x += sample*stride_sample + channel*stride_channel + row*stride_row; + dst += ((sample*nchannels + channel)*nrows + row)*ncols; + + float tmp = 0.0f; // partial sum for thread in warp for (int col = tid; col < ncols; col += block_size) { - const float xi = x[row * ncols + col]; + const float xi = x[col]; tmp += xi * xi; } @@ -155,17 +162,9 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa if (lane_id == 0) { s_sum[warp_id] = tmp; } - /* - DPCT1118:3: SYCL group functions and algorithms must be encountered in - converged control flow. You may need to adjust the code. - */ + item_ct1.barrier(sycl::access::fence_space::local_space); - size_t nreduce = nwarps / WARP_SIZE; - tmp = 0.f; - for (size_t i = 0; i < nreduce; i += 1) - { - tmp += s_sum[lane_id + i * WARP_SIZE]; - } + tmp = s_sum[lane_id]; tmp = warp_reduce_sum(tmp, item_ct1); } @@ -173,7 +172,7 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa const float scale = sycl::rsqrt(mean + eps); for (int col = tid; col < ncols; col += block_size) { - dst[row * ncols + col] = scale * x[row * ncols + col]; + dst[col] = scale * x[col]; } } @@ -307,21 +306,20 @@ static void group_norm_f32_sycl(const float* x, float* dst, } } -static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, - const int nrows, const float eps, - queue_ptr stream, int device) { +static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const int nrows, const int nchannels, const int nsamples, + const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, queue_ptr stream, int device) { GGML_ASSERT(ncols % WARP_SIZE == 0); // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE); + + const sycl::range<3> global_dims(nsamples, nchannels, nrows); if (ncols < 1024) { const sycl::range<3> block_dims(1, 1, WARP_SIZE); stream->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, - block_dims), + sycl::nd_range<3>(global_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - rms_norm_f32(x, dst, ncols, eps, item_ct1, - nullptr, WARP_SIZE); + rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE); }); }); } @@ -338,12 +336,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE), cgh); cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, - block_dims), + sycl::nd_range<3>(global_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - rms_norm_f32(x, dst, ncols, eps, item_ct1, - get_pointer(s_sum_acc_ct1), work_group_size); + rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size); }); }); } @@ -436,11 +432,10 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - const int64_t ne00 = dst->src[0]->ne[0]; - const int64_t nrows = ggml_nrows(dst->src[0]); dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); @@ -450,7 +445,13 @@ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { float eps; memcpy(&eps, dst->op_params, sizeof(float)); - rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device); + GGML_TENSOR_UNARY_OP_LOCALS + const size_t ts0 = ggml_type_size(src0->type); + GGML_ASSERT(nb00 == ts0); + const int64_t s01 = nb01 / ts0; + const int64_t s02 = nb02 / ts0; + const int64_t s03 = nb03 / ts0; + rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device); } void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { From 0a1888cb33f678d6f0d44e862ec0237750f562bc Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 18 May 2025 12:29:01 +0530 Subject: [PATCH 03/10] restore subgroup reduction for multi-subgroup thread blocks in norm kernels --- ggml/src/ggml-sycl/norm.cpp | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 59fbd4b880fcb..47f241b25ac84 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -5,11 +5,13 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t const int nrows = item_ct1.get_group_range(2); const int nchannels = item_ct1.get_group_range(1); + const int nthreads = item_ct1.get_local_range(2); const int sample = item_ct1.get_group(0); const int channel = item_ct1.get_group(1); const int row = item_ct1.get_group(2); const int tid = item_ct1.get_local_id(2); + const int nwarps = nthreads / WARP_SIZE; x += sample * stride_sample + channel * stride_channel + row * stride_row; dst += ((sample * nchannels + channel) * nrows + row) * ncols; @@ -30,8 +32,12 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t s_sum[warp_id] = mean_var; } item_ct1.barrier(sycl::access::fence_space::local_space); - - mean_var = s_sum[lane_id]; + mean_var = 0.f; + size_t nreduce = nwarps / WARP_SIZE; + for (size_t i = 0; i < nreduce; i += 1) + { + mean_var += s_sum[lane_id + i * WARP_SIZE]; + } mean_var = warp_reduce_sum(mean_var, item_ct1); } @@ -139,8 +145,10 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6 const int sample = item_ct1.get_group(0); const int channel = item_ct1.get_group(1); const int row = item_ct1.get_group(2); + const int nthreads = item_ct1.get_local_range(2); const int tid = item_ct1.get_local_id(2); + const int nwarps = nthreads / WARP_SIZE; x += sample*stride_sample + channel*stride_channel + row*stride_row; dst += ((sample*nchannels + channel)*nrows + row)*ncols; @@ -164,7 +172,12 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6 } item_ct1.barrier(sycl::access::fence_space::local_space); - tmp = s_sum[lane_id]; + size_t nreduce = nwarps / WARP_SIZE; + tmp = 0.f; + for (size_t i = 0; i < nreduce; i += 1) + { + tmp += s_sum[lane_id + i * WARP_SIZE]; + } tmp = warp_reduce_sum(tmp, item_ct1); } From 23cc97967b8c6b8b1a16ba3dd7c1432c980eaf65 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 18 May 2025 18:22:03 +0530 Subject: [PATCH 04/10] Swap grid dims of nsamples and nrows ggml-ci --- ggml/src/ggml-sycl/norm.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 47f241b25ac84..e15305e1a5e95 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -3,12 +3,12 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) { - const int nrows = item_ct1.get_group_range(2); + const int nrows = item_ct1.get_group_range(0); const int nchannels = item_ct1.get_group_range(1); const int nthreads = item_ct1.get_local_range(2); - const int sample = item_ct1.get_group(0); + const int sample = item_ct1.get_group(2); const int channel = item_ct1.get_group(1); - const int row = item_ct1.get_group(2); + const int row = item_ct1.get_group(0); const int tid = item_ct1.get_local_id(2); const int nwarps = nthreads / WARP_SIZE; @@ -140,11 +140,11 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con static void rms_norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) { - const int nrows = item_ct1.get_group_range(2); + const int nrows = item_ct1.get_group_range(0); const int nchannels = item_ct1.get_group_range(1); - const int sample = item_ct1.get_group(0); + const int sample = item_ct1.get_group(2); const int channel = item_ct1.get_group(1); - const int row = item_ct1.get_group(2); + const int row = item_ct1.get_group(0); const int nthreads = item_ct1.get_local_range(2); const int tid = item_ct1.get_local_id(2); @@ -237,10 +237,10 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, queue_ptr stream, int device) { - const sycl::range<3> global_dims(nsamples, nchannels, nrows); + const sycl::range<3> global_dims(nrows, nchannels, nsamples); GGML_ASSERT(ncols % WARP_SIZE == 0); if (ncols < 1024) { - const sycl::range<3> block_dims(1, 1, WARP_SIZE); // Equivalent to CUDA's (WARP_SIZE, 1, 1) + const sycl::range<3> block_dims(1, 1, WARP_SIZE); stream->submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl::nd_range<3>(global_dims * block_dims, block_dims), @@ -324,7 +324,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const GGML_ASSERT(ncols % WARP_SIZE == 0); // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE); - const sycl::range<3> global_dims(nsamples, nchannels, nrows); + const sycl::range<3> global_dims(nrows, nchannels, nsamples); if (ncols < 1024) { const sycl::range<3> block_dims(1, 1, WARP_SIZE); stream->submit([&](sycl::handler& cgh) { From 95bd077b280448e9c9c7b54a26b4bf9ebec32fbf Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 18 May 2025 18:43:12 +0530 Subject: [PATCH 05/10] Revert "Swap grid dims of nsamples and nrows" This reverts commit 43be2d657fec7f7fba54e2cd154106bc0fc45adf. --- ggml/src/ggml-sycl/norm.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index e15305e1a5e95..47f241b25ac84 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -3,12 +3,12 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) { - const int nrows = item_ct1.get_group_range(0); + const int nrows = item_ct1.get_group_range(2); const int nchannels = item_ct1.get_group_range(1); const int nthreads = item_ct1.get_local_range(2); - const int sample = item_ct1.get_group(2); + const int sample = item_ct1.get_group(0); const int channel = item_ct1.get_group(1); - const int row = item_ct1.get_group(0); + const int row = item_ct1.get_group(2); const int tid = item_ct1.get_local_id(2); const int nwarps = nthreads / WARP_SIZE; @@ -140,11 +140,11 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con static void rms_norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) { - const int nrows = item_ct1.get_group_range(0); + const int nrows = item_ct1.get_group_range(2); const int nchannels = item_ct1.get_group_range(1); - const int sample = item_ct1.get_group(2); + const int sample = item_ct1.get_group(0); const int channel = item_ct1.get_group(1); - const int row = item_ct1.get_group(0); + const int row = item_ct1.get_group(2); const int nthreads = item_ct1.get_local_range(2); const int tid = item_ct1.get_local_id(2); @@ -237,10 +237,10 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, queue_ptr stream, int device) { - const sycl::range<3> global_dims(nrows, nchannels, nsamples); + const sycl::range<3> global_dims(nsamples, nchannels, nrows); GGML_ASSERT(ncols % WARP_SIZE == 0); if (ncols < 1024) { - const sycl::range<3> block_dims(1, 1, WARP_SIZE); + const sycl::range<3> block_dims(1, 1, WARP_SIZE); // Equivalent to CUDA's (WARP_SIZE, 1, 1) stream->submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl::nd_range<3>(global_dims * block_dims, block_dims), @@ -324,7 +324,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const GGML_ASSERT(ncols % WARP_SIZE == 0); // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE); - const sycl::range<3> global_dims(nrows, nchannels, nsamples); + const sycl::range<3> global_dims(nsamples, nchannels, nrows); if (ncols < 1024) { const sycl::range<3> block_dims(1, 1, WARP_SIZE); stream->submit([&](sycl::handler& cgh) { From 64e2dffe9d3bbb0e6a79833f9e7c40108b51966e Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Mon, 19 May 2025 11:24:34 +0530 Subject: [PATCH 06/10] restore not required changes ggml-ci --- ggml/src/ggml-sycl/norm.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 47f241b25ac84..26bf48d03ad46 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -16,7 +16,8 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t x += sample * stride_sample + channel * stride_channel + row * stride_row; dst += ((sample * nchannels + channel) * nrows + row) * ncols; - sycl::float2 mean_var{0.f, 0.f}; + sycl::float2 mean_var = sycl::float2(0.f, 0.f); + for (int col = tid; col < ncols; col += block_size) { const float xi = x[col]; mean_var.x() += xi; @@ -42,7 +43,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t } const float mean = mean_var.x() / ncols; - const float var = mean_var.y() / ncols - mean * mean; + const float var = mean_var.y() / ncols - mean * mean; const float inv_std = sycl::rsqrt(var + eps); for (int col = tid; col < ncols; col += block_size) { @@ -240,7 +241,7 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i const sycl::range<3> global_dims(nsamples, nchannels, nrows); GGML_ASSERT(ncols % WARP_SIZE == 0); if (ncols < 1024) { - const sycl::range<3> block_dims(1, 1, WARP_SIZE); // Equivalent to CUDA's (WARP_SIZE, 1, 1) + const sycl::range<3> block_dims(1, 1, WARP_SIZE); stream->submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl::nd_range<3>(global_dims * block_dims, block_dims), @@ -260,8 +261,8 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i info::device::max_work_group_size. Adjust the work-group size if needed. */ stream->submit([&](sycl::handler& cgh) { - auto s_sum_acc_ct1 = sycl::local_accessor(sycl::range<1>(work_group_size / WARP_SIZE), cgh); - + sycl::local_accessor s_sum_acc_ct1( + sycl::range<1>(work_group_size / WARP_SIZE), cgh); cgh.parallel_for( sycl::nd_range<3>(global_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) From 6dab4bf03b8b9e9d88141f0cb022275d9d2c1c03 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Thu, 22 May 2025 18:42:59 +0530 Subject: [PATCH 07/10] address review comments: change it to more like SYCL --- ggml/src/ggml-sycl/norm.cpp | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 26bf48d03ad46..a68525c0019c7 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -27,17 +27,18 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t // sum up partial sums mean_var = warp_reduce_sum(mean_var, item_ct1); if (block_size > WARP_SIZE) { - int warp_id = tid / WARP_SIZE; - int lane_id = tid % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = mean_var; + const auto sub_group = item_ct1.get_sub_group(); + const auto sg_id = sub_group.get_group_linear_id(); + const auto wi_in_sg = sub_group.get_local_linear_id(); + if (wi_in_sg == 0) { + s_sum[sg_id] = mean_var; } item_ct1.barrier(sycl::access::fence_space::local_space); mean_var = 0.f; - size_t nreduce = nwarps / WARP_SIZE; + const size_t nreduce = (nwarps + WARP_SIZE - 1) / WARP_SIZE; for (size_t i = 0; i < nreduce; i += 1) { - mean_var += s_sum[lane_id + i * WARP_SIZE]; + mean_var += s_sum[wi_in_sg + i * WARP_SIZE]; } mean_var = warp_reduce_sum(mean_var, item_ct1); } @@ -165,19 +166,19 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6 // sum up partial sums tmp = warp_reduce_sum(tmp, item_ct1); if (block_size > WARP_SIZE) { - - int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; - int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = tmp; + const auto sub_group = item_ct1.get_sub_group(); + const auto sg_id = sub_group.get_group_linear_id(); + const auto wi_in_sg = sub_group.get_local_linear_id(); + if (wi_in_sg == 0) { + s_sum[sg_id] = tmp; } item_ct1.barrier(sycl::access::fence_space::local_space); - size_t nreduce = nwarps / WARP_SIZE; + const size_t nreduce = (nwarps + WARP_SIZE - 1) / WARP_SIZE; tmp = 0.f; for (size_t i = 0; i < nreduce; i += 1) { - tmp += s_sum[lane_id + i * WARP_SIZE]; + tmp += s_sum[wi_in_sg + i * WARP_SIZE]; } tmp = warp_reduce_sum(tmp, item_ct1); } From 7f65c00a16309d110ab9b11fdd363be28932b4bc Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Thu, 22 May 2025 19:44:38 +0530 Subject: [PATCH 08/10] Use a common function to calculate offset --- ggml/src/ggml-sycl/common.hpp | 18 ++++++++++++++++++ ggml/src/ggml-sycl/norm.cpp | 20 ++++++++++++++++---- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 60909dde7d087..170ec484d25eb 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -13,6 +13,7 @@ #ifndef GGML_SYCL_COMMON_HPP #define GGML_SYCL_COMMON_HPP +#include #include #include @@ -471,6 +472,23 @@ static __dpct_inline__ float warp_reduce_max(float x, return x; } +/* Helper for Computing the linear offset into an 4-dimensional ggml_tensor given +per-dimension sizes, strides, and indices */ +template +static __dpct_inline__ size_t calculate_offset(const std::array & dims, const std::array & strides, const std::array & indices) { + size_t offset = 0; +#pragma unroll + for (int i = 0; i < N; i++) { + auto index_i = indices[i]; + // Handle wrap-around for indices that exceed dimensions + if (indices[i] >= dims[i]) { + index_i = indices[i] % dims[i]; + } + offset += strides[i] * index_i; + } + return offset; +} + // Helper for vec loading aligned data template inline sycl::vec vec_aligned_load(const Tp* aligned_ptr) { diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index a68525c0019c7..13f21e5d41121 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -1,10 +1,13 @@ #include "norm.hpp" +#include "ggml-sycl/common.hpp" static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) { const int nrows = item_ct1.get_group_range(2); const int nchannels = item_ct1.get_group_range(1); + const int nsamples = item_ct1.get_group_range(0); + const int nthreads = item_ct1.get_local_range(2); const int sample = item_ct1.get_group(0); const int channel = item_ct1.get_group(1); @@ -13,8 +16,11 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t const int tid = item_ct1.get_local_id(2); const int nwarps = nthreads / WARP_SIZE; - x += sample * stride_sample + channel * stride_channel + row * stride_row; - dst += ((sample * nchannels + channel) * nrows + row) * ncols; + const auto strided_offset = calculate_offset<3>({nsamples, nchannels, nrows}, {stride_sample, stride_channel, stride_row}, {sample, channel, row}); + const auto packed_offset = calculate_offset<3>({nsamples, nchannels, nrows}, {nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row}); + + x += strided_offset; + dst += packed_offset; sycl::float2 mean_var = sycl::float2(0.f, 0.f); @@ -144,16 +150,22 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6 const int nrows = item_ct1.get_group_range(2); const int nchannels = item_ct1.get_group_range(1); + const int nsamples = item_ct1.get_group_range(0); + const int sample = item_ct1.get_group(0); const int channel = item_ct1.get_group(1); const int row = item_ct1.get_group(2); + const int nthreads = item_ct1.get_local_range(2); const int tid = item_ct1.get_local_id(2); const int nwarps = nthreads / WARP_SIZE; - x += sample*stride_sample + channel*stride_channel + row*stride_row; - dst += ((sample*nchannels + channel)*nrows + row)*ncols; + const auto strided_offset = calculate_offset<3>({nsamples, nchannels, nrows}, {stride_sample, stride_channel, stride_row}, {sample, channel, row}); + const auto packed_offset = calculate_offset<3>({nsamples, nchannels, nrows}, {nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row}); + + x += strided_offset; + dst += packed_offset; float tmp = 0.0f; // partial sum for thread in warp From 22dbcdf17250f9165ab502e1c5edcd11a5c9b427 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Thu, 22 May 2025 20:46:05 +0530 Subject: [PATCH 09/10] remove wrap around logic for handling broadcasts --- ggml/src/ggml-sycl/common.hpp | 6 +----- ggml/src/ggml-sycl/norm.cpp | 10 ++++------ 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 170ec484d25eb..836aab0dd6a79 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -475,15 +475,11 @@ static __dpct_inline__ float warp_reduce_max(float x, /* Helper for Computing the linear offset into an 4-dimensional ggml_tensor given per-dimension sizes, strides, and indices */ template -static __dpct_inline__ size_t calculate_offset(const std::array & dims, const std::array & strides, const std::array & indices) { +static __dpct_inline__ size_t calculate_offset(const std::array & strides, const std::array & indices) { size_t offset = 0; #pragma unroll for (int i = 0; i < N; i++) { auto index_i = indices[i]; - // Handle wrap-around for indices that exceed dimensions - if (indices[i] >= dims[i]) { - index_i = indices[i] % dims[i]; - } offset += strides[i] * index_i; } return offset; diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 13f21e5d41121..88b0aa124842e 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -6,7 +6,6 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t const int nrows = item_ct1.get_group_range(2); const int nchannels = item_ct1.get_group_range(1); - const int nsamples = item_ct1.get_group_range(0); const int nthreads = item_ct1.get_local_range(2); const int sample = item_ct1.get_group(0); @@ -16,8 +15,8 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t const int tid = item_ct1.get_local_id(2); const int nwarps = nthreads / WARP_SIZE; - const auto strided_offset = calculate_offset<3>({nsamples, nchannels, nrows}, {stride_sample, stride_channel, stride_row}, {sample, channel, row}); - const auto packed_offset = calculate_offset<3>({nsamples, nchannels, nrows}, {nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row}); + const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row}); + const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row}); x += strided_offset; dst += packed_offset; @@ -150,7 +149,6 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6 const int nrows = item_ct1.get_group_range(2); const int nchannels = item_ct1.get_group_range(1); - const int nsamples = item_ct1.get_group_range(0); const int sample = item_ct1.get_group(0); const int channel = item_ct1.get_group(1); @@ -161,8 +159,8 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6 const int tid = item_ct1.get_local_id(2); const int nwarps = nthreads / WARP_SIZE; - const auto strided_offset = calculate_offset<3>({nsamples, nchannels, nrows}, {stride_sample, stride_channel, stride_row}, {sample, channel, row}); - const auto packed_offset = calculate_offset<3>({nsamples, nchannels, nrows}, {nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row}); + const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row}); + const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row}); x += strided_offset; dst += packed_offset; From db0caa69a4e5446a38837ab925cb0c8e8740cd04 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 24 May 2025 10:53:30 +0530 Subject: [PATCH 10/10] remove static from calculate_offset fn and use ceil_div --- ggml/src/ggml-sycl/common.hpp | 4 ++-- ggml/src/ggml-sycl/norm.cpp | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 836aab0dd6a79..54f307d51abd6 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -472,10 +472,10 @@ static __dpct_inline__ float warp_reduce_max(float x, return x; } -/* Helper for Computing the linear offset into an 4-dimensional ggml_tensor given +/* Helper for Computing the linear offset of a ggml_tensor given per-dimension sizes, strides, and indices */ template -static __dpct_inline__ size_t calculate_offset(const std::array & strides, const std::array & indices) { +__dpct_inline__ size_t calculate_offset(const std::array & strides, const std::array & indices) { size_t offset = 0; #pragma unroll for (int i = 0; i < N; i++) { diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 88b0aa124842e..4ec1416849c7e 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -1,5 +1,6 @@ #include "norm.hpp" #include "ggml-sycl/common.hpp" +#include "ggml-sycl/presets.hpp" static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) { @@ -40,7 +41,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t } item_ct1.barrier(sycl::access::fence_space::local_space); mean_var = 0.f; - const size_t nreduce = (nwarps + WARP_SIZE - 1) / WARP_SIZE; + const size_t nreduce = ceil_div(nwarps, WARP_SIZE); for (size_t i = 0; i < nreduce; i += 1) { mean_var += s_sum[wi_in_sg + i * WARP_SIZE]; @@ -184,7 +185,7 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6 } item_ct1.barrier(sycl::access::fence_space::local_space); - const size_t nreduce = (nwarps + WARP_SIZE - 1) / WARP_SIZE; + const size_t nreduce = ceil_div(nwarps, WARP_SIZE); tmp = 0.f; for (size_t i = 0; i < nreduce; i += 1) {