diff --git a/dpctl/__init__.py b/dpctl/__init__.py index 0447fba6ff..1d9b7209e4 100644 --- a/dpctl/__init__.py +++ b/dpctl/__init__.py @@ -52,6 +52,7 @@ SyclKernelSubmitError, SyclQueue, SyclQueueCreationError, + WorkGroupMemory, ) from ._sycl_queue_manager import get_device_cached_queue from ._sycl_timer import SyclTimer @@ -100,6 +101,7 @@ "SyclKernelInvalidRangeError", "SyclKernelSubmitError", "SyclQueueCreationError", + "WorkGroupMemory", ] __all__ += [ "get_device_cached_queue", diff --git a/dpctl/_backend.pxd b/dpctl/_backend.pxd index 10a556eacd..cf0dba2d7b 100644 --- a/dpctl/_backend.pxd +++ b/dpctl/_backend.pxd @@ -69,7 +69,8 @@ cdef extern from "syclinterface/dpctl_sycl_enum_types.h": _FLOAT 'DPCTL_FLOAT32_T', _DOUBLE 'DPCTL_FLOAT64_T', _VOID_PTR 'DPCTL_VOID_PTR', - _LOCAL_ACCESSOR 'DPCTL_LOCAL_ACCESSOR' + _LOCAL_ACCESSOR 'DPCTL_LOCAL_ACCESSOR', + _WORK_GROUP_MEMORY 'DPCTL_WORK_GROUP_MEMORY' ctypedef enum _queue_property_type 'DPCTLQueuePropertyType': _DEFAULT_PROPERTY 'DPCTL_DEFAULT_PROPERTY' @@ -468,3 +469,17 @@ cdef extern from "syclinterface/dpctl_sycl_usm_interface.h": cdef DPCTLSyclDeviceRef DPCTLUSM_GetPointerDevice( DPCTLSyclUSMRef MRef, DPCTLSyclContextRef CRef) + +cdef extern from "syclinterface/dpctl_sycl_extension_interface.h": + cdef struct RawWorkGroupMemoryTy + ctypedef RawWorkGroupMemoryTy RawWorkGroupMemory + + cdef struct DPCTLOpaqueWorkGroupMemory + ctypedef DPCTLOpaqueWorkGroupMemory *DPCTLSyclWorkGroupMemoryRef; + + cdef DPCTLSyclWorkGroupMemoryRef DPCTLWorkGroupMemory_Create(size_t nbytes); + + cdef void DPCTLWorkGroupMemory_Delete( + DPCTLSyclWorkGroupMemoryRef Ref); + + cdef bint DPCTLWorkGroupMemory_Available(); diff --git a/dpctl/_sycl_queue.pxd b/dpctl/_sycl_queue.pxd index 003201e9a1..4fde4af77a 100644 --- a/dpctl/_sycl_queue.pxd +++ b/dpctl/_sycl_queue.pxd @@ -22,7 +22,12 @@ from libcpp cimport bool as cpp_bool -from ._backend cimport DPCTLSyclDeviceRef, DPCTLSyclQueueRef, _arg_data_type +from ._backend cimport ( + DPCTLSyclDeviceRef, + DPCTLSyclQueueRef, + DPCTLSyclWorkGroupMemoryRef, + _arg_data_type, +) from ._sycl_context cimport SyclContext from ._sycl_device cimport SyclDevice from ._sycl_event cimport SyclEvent @@ -98,3 +103,13 @@ cdef public api class SyclQueue (_SyclQueue) [ cpdef prefetch(self, ptr, size_t count=*) cpdef mem_advise(self, ptr, size_t count, int mem) cpdef SyclEvent submit_barrier(self, dependent_events=*) + +cdef public api class _WorkGroupMemory [ + object Py_WorkGroupMemoryObject, type Py_WorkGroupMemoryType +]: + cdef DPCTLSyclWorkGroupMemoryRef _mem_ref + +cdef public api class WorkGroupMemory(_WorkGroupMemory) [ + object PyWorkGroupMemoryObject, type PyWorkGroupMemoryType +]: + pass diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index 86ef08f584..94527506ef 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -54,6 +54,9 @@ from ._backend cimport ( # noqa: E211 DPCTLSyclContextRef, DPCTLSyclDeviceSelectorRef, DPCTLSyclEventRef, + DPCTLWorkGroupMemory_Available, + DPCTLWorkGroupMemory_Create, + DPCTLWorkGroupMemory_Delete, _arg_data_type, _backend_type, _queue_property_type, @@ -61,6 +64,7 @@ from ._backend cimport ( # noqa: E211 from .memory._memory cimport _Memory import ctypes +import numbers from .enum_types import backend_type @@ -250,6 +254,15 @@ cdef class _kernel_arg_type: _arg_data_type._LOCAL_ACCESSOR ) + @property + def dpctl_work_group_memory(self): + cdef str p_name = "dpctl_work_group_memory" + return kernel_arg_type_attribute( + self._name, + p_name, + _arg_data_type._WORK_GROUP_MEMORY + ) + kernel_arg_type = _kernel_arg_type() @@ -849,6 +862,9 @@ cdef class SyclQueue(_SyclQueue): elif isinstance(arg, _Memory): kargs[idx]= (arg._pointer) kargty[idx] = _arg_data_type._VOID_PTR + elif isinstance(arg, WorkGroupMemory): + kargs[idx] = (arg._ref) + kargty[idx] = _arg_data_type._WORK_GROUP_MEMORY else: ret = -1 return ret @@ -1524,3 +1540,89 @@ cdef api SyclQueue SyclQueue_Make(DPCTLSyclQueueRef QRef): """ cdef DPCTLSyclQueueRef copied_QRef = DPCTLQueue_Copy(QRef) return SyclQueue._create(copied_QRef) + +cdef class _WorkGroupMemory: + def __dealloc__(self): + if(self._mem_ref): + DPCTLWorkGroupMemory_Delete(self._mem_ref) + +cdef class WorkGroupMemory: + """ + WorkGroupMemory(nbytes) + Python class representing the ``work_group_memory`` class from the + Workgroup Memory oneAPI SYCL extension for low-overhead allocation of local + memory shared by the workitems in a workgroup. + + This class is intended be used as kernel argument when launching kernels. + + This is based on a DPC++ SYCL extension and only available in newer + versions. Use ``is_available()`` to check availability in your build. + + There are multiple ways to create a `WorkGroupMemory`. + + - If the constructor is invoked with just a single argument, this argument + is interpreted as the number of bytes to allocated in the shared local + memory. + + - If the constructor is invoked with two arguments, the first argument is + interpreted as the datatype of the local memory, using the numpy type + naming scheme. + The second argument is interpreted as the number of elements to allocate. + The number of bytes to allocate is then computed from the byte size of + the data type and the element count. + + Args: + args: + Variadic argument, see class documentation. + + Raises: + TypeError: In case of incorrect arguments given to constructors, + unexpected types of input arguments. + """ + def __cinit__(self, *args): + cdef size_t nbytes + if not DPCTLWorkGroupMemory_Available(): + raise RuntimeError("Workgroup memory extension not available") + + if not (0 < len(args) < 3): + raise TypeError("WorkGroupMemory constructor takes 1 or 2 " + f"arguments, but {len(args)} were given") + + if len(args) == 1: + if not isinstance(args[0], numbers.Integral): + raise TypeError("WorkGroupMemory single argument constructor" + "expects first argument to be `int`", + f"but got {type(args[0])}") + nbytes = (args[0]) + else: + if not isinstance(args[0], str): + raise TypeError("WorkGroupMemory constructor expects first" + f"argument to be `str`, but got {type(args[0])}") + if not isinstance(args[1], numbers.Integral): + raise TypeError("WorkGroupMemory constructor expects second" + f"argument to be `int`, but got {type(args[1])}") + dtype = (args[0]) + count = (args[1]) + if not dtype[0] in ["i", "u", "f"]: + raise TypeError(f"Unrecognized type value: '{dtype}'") + try: + bit_width = int(dtype[1:]) + except ValueError: + raise TypeError(f"Unrecognized type value: '{dtype}'") + + byte_size = bit_width + nbytes = count * byte_size + + self._mem_ref = DPCTLWorkGroupMemory_Create(nbytes) + + """Check whether the work_group_memory extension is available""" + @staticmethod + def is_available(): + return DPCTLWorkGroupMemory_Available() + + property _ref: + """Returns the address of the C API ``DPCTLWorkGroupMemoryRef`` + pointer as a ``size_t``. + """ + def __get__(self): + return self._mem_ref diff --git a/dpctl/apis/include/dpctl_capi.h b/dpctl/apis/include/dpctl_capi.h index a0a2235fe8..73e70903e6 100644 --- a/dpctl/apis/include/dpctl_capi.h +++ b/dpctl/apis/include/dpctl_capi.h @@ -25,9 +25,11 @@ #pragma once // clang-format off -// Ordering of includes is important here. dpctl_sycl_types defines types -// used by dpctl's Python C-API headers. +// Ordering of includes is important here. dpctl_sycl_types and +// dpctl_sycl_extension_interface define types used by dpctl's Python +// C-API headers. #include "syclinterface/dpctl_sycl_types.h" +#include "syclinterface/dpctl_sycl_extension_interface.h" #ifdef __cplusplus #define CYTHON_EXTERN_C extern "C" #else diff --git a/dpctl/sycl.pxd b/dpctl/sycl.pxd index 12f3f141db..ce0c674512 100644 --- a/dpctl/sycl.pxd +++ b/dpctl/sycl.pxd @@ -42,6 +42,10 @@ cdef extern from "sycl/sycl.hpp" namespace "sycl": "sycl::kernel_bundle": pass +cdef extern from "syclinterface/dpctl_sycl_extension_interface.h": + cdef struct RawWorkGroupMemoryTy + ctypedef RawWorkGroupMemoryTy RawWorkGroupMemory + cdef extern from "syclinterface/dpctl_sycl_type_casters.hpp" \ namespace "dpctl::syclinterface": # queue @@ -67,3 +71,12 @@ cdef extern from "syclinterface/dpctl_sycl_type_casters.hpp" \ "dpctl::syclinterface::wrap" (const event *) cdef event * unwrap_event "dpctl::syclinterface::unwrap" ( dpctl_backend.DPCTLSyclEventRef) + + # work group memory extension + cdef dpctl_backend.DPCTLSyclWorkGroupMemoryRef wrap_work_group_memory \ + "dpctl::syclinterface::wrap" \ + (const RawWorkGroupMemory *) + + cdef RawWorkGroupMemory * unwrap_work_group_memory \ + "dpctl::syclinterface::unwrap" ( + dpctl_backend.DPCTLSyclWorkGroupMemoryRef) diff --git a/dpctl/tests/input_files/work-group-memory-kernel.spv b/dpctl/tests/input_files/work-group-memory-kernel.spv new file mode 100644 index 0000000000..c97e51b0b6 Binary files /dev/null and b/dpctl/tests/input_files/work-group-memory-kernel.spv differ diff --git a/dpctl/tests/test_sycl_kernel_submit.py b/dpctl/tests/test_sycl_kernel_submit.py index f1d8bf552c..9575e228f2 100644 --- a/dpctl/tests/test_sycl_kernel_submit.py +++ b/dpctl/tests/test_sycl_kernel_submit.py @@ -278,3 +278,4 @@ def test_kernel_arg_type(): _check_kernel_arg_type_instance(kernel_arg_type.dpctl_float64) _check_kernel_arg_type_instance(kernel_arg_type.dpctl_void_ptr) _check_kernel_arg_type_instance(kernel_arg_type.dpctl_local_accessor) + _check_kernel_arg_type_instance(kernel_arg_type.dpctl_work_group_memory) diff --git a/dpctl/tests/test_work_group_memory.py b/dpctl/tests/test_work_group_memory.py new file mode 100644 index 0000000000..edf390e2b6 --- /dev/null +++ b/dpctl/tests/test_work_group_memory.py @@ -0,0 +1,90 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Defines unit test cases for the work_group_memory in a SYCL kernel""" + +import os + +import pytest + +import dpctl +import dpctl.tensor + + +def get_spirv_abspath(fn): + curr_dir = os.path.dirname(os.path.abspath(__file__)) + spirv_file = os.path.join(curr_dir, "input_files", fn) + return spirv_file + + +# The kernel in the SPIR-V file used in this test was generated from the +# following SYCL source code: +# #include +# using namespace sycl; +# namespace syclexp = sycl::ext::oneapi::experimental; +# namespace syclext = sycl::ext::oneapi; +# using data_t = int32_t; +# +# extern "C" SYCL_EXTERNAL +# SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) +# void local_mem_kernel(data_t* in, data_t* out, +# syclexp::work_group_memory mem){ +# auto* local_mem = &mem; +# auto item = syclext::this_work_item::get_nd_item<1>(); +# size_t global_id = item.get_global_linear_id(); +# size_t local_id = item.get_local_linear_id(); +# local_mem[local_id] = in[global_id]; +# out[global_id] = local_mem[local_id]; +# } + + +def test_submit_work_group_memory(): + if not dpctl.WorkGroupMemory.is_available(): + pytest.skip("Work group memory extension not supported") + + try: + q = dpctl.SyclQueue("level_zero") + except dpctl.SyclQueueCreationError: + pytest.skip("LevelZero queue could not be created") + spirv_file = get_spirv_abspath("work-group-memory-kernel.spv") + with open(spirv_file, "br") as spv: + spv_bytes = spv.read() + prog = dpctl.program.create_program_from_spirv(q, spv_bytes) + kernel = prog.get_sycl_kernel("__sycl_kernel_local_mem_kernel") + local_size = 16 + global_size = local_size * 8 + + x = dpctl.tensor.ones(global_size, dtype="int32") + y = dpctl.tensor.zeros(global_size, dtype="int32") + x.sycl_queue.wait() + y.sycl_queue.wait() + + try: + q.submit( + kernel, + [ + x.usm_data, + y.usm_data, + dpctl.WorkGroupMemory("i4", local_size), + ], + [global_size], + [local_size], + ) + q.wait() + except dpctl._sycl_queue.SyclKernelSubmitError: + pytest.skip(f"Kernel submission to {q.sycl_device} failed") + + assert dpctl.tensor.all(x == y) diff --git a/dpctl/tests/test_work_group_memory_opencl.py b/dpctl/tests/test_work_group_memory_opencl.py new file mode 100644 index 0000000000..df90f2be01 --- /dev/null +++ b/dpctl/tests/test_work_group_memory_opencl.py @@ -0,0 +1,80 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Defines unit test cases for the work_group_memory in an OpenCL kernel""" + +import numpy as np +import pytest + +import dpctl +import dpctl.tensor + +ocl_kernel_src = """ +__kernel void local_mem_kernel(__global float *input, __global float *output, + __local float *local_data) { + int gid = get_global_id(0); + int lid = get_local_id(0); + + // Load input data into local memory + local_data[lid] = input[gid]; + + // Store the data in the output array + output[gid] = local_data[lid]; +} +""" + + +def test_submit_work_group_memory_opencl(): + if not dpctl.WorkGroupMemory.is_available(): + pytest.skip("Work group memory extension not supported") + + try: + q = dpctl.SyclQueue("opencl") + except dpctl.SyclQueueCreationError: + pytest.skip("OpenCL queue could not be created") + + prog = dpctl.program.create_program_from_source(q, ocl_kernel_src) + kernel = prog.get_sycl_kernel("local_mem_kernel") + local_size = 16 + global_size = local_size * 8 + + x_dev = dpctl.memory.MemoryUSMDevice(global_size * 4, queue=q) + y_dev = dpctl.memory.MemoryUSMDevice(global_size * 4, queue=q) + + x = np.ones(global_size, dtype="float32") + y = np.zeros(global_size, dtype="float32") + q.memcpy(x_dev, x, x_dev.nbytes) + q.memcpy(y_dev, y, y_dev.nbytes) + + try: + q.submit( + kernel, + [ + x_dev, + y_dev, + dpctl.WorkGroupMemory(local_size * x.itemsize), + ], + [global_size], + [local_size], + ) + q.wait() + except dpctl._sycl_queue.SyclKernelSubmitError: + pytest.fail("Foo") + pytest.skip(f"Kernel submission to {q.sycl_device} failed") + + q.memcpy(y, y_dev, y_dev.nbytes) + + assert np.all(x == y) diff --git a/libsyclinterface/include/syclinterface/dpctl_sycl_enum_types.h b/libsyclinterface/include/syclinterface/dpctl_sycl_enum_types.h index 6c7f05e195..e98ded7849 100644 --- a/libsyclinterface/include/syclinterface/dpctl_sycl_enum_types.h +++ b/libsyclinterface/include/syclinterface/dpctl_sycl_enum_types.h @@ -100,6 +100,7 @@ typedef enum DPCTL_FLOAT64_T, DPCTL_VOID_PTR, DPCTL_LOCAL_ACCESSOR, + DPCTL_WORK_GROUP_MEMORY, DPCTL_UNSUPPORTED_KERNEL_ARG } DPCTLKernelArgType; diff --git a/libsyclinterface/include/syclinterface/dpctl_sycl_extension_interface.h b/libsyclinterface/include/syclinterface/dpctl_sycl_extension_interface.h new file mode 100644 index 0000000000..ee4d7d4fbb --- /dev/null +++ b/libsyclinterface/include/syclinterface/dpctl_sycl_extension_interface.h @@ -0,0 +1,56 @@ +//===---- dpctl_sycl_extension_interface.h - C API for SYCL ext -*-C++-*- ===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This header declares a C interface to SYCL language extensions defined by +/// DPC++. +/// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "Support/DllExport.h" +#include "Support/ExternC.h" +#include "Support/MemOwnershipAttrs.h" +#include "dpctl_data_types.h" +#include "dpctl_error_handler_type.h" +#include "dpctl_sycl_enum_types.h" +#include "dpctl_sycl_types.h" + +DPCTL_C_EXTERN_C_BEGIN + +typedef struct RawWorkGroupMemoryTy +{ + size_t nbytes; +} RawWorkGroupMemory; + +typedef struct DPCTLOpaqueSyclWorkGroupMemory *DPCTLSyclWorkGroupMemoryRef; + +DPCTL_API +__dpctl_give DPCTLSyclWorkGroupMemoryRef +DPCTLWorkGroupMemory_Create(size_t nbytes); + +DPCTL_API +void DPCTLWorkGroupMemory_Delete(__dpctl_take DPCTLSyclWorkGroupMemoryRef Ref); + +DPCTL_API +bool DPCTLWorkGroupMemory_Available(); + +DPCTL_C_EXTERN_C_END diff --git a/libsyclinterface/include/syclinterface/dpctl_sycl_type_casters.hpp b/libsyclinterface/include/syclinterface/dpctl_sycl_type_casters.hpp index 3f5e474533..638916f083 100644 --- a/libsyclinterface/include/syclinterface/dpctl_sycl_type_casters.hpp +++ b/libsyclinterface/include/syclinterface/dpctl_sycl_type_casters.hpp @@ -80,6 +80,10 @@ DEFINE_SIMPLE_CONVERSION_FUNCTIONS(std::vector, DEFINE_SIMPLE_CONVERSION_FUNCTIONS(std::vector, DPCTLEventVectorRef) +#include "dpctl_sycl_extension_interface.h" +DEFINE_SIMPLE_CONVERSION_FUNCTIONS(RawWorkGroupMemory, + DPCTLSyclWorkGroupMemoryRef) + #endif } // namespace dpctl::syclinterface diff --git a/libsyclinterface/source/dpctl_sycl_extension_interface.cpp b/libsyclinterface/source/dpctl_sycl_extension_interface.cpp new file mode 100644 index 0000000000..862be8dded --- /dev/null +++ b/libsyclinterface/source/dpctl_sycl_extension_interface.cpp @@ -0,0 +1,64 @@ +//===---- dpctl_sycl_extension_interface.cpp - Implements C API for SYCL ext =// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the data types and functions declared in +/// dpctl_sycl_extension_interface.h. +/// +//===----------------------------------------------------------------------===// + +#include "dpctl_sycl_extension_interface.h" + +#include "dpctl_error_handlers.h" +#include "dpctl_sycl_type_casters.hpp" + +#include + +using namespace dpctl::syclinterface; + +DPCTL_API +__dpctl_give DPCTLSyclWorkGroupMemoryRef +DPCTLWorkGroupMemory_Create(size_t nbytes) +{ + DPCTLSyclWorkGroupMemoryRef wgm = nullptr; + try { + auto WorkGroupMem = new RawWorkGroupMemory{nbytes}; + wgm = wrap(WorkGroupMem); + } catch (std::exception const &e) { + error_handler(e, __FILE__, __func__, __LINE__); + } + return wgm; +} + +DPCTL_API +void DPCTLWorkGroupMemory_Delete(__dpctl_take DPCTLSyclWorkGroupMemoryRef Ref) +{ + delete unwrap(Ref); +} + +DPCTL_API +bool DPCTLWorkGroupMemory_Available() +{ +#ifdef SYCL_EXT_ONEAPI_WORK_GROUP_MEMORY + return true; +#else + return false; +#endif +} diff --git a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp index 7e2a7ac4ee..7fb971d253 100644 --- a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp @@ -42,6 +42,10 @@ #include /* SYCL headers */ #include +#ifdef SYCL_EXT_ONEAPI_WORK_GROUP_MEMORY +#include "dpctl_sycl_extension_interface.h" +#endif + using namespace sycl; #define SET_LOCAL_ACCESSOR_ARG(CGH, NDIM, ARGTY, R, IDX) \ @@ -216,6 +220,18 @@ bool set_kernel_arg(handler &cgh, case DPCTL_LOCAL_ACCESSOR: arg_set = set_local_accessor_arg(cgh, idx, (MDLocalAccessor *)Arg); break; +#ifdef SYCL_EXT_ONEAPI_WORK_GROUP_MEMORY + case DPCTL_WORK_GROUP_MEMORY: + { + auto ref = static_cast(Arg); + RawWorkGroupMemory *raw_mem = unwrap(ref); + size_t num_bytes = raw_mem->nbytes; + sycl::ext::oneapi::experimental::work_group_memory mem{ + num_bytes, cgh}; + cgh.set_arg(idx, mem); + break; + } +#endif default: arg_set = false; break; diff --git a/libsyclinterface/tests/CMakeLists.txt b/libsyclinterface/tests/CMakeLists.txt index 3a714eb13c..36a511bdc6 100644 --- a/libsyclinterface/tests/CMakeLists.txt +++ b/libsyclinterface/tests/CMakeLists.txt @@ -15,6 +15,8 @@ set(spirv-test-files oneD_range_kernel_fp64.spv local_accessor_kernel_inttys_fp32.spv local_accessor_kernel_fp64.spv + work_group_memory_kernel_fp64.spv + work_group_memory_kernel_inttys_fp32.spv ) foreach(tf ${spirv-test-files}) @@ -50,6 +52,7 @@ add_sycl_to_target( ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_manager.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_submit.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_submit_local_accessor_arg.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_submit_work_group_memory_arg.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_interface.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_usm_interface.cpp ) diff --git a/libsyclinterface/tests/test_sycl_queue_submit_work_group_memory_arg.cpp b/libsyclinterface/tests/test_sycl_queue_submit_work_group_memory_arg.cpp new file mode 100644 index 0000000000..658cca428a --- /dev/null +++ b/libsyclinterface/tests/test_sycl_queue_submit_work_group_memory_arg.cpp @@ -0,0 +1,387 @@ +//===-- test_sycl_queue_submit_work_group_memory_arg - Test work group mem ===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains tests for kernel submit using the work_group_memory +/// SYCL extension. +//===----------------------------------------------------------------------===// + +#include "dpctl_sycl_context_interface.h" +#include "dpctl_sycl_device_interface.h" +#include "dpctl_sycl_device_selector_interface.h" +#include "dpctl_sycl_event_interface.h" +#include "dpctl_sycl_kernel_bundle_interface.h" +#include "dpctl_sycl_kernel_interface.h" +#include "dpctl_sycl_queue_interface.h" +#include "dpctl_sycl_type_casters.hpp" +#include "dpctl_sycl_usm_interface.h" + +#include + +#include +#include +#include +#include +#include + +#include +#include + +namespace +{ +constexpr std::size_t SIZE = 320; + +static_assert(SIZE % 10 == 0); + +using namespace dpctl::syclinterface; + +template +void submit_kernel(DPCTLSyclQueueRef QRef, + DPCTLSyclKernelBundleRef KBRef, + std::vector spirvBuffer, + std::size_t spirvFileSize, + DPCTLKernelArgType kernelArgTy, + std::string kernelName) +{ + if (!DPCTLWorkGroupMemory_Available()) { + GTEST_SKIP() + << "Skipping work-group-memory test since the compiler does " + "not support this feature"; + return; + } + + constexpr std::size_t NARGS = 2; + constexpr std::size_t RANGE_NDIMS = 1; + + ASSERT_TRUE(DPCTLKernelBundle_HasKernel(KBRef, kernelName.c_str())); + auto kernel = DPCTLKernelBundle_GetKernel(KBRef, kernelName.c_str()); + + // Create the input args + auto a = DPCTLmalloc_shared(SIZE * sizeof(T), QRef); + ASSERT_TRUE(a != nullptr); + auto a_ptr = static_cast(unwrap(a)); + for (auto i = 0ul; i < SIZE; ++i) { + a_ptr[i] = 0; + } + + // Create kernel args for vector_add + std::size_t lws = SIZE / 10; + std::size_t gRange[] = {SIZE}; + std::size_t lRange[] = {lws}; + + std::uintptr_t wgm_sz = lws * sizeof(T); + auto wgm = DPCTLWorkGroupMemory_Create(wgm_sz); + ASSERT_TRUE(wgm != nullptr); + auto *wgm_raw = unwrap(wgm); + ASSERT_TRUE(wgm_raw != nullptr); + ASSERT_TRUE(wgm_raw->nbytes == wgm_sz); + void *args_1d[NARGS] = {unwrap(a), wgm}; + DPCTLKernelArgType addKernelArgTypes[] = {DPCTL_VOID_PTR, + DPCTL_WORK_GROUP_MEMORY}; + + DPCTLSyclEventRef E1Ref = DPCTLQueue_SubmitNDRange( + kernel, QRef, args_1d, addKernelArgTypes, NARGS, gRange, lRange, + RANGE_NDIMS, nullptr, 0); + ASSERT_TRUE(E1Ref != nullptr); + + DPCTLSyclEventRef DepEv1[] = {E1Ref}; + void *args_2d[NARGS] = {unwrap(a), wgm}; + + DPCTLSyclEventRef E2Ref = + DPCTLQueue_SubmitNDRange(kernel, QRef, args_2d, addKernelArgTypes, + NARGS, gRange, lRange, RANGE_NDIMS, DepEv1, 1); + ASSERT_TRUE(E2Ref != nullptr); + + DPCTLSyclEventRef DepEv2[] = {E1Ref, E2Ref}; + void *args_3d[NARGS] = {unwrap(a), wgm}; + + DPCTLSyclEventRef E3Ref = + DPCTLQueue_SubmitNDRange(kernel, QRef, args_3d, addKernelArgTypes, + NARGS, gRange, lRange, RANGE_NDIMS, DepEv2, 2); + ASSERT_TRUE(E3Ref != nullptr); + + DPCTLEvent_Wait(E3Ref); + + ASSERT_TRUE(a_ptr[0] == T(lws * 2)); + + // clean ups + DPCTLEvent_Delete(E1Ref); + DPCTLEvent_Delete(E2Ref); + DPCTLEvent_Delete(E3Ref); + DPCTLWorkGroupMemory_Delete(wgm); + DPCTLKernel_Delete(kernel); + DPCTLfree_with_queue((DPCTLSyclUSMRef)a, QRef); +} + +} /* end of anonymous namespace */ + +/* +// The work_group_memory_kernel spv files were generated from the SYCL program +// included in this comment. The program can be compiled using +// `icpx -fsycl work_group_memory_kernel.cpp`. After that if the generated +// executable is run with the environment variable `SYCL_DUMP_IMAGES=1`, icpx +// runtime will dump all offload sections of fat binary to the current working +// directory. When tested with DPC++ 2024.0 the kernels are split across two +// separate SPV files. One contains all kernels for integers and FP32 +// data type, and another contains the kernel for FP64. +// +// Note that, `SYCL_DUMP_IMAGES=1` will also generate extra SPV files that +// contain the code for built in functions such as indexing and barriers. To +// figure which SPV file contains the kernels, use `spirv-dis` from the +// spirv-tools package to translate the SPV binary format to a human-readable +// textual format. +#include +#include +#include + +namespace syclexp = sycl::ext::oneapi::experimental; + +template +class SyclKernel_WGM +{ +private: + T N_; + T *a_ = nullptr; + syclexp::work_group_memory wgm_; + +public: + SyclKernel_WGM(T *a, syclexp::work_group_memory wgm) + : a_(a), wgm_(wgm) + { + } + + void operator()(sycl::nd_item<1> it) const + { + int i = it.get_global_id(); + int j = it.get_local_id(); + wgm_[j] = 2; + auto g = it.get_group(); + group_barrier(g); + auto temp = 0; + for (auto idx = 0ul; idx < it.get_local_range(0); ++idx) + temp += wgm_[idx]; + a_[i] = temp * (i + 1); + } +}; + +template +sycl::event +submit_kernel(sycl::queue q, const unsigned long N, T *a) +{ + auto gws = N; + auto lws = (N/10); + + sycl::range<1> gRange{gws}; + sycl::range<1> lRange{lws}; + sycl::nd_range<1> ndRange{gRange, lRange}; + + sycl::event e = + q.submit([&](auto &h) + { + syclexp::work_group_memory wgm(lws, h); + h.parallel_for( + ndRange, + SyclKernel_WGM(a, wgm)); + }); + + return e; +} + +template +void driver(std::size_t N) +{ + sycl::queue q; + auto *a = sycl::malloc_shared(N, q); + submit_kernel(q, N, a).wait(); + sycl::free(a, q); +} + +int main(int argc, const char **argv) +{ + std::size_t N = 0; + std::cout << "Enter problem size in N:\n"; + std::cin >> N; + std::cout << "Executing with N = " << N << std::endl; + + driver(N); + driver(N); + driver(N); + driver(N); + driver(N); + driver(N); + driver(N); + driver(N); + driver(N); + driver(N); + + return 0; +} +*/ + +struct TestQueueSubmitWithWorkGroupMemory : public ::testing::Test +{ + std::ifstream spirvFile; + std::size_t spirvFileSize_; + std::vector spirvBuffer_; + DPCTLSyclQueueRef QRef = nullptr; + DPCTLSyclKernelBundleRef KBRef = nullptr; + + TestQueueSubmitWithWorkGroupMemory() + { + DPCTLSyclDeviceSelectorRef DSRef = nullptr; + DPCTLSyclDeviceRef DRef = nullptr; + const char *test_spv_fn = "./work_group_memory_kernel_inttys_fp32.spv"; + + spirvFile.open(test_spv_fn, std::ios::binary | std::ios::ate); + spirvFileSize_ = std::filesystem::file_size(test_spv_fn); + spirvBuffer_.reserve(spirvFileSize_); + spirvFile.seekg(0, std::ios::beg); + spirvFile.read(spirvBuffer_.data(), spirvFileSize_); + + DSRef = DPCTLDefaultSelector_Create(); + DRef = DPCTLDevice_CreateFromSelector(DSRef); + QRef = + DPCTLQueue_CreateForDevice(DRef, nullptr, DPCTL_DEFAULT_PROPERTY); + auto CRef = DPCTLQueue_GetContext(QRef); + + KBRef = DPCTLKernelBundle_CreateFromSpirv( + CRef, DRef, spirvBuffer_.data(), spirvFileSize_, nullptr); + DPCTLDevice_Delete(DRef); + DPCTLDeviceSelector_Delete(DSRef); + } + + ~TestQueueSubmitWithWorkGroupMemory() + { + spirvFile.close(); + DPCTLQueue_Delete(QRef); + DPCTLKernelBundle_Delete(KBRef); + } +}; + +struct TestQueueSubmitWithWorkGroupMemoryFP64 : public ::testing::Test +{ + std::ifstream spirvFile; + std::size_t spirvFileSize_; + std::vector spirvBuffer_; + DPCTLSyclDeviceRef DRef = nullptr; + DPCTLSyclQueueRef QRef = nullptr; + DPCTLSyclKernelBundleRef KBRef = nullptr; + + TestQueueSubmitWithWorkGroupMemoryFP64() + { + DPCTLSyclDeviceSelectorRef DSRef = nullptr; + const char *test_spv_fn = "./work_group_memory_kernel_fp64.spv"; + + spirvFile.open(test_spv_fn, std::ios::binary | std::ios::ate); + spirvFileSize_ = std::filesystem::file_size(test_spv_fn); + spirvBuffer_.reserve(spirvFileSize_); + spirvFile.seekg(0, std::ios::beg); + spirvFile.read(spirvBuffer_.data(), spirvFileSize_); + DSRef = DPCTLDefaultSelector_Create(); + DRef = DPCTLDevice_CreateFromSelector(DSRef); + QRef = + DPCTLQueue_CreateForDevice(DRef, nullptr, DPCTL_DEFAULT_PROPERTY); + auto CRef = DPCTLQueue_GetContext(QRef); + + KBRef = DPCTLKernelBundle_CreateFromSpirv( + CRef, DRef, spirvBuffer_.data(), spirvFileSize_, nullptr); + DPCTLDeviceSelector_Delete(DSRef); + } + + ~TestQueueSubmitWithWorkGroupMemoryFP64() + { + spirvFile.close(); + DPCTLDevice_Delete(DRef); + DPCTLQueue_Delete(QRef); + DPCTLKernelBundle_Delete(KBRef); + } +}; + +TEST_F(TestQueueSubmitWithWorkGroupMemory, CheckForInt8) +{ + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_INT8_T, + "_ZTS14SyclKernel_WGMIaE"); +} + +TEST_F(TestQueueSubmitWithWorkGroupMemory, CheckForUInt8) +{ + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_UINT8_T, + "_ZTS14SyclKernel_WGMIhE"); +} + +TEST_F(TestQueueSubmitWithWorkGroupMemory, CheckForInt16) +{ + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_INT16_T, + "_ZTS14SyclKernel_WGMIsE"); +} + +TEST_F(TestQueueSubmitWithWorkGroupMemory, CheckForUInt16) +{ + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_UINT16_T, + "_ZTS14SyclKernel_WGMItE"); +} + +TEST_F(TestQueueSubmitWithWorkGroupMemory, CheckForInt32) +{ + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_INT32_T, + "_ZTS14SyclKernel_WGMIiE"); +} + +TEST_F(TestQueueSubmitWithWorkGroupMemory, CheckForUInt32) +{ + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_UINT32_T, + "_ZTS14SyclKernel_WGMIjE"); +} + +TEST_F(TestQueueSubmitWithWorkGroupMemory, CheckForInt64) +{ + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_INT64_T, + "_ZTS14SyclKernel_WGMIlE"); +} + +TEST_F(TestQueueSubmitWithWorkGroupMemory, CheckForUInt64) +{ + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_UINT64_T, + "_ZTS14SyclKernel_WGMImE"); +} + +TEST_F(TestQueueSubmitWithWorkGroupMemory, CheckForFloat) +{ + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_FLOAT32_T, + "_ZTS14SyclKernel_WGMIfE"); +} + +TEST_F(TestQueueSubmitWithWorkGroupMemoryFP64, CheckForDouble) +{ + if (DPCTLDevice_HasAspect(DRef, DPCTLSyclAspectType::fp64)) { + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_FLOAT64_T, + "_ZTS14SyclKernel_WGMIdE"); + } +} diff --git a/libsyclinterface/tests/work_group_memory_kernel_fp64.spv b/libsyclinterface/tests/work_group_memory_kernel_fp64.spv new file mode 100644 index 0000000000..fe6ce5585b Binary files /dev/null and b/libsyclinterface/tests/work_group_memory_kernel_fp64.spv differ diff --git a/libsyclinterface/tests/work_group_memory_kernel_inttys_fp32.spv b/libsyclinterface/tests/work_group_memory_kernel_inttys_fp32.spv new file mode 100644 index 0000000000..0c10e45e5a Binary files /dev/null and b/libsyclinterface/tests/work_group_memory_kernel_inttys_fp32.spv differ