diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt index a466d3eef1..795ca09c78 100644 --- a/dpctl/CMakeLists.txt +++ b/dpctl/CMakeLists.txt @@ -191,6 +191,7 @@ foreach(_cy_file ${_cython_sources}) build_dpctl_ext(${_trgt} ${_cy_file} "dpctl") endforeach() +# _sycl_queue include _host_task_util.hpp target_include_directories(_sycl_queue PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) add_subdirectory(program) diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py index dc647febf7..f6d1ca086b 100644 --- a/dpctl/tests/test_tensor_sum.py +++ b/dpctl/tests/test_tensor_sum.py @@ -17,6 +17,7 @@ import pytest import dpctl.tensor as dpt +import dpctl.utils as du from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported _all_dtypes = [ @@ -187,11 +188,28 @@ def test_axis0_bug(): assert dpt.all(s == expected) +def _any_complex(dtypes): + return any(dpt.isdtype(dpt.dtype(dt), "complex floating") for dt in dtypes) + + +def _skip_on_this_device(sycl_dev): + device_mask = du.intel_device_info(sycl_dev).get("device_id", 0) & 0xFF00 + return device_mask in [0x3E00, 0x9B00] + + @pytest.mark.parametrize("arg_dtype", _all_dtypes[1:]) def test_prod_arg_dtype_default_output_dtype_matrix(arg_dtype): q = get_queue_or_skip() skip_if_dtype_not_supported(arg_dtype, q) + arg_dtype = dpt.dtype(arg_dtype) + if _any_complex((arg_dtype,)): + if _skip_on_this_device(q.sycl_device): + pytest.skip( + "Product reduction for complex output are known " + "to fail for Gen9 with 2024.0 compiler" + ) + m = dpt.ones(100, dtype=arg_dtype) r = dpt.prod(m) @@ -242,6 +260,15 @@ def test_prod_arg_out_dtype_matrix(arg_dtype, out_dtype): skip_if_dtype_not_supported(arg_dtype, q) skip_if_dtype_not_supported(out_dtype, q) + out_dtype = dpt.dtype(out_dtype) + arg_dtype = dpt.dtype(arg_dtype) + if _any_complex((arg_dtype, out_dtype)): + if _skip_on_this_device(q.sycl_device): + pytest.skip( + "Product reduction for complex output are known " + "to fail for Gen9 with 2024.0 compiler" + ) + m = dpt.ones(100, dtype=arg_dtype) r = dpt.prod(m, dtype=out_dtype) diff --git a/dpctl/tests/test_utils.py b/dpctl/tests/test_utils.py index df4a9f503f..1aab7fd7e7 100644 --- a/dpctl/tests/test_utils.py +++ b/dpctl/tests/test_utils.py @@ -21,6 +21,7 @@ import dpctl import dpctl.utils +from dpctl.enum_types import backend_type def test_get_execution_queue_input_validation(): @@ -122,3 +123,29 @@ def test_onetrace_enabled(): with dpctl.utils.onetrace_enabled(): assert os.getenv(v_name, None) == "1" assert os.getenv(v_name, None) == v_v + + +def test_intel_device_info(): + try: + d = dpctl.select_default_device() + except dpctl.SyclDeviceCreationError: + pytest.skip("Default device could not be created") + descr = dpctl.utils.intel_device_info(d) + assert isinstance(descr, dict) + assert ("device_id" in descr) or ( + not d.has_aspect_cpu and not d.backend == backend_type.level_zero + ) + allowed_names = [ + "device_id", + "gpu_slices", + "gpu_eu_count", + "gpu_eu_simd_width", + "gpu_hw_threads_per_eu", + "gpu_subslices_per_slice", + "gpu_eu_count_per_subslice", + "max_mem_bandwidth", + ] + for descriptor_name in descr.keys(): + test = descriptor_name in allowed_names + err_msg = f"Key '{descriptor_name}' is not recognized" + assert test, err_msg diff --git a/dpctl/utils/CMakeLists.txt b/dpctl/utils/CMakeLists.txt index 11b0930052..8bc65e3056 100644 --- a/dpctl/utils/CMakeLists.txt +++ b/dpctl/utils/CMakeLists.txt @@ -4,3 +4,24 @@ foreach(_cy_file ${_cython_sources}) get_filename_component(_trgt ${_cy_file} NAME_WLE) build_dpctl_ext(${_trgt} ${_cy_file} "dpctl/utils") endforeach() + +add_custom_target(_dpctl4pybind11_header_ready + DEPENDS + _usmarray_copy_capi_include + _memory_copy_capi_include + _sycl_device_copy_capi_include + _sycl_queue_copy_capi_include + _sycl_context_copy_capi_include + _sycl_event_copy_capi_include +) + +set(python_module_name _device_queries) +pybind11_add_module(${python_module_name} MODULE + ${CMAKE_CURRENT_SOURCE_DIR}/src/device_queries.cpp +) +target_include_directories(${python_module_name} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../include +) +add_dependencies(${python_module_name} _dpctl4pybind11_header_ready) +install(TARGETS ${python_module_name} DESTINATION "dpctl/utils") diff --git a/dpctl/utils/__init__.py b/dpctl/utils/__init__.py index 671564cda5..fb41b3b74c 100644 --- a/dpctl/utils/__init__.py +++ b/dpctl/utils/__init__.py @@ -18,18 +18,85 @@ A collection of utility functions. """ +from .._sycl_device import SyclDevice from ._compute_follows_data import ( ExecutionPlacementError, get_coerced_usm_type, get_execution_queue, validate_usm_type, ) +from ._device_queries import ( + intel_device_info_device_id, + intel_device_info_gpu_eu_count, + intel_device_info_gpu_eu_count_per_subslice, + intel_device_info_gpu_eu_simd_width, + intel_device_info_gpu_hw_threads_per_eu, + intel_device_info_gpu_slices, + intel_device_info_gpu_subslices_per_slice, + intel_device_info_max_mem_bandwidth, +) from ._onetrace_context import onetrace_enabled + +def intel_device_info(dev): + """intel_device_info(sycl_device) + + For Intel(R) GPU devices returns a dictionary + with device architectural details, and an empty + dictionary otherwise. The dictionary contains + the following keys: + + device_id: 32-bits device PCI identifier + gpu_eu_count: Total number of execution units + gpu_hw_threads_per_eu: Number of thread contexts in EU + gpu_eu_simd_width: Physical SIMD width of EU + gpu_slices: Total number of slices + gpu_subslices_per_slice: Number of sub-slices per slice + gpu_eu_count_per_subslice: Number of EUs in subslice + max_mem_bandwidth: Maximum memory bandwidth in bytes/second + + Unsupported descriptors are omitted from the dictionary. + Descriptors other than PCI identifier are supported only for + SyclDevices with Leve-Zero backend. + """ + if not isinstance(dev, SyclDevice): + raise TypeError(f"Expected dpctl.SyclDevice, got {type(dev)}") + dev_id = intel_device_info_device_id(dev) + if dev_id: + res = { + "device_id": dev_id, + } + if dev.has_aspect_gpu: + eu_count = intel_device_info_gpu_eu_count(dev) + if eu_count: + res["gpu_eu_count"] = eu_count + hw_threads = intel_device_info_gpu_hw_threads_per_eu(dev) + if hw_threads: + res["gpu_hw_threads_per_eu"] = hw_threads + simd_w = intel_device_info_gpu_eu_simd_width(dev) + if simd_w: + res["gpu_eu_simd_width"] = simd_w + n_slices = intel_device_info_gpu_slices(dev) + if n_slices: + res["gpu_slices"] = n_slices + n_subslices = intel_device_info_gpu_subslices_per_slice(dev) + if n_subslices: + res["gpu_subslices_per_slice"] = n_subslices + n_eu_per_subslice = intel_device_info_gpu_eu_count_per_subslice(dev) + if n_eu_per_subslice: + res["gpu_eu_count_per_subslice"] = n_eu_per_subslice + bw = intel_device_info_max_mem_bandwidth(dev) + if bw: + res["max_mem_bandwidth"] = bw + return res + return dict() + + __all__ = [ "get_execution_queue", "get_coerced_usm_type", "validate_usm_type", "onetrace_enabled", + "intel_device_info", "ExecutionPlacementError", ] diff --git a/dpctl/utils/src/device_queries.cpp b/dpctl/utils/src/device_queries.cpp new file mode 100644 index 0000000000..6407e69dbb --- /dev/null +++ b/dpctl/utils/src/device_queries.cpp @@ -0,0 +1,139 @@ +#include "dpctl4pybind11.hpp" +#include +#include +#include + +#include +#include + +namespace +{ + +std::uint32_t py_intel_device_id(const sycl::device &d) +{ + static constexpr std::uint32_t device_id_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_device_id)) { + return d.get_info(); + } + + return device_id_unavailable; +} + +std::uint32_t py_intel_gpu_eu_count(const sycl::device &d) +{ + static constexpr std::uint32_t eu_count_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_eu_count)) { + return d.get_info(); + } + + return eu_count_unavailable; +} + +std::uint32_t py_intel_gpu_hw_threads_per_eu(const sycl::device &d) +{ + static constexpr std::uint32_t thread_count_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { + return d + .get_info(); + } + + return thread_count_unavailable; +} + +std::uint32_t py_intel_gpu_eu_simd_width(const sycl::device &d) +{ + static constexpr std::uint32_t width_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_eu_simd_width)) { + return d.get_info(); + } + + return width_unavailable; +} + +std::uint32_t py_intel_gpu_slices(const sycl::device &d) +{ + static constexpr std::uint32_t count_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_slices)) { + return d.get_info(); + } + + return count_unavailable; +} + +std::uint32_t py_intel_gpu_subslices_per_slice(const sycl::device &d) +{ + static constexpr std::uint32_t count_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_subslices_per_slice)) { + return d.get_info< + sycl::ext::intel::info::device::gpu_subslices_per_slice>(); + } + + return count_unavailable; +} + +std::uint32_t py_intel_gpu_eu_count_per_subslice(const sycl::device &d) +{ + static constexpr std::uint32_t count_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_eu_count_per_subslice)) { + return d.get_info< + sycl::ext::intel::info::device::gpu_eu_count_per_subslice>(); + } + + return count_unavailable; +} + +std::uint64_t py_intel_max_mem_bandwidth(const sycl::device &d) +{ + static constexpr std::uint64_t bandwidth_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_max_mem_bandwidth)) { + return d.get_info(); + } + + return bandwidth_unavailable; +} + +}; // namespace + +PYBIND11_MODULE(_device_queries, m) +{ + m.def("intel_device_info_device_id", &py_intel_device_id, + "Get ext_intel_device_id for the device, zero if not an intel device", + py::arg("device")); + + m.def("intel_device_info_gpu_eu_count", &py_intel_gpu_eu_count, + "Returns the number of execution units (EUs) associated with the " + "Intel GPU.", + py::arg("device")); + + m.def("intel_device_info_gpu_hw_threads_per_eu", + &py_intel_gpu_hw_threads_per_eu, + "Returns the number of hardware threads in EU.", py::arg("device")); + + m.def("intel_device_info_gpu_eu_simd_width", &py_intel_gpu_eu_simd_width, + "Returns the physical SIMD width of the execution unit (EU).", + py::arg("device")); + + m.def("intel_device_info_gpu_slices", &py_intel_gpu_slices, + "Returns the number of slices in the GPU device, or zero.", + py::arg("device")); + + m.def("intel_device_info_gpu_subslices_per_slice", + &py_intel_gpu_subslices_per_slice, + "Returns the number of subslices per slice.", py::arg("device")); + + m.def("intel_device_info_gpu_eu_count_per_subslice", + &py_intel_gpu_eu_count_per_subslice, + "Returns the number of EUs per subslice of GPU.", py::arg("device")); + + m.def("intel_device_info_max_mem_bandwidth", &py_intel_max_mem_bandwidth, + "Returns the maximum memory bandwidth in units of bytes/second.", + py::arg("device")); +}