diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake index eb94db50095d8..dbb75e31104e0 100644 --- a/libdevice/cmake/modules/SYCLLibdevice.cmake +++ b/libdevice/cmake/modules/SYCLLibdevice.cmake @@ -167,12 +167,46 @@ add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-cmath-fp64.${lib-su DEPENDS device_math.h device.h clang clang-offload-bundler VERBATIM) +add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-itt-stubs.${lib-suffix} + COMMAND ${clang} -fsycl -c + ${compile_opts} ${sycl_targets_opt} + ${CMAKE_CURRENT_SOURCE_DIR}/itt_stubs.cpp + -o ${obj_binary_dir}/libsycl-itt-stubs.${lib-suffix} + MAIN_DEPENDENCY itt_stubs.cpp + DEPENDS device_itt.h spirv_vars.h device.h clang clang-offload-bundler + VERBATIM) + +add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-itt-compiler-wrappers.${lib-suffix} + COMMAND ${clang} -fsycl -c + ${compile_opts} ${sycl_targets_opt} + ${CMAKE_CURRENT_SOURCE_DIR}/itt_compiler_wrappers.cpp + -o ${obj_binary_dir}/libsycl-itt-compiler-wrappers.${lib-suffix} + MAIN_DEPENDENCY itt_compiler_wrappers.cpp + DEPENDS device_itt.h spirv_vars.h device.h clang clang-offload-bundler + VERBATIM) + +add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-itt-user-wrappers.${lib-suffix} + COMMAND ${clang} -fsycl -c + ${compile_opts} ${sycl_targets_opt} + ${CMAKE_CURRENT_SOURCE_DIR}/itt_user_wrappers.cpp + -o ${obj_binary_dir}/libsycl-itt-user-wrappers.${lib-suffix} + MAIN_DEPENDENCY itt_user_wrappers.cpp + DEPENDS device_itt.h spirv_vars.h device.h clang clang-offload-bundler + VERBATIM) + +set(devicelib-obj-itt-files + ${obj_binary_dir}/libsycl-itt-stubs.${lib-suffix} + ${obj_binary_dir}/libsycl-itt-compiler-wrappers.${lib-suffix} + ${obj_binary_dir}/libsycl-itt-user-wrappers.${lib-suffix} + ) + add_custom_target(libsycldevice-obj DEPENDS ${devicelib-obj-file} ${devicelib-obj-complex} ${devicelib-obj-complex-fp64} ${devicelib-obj-cmath} ${devicelib-obj-cmath-fp64} + ${devicelib-obj-itt-files} ) add_custom_target(libsycldevice-spv DEPENDS ${spv_binary_dir}/libsycl-fallback-cassert.spv @@ -212,7 +246,8 @@ install(FILES ${devicelib-obj-file} ${devicelib-obj-cmath} ${obj_binary_dir}/libsycl-fallback-cmath.${lib-suffix} ${devicelib-obj-cmath-fp64} - ${obj_binary_dir}/libsycl-fallback-cmath-fp64.${lib-suffix} + ${obj_binary_dir}/libsycl-fallback-cmath-fp64.${lib-suffix} + ${devicelib-obj-itt-files} DESTINATION ${install_dest_lib} COMPONENT libsycldevice) diff --git a/libdevice/device_itt.h b/libdevice/device_itt.h new file mode 100644 index 0000000000000..07614c6c532a6 --- /dev/null +++ b/libdevice/device_itt.h @@ -0,0 +1,103 @@ +//==------- device_itt.h - ITT devicelib functions declarations ------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//==------------------------------------------------------------------------==// + +#ifndef __LIBDEVICE_DEVICE_ITT_H__ +#define __LIBDEVICE_DEVICE_ITT_H__ + +#include "device.h" + +#ifdef __SPIR__ +#include "spirv_vars.h" + +#define ITT_STUB_ATTRIBUTES __attribute__((noinline, optnone)) + +/// Atomic operation type +enum __itt_atomic_mem_op_t { + __itt_mem_load = 0, + __itt_mem_store = 1, + __itt_mem_update = 2 +}; + +/// Memory operation ordering semantic type +enum __itt_atomic_mem_order_t { + __itt_mem_order_relaxed = 0, + __itt_mem_order_acquire = 1, + __itt_mem_order_release = 2, + __itt_mem_order_acquire_release = 3 +}; + +// FIXME: must be enabled via -fdeclare-spirv-builtins +DEVICE_EXTERN_C char __spirv_SpecConstant(int, char); + +#define ITT_SPEC_CONSTANT 0xFF747469 + +static inline bool isITTEnabled() { + return __spirv_SpecConstant(ITT_SPEC_CONSTANT, 0) != 0; +} + +// Wrapper APIs that may be called by compiler-generated code. +// These are just parameterless helper APIs that call the corresponding +// stub APIs after preparing the arguments for them. +// +// Note that we do not provide compiler wrappers for all stub APIs. +// For example, there is no compiler wrapper for +// __itt_offload_sync_acquired_stub, since the API's parameter cannot +// be computed in the wrapper itself and has to be passed from outside. +// If a compiler needs to invoke such an API, it has to use the user +// visible API directly (i.e. __itt_offload_sync_acquired). +DEVICE_EXTERN_C +void __itt_offload_wi_start_wrapper(); +DEVICE_EXTERN_C +void __itt_offload_wi_finish_wrapper(); +DEVICE_EXTERN_C +void __itt_offload_wg_barrier_wrapper(); +DEVICE_EXTERN_C +void __itt_offload_wi_resume_wrapper(); + +// Non-inlinable and non-optimizable APIs that are recognized +// by profiling tools. +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_wi_start_stub(size_t *group_id, size_t wi_id, uint32_t wg_size); +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_wi_finish_stub(size_t *group_id, size_t wi_id); +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_wg_barrier_stub(uintptr_t barrier_id); +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_wi_resume_stub(size_t *group_id, size_t wi_id); +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_sync_acquired_stub(uintptr_t sync_id); +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_sync_releasing_stub(uintptr_t sync_id); +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_wg_local_range_stub(void *ptr, size_t size); +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_atomic_op_start_stub(void *object, __itt_atomic_mem_op_t op_type, + __itt_atomic_mem_order_t mem_order); +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_atomic_op_finish_stub(void *object, __itt_atomic_mem_op_t op_type, + __itt_atomic_mem_order_t mem_order); + +// User visible APIs. These may called both from user code and from +// compiler generated code. +DEVICE_EXTERN_C void __itt_offload_wi_start(size_t *group_id, size_t wi_id, + uint32_t wg_size); +DEVICE_EXTERN_C void __itt_offload_wi_finish(size_t *group_id, size_t wi_id); +DEVICE_EXTERN_C void __itt_offload_wg_barrier(uintptr_t barrier_id); +DEVICE_EXTERN_C void __itt_offload_wi_resume(size_t *group_id, size_t wi_id); +DEVICE_EXTERN_C void __itt_offload_sync_acquired(uintptr_t sync_id); +DEVICE_EXTERN_C void __itt_offload_sync_releasing(uintptr_t sync_id); +DEVICE_EXTERN_C void __itt_offload_wg_local_range(void *ptr, size_t size); +DEVICE_EXTERN_C void +__itt_offload_atomic_op_start(void *object, __itt_atomic_mem_op_t op_type, + __itt_atomic_mem_order_t mem_order); +DEVICE_EXTERN_C void +__itt_offload_atomic_op_finish(void *object, __itt_atomic_mem_op_t op_type, + __itt_atomic_mem_order_t mem_order); + +#endif // __SPIR__ +#endif // __LIBDEVICE_DEVICE_ITT_H__ diff --git a/libdevice/itt_compiler_wrappers.cpp b/libdevice/itt_compiler_wrappers.cpp new file mode 100644 index 0000000000000..f942cbc69ff97 --- /dev/null +++ b/libdevice/itt_compiler_wrappers.cpp @@ -0,0 +1,60 @@ +//==--- itt_compiler_wrappers.cpp - compiler wrappers for ITT --------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "device_itt.h" + +#ifdef __SPIR__ + +DEVICE_EXTERN_C +void __itt_offload_wi_start_wrapper() { + if (!isITTEnabled()) + return; + + size_t GroupID[3] = {__spirv_BuiltInWorkgroupId.x, + __spirv_BuiltInWorkgroupId.y, + __spirv_BuiltInWorkgroupId.z}; + size_t WIID = __spirv_BuiltInGlobalLinearId; + uint32_t WGSize = static_cast(__spirv_BuiltInWorkgroupSize.x * + __spirv_BuiltInWorkgroupSize.y * + __spirv_BuiltInWorkgroupSize.z); + __itt_offload_wi_start_stub(GroupID, WIID, WGSize); +} + +DEVICE_EXTERN_C +void __itt_offload_wi_finish_wrapper() { + if (!isITTEnabled()) + return; + + size_t GroupID[3] = {__spirv_BuiltInWorkgroupId.x, + __spirv_BuiltInWorkgroupId.y, + __spirv_BuiltInWorkgroupId.z}; + size_t WIID = __spirv_BuiltInGlobalLinearId; + __itt_offload_wi_finish_stub(GroupID, WIID); +} + +DEVICE_EXTERN_C +void __itt_offload_wg_barrier_wrapper() { + if (!isITTEnabled()) + return; + + __itt_offload_wg_barrier_stub(0); +} + +DEVICE_EXTERN_C +void __itt_offload_wi_resume_wrapper() { + if (!isITTEnabled()) + return; + + size_t GroupID[3] = {__spirv_BuiltInWorkgroupId.x, + __spirv_BuiltInWorkgroupId.y, + __spirv_BuiltInWorkgroupId.z}; + size_t WIID = __spirv_BuiltInGlobalLinearId; + __itt_offload_wi_resume_stub(GroupID, WIID); +} + +#endif // __SPIR__ diff --git a/libdevice/itt_stubs.cpp b/libdevice/itt_stubs.cpp new file mode 100644 index 0000000000000..8eae2bf2d44dc --- /dev/null +++ b/libdevice/itt_stubs.cpp @@ -0,0 +1,38 @@ +//==--- itt_stubs.cpp - stub functions for ITT ----------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "device_itt.h" + +#ifdef __SPIR__ + +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_wi_start_stub(size_t *group_id, size_t wi_id, uint32_t wg_size) {} + +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_wi_finish_stub(size_t *group_id, size_t wi_id) {} + +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_wg_barrier_stub(uintptr_t barrier_id) {} + +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_wi_resume_stub(size_t *group_id, size_t wi_id) {} + +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_sync_acquired_stub(uintptr_t sync_id) {} +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_sync_releasing_stub(uintptr_t sync_id) {} +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_wg_local_range_stub(void *ptr, size_t size) {} +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_atomic_op_start_stub(void *object, __itt_atomic_mem_op_t op_type, + __itt_atomic_mem_order_t mem_order) {} +DEVICE_EXTERN_C ITT_STUB_ATTRIBUTES void +__itt_offload_atomic_op_finish_stub(void *object, __itt_atomic_mem_op_t op_type, + __itt_atomic_mem_order_t mem_order) {} + +#endif // __SPIR__ diff --git a/libdevice/itt_user_wrappers.cpp b/libdevice/itt_user_wrappers.cpp new file mode 100644 index 0000000000000..cedd2865eebb9 --- /dev/null +++ b/libdevice/itt_user_wrappers.cpp @@ -0,0 +1,63 @@ +//==--- itt_user_wrappers.cpp - user visible functions for ITT ------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "device_itt.h" + +#ifdef __SPIR__ + +DEVICE_EXTERN_C void __itt_offload_wi_start(size_t *group_id, size_t wi_id, + uint32_t wg_size) { + if (isITTEnabled()) + __itt_offload_wi_start_stub(group_id, wi_id, wg_size); +} + +DEVICE_EXTERN_C void __itt_offload_wi_finish(size_t *group_id, size_t wi_id) { + if (isITTEnabled()) + __itt_offload_wi_finish_stub(group_id, wi_id); +} + +DEVICE_EXTERN_C void __itt_offload_wg_barrier(uintptr_t barrier_id) { + if (isITTEnabled()) + __itt_offload_wg_barrier_stub(barrier_id); +} + +DEVICE_EXTERN_C void __itt_offload_wi_resume(size_t *group_id, size_t wi_id) { + if (isITTEnabled()) + __itt_offload_wi_resume_stub(group_id, wi_id); +} + +DEVICE_EXTERN_C void __itt_offload_sync_acquired(uintptr_t sync_id) { + if (isITTEnabled()) + __itt_offload_sync_acquired_stub(sync_id); +} + +DEVICE_EXTERN_C void __itt_offload_sync_releasing(uintptr_t sync_id) { + if (isITTEnabled()) + __itt_offload_sync_releasing_stub(sync_id); +} + +DEVICE_EXTERN_C void __itt_offload_wg_local_range(void *ptr, size_t size) { + if (isITTEnabled()) + __itt_offload_wg_local_range_stub(ptr, size); +} + +DEVICE_EXTERN_C void +__itt_offload_atomic_op_start(void *object, __itt_atomic_mem_op_t op_type, + __itt_atomic_mem_order_t mem_order) { + if (isITTEnabled()) + __itt_offload_atomic_op_start_stub(object, op_type, mem_order); +} + +DEVICE_EXTERN_C void +__itt_offload_atomic_op_finish(void *object, __itt_atomic_mem_op_t op_type, + __itt_atomic_mem_order_t mem_order) { + if (isITTEnabled()) + __itt_offload_atomic_op_finish_stub(object, op_type, mem_order); +} + +#endif // __SPIR__ diff --git a/libdevice/spirv_vars.h b/libdevice/spirv_vars.h index 5a1a1d0d96705..eab02e7a860be 100644 --- a/libdevice/spirv_vars.h +++ b/libdevice/spirv_vars.h @@ -16,10 +16,21 @@ #include #include +#define __SPIRV_VAR_QUALIFIERS EXTERN_C const typedef size_t size_t_vec __attribute__((ext_vector_type(3))); -extern "C" const size_t_vec __spirv_BuiltInGlobalInvocationId; -extern "C" const size_t_vec __spirv_BuiltInLocalInvocationId; - +__SPIRV_VAR_QUALIFIERS size_t_vec __spirv_BuiltInGlobalInvocationId; +__SPIRV_VAR_QUALIFIERS size_t __spirv_BuiltInGlobalLinearId; +__SPIRV_VAR_QUALIFIERS size_t_vec __spirv_BuiltInLocalInvocationId; +__SPIRV_VAR_QUALIFIERS size_t_vec __spirv_BuiltInWorkgroupId; +__SPIRV_VAR_QUALIFIERS size_t_vec __spirv_BuiltInWorkgroupSize; + +// FIXME: change DEVICE_EXTERNAL to static and rename the functions, +// when #3311 is fixed. +// These are just internal functions used within libdevice. +// We must not intrude the __spirv "namespace", so we'd better +// use names like getGlobalInvocationIdX. +// Libdevice must not export these APIs either, but it currently +// exports them due to DEVICE_EXTERNAL. DEVICE_EXTERNAL inline size_t __spirv_GlobalInvocationId_x() { return __spirv_BuiltInGlobalInvocationId.x; } diff --git a/sycl/doc/extensions/ITTAnnotations/ITTAnnotations.rst b/sycl/doc/extensions/ITTAnnotations/ITTAnnotations.rst new file mode 100644 index 0000000000000..0b72d138549ce --- /dev/null +++ b/sycl/doc/extensions/ITTAnnotations/ITTAnnotations.rst @@ -0,0 +1,79 @@ +ITT annotations support +======================= + +This extension enables a set of functions implementing +the Instrumentation and Tracing Technology (ITT) functionality +in SYCL device code. + +There are three sets of functions defined by this extension, +and they serve different purposes. + +User APIs +--------- + +The user code calling these functions must include the corresponding header +file(s) provided by ``ittnotify`` project (TBD: reference ITT repo here). + +These functions are named using ``__itt_notify_`` prefix. + +Stub APIs +--------- + +These functions are not defined in any header file, and their declarations +follow exactly the declarations of the corresponding user APIs, except that +they have an extra ``_stub`` suffix in their names. + +These functions implement the ITT functionality in a way that allows +the tools, such as Intel(R) Inspector, to recognize the ITT annotations +and run their analysis methods based on that. + +For SYCL device code these functions are implemented as ``noinline`` and +``optnone`` functions so that the corresponding calls may be distinguished +in the execution trace. This is just one way for implementing them, +and the actual implementation may change in future. + +Compiler wrapper APIs +--------------------- + +These functions are not defined in any header file, and they are supposed +to be called from the compiler generated code. These thin wrappers +just provide a convenient way for compilers to produce ITT annotations +without generating too much code in the compilers' IR. + +These functions have ``_wrapper`` suffix in their names. + +Example +~~~~~~~ + +.. code: c++ + DEVICE_EXTERN_C void __itt_offload_wi_start_stub( + size_t[3], size_t, uint32_t); + + DEVICE_EXTERN_C void __itt_offload_wi_start_wrapper() { + if (__spirv_SpecConstant(0xFF747469, 0)) { + size_t GroupID[3] = ...; + size_t WIId = ...; + uint32_t WGSize = ...; + __itt_offload_wi_start_stub(GroupID, WIId, WGSize); + } + } + +A compiler may generate a simple call to ``__itt_offload_wi_start_wrapper`` +to annotate a kernel entry point. Compare this to the code inside the wrapper +function, which a compiler would have to generate if there were no such +a wrapper. + +Conditional compilation +----------------------- + +To minimize the effect of ITT annotations on the performance of the device code, +the implementation is guarded with a specialization constant check. This allows +users and tools to have one version of the annotated code that may be built +with and without ITT annotations "enabled". When the ITT annotations are not +enabled, we expect that the overall effect of the annotations will be minimized +by the dead code elimination optimization(s) made by the device compilers. + +For this purpose we reserve a 1-byte specialization constant numbered +``4285822057`` (``0xFF747469``). The users/tools/runtimes should set this +specialization constant to non-zero value to enable the ITT annotations +in SYCL device code. diff --git a/sycl/doc/extensions/README.md b/sycl/doc/extensions/README.md index 1db502a579f93..e4b4a7bdb52be 100755 --- a/sycl/doc/extensions/README.md +++ b/sycl/doc/extensions/README.md @@ -36,6 +36,7 @@ DPC++ extensions status: | [Unified Shared Memory](USM/USM.adoc) | Supported(OpenCL) | | | [Use Pinned Memory Property](UsePinnedMemoryProperty/UsePinnedMemoryPropery.adoc) | Supported | | | [Level-Zero backend specification](LevelZeroBackend/LevelZeroBackend.md) | Supported | | +| [ITT annotations support](ITTAnnotations/ITTAnnotations.rst) | Supported | | Legend: