From acc7ae67909d69b143877c1b328472ca995b5eb2 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 21 Apr 2020 11:36:29 -0700 Subject: [PATCH 1/6] [SYCL][Doc] Update max_sub_group_size query Signed-off-by: John Pennycook --- .../SubGroup/SYCL_INTEL_sub_group.asciidoc | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc b/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc index 546bed21a0187..d291f0b6a21d9 100755 --- a/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc +++ b/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc @@ -118,26 +118,32 @@ An additional query for sub-group information is added to the +kernel+ class: |+template typename info::param_traits::return_type get_sub_group_info(const device &dev) const+ |Query information from the sub-group from a kernel using the +info::kernel_sub_group+ descriptor for a specific device. +|+template typename info::param_traits::return_type get_sub_group_info(const device &dev, typename info::param_traits::input_type value) const+ +|Query information from the sub-group from a kernel using the +info::kernel_sub_group+ descriptor for a specific device and input parameter. The expected value of the input parameter depends on the information being queried. |=== The kernel descriptors below are added as part of a new +info::kernel_sub_group+ enumeration class: |=== -|Kernel Descriptors|Return Type|Description +|Kernel Descriptors|Input Type|Return Type|Description |+info::kernel_sub_group::max_num_sub_groups+ +|N/A |+uint32_t+ |Returns the maximum number of sub-groups for this kernel. |+info::kernel_sub_group::compile_num_sub_groups+ +|N/A |+uint32_t+ |Returns the number of sub-groups specified by the kernel, or 0 (if not specified). |+info::kernel_sub_group::max_sub_group_size+ +|+range+ |+uint32_t+ -|Returns the maximum sub-group size for this kernel. +|Returns the maximum sub-group size for this kernel launched with the specified work-group size. |+info::kernel_sub_group::compile_sub_group_size+ +|N/A |+uint32_t+ |Returns the required sub-group size specified by the kernel, or 0 (if not specified). |=== @@ -230,7 +236,12 @@ struct sub_group { == Issues -None. +. Should sub-group query results for specific kernels depend on work-group size? ++ +-- +*RESOLVED*: +Yes, this is required by OpenCL devices. Devices that do not require the work-group size can ignore the parameter. +-- //. asd //+ @@ -247,6 +258,7 @@ None. |Rev|Date|Author|Changes |1|2019-04-19|John Pennycook|*Initial public working draft* |2|2020-03-16|John Pennycook|*Separate class definition from algorithms* +|3|2020-04-21|John Pennycook|*Update max_sub_group_size query* |======================================== //************************************************************************ From 66bf6307d97950f99f1fed89e3b4b75dca541fba Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 21 Apr 2020 11:41:16 -0700 Subject: [PATCH 2/6] [SYCL][Doc] Restore missing barrier function Signed-off-by: John Pennycook --- .../SubGroup/SYCL_INTEL_sub_group.asciidoc | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc b/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc index d291f0b6a21d9..01060e4518808 100755 --- a/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc +++ b/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc @@ -161,7 +161,9 @@ To provide access to the +sub_group+ class, a new member function is added to th |Return the sub-group to which the work-item belongs. |=== -The member functions of the sub-group class provide a mechanism for a developer to query properties of a sub-group and a work-item's position in it. +==== Core Member Functions + +The core member functions of the sub-group class provide a mechanism for a developer to query properties of a sub-group and a work-item's position in it. |=== |Member Functions|Description @@ -205,6 +207,17 @@ parallel_for(..., [&](nd_item item) }); ---- +==== Synchronization Functions + +A sub-group barrier synchronizes all work-items in a sub-group, and orders memory operations to the specified address space(s). + +|=== +|Member Functions|Description + +|+void barrier(access::fence_space accessSpace = access::fence_space::global_and_local) const+ +|Execute a sub-group barrier with an optional memory fence specified by _accessSpace_. +|=== + ==== Sample Header [source, c++] @@ -228,6 +241,8 @@ struct sub_group { linear_id_type get_group_linear_id() const; range_type get_group_range() const; + void barrier(access::fence_space accessSpace = access::fence_space::global_and_local) const; + }; } // intel } // sycl @@ -259,6 +274,7 @@ Yes, this is required by OpenCL devices. Devices that do not require the work-g |1|2019-04-19|John Pennycook|*Initial public working draft* |2|2020-03-16|John Pennycook|*Separate class definition from algorithms* |3|2020-04-21|John Pennycook|*Update max_sub_group_size query* +|4|2020-04-21|John Pennycook|*Restore missing barrier function* |======================================== //************************************************************************ From e50761b6bc92ef8079cb96cbef7ea6bd2c970649 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 21 Apr 2020 11:49:54 -0700 Subject: [PATCH 3/6] [SYCL][Doc] Restore sub-group shuffles as members Signed-off-by: John Pennycook --- .../SubGroup/SYCL_INTEL_sub_group.asciidoc | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc b/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc index 01060e4518808..fd22324832e89 100755 --- a/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc +++ b/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc @@ -218,6 +218,26 @@ A sub-group barrier synchronizes all work-items in a sub-group, and orders memor |Execute a sub-group barrier with an optional memory fence specified by _accessSpace_. |=== +==== Shuffles + +The shuffle sub-group functions perform arbitrary communication between pairs of work-items in a sub-group. Common patterns -- such as shifting all values in a sub-group by a fixed number of work-items -- are exposed as specialized shuffles that may be accelerated in hardware. + +|=== +|Member Functions|Description + +|+template T shuffle(T x, id<1> local_id) const+ +|Exchange values of _x_ between work-items in the sub-group in an arbitrary pattern. Returns the value of _x_ from the work-item with the specified id. The value of _local_id_ must be between 0 and the sub-group size. + +|+template T shuffle_down(T x, uint32_t delta) const+ +|Exchange values of _x_ between work-items in the sub-group via a shift. Returns the value of _x_ from the work-item whose id is _delta_ larger than the calling work-item. The value returned when the result of id + _delta_ is greater than or equal to the sub-group size is undefined. + +|+template T shuffle_up(T x, uint32_t delta) const+ +|Exchange values of _x_ between work-items in the sub-group via a shift. Returns the value of _x_ from the work-item whose id is _delta_ smaller than the calling work-item. The value of returned when the result of id - _delta_ is less than zero is undefined. + +|+template T shuffle_xor(T x, id<1> mask) const+ +|Exchange pairs of values of _x_ between work-items in the sub-group. Returns the value of _x_ from the work-item whose id is equal to the exclusive-or of the calling work-item's id and _mask_. _mask_ must be a compile-time constant value that is the same for all work-items in the sub-group. +|=== + ==== Sample Header [source, c++] @@ -243,6 +263,18 @@ struct sub_group { void barrier(access::fence_space accessSpace = access::fence_space::global_and_local) const; + template + T shuffle(T x, id<1> local_id) const; + + template + T shuffle_down(T x, uint32_t delta) const; + + template + T shuffle_up(T x, uint32_t delta) const; + + template + T shuffle_xor(T x, id<1> mask) const; + }; } // intel } // sycl @@ -258,6 +290,13 @@ struct sub_group { Yes, this is required by OpenCL devices. Devices that do not require the work-group size can ignore the parameter. -- +. Should sub-group "shuffles" be member functions? ++ +-- +*RESOLVED*: +Yes, the four shuffles in this extension are a defining feature of sub-groups. Higher-level algorithms (such as those in the +SubGroupAlgorithms+ proposal) may build on them, the same way as higher-level algorithms using work-groups build on work-group local memory. +-- + //. asd //+ //-- @@ -275,6 +314,7 @@ Yes, this is required by OpenCL devices. Devices that do not require the work-g |2|2020-03-16|John Pennycook|*Separate class definition from algorithms* |3|2020-04-21|John Pennycook|*Update max_sub_group_size query* |4|2020-04-21|John Pennycook|*Restore missing barrier function* +|5|2020-04-21|John Pennycook|*Restore sub-group shuffles as member functions* |======================================== //************************************************************************ From 450eb9c1002b9d454b2a1c7c309e87a0ef1584bd Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Wed, 22 Apr 2020 07:18:15 -0700 Subject: [PATCH 4/6] [SYCL][Doc] Align sub-groups with other extensions Sub-group queries build on SYCL_device_specific_kernel_queries. Signed-off-by: John Pennycook --- .../SubGroup/SYCL_INTEL_sub_group.asciidoc | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc b/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc index fd22324832e89..3399310730b1d 100755 --- a/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc +++ b/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc @@ -51,7 +51,7 @@ John Pennycook, Intel (john 'dot' pennycook 'at' intel 'dot' com) == Dependencies -This extension is written against the SYCL 1.2.1 specification, Revision 6. +This extension is written against the SYCL 1.2.1 specification, Revision 6 and the SYCL_INTEL_device_specific_kernel_queries extension. == Overview @@ -111,38 +111,36 @@ The device descriptors below are added to the +info::device+ enumeration class: |Returns a vector_class of +size_t+ containing the set of sub-group sizes supported by the device. |=== -An additional query for sub-group information is added to the +kernel+ class: +An additional query is added to the +kernel+ class, enabling an input value to be passed to `get_info`: |=== |Member Functions|Description -|+template typename info::param_traits::return_type get_sub_group_info(const device &dev) const+ -|Query information from the sub-group from a kernel using the +info::kernel_sub_group+ descriptor for a specific device. -|+template typename info::param_traits::return_type get_sub_group_info(const device &dev, typename info::param_traits::input_type value) const+ -|Query information from the sub-group from a kernel using the +info::kernel_sub_group+ descriptor for a specific device and input parameter. The expected value of the input parameter depends on the information being queried. +|+template typename info::param_traits::return_type get_info(const device &dev, typename info::param_traits::input_type value) const+ +|Query information from a kernel using the +info::kernel_device_specific+ descriptor for a specific device and input parameter. The expected value of the input parameter depends on the information being queried. |=== -The kernel descriptors below are added as part of a new +info::kernel_sub_group+ enumeration class: +The kernel descriptors below are added to the +info::kernel_device_specific+ enumeration class: |=== |Kernel Descriptors|Input Type|Return Type|Description -|+info::kernel_sub_group::max_num_sub_groups+ +|+info::kernel_device_specific::max_num_sub_groups+ |N/A |+uint32_t+ |Returns the maximum number of sub-groups for this kernel. -|+info::kernel_sub_group::compile_num_sub_groups+ +|+info::kernel_device_specific::compile_num_sub_groups+ |N/A |+uint32_t+ |Returns the number of sub-groups specified by the kernel, or 0 (if not specified). -|+info::kernel_sub_group::max_sub_group_size+ +|+info::kernel_device_specific::max_sub_group_size+ |+range+ |+uint32_t+ |Returns the maximum sub-group size for this kernel launched with the specified work-group size. -|+info::kernel_sub_group::compile_sub_group_size+ +|+info::kernel_device_specific::compile_sub_group_size+ |N/A |+uint32_t+ |Returns the required sub-group size specified by the kernel, or 0 (if not specified). @@ -315,6 +313,7 @@ Yes, the four shuffles in this extension are a defining feature of sub-groups. |3|2020-04-21|John Pennycook|*Update max_sub_group_size query* |4|2020-04-21|John Pennycook|*Restore missing barrier function* |5|2020-04-21|John Pennycook|*Restore sub-group shuffles as member functions* +|6|2020-04-22|John Pennycook|*Align with SYCL_INTEL_device_specific_kernel_queries* |======================================== //************************************************************************ From 35f47936454d349a7dc31e96bfa68f8db7bc4f7a Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 28 Apr 2020 07:27:40 -0700 Subject: [PATCH 5/6] [SYCL][Doc] Clarify usage of two get_info calls Signed-off-by: John Pennycook --- sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc b/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc index 3399310730b1d..bf115392d206f 100755 --- a/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc +++ b/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc @@ -111,7 +111,7 @@ The device descriptors below are added to the +info::device+ enumeration class: |Returns a vector_class of +size_t+ containing the set of sub-group sizes supported by the device. |=== -An additional query is added to the +kernel+ class, enabling an input value to be passed to `get_info`: +An additional query is added to the +kernel+ class, enabling an input value to be passed to `get_info`. The original `get_info` query from the SYCL_INTEL_device_specific_kernel_queries extension should be used for queries that do not specify an input type. |=== |Member Functions|Description From 7f44b71abeeb6dc032243862fe006643c3e9ccdd Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Thu, 18 Jun 2020 08:28:20 -0700 Subject: [PATCH 6/6] [SYCL] Remove fence_space from sub-group barrier Incompatible with memory model introduced by ExtendedAtomics extension. Signed-off-by: John Pennycook --- .../doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc b/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc index bf115392d206f..7b36b219062e3 100755 --- a/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc +++ b/sycl/doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc @@ -207,13 +207,13 @@ parallel_for(..., [&](nd_item item) ==== Synchronization Functions -A sub-group barrier synchronizes all work-items in a sub-group, and orders memory operations to the specified address space(s). +A sub-group barrier synchronizes all work-items in a sub-group, and orders memory operations with a memory fence to all address spaces. |=== |Member Functions|Description -|+void barrier(access::fence_space accessSpace = access::fence_space::global_and_local) const+ -|Execute a sub-group barrier with an optional memory fence specified by _accessSpace_. +|+void barrier() const+ +|Execute a sub-group barrier. |=== ==== Shuffles @@ -259,7 +259,7 @@ struct sub_group { linear_id_type get_group_linear_id() const; range_type get_group_range() const; - void barrier(access::fence_space accessSpace = access::fence_space::global_and_local) const; + void barrier() const; template T shuffle(T x, id<1> local_id) const;