diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp index 420ec280cffeb..d27342a16c930 100644 --- a/sycl/include/CL/sycl/detail/cg.hpp +++ b/sycl/include/CL/sycl/detail/cg.hpp @@ -36,18 +36,119 @@ namespace sycl { // Forward declarations class queue; -namespace detail { -class queue_impl; -} // namespace detail namespace detail { +// Periodically there is a need to extend handler and CG classes to hold more +// data(members) than it has now. But any modification of the layout of those +// classes is an ABI break. To have an ability to have more data the following +// approach is implemented: +// +// Those classes have a member - MSharedPtrStorage which is an std::vector of +// std::shared_ptr's and is supposed to hold reference counters of user +// provided shared_ptr's. +// +// The first element of this vector is reused to store a vector of additional +// members handler and CG need to have. +// +// These additional arguments are represented using "ExtendedMemberT" structure +// which has a pointer to an arbitrary value and an integer which is used to +// understand how the value the pointer points to should be interpreted. +// +// ======== ======== ======== +// | | | | ... | | std::vector> +// ======== ======== ======== +// || || || +// || \/ \/ +// || user user +// || data data +// \/ +// ======== ======== ======== +// | Type | | Type | ... | Type | std::vector +// | | | | | | +// | Ptr | | Ptr | ... | Ptr | +// ======== ======== ======== +// +// Prior to this change this vector was supposed to have user's values only, so +// it is not legal to expect that the first argument is a special one. +// Versioning is implemented to overcome this problem - if the first element of +// the MSharedPtrStorage is a pointer to the special vector then CGType value +// has version "1" encoded. +// +// The version of CG type is encoded in the highest byte of the value: +// +// 0x00000001 - CG type KERNEL version 0 +// 0x01000001 - CG type KERNEL version 1 +// /\ +// || +// The byte specifies the version +// +// A user of this vector should not expect that a specific data is stored at a +// specific position, but iterate over all looking for an ExtendedMemberT value +// with the desired type. +// This allows changing/extending the contents of this vector without changing +// the version. +// + +// Used to represent a type of an extended member +enum class ExtendedMembersType : unsigned int { + HANDLER_KERNEL_BUNDLE = 0, +}; + +// Holds a pointer to an object of an arbitrary type and an ID value which +// should be used to understand what type pointer points to. +// Used as to extend handler class without introducing new class members which +// would change handler layout. +struct ExtendedMemberT { + ExtendedMembersType MType; + std::shared_ptr MData; +}; + +static std::shared_ptr> +convertToExtendedMembers(const std::shared_ptr &SPtr) { + return std::const_pointer_cast>( + std::static_pointer_cast>(SPtr)); +} + class stream_impl; +class queue_impl; +class kernel_bundle_impl; + +// The constant is used to left shift a CG type value to access it's version +constexpr unsigned int ShiftBitsForVersion = 24; + +// Constructs versioned type +constexpr unsigned int getVersionedCGType(unsigned int Type, + unsigned char Version) { + return Type | (static_cast(Version) << ShiftBitsForVersion); +} + +// Returns the type without version encoded +constexpr unsigned char getUnversionedCGType(unsigned int Type) { + unsigned int Mask = -1; + Mask >>= (sizeof(Mask) * 8 - ShiftBitsForVersion); + return Type & Mask; +} + +// Returns the version encoded to the type +constexpr unsigned char getCGTypeVersion(unsigned int Type) { + return Type >> ShiftBitsForVersion; +} + /// Base class for all types of command groups. class CG { public: + // Used to version CG and handler classes. Using unsigned char as the version + // is encoded in the highest byte of CGType value. So it is not possible to + // encode a value > 255 anyway which should be big enough room for version + // bumping. + enum class CG_VERSION : unsigned char { + V0 = 0, + V1 = 1, + }; + /// Type of the command group. - enum CGTYPE { + enum CGTYPE : unsigned int { NONE = 0, KERNEL = 1, COPY_ACC_TO_PTR = 2, @@ -62,7 +163,9 @@ class CG { FILL_USM = 11, PREFETCH_USM = 12, CODEPLAY_INTEROP_TASK = 13, - CODEPLAY_HOST_TASK = 14 + CODEPLAY_HOST_TASK = 14, + KERNEL_V1 = + getVersionedCGType(KERNEL, static_cast(CG_VERSION::V1)), }; CG(CGTYPE Type, vector_class> ArgsStorage, @@ -87,7 +190,17 @@ class CG { CG(CG &&CommandGroup) = default; - CGTYPE getType() { return MType; } + CGTYPE getType() { return static_cast(getUnversionedCGType(MType)); } + + std::shared_ptr> getExtendedMembers() { + if (getCGTypeVersion(MType) == static_cast(CG_VERSION::V0) || + MSharedPtrStorage.empty()) + return nullptr; + + // The first value in shared_ptr storage is supposed to store a vector of + // extended members. + return convertToExtendedMembers(MSharedPtrStorage[0]); + } virtual ~CG() = default; @@ -146,7 +259,8 @@ class CGExecKernel : public CG { MSyclKernel(std::move(SyclKernel)), MArgs(std::move(Args)), MKernelName(std::move(KernelName)), MOSModuleHandle(OSModuleHandle), MStreams(std::move(Streams)) { - assert((getType() == RUN_ON_HOST_INTEL || getType() == KERNEL) && + assert((getType() == RUN_ON_HOST_INTEL || getType() == KERNEL || + getType() == KERNEL_V1) && "Wrong type of exec kernel CG."); } @@ -155,6 +269,19 @@ class CGExecKernel : public CG { vector_class> getStreams() const { return MStreams; } + + std::shared_ptr getKernelBundle() { + const std::shared_ptr> &ExtendedMembers = + getExtendedMembers(); + if (!ExtendedMembers) + return nullptr; + for (const ExtendedMemberT &EMember : *ExtendedMembers) + if (ExtendedMembersType::HANDLER_KERNEL_BUNDLE == EMember.MType) + return std::static_pointer_cast( + EMember.MData); + return nullptr; + } + void clearStreams() { MStreams.clear(); } }; diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index ef60964e3d332..c11235a90fe91 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -313,8 +314,7 @@ class __SYCL_EXPORT handler { /// /// \param Queue is a SYCL queue. /// \param IsHost indicates if this handler is created for SYCL host device. - handler(shared_ptr_class Queue, bool IsHost) - : MQueue(std::move(Queue)), MIsHost(IsHost) {} + handler(shared_ptr_class Queue, bool IsHost); /// Stores copy of Arg passed to the MArgsStorage. template ( std::move(Wrapper)); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; #endif } else #endif // !SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING && \ @@ -834,7 +834,7 @@ class __SYCL_EXPORT handler { MNDRDesc.set(std::move(NumWorkItems)); StoreLambda( std::move(KernelFunc)); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; #endif } } @@ -853,7 +853,7 @@ class __SYCL_EXPORT handler { MKernel = detail::getSyclObjImpl(std::move(Kernel)); detail::checkValueRange(NumWorkItems); MNDRDesc.set(std::move(NumWorkItems)); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; extractArgsAndReqs(); MKernelName = getKernelName(); } @@ -898,12 +898,50 @@ class __SYCL_EXPORT handler { #endif + std::shared_ptr + getOrInsertHandlerKernelBundle(bool Insert) const; + + void setHandlerKernelBundle( + const std::shared_ptr &NewKernelBundleImpPtr); + public: handler(const handler &) = delete; handler(handler &&) = delete; handler &operator=(const handler &) = delete; handler &operator=(handler &&) = delete; +#if __cplusplus > 201402L + template + void set_specialization_constant( + typename std::remove_reference_t::value_type Value) { + + std::shared_ptr KernelBundleImplPtr = + getOrInsertHandlerKernelBundle(/*Insert=*/true); + + detail::createSyclObjFromImpl>( + KernelBundleImplPtr) + .set_specialization_constant(Value); + } + + template + typename std::remove_reference_t::type + get_specialization_constant() const { + + std::shared_ptr KernelBundleImplPtr = + getOrInsertHandlerKernelBundle(/*Insert=*/true); + + return detail::createSyclObjFromImpl>( + KernelBundleImplPtr) + .get_specialization_constant(); + } + +#endif + + void + use_kernel_bundle(const kernel_bundle &ExecBundle) { + setHandlerKernelBundle(detail::getSyclObjImpl(ExecBundle)); + } + /// Requires access to the memory object associated with the placeholder /// accessor. /// @@ -1011,7 +1049,7 @@ class __SYCL_EXPORT handler { MNDRDesc.set(range<1>{1}); StoreLambda(KernelFunc); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; #endif } @@ -1123,7 +1161,7 @@ class __SYCL_EXPORT handler { detail::checkValueRange(NumWorkItems, WorkItemOffset); MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset)); StoreLambda(std::move(KernelFunc)); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; #endif } @@ -1155,7 +1193,7 @@ class __SYCL_EXPORT handler { detail::checkValueRange(ExecutionRange); MNDRDesc.set(std::move(ExecutionRange)); StoreLambda(std::move(KernelFunc)); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; #endif } @@ -1376,7 +1414,7 @@ class __SYCL_EXPORT handler { detail::checkValueRange(NumWorkGroups); MNDRDesc.setNumWorkGroups(NumWorkGroups); StoreLambda(std::move(KernelFunc)); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; #endif // __SYCL_DEVICE_ONLY__ } @@ -1412,7 +1450,7 @@ class __SYCL_EXPORT handler { detail::checkValueRange(ExecRange); MNDRDesc.set(std::move(ExecRange)); StoreLambda(std::move(KernelFunc)); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; #endif // __SYCL_DEVICE_ONLY__ } @@ -1429,7 +1467,7 @@ class __SYCL_EXPORT handler { // known constant MNDRDesc.set(range<1>{1}); MKernel = detail::getSyclObjImpl(std::move(Kernel)); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; extractArgsAndReqs(); MKernelName = getKernelName(); } @@ -1462,7 +1500,7 @@ class __SYCL_EXPORT handler { MKernel = detail::getSyclObjImpl(std::move(Kernel)); detail::checkValueRange(NumWorkItems, WorkItemOffset); MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset)); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; extractArgsAndReqs(); MKernelName = getKernelName(); } @@ -1481,7 +1519,7 @@ class __SYCL_EXPORT handler { MKernel = detail::getSyclObjImpl(std::move(Kernel)); detail::checkValueRange(NDRange); MNDRDesc.set(std::move(NDRange)); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; extractArgsAndReqs(); MKernelName = getKernelName(); } @@ -1505,7 +1543,7 @@ class __SYCL_EXPORT handler { // known constant MNDRDesc.set(range<1>{1}); MKernel = detail::getSyclObjImpl(std::move(Kernel)); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; if (!MIsHost && !lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); @@ -1546,7 +1584,7 @@ class __SYCL_EXPORT handler { detail::checkValueRange(NumWorkItems); MNDRDesc.set(std::move(NumWorkItems)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; if (!MIsHost && !lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); @@ -1582,7 +1620,7 @@ class __SYCL_EXPORT handler { detail::checkValueRange(NumWorkItems, WorkItemOffset); MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; if (!MIsHost && !lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); @@ -1618,7 +1656,7 @@ class __SYCL_EXPORT handler { detail::checkValueRange(NDRange); MNDRDesc.set(std::move(NDRange)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; if (!MIsHost && !lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); @@ -1659,7 +1697,7 @@ class __SYCL_EXPORT handler { MNDRDesc.setNumWorkGroups(NumWorkGroups); MKernel = detail::getSyclObjImpl(std::move(Kernel)); StoreLambda(std::move(KernelFunc)); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; #endif // __SYCL_DEVICE_ONLY__ } @@ -1700,7 +1738,7 @@ class __SYCL_EXPORT handler { MNDRDesc.set(std::move(ExecRange)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); StoreLambda(std::move(KernelFunc)); - MCGType = detail::CG::KERNEL; + MCGType = detail::CG::KERNEL_V1; #endif // __SYCL_DEVICE_ONLY__ } @@ -2024,7 +2062,7 @@ class __SYCL_EXPORT handler { vector_class MAccStorage; vector_class MLocalAccStorage; vector_class> MStreamStorage; - vector_class> MSharedPtrStorage; + mutable vector_class> MSharedPtrStorage; /// The list of arguments for the kernel. vector_class MArgs; /// The list of associated accessors with this handler. diff --git a/sycl/include/CL/sycl/kernel_bundle.hpp b/sycl/include/CL/sycl/kernel_bundle.hpp index ab10d2179e359..41ab9c3405899 100644 --- a/sycl/include/CL/sycl/kernel_bundle.hpp +++ b/sycl/include/CL/sycl/kernel_bundle.hpp @@ -158,22 +158,6 @@ class __SYCL_EXPORT kernel_bundle_plain { bool native_specialization_constant() const noexcept; protected: - // \returns true if the kernel_bundle has the specialization constant with - // specified ID - bool has_specialization_constant(unsigned int SpecID) const noexcept; - - // Sets the specialization constant with specified ID to the value pointed by - // Value + ValueSize - void set_specialization_constant_raw_value(unsigned int SpecID, - const void *Value, - size_t ValueSize); - - // \returns pointer to the value of the specialization constant with specified - // ID - void get_specialization_constant_raw_value(unsigned int SpecID, - void *ValueRet, - size_t ValueSize) const; - // \returns a kernel object which represents the kernel identified by // kernel_id passed kernel get_kernel(const kernel_id &KernelID) const; @@ -263,9 +247,9 @@ class kernel_bundle : public detail::kernel_bundle_plain { /// \returns true if any device image in the kernel_bundle uses specialization /// constant whose address is SpecName template bool has_specialization_constant() const noexcept { - assert(false && "has_specialization_constant is not implemented yet"); - unsigned int SpecID = 0; // TODO: Convert SpecName to a numeric ID - return kernel_bundle_plain::has_specialization_constant(SpecID); + throw sycl::runtime_error( + "kernel_bundle::has_specialization_constant is not implemented yet", + PI_INVALID_OPERATION); } /// Sets the value of the specialization constant whose address is SpecName @@ -274,24 +258,21 @@ class kernel_bundle : public detail::kernel_bundle_plain { template > void set_specialization_constant( - typename std::remove_reference_t::type Value) { - assert(false && "set_specialization_constant is not implemented yet"); - unsigned int SpecID = 0; // TODO: Convert SpecName to a numeric ID - return kernel_bundle_plain::set_specialization_constant_raw_value( - SpecID, &Value, sizeof(Value)); + typename std::remove_reference_t::value_type Value) { + (void)Value; + throw sycl::runtime_error( + "kernel_bundle::set_specialization_constant is not implemented yet", + PI_INVALID_OPERATION); } /// The value of the specialization constant whose address is SpecName for /// this kernel bundle. template - typename std::remove_reference_t::type + typename std::remove_reference_t::value_type get_specialization_constant() const { - assert(false && "get_specialization_constant is not implemented yet"); - unsigned int SpecID = 0; // TODO: Convert SpecName to a numeric ID - typename std::remove_reference_t::type Value; - kernel_bundle_plain::get_specialization_constant_raw_value( - SpecID, (void *)&Value, sizeof(Value)); - return Value; + throw sycl::runtime_error( + "kernel_bundle::get_specialization_constant is not implemented yet", + PI_INVALID_OPERATION); } #endif @@ -352,6 +333,7 @@ kernel_bundle get_kernel_bundle(const context &Ctx, const std::vector &Devs) { detail::KernelBundleImplPtr Impl = detail::get_kernel_bundle_impl(Ctx, Devs, State); + return detail::createSyclObjFromImpl>(Impl); } @@ -430,7 +412,6 @@ kernel_bundle get_kernel_bundle(const context &Ctx, detail::createSyclObjFromImpl>(DevImg)); }; - std::vector EmptyKernelIDs; detail::KernelBundleImplPtr Impl = detail::get_kernel_bundle_impl(Ctx, Devs, State, SelectorWrapper); @@ -448,12 +429,14 @@ kernel_bundle get_kernel_bundle(const context &Ctx, SelectorT Selector) { namespace detail { -bool has_kernel_bundle_impl(const context &Ctx, const std::vector &Devs, - bundle_state State); +__SYCL_EXPORT bool has_kernel_bundle_impl(const context &Ctx, + const std::vector &Devs, + bundle_state State); -bool has_kernel_bundle_impl(const context &Ctx, const std::vector &Devs, - const std::vector &kernelIds, - bundle_state State); +__SYCL_EXPORT bool +has_kernel_bundle_impl(const context &Ctx, const std::vector &Devs, + const std::vector &kernelIds, + bundle_state State); } // namespace detail /// \returns true if the following is true: @@ -515,9 +498,8 @@ template bool is_compatible(const device &Dev) { namespace detail { -std::shared_ptr +__SYCL_EXPORT std::shared_ptr join_impl(const std::vector &Bundles); - } /// \returns a new kernel bundle that represents the union of all the device @@ -525,9 +507,10 @@ join_impl(const std::vector &Bundles); template sycl::kernel_bundle join(const std::vector> &Bundles) { + // Convert kernel_bundle to impls to abstract template parameter away std::vector KernelBundleImpls; KernelBundleImpls.reserve(Bundles.size()); - for (sycl::kernel_bundle &Bundle : Bundles) + for (const sycl::kernel_bundle &Bundle : Bundles) KernelBundleImpls.push_back(detail::getSyclObjImpl(Bundle)); std::shared_ptr Impl = @@ -541,7 +524,7 @@ join(const std::vector> &Bundles) { namespace detail { -std::shared_ptr +__SYCL_EXPORT std::shared_ptr compile_impl(const kernel_bundle &InputBundle, const std::vector &Devs, const property_list &PropList); } @@ -572,6 +555,10 @@ compile(const kernel_bundle &InputBundle, namespace detail { std::vector find_device_intersection( const std::vector> &ObjectBundles); + +__SYCL_EXPORT std::shared_ptr +link_impl(const std::vector> &ObjectBundles, + const std::vector &Devs, const property_list &PropList); } /// \returns a new kernel_bundle which contains the device images from the @@ -579,9 +566,14 @@ std::vector find_device_intersection( /// state bundle_state::executable The new bundle represents all of the kernels /// in ObjectBundles that are compatible with at least one of the devices in /// Devs. -kernel_bundle +inline kernel_bundle link(const std::vector> &ObjectBundles, - const std::vector &Devs, const property_list &PropList = {}); + const std::vector &Devs, const property_list &PropList = {}) { + detail::KernelBundleImplPtr Impl = + detail::link_impl(ObjectBundles, Devs, PropList); + return detail::createSyclObjFromImpl< + kernel_bundle>(Impl); +} inline kernel_bundle link(const kernel_bundle &ObjectBundle, @@ -609,13 +601,24 @@ link(const kernel_bundle &ObjectBundle, // build API ///////////////////////// +namespace detail { +__SYCL_EXPORT std::shared_ptr +build_impl(const kernel_bundle &InputBundle, + const std::vector &Devs, const property_list &PropList); +} + /// \returns a new kernel_bundle which contains device images that are /// translated into one ore more new device images of state /// bundle_state::executable. The new bundle represents all of the kernels in /// InputBundle that are compatible with at least one of the devices in Devs. -kernel_bundle +inline kernel_bundle build(const kernel_bundle &InputBundle, - const std::vector &Devs, const property_list &PropList = {}); + const std::vector &Devs, const property_list &PropList = {}) { + detail::KernelBundleImplPtr Impl = + detail::build_impl(InputBundle, Devs, PropList); + return detail::createSyclObjFromImpl< + kernel_bundle>(Impl); +} inline kernel_bundle build(const kernel_bundle &InputBundle, diff --git a/sycl/source/detail/device_image_impl.hpp b/sycl/source/detail/device_image_impl.hpp index 931710765b433..dbfc545bcea3d 100644 --- a/sycl/source/detail/device_image_impl.hpp +++ b/sycl/source/detail/device_image_impl.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -28,41 +29,17 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { -// Used for sorting vector of kernel_id's -struct LessByNameComp { - bool operator()(const sycl::kernel_id &LHS, const sycl::kernel_id &RHS) { - return std::strcmp(LHS.get_name(), RHS.get_name()) < 0; - } -}; - // The class is impl counterpart for sycl::device_image // It can represent a program in different states, kernel_id's it has and state // of specialization constants for it class device_image_impl { public: - device_image_impl(RTDeviceBinaryImage *BinImage, context Context, - std::vector Devices, bundle_state State) + device_image_impl(const RTDeviceBinaryImage *BinImage, context Context, + std::vector Devices, bundle_state State, + std::vector KernelIDs, RT::PiProgram Program) : MBinImage(BinImage), MContext(std::move(Context)), - MDevices(std::move(Devices)), MState(State) { - - // Collect kernel names for the image - pi_device_binary DevBin = - const_cast(&MBinImage->getRawData()); - for (_pi_offload_entry EntriesIt = DevBin->EntriesBegin; - EntriesIt != DevBin->EntriesEnd; ++EntriesIt) { - - std::shared_ptr KernleIDImpl = - std::make_shared(EntriesIt->name); - - sycl::kernel_id KernelID = - detail::createSyclObjFromImpl(KernleIDImpl); - - // Insert new element keeping MKernelIDs sorted. - auto It = std::lower_bound(MKernelIDs.begin(), MKernelIDs.end(), KernelID, - LessByNameComp{}); - MKernelIDs.insert(It, std::move(KernelID)); - } - } + MDevices(std::move(Devices)), MState(State), MProgram(Program), + MKernelIDs(std::move(KernelIDs)) {} bool has_kernel(const kernel_id &KernelIDCand) const noexcept { return std::binary_search(MKernelIDs.begin(), MKernelIDs.end(), @@ -93,27 +70,30 @@ class device_image_impl { // The struct maps specialization ID to offset in the binary blob where value // for this spec const should be. - struct SpecConstIDOffset { + struct SpecConstDescT { unsigned int ID = 0; unsigned int Offset = 0; + bool IsSet = false; }; bool has_specialization_constant(unsigned int SpecID) const noexcept { - return std::any_of( - MSpecConstOffsets.begin(), MSpecConstOffsets.end(), - [SpecID](const SpecConstIDOffset &Pair) { return Pair.ID == SpecID; }); + return std::any_of(MSpecConstDescs.begin(), MSpecConstDescs.end(), + [SpecID](const SpecConstDescT &SpecConstDesc) { + return SpecConstDesc.ID == SpecID; + }); } void set_specialization_constant_raw_value(unsigned int SpecID, const void *Value, size_t ValueSize) noexcept { - for (const SpecConstIDOffset &Pair : MSpecConstOffsets) - if (Pair.ID == SpecID) { + for (const SpecConstDescT &SpecConstDesc : MSpecConstDescs) + if (SpecConstDesc.ID == SpecID) { // Lock the mutex to prevent when one thread in the middle of writing a // new value while another thread is reading the value to pass it to // JIT compiler. const std::lock_guard SpecConstLock(MSpecConstAccessMtx); - std::memcpy(MSpecConstsBlob.data() + Pair.Offset, Value, ValueSize); + std::memcpy(MSpecConstsBlob.data() + SpecConstDesc.Offset, Value, + ValueSize); return; } } @@ -121,13 +101,14 @@ class device_image_impl { void get_specialization_constant_raw_value(unsigned int SpecID, void *ValueRet, size_t ValueSize) const noexcept { - for (const SpecConstIDOffset &Pair : MSpecConstOffsets) - if (Pair.ID == SpecID) { + for (const SpecConstDescT &SpecConstDesc : MSpecConstDescs) + if (SpecConstDesc.ID == SpecID) { // Lock the mutex to prevent when one thread in the middle of writing a // new value while another thread is reading the value to pass it to // JIT compiler. const std::lock_guard SpecConstLock(MSpecConstAccessMtx); - std::memcpy(ValueRet, MSpecConstsBlob.data() + Pair.Offset, ValueSize); + std::memcpy(ValueRet, MSpecConstsBlob.data() + SpecConstDesc.Offset, + ValueSize); return; } } @@ -136,11 +117,45 @@ class device_image_impl { void set_state(bundle_state NewState) noexcept { MState = NewState; } + const std::vector &get_devices() const noexcept { return MDevices; } + + bool compatible_with_device(const device &Dev) const { + return std::any_of( + MDevices.begin(), MDevices.end(), + [&Dev](const device &DevCand) { return Dev == DevCand; }); + } + + const RT::PiProgram &get_program_ref() const noexcept { return MProgram; } + + const RTDeviceBinaryImage *&get_bin_image_ref() noexcept { return MBinImage; } + + const context &get_context() const noexcept { return MContext; } + + std::vector &get_kernel_ids_ref() noexcept { return MKernelIDs; } + + std::vector &get_spec_const_blob_ref() noexcept { + return MSpecConstsBlob; + } + + std::vector &get_spec_const_offsets_ref() noexcept { + return MSpecConstDescs; + } + + ~device_image_impl() { + + if (MProgram) { + const detail::plugin &Plugin = getSyclObjImpl(MContext)->getPlugin(); + Plugin.call(MProgram); + } + } + private: - RTDeviceBinaryImage *MBinImage = nullptr; + const RTDeviceBinaryImage *MBinImage = nullptr; context MContext; std::vector MDevices; bundle_state MState; + // Native program handler which this device image represents + RT::PiProgram MProgram = nullptr; // List of kernel ids available in this image, elements should be sorted // according to LessByNameComp std::vector MKernelIDs; @@ -152,7 +167,7 @@ class device_image_impl { // image std::vector MSpecConstsBlob; // Contains list of spec ID + their offsets in the MSpecConstsBlob - std::vector MSpecConstOffsets; + std::vector MSpecConstDescs; }; } // namespace detail diff --git a/sycl/source/detail/global_handler.cpp b/sycl/source/detail/global_handler.cpp index b9b46c9c4aa96..06cf90b3e11f9 100644 --- a/sycl/source/detail/global_handler.cpp +++ b/sycl/source/detail/global_handler.cpp @@ -114,6 +114,17 @@ GlobalHandler::getDeviceFilterList(const std::string &InitValue) { return *MDeviceFilterList; } +std::mutex &GlobalHandler::getHandlerExtendedMembersMutex() { + if (MHandlerExtendedMembersMutex) + return *MHandlerExtendedMembersMutex; + + const std::lock_guard Lock{MFieldsLock}; + if (!MHandlerExtendedMembersMutex) + MHandlerExtendedMembersMutex = std::make_unique(); + + return *MHandlerExtendedMembersMutex; +} + void shutdown() { // First, release resources, that may access plugins. GlobalHandler::instance().MScheduler.reset(nullptr); diff --git a/sycl/source/detail/global_handler.hpp b/sycl/source/detail/global_handler.hpp index b3511445c5406..9b2c582b164ae 100644 --- a/sycl/source/detail/global_handler.hpp +++ b/sycl/source/detail/global_handler.hpp @@ -56,6 +56,7 @@ class GlobalHandler { std::mutex &getFilterMutex(); std::vector &getPlugins(); device_filter_list &getDeviceFilterList(const std::string &InitValue); + std::mutex &getHandlerExtendedMembersMutex(); private: friend void shutdown(); @@ -76,6 +77,8 @@ class GlobalHandler { std::unique_ptr MFilterMutex; std::unique_ptr> MPlugins; std::unique_ptr MDeviceFilterList; + // The mutex for synchronizing accesses to handlers extended members + std::unique_ptr MHandlerExtendedMembersMutex; }; } // namespace detail } // namespace sycl diff --git a/sycl/source/detail/kernel_bundle_impl.hpp b/sycl/source/detail/kernel_bundle_impl.hpp index c992f5f14a8ec..a7a5a93990725 100644 --- a/sycl/source/detail/kernel_bundle_impl.hpp +++ b/sycl/source/detail/kernel_bundle_impl.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -26,6 +27,12 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { +template struct LessByHash { + bool operator()(const T &LHS, const T &RHS) { + return getSyclObjImpl(LHS) < getSyclObjImpl(RHS); + } +}; + // The class is an impl counterpart of the sycl::kernel_bundle. // It provides an access and utilities to manage set of sycl::device_images // objects. @@ -39,33 +46,107 @@ class kernel_bundle_impl { MContext, MDevices, State); } - kernel_bundle_impl(const context &Ctx, const std::vector &Devs, + // Matches sycl::build and sycl::compile + // Have one constructor because sycl::build and sycl::compile have the same + // signature + kernel_bundle_impl(const kernel_bundle &InputBundle, + std::vector Devs, const property_list &PropList, + bundle_state TargetState) + : MContext(InputBundle.get_context()), MDevices(std::move(Devs)) { + + for (const device_image_plain &DeviceImage : InputBundle) { + // Skip images which are not compatible with devices provided + if (std::none_of( + MDevices.begin(), MDevices.end(), + [&DeviceImage](const device &Dev) { + return getSyclObjImpl(DeviceImage)->compatible_with_device(Dev); + })) + continue; + + switch (TargetState) { + case bundle_state::object: + MDeviceImages.push_back(detail::ProgramManager::getInstance().compile( + DeviceImage, MDevices, PropList)); + break; + case bundle_state::executable: + MDeviceImages.push_back(detail::ProgramManager::getInstance().build( + DeviceImage, MDevices, PropList)); + break; + case bundle_state::input: + throw sycl::runtime_error( + "Internal error. The target state should not be input", + PI_INVALID_OPERATION); + break; + } + } + } + + // Matches sycl::link + kernel_bundle_impl( + const std::vector> &ObjectBundles, + std::vector Devs, const property_list &PropList) + : MContext(ObjectBundles[0].get_context()), MDevices(std::move(Devs)) { + + // TODO: Unify with c'tor for sycl::comile and sycl::build by calling + // sycl::join on vector of kernel_bundles + + std::vector DeviceImages; + for (const kernel_bundle &ObjectBundle : + ObjectBundles) { + for (const device_image_plain &DeviceImage : ObjectBundle) { + + // Skip images which are not compatible with devices provided + if (std::none_of(MDevices.begin(), MDevices.end(), + [&DeviceImage](const device &Dev) { + return getSyclObjImpl(DeviceImage) + ->compatible_with_device(Dev); + })) + continue; + + DeviceImages.insert(DeviceImages.end(), DeviceImage); + } + } + + MDeviceImages = detail::ProgramManager::getInstance().link( + std::move(DeviceImages), MDevices, PropList); + } + + kernel_bundle_impl(context Ctx, std::vector Devs, const std::vector &KernelIDs, bundle_state State) - : kernel_bundle_impl(Ctx, Devs, State) { + : MContext(std::move(Ctx)), MDevices(std::move(Devs)) { - // Filter out images that have no kernel_ids specified - auto It = std::remove_if(MDeviceImages.begin(), MDeviceImages.end(), - [&KernelIDs](const device_image_plain &Image) { - return std::none_of( - KernelIDs.begin(), KernelIDs.end(), - [&Image](const sycl::kernel_id &KernelID) { - return Image.has_kernel(KernelID); - }); - }); - MDeviceImages.erase(It, MDeviceImages.end()); + MDeviceImages = detail::ProgramManager::getInstance().getSYCLDeviceImages( + MContext, MDevices, KernelIDs, State); } - kernel_bundle_impl(const context &Ctx, const std::vector &Devs, + kernel_bundle_impl(context Ctx, std::vector Devs, const DevImgSelectorImpl &Selector, bundle_state State) - : kernel_bundle_impl(Ctx, Devs, State) { + : MContext(std::move(Ctx)), MDevices(std::move(Devs)) { - // Filter out images that are rejected by Selector - auto It = std::remove_if(MDeviceImages.begin(), MDeviceImages.end(), - [&Selector](const device_image_plain &Image) { - return !Selector(getSyclObjImpl(Image)); - }); - MDeviceImages.erase(It, MDeviceImages.end()); + MDeviceImages = detail::ProgramManager::getInstance().getSYCLDeviceImages( + MContext, MDevices, Selector, State); + } + + // C'tor matches sycl::join API + kernel_bundle_impl(const std::vector &Bundles) { + MContext = Bundles[0]->MContext; + for (const detail::KernelBundleImplPtr &Bundle : Bundles) { + MDevices.insert(MDevices.end(), Bundle->MDevices.begin(), + Bundle->MDevices.end()); + MDeviceImages.insert(MDeviceImages.end(), Bundle->MDeviceImages.begin(), + Bundle->MDeviceImages.end()); + } + + std::sort(MDevices.begin(), MDevices.end(), LessByHash{}); + const auto DevIt = std::unique(MDevices.begin(), MDevices.end()); + MDevices.erase(DevIt, MDevices.end()); + + std::sort(MDeviceImages.begin(), MDeviceImages.end(), + LessByHash{}); + const auto DevImgIt = + std::unique(MDeviceImages.begin(), MDeviceImages.end()); + MDeviceImages.erase(DevImgIt, MDeviceImages.end()); } bool empty() const noexcept { return MDeviceImages.empty(); } @@ -76,7 +157,7 @@ class kernel_bundle_impl { context get_context() const noexcept { return MContext; } - std::vector get_devices() const noexcept { return MDevices; } + const std::vector &get_devices() const noexcept { return MDevices; } std::vector get_kernel_ids() const { // Collect kernel ids from all device images, then remove duplicates @@ -88,20 +169,42 @@ class kernel_bundle_impl { Result.insert(Result.end(), KernelIDs.begin(), KernelIDs.end()); } - std::sort(Result.begin(), Result.end(), - [](const kernel_id &LHS, const kernel_id &RHS) { - return detail::getSyclObjImpl(LHS) < - detail::getSyclObjImpl(RHS); - }); - auto LastIt = std::unique(Result.begin(), Result.end()); - Result.erase(LastIt, Result.end()); + std::sort(Result.begin(), Result.end(), LessByNameComp{}); + + auto NewIt = + std::unique(Result.begin(), Result.end(), + [](const sycl::kernel_id &LHS, const sycl::kernel_id &RHS) { + return strcmp(LHS.get_name(), RHS.get_name()) == 0; + } + + ); + + Result.erase(NewIt, Result.end()); return Result; } - kernel get_kernel(const kernel_id &KernelID) const { - (void)KernelID; - throw sycl::runtime_error("Not implemented", PI_INVALID_OPERATION); + kernel + get_kernel(const kernel_id &KernelID, + const std::shared_ptr &Self) const { + + auto It = std::find_if(MDeviceImages.begin(), MDeviceImages.end(), + [&KernelID](const device_image_plain &DeviceImage) { + return DeviceImage.has_kernel(KernelID); + }); + const std::shared_ptr &DeviceImageImpl = + detail::getSyclObjImpl(*It); + + RT::PiKernel Kernel = nullptr; + std::tie(Kernel, std::ignore) = + detail::ProgramManager::getInstance().getOrCreateKernel( + MContext, KernelID.get_name(), /*PropList=*/{}, + DeviceImageImpl->get_program_ref()); + + std::shared_ptr KernelImpl = std::make_shared( + Kernel, detail::getSyclObjImpl(MContext), DeviceImageImpl, Self); + + return detail::createSyclObjFromImpl(KernelImpl); } bool has_kernel(const kernel_id &KernelID) const noexcept { @@ -168,6 +271,15 @@ class kernel_bundle_impl { const device_image_plain *end() const { return &MDeviceImages.back() + 1; } + size_t size() const noexcept { return MDeviceImages.size(); } + + bundle_state get_bundle_state() const { + // All device images are expected to have the same state + return MDeviceImages.empty() + ? bundle_state::input + : detail::getSyclObjImpl(MDeviceImages[0])->get_state(); + } + private: context MContext; std::vector MDevices; diff --git a/sycl/source/detail/kernel_id_impl.hpp b/sycl/source/detail/kernel_id_impl.hpp index 43d7193c8bc6c..69f9b425453eb 100644 --- a/sycl/source/detail/kernel_id_impl.hpp +++ b/sycl/source/detail/kernel_id_impl.hpp @@ -12,6 +12,13 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { +// Used for sorting vector of kernel_id's +struct LessByNameComp { + bool operator()(const sycl::kernel_id &LHS, const sycl::kernel_id &RHS) { + return std::strcmp(LHS.get_name(), RHS.get_name()) < 0; + } +}; + // The class is impl counterpart for sycl::kernel_id which represent a kernel // identificator class kernel_id_impl { diff --git a/sycl/source/detail/kernel_impl.cpp b/sycl/source/detail/kernel_impl.cpp index c5e23399783cd..ffda1739c238b 100644 --- a/sycl/source/detail/kernel_impl.cpp +++ b/sycl/source/detail/kernel_impl.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include +#include #include #include @@ -48,6 +49,19 @@ kernel_impl::kernel_impl(RT::PiKernel Kernel, ContextImplPtr ContextImpl, PI_INVALID_CONTEXT); } +kernel_impl::kernel_impl(RT::PiKernel Kernel, ContextImplPtr ContextImpl, + DeviceImageImplPtr DeviceImageImpl, + KernelBundleImplPtr KernelBundleImpl) + : MKernel(Kernel), MContext(std::move(ContextImpl)), MProgramImpl(nullptr), + MCreatedFromSource(false), MDeviceImageImpl(std::move(DeviceImageImpl)), + MKernelBundleImpl(std::move(KernelBundleImpl)) { + + // kernel_impl shared ownership of kernel handle + if (!is_host()) { + getPlugin().call(MKernel); + } +} + kernel_impl::kernel_impl(ContextImplPtr Context, ProgramImplPtr ProgramImpl) : MContext(Context), MProgramImpl(std::move(ProgramImpl)) {} diff --git a/sycl/source/detail/kernel_impl.hpp b/sycl/source/detail/kernel_impl.hpp index 0109eb12fe70d..4760b78e6b009 100644 --- a/sycl/source/detail/kernel_impl.hpp +++ b/sycl/source/detail/kernel_impl.hpp @@ -25,9 +25,11 @@ namespace sycl { namespace detail { // Forward declaration class program_impl; +class kernel_bundle_impl; using ContextImplPtr = std::shared_ptr; using ProgramImplPtr = std::shared_ptr; +using KernelBundleImplPtr = std::shared_ptr; class kernel_impl { public: /// Constructs a SYCL kernel instance from a PiKernel @@ -55,6 +57,16 @@ class kernel_impl { kernel_impl(RT::PiKernel Kernel, ContextImplPtr ContextImpl, ProgramImplPtr ProgramImpl, bool IsCreatedFromSource); + /// Constructs a SYCL kernel_impl instance from a SYCL device_image, + /// kernel_bundle and / PiKernel. + /// + /// \param Kernel is a valid PiKernel instance + /// \param ContextImpl is a valid SYCL context + /// \param ProgramImpl is a valid instance of kernel_bundle_impl + kernel_impl(RT::PiKernel Kernel, ContextImplPtr ContextImpl, + DeviceImageImplPtr DeviceImageImpl, + KernelBundleImplPtr KernelBundleImpl); + /// Constructs a SYCL kernel for host device /// /// \param Context is a valid SYCL context @@ -171,11 +183,15 @@ class kernel_impl { /// \return true if kernel was created from source. bool isCreatedFromSource() const; + const DeviceImageImplPtr &getDeviceImage() const { return MDeviceImageImpl; } + private: RT::PiKernel MKernel; const ContextImplPtr MContext; const ProgramImplPtr MProgramImpl; bool MCreatedFromSource = true; + const DeviceImageImplPtr MDeviceImageImpl; + const KernelBundleImplPtr MKernelBundleImpl; }; template diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp index ef8442c7c8666..ec2393b99c421 100644 --- a/sycl/source/detail/program_impl.cpp +++ b/sycl/source/detail/program_impl.cpp @@ -489,7 +489,7 @@ void program_impl::create_pi_program_with_kernel_name( const device FirstDevice = get_devices()[0]; RTDeviceBinaryImage &Img = PM.getDeviceImage( Module, KernelName, get_context(), FirstDevice, JITCompilationIsRequired); - MProgram = PM.createPIProgram(Img, get_context(), FirstDevice); + MProgram = PM.createPIProgram(Img, get_context(), {FirstDevice}); } template <> diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index aec28cf0809d3..dd00f357032a2 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -1141,7 +1141,7 @@ ProgramManager::KernelArgMask ProgramManager::getEliminatedKernelArgMask( return {}; } -static bundle_state getBinImageState(RTDeviceBinaryImage *BinImage) { +static bundle_state getBinImageState(const RTDeviceBinaryImage *BinImage) { auto IsAOTBinary = [](const char *Format) { return ( (strcmp(Format, __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_X86_64) == 0) || @@ -1178,9 +1178,9 @@ static bool compatibleWithDevice(RTDeviceBinaryImage *BinImage, } std::vector -ProgramManager::getSYCLDeviceImages(const context &Ctx, - const std::vector &Devs, - bundle_state TargetState) { +ProgramManager::getSYCLDeviceImagesWithCompatibleState( + const context &Ctx, const std::vector &Devs, + bundle_state TargetState) { // Collect raw device images std::vector BinImages; @@ -1188,55 +1188,477 @@ ProgramManager::getSYCLDeviceImages(const context &Ctx, std::lock_guard Guard(Sync::getGlobalLock()); for (auto &ImagesSets : m_DeviceImages) { auto &ImagesUPtrs = *ImagesSets.second.get(); - for (auto &ImageUPtr : ImagesUPtrs) + for (auto &ImageUPtr : ImagesUPtrs) { + const RTDeviceBinaryImage *BinImage = ImageUPtr.get(); + const bundle_state ImgState = getBinImageState(BinImage); + + // Ignore images with incompatible state. Image is considered compatible + // with a target state if an image is already in the target state or can + // be brought to target state by compiling/linking/building. + // + // Example: an image in "executable" state is not compatible with + // "input" target state - there is no operation to convert the image it + // to "input" state. An image in "input" state is compatible with + // "executable" target state because it can be built to get into + // "executable" state. + if (ImgState > TargetState) + continue; + BinImages.push_back(ImageUPtr.get()); + } } } + // TODO: Add a diagnostic on multiple device images with conflicting kernel + // names, and remove OSModuleHandle usage, as conflicting kernel names will be + // an error. + // TODO: Cache device_image objects // Create SYCL device image from those that have compatible state and at least // one device std::vector SYCLDeviceImages; for (RTDeviceBinaryImage *BinImage : BinImages) { const bundle_state ImgState = getBinImageState(BinImage); - // Ignore images with incompatible state. Image is considered compatible - // with a target state if an image is already in the target state or can be - // brought to target state by compiling/linking/building. - // - // Example: an image in "executable" state is not compatbile with "input" - // target state - there is no operation to convert the image it to "input" - // state. - // An image in "input" state is compatible with "executable" target state - // because it can be built to get into "executable" state. - if (ImgState > TargetState) - continue; - for (const sycl::device &Dev : Devs) - if (compatibleWithDevice(BinImage, Dev)) { - DeviceImageImplPtr Impl = std::make_shared( - BinImage, Ctx, Devs, ImgState); + for (const sycl::device &Dev : Devs) { + if (!compatibleWithDevice(BinImage, Dev)) + continue; + + // TODO: Cache kernel_ids + std::vector KernelIDs; + // Collect kernel names for the image + pi_device_binary DevBin = + const_cast(&BinImage->getRawData()); + for (_pi_offload_entry EntriesIt = DevBin->EntriesBegin; + EntriesIt != DevBin->EntriesEnd; ++EntriesIt) { + + std::shared_ptr KernelIDImpl = + std::make_shared(EntriesIt->name); + + KernelIDs.push_back( + detail::createSyclObjFromImpl(KernelIDImpl)); + } + // device_image_impl expects kernel ids to be sorted for fast search + std::sort(KernelIDs.begin(), KernelIDs.end(), LessByNameComp{}); + + DeviceImageImplPtr Impl = std::make_shared( + BinImage, Ctx, Devs, ImgState, KernelIDs, /*PIProgram=*/nullptr); - SYCLDeviceImages.push_back( - createSyclObjFromImpl(Impl)); + SYCLDeviceImages.push_back( + createSyclObjFromImpl(Impl)); + break; + } + } + + return SYCLDeviceImages; +} + +void ProgramManager::bringSYCLDeviceImagesToState( + std::vector &DeviceImages, bundle_state TargetState) { + + for (device_image_plain &DevImage : DeviceImages) { + const bundle_state DevImageState = getSyclObjImpl(DevImage)->get_state(); + + switch (TargetState) { + case bundle_state::input: + // Do nothing since there is no state which can be upgraded to the input. + assert(DevImageState == bundle_state::input); + break; + case bundle_state::object: + if (DevImageState == bundle_state::input) { + DevImage = compile(DevImage, getSyclObjImpl(DevImage)->get_devices(), + /*PropList=*/{}); break; } + // Device image is expected to be object state then. + assert(DevImageState == bundle_state::object); + break; + case bundle_state::executable: { + switch (DevImageState) { + case bundle_state::input: + DevImage = build(DevImage, getSyclObjImpl(DevImage)->get_devices(), + /*PropList=*/{}); + break; + case bundle_state::object: { + std::vector LinkedDevImages = + link({DevImage}, getSyclObjImpl(DevImage)->get_devices(), + /*PropList=*/{}); + // Since only one device image is passed here one output device image is + // expected + assert(LinkedDevImages.size() == 1 && "Expected one linked image here"); + DevImage = LinkedDevImages[0]; + break; + } + case bundle_state::executable: + // Device image is already in the desired state. + break; + } + break; + } + } + } +} + +std::vector +ProgramManager::getSYCLDeviceImages(const context &Ctx, + const std::vector &Devs, + bundle_state TargetState) { + // Collect device images with compatible state + std::vector DeviceImages = + getSYCLDeviceImagesWithCompatibleState(Ctx, Devs, TargetState); + // Brind device images with compatible state to desired state + bringSYCLDeviceImagesToState(DeviceImages, TargetState); + return DeviceImages; +} + +std::vector ProgramManager::getSYCLDeviceImages( + const context &Ctx, const std::vector &Devs, + const DevImgSelectorImpl &Selector, bundle_state TargetState) { + // Collect device images with compatible state + std::vector DeviceImages = + getSYCLDeviceImagesWithCompatibleState(Ctx, Devs, TargetState); + + // Filter out images that are rejected by Selector + auto It = std::remove_if(DeviceImages.begin(), DeviceImages.end(), + [&Selector](const device_image_plain &Image) { + return !Selector(getSyclObjImpl(Image)); + }); + DeviceImages.erase(It, DeviceImages.end()); + + // The spec says that the function should not call online compiler or linker + // to translate device images into target state + return DeviceImages; +} + +std::vector ProgramManager::getSYCLDeviceImages( + const context &Ctx, const std::vector &Devs, + const std::vector &KernelIDs, bundle_state TargetState) { + // Collect device images with compatible state + std::vector DeviceImages = + getSYCLDeviceImagesWithCompatibleState(Ctx, Devs, TargetState); + + // Filter out images that have no kernel_ids specified + auto It = std::remove_if(DeviceImages.begin(), DeviceImages.end(), + [&KernelIDs](const device_image_plain &Image) { + return std::none_of( + KernelIDs.begin(), KernelIDs.end(), + [&Image](const sycl::kernel_id &KernelID) { + return Image.has_kernel(KernelID); + }); + }); + + DeviceImages.erase(It, DeviceImages.end()); + + // Brind device images with compatible state to desired state + bringSYCLDeviceImagesToState(DeviceImages, TargetState); + return DeviceImages; +} + +device_image_plain +ProgramManager::compile(const device_image_plain &DeviceImage, + const std::vector &Devs, + const property_list &) { + + // TODO: Extract compile options from property list once the Spec clarifies + // how they can be passed. + + // TODO: Probably we could have cached compiled device images. + const std::shared_ptr &InputImpl = + getSyclObjImpl(DeviceImage); + + const detail::plugin &Plugin = + getSyclObjImpl(InputImpl->get_context())->getPlugin(); + + // TODO: Add support for creating non-SPIRV programs from multiple devices. + if (InputImpl->get_bin_image_ref()->getFormat() != + PI_DEVICE_BINARY_TYPE_SPIRV && + Devs.size() > 1) + sycl::runtime_error( + "Creating a program from AOT binary for multiple device is not " + "supported", + PI_INVALID_OPERATION); + + // Device is not used when creating program from SPIRV, so passing only one + // device is OK. + RT::PiProgram Prog = createPIProgram(*InputImpl->get_bin_image_ref(), + InputImpl->get_context(), Devs[0]); + + DeviceImageImplPtr ObjectImpl = std::make_shared( + InputImpl->get_bin_image_ref(), InputImpl->get_context(), Devs, + bundle_state::object, InputImpl->get_kernel_ids_ref(), Prog); + + std::vector PIDevices; + PIDevices.reserve(Devs.size()); + for (const device &Dev : Devs) + PIDevices.push_back(getSyclObjImpl(Dev)->getHandleRef()); + + // TODO: Set spec constatns here. + + // TODO: Handle zero sized Device list. + Plugin.call( + ObjectImpl->get_program_ref(), /*num devices=*/Devs.size(), + PIDevices.data(), + /*options=*/nullptr, + /*num_input_headers=*/0, /*input_headers=*/nullptr, + /*header_include_names=*/nullptr, + /*pfn_notify=*/nullptr, /*user_data*/ nullptr); + + return createSyclObjFromImpl(ObjectImpl); +} + +std::vector +ProgramManager::link(const std::vector &DeviceImages, + const std::vector &Devs, + const property_list &PropList) { + (void)PropList; + + std::vector PIPrograms; + PIPrograms.reserve(DeviceImages.size()); + for (const device_image_plain &DeviceImage : DeviceImages) + PIPrograms.push_back(getSyclObjImpl(DeviceImage)->get_program_ref()); + + std::vector PIDevices; + PIDevices.reserve(Devs.size()); + for (const device &Dev : Devs) + PIDevices.push_back(getSyclObjImpl(Dev)->getHandleRef()); + + const context &Context = getSyclObjImpl(DeviceImages[0])->get_context(); + + const detail::plugin &Plugin = getSyclObjImpl(Context)->getPlugin(); + + RT::PiProgram LinkedProg = nullptr; + RT::PiResult Error = Plugin.call_nocheck( + getSyclObjImpl(Context)->getHandleRef(), PIDevices.size(), + PIDevices.data(), + /*options=*/nullptr, PIPrograms.size(), PIPrograms.data(), + /*pfn_notify=*/nullptr, + /*user_data=*/nullptr, &LinkedProg); + + (void)Error; + // TODO: Add error handling + + std::vector KernelIDs; + for (const device_image_plain &DeviceImage : DeviceImages) { + // Duplicates are not expected here, otherwise piProgramLink should fail + KernelIDs.insert(KernelIDs.end(), + getSyclObjImpl(DeviceImage)->get_kernel_ids().begin(), + getSyclObjImpl(DeviceImage)->get_kernel_ids().end()); } + // device_image_impl expects kernel ids to be sorted for fast search + std::sort(KernelIDs.begin(), KernelIDs.end(), LessByNameComp{}); - // Make it so that SYCL device images have required state by compiling or - // linking them if needed - switch (TargetState) { - case bundle_state::input: - // Do nothing as the check above should make sure that resulting images are - // in input state already - break; - case bundle_state::object: - assert(false && "Not implemented yet"); - break; - case bundle_state::executable: - assert(false && "Not implemented yet"); - break; + DeviceImageImplPtr ExecutableImpl = + std::make_shared( + /*BinImage=*/nullptr, Context, Devs, bundle_state::object, + std::move(KernelIDs), LinkedProg); + + // TODO: Make multiple sets of device images organized by devices they are + // compiled for. + return {createSyclObjFromImpl(ExecutableImpl)}; +} + +// The function duplicates most of the code from existing getBuiltPIProgram. +// The differences are: +// Different API - uses different objects to extract required info +// Supports caching of a program built for multiple devices +device_image_plain ProgramManager::build(const device_image_plain &DeviceImage, + const std::vector &Devs, + const property_list &PropList) { + (void)PropList; + + const std::shared_ptr &InputImpl = + getSyclObjImpl(DeviceImage); + + const context Context = InputImpl->get_context(); + + const ContextImplPtr ContextImpl = getSyclObjImpl(Context); + + using PiProgramT = KernelProgramCache::PiProgramT; + using ProgramCacheT = KernelProgramCache::ProgramCacheT; + + KernelProgramCache &Cache = ContextImpl->getKernelProgramCache(); + + auto AcquireF = [](KernelProgramCache &Cache) { + return Cache.acquireCachedPrograms(); + }; + auto GetF = [](const Locked &LockedCache) -> ProgramCacheT & { + return LockedCache.get(); + }; + + std::string CompileOpts; + std::string LinkOpts; + // Build options are overridden if environment variables are present. + // Environment variables are not changed during program lifecycle so it + // is reasonable to use static here to read them only once. + static const char *CompileOptsEnv = + SYCLConfig::get(); + if (CompileOptsEnv) + CompileOpts = CompileOptsEnv; + + static const char *LinkOptsEnv = SYCLConfig::get(); + if (LinkOptsEnv) { + LinkOpts = LinkOptsEnv; } - return SYCLDeviceImages; + const RTDeviceBinaryImage *ImgPtr = InputImpl->get_bin_image_ref(); + const RTDeviceBinaryImage &Img = *ImgPtr; + + // TODO: Unify this code with getBuiltPIProgram + auto BuildF = [this, &Context, Img, &Devs, &CompileOpts, &LinkOpts, + &InputImpl] { + // Update only if compile options are not overwritten by environment + // variable + if (!CompileOptsEnv) { + CompileOpts += Img.getCompileOptions(); + pi_device_binary_property isEsimdImage = Img.getProperty("isEsimdImage"); + + if (isEsimdImage && pi::DeviceBinaryProperty(isEsimdImage).asUint32()) { + if (!CompileOpts.empty()) + CompileOpts += " "; + CompileOpts += "-vc-codegen"; + } + } + + // Update only if link options are not overwritten by environment variable + if (!LinkOptsEnv) + LinkOpts += Img.getLinkOptions(); + ContextImplPtr ContextImpl = getSyclObjImpl(Context); + const detail::plugin &Plugin = ContextImpl->getPlugin(); + + // TODO: Add support for creating non-SPIRV programs from multiple devices. + if (InputImpl->get_bin_image_ref()->getFormat() != + PI_DEVICE_BINARY_TYPE_SPIRV && + Devs.size() > 1) + sycl::runtime_error( + "Creating a program from AOT binary for multiple device is not " + "supported", + PI_INVALID_OPERATION); + + // Device is not used when creating program from SPIRV, so passing only one + // device is OK. + RT::PiProgram NativePrg = createPIProgram(Img, Context, Devs[0]); + + const std::vector &SpecConstsBlob = + InputImpl->get_spec_const_blob_ref(); + + std::vector &SpecConstOffsets = + InputImpl->get_spec_const_offsets_ref(); + + unsigned int PrevOffset = 0; + for (const device_image_impl::SpecConstDescT &SpecIDDesc : + SpecConstOffsets) { + + Plugin.call( + NativePrg, SpecIDDesc.ID, SpecIDDesc.Offset - PrevOffset, + SpecConstsBlob.data() + SpecIDDesc.Offset); + PrevOffset = SpecIDDesc.Offset; + } + + ProgramPtr ProgramManaged( + NativePrg, Plugin.getPiPlugin().PiFunctionTable.piProgramRelease); + + // Link a fallback implementation of device libraries if they are not + // supported by a device compiler. + // Pre-compiled programs are supposed to be already linked. + // If device image is not SPIR-V, DeviceLibReqMask will be 0 which means + // no fallback device library will be linked. + uint32_t DeviceLibReqMask = 0; + if (Img.getFormat() == PI_DEVICE_BINARY_TYPE_SPIRV && + !SYCLConfig::get()) + DeviceLibReqMask = getDeviceLibReqMask(Img); + + ProgramPtr BuiltProgram = + build(std::move(ProgramManaged), ContextImpl, CompileOpts, LinkOpts, + getRawSyclObjImpl(Devs[0])->getHandleRef(), + ContextImpl->getCachedLibPrograms(), DeviceLibReqMask); + + { + std::lock_guard Lock(MNativeProgramsMutex); + NativePrograms[BuiltProgram.get()] = &Img; + } + + return BuiltProgram.release(); + }; + + SerializedObj SpecConsts = InputImpl->get_spec_const_blob_ref(); + + const RT::PiDevice PiDevice = getRawSyclObjImpl(Devs[0])->getHandleRef(); + auto BuildResult = getOrBuild( + Cache, + std::make_pair(std::make_pair(std::move(SpecConsts), (size_t)ImgPtr), + std::make_pair(PiDevice, CompileOpts + LinkOpts)), + AcquireF, GetF, BuildF); + + RT::PiProgram ResProgram = BuildResult->Ptr.load(); + + // Cache supports key with once device only, but here we have multiple + // devices a program is built for, so add the program to the cache for all + // other devices. + auto CacheOtherDevices = [ResProgram]() { return ResProgram; }; + + // The program for device "0" is already added to the cache during the first + // call to getOrBuild, so starting with "1" + for (size_t Idx = 1; Idx < Devs.size(); ++Idx) { + const RT::PiDevice PiDeviceAdd = + getRawSyclObjImpl(Devs[Idx])->getHandleRef(); + + getOrBuild( + Cache, + std::make_pair(std::make_pair(std::move(SpecConsts), (size_t)ImgPtr), + std::make_pair(PiDeviceAdd, CompileOpts + LinkOpts)), + AcquireF, GetF, CacheOtherDevices); + } + + // devive_image_impl shares ownership of PIProgram with, at least, program + // cache. The ref counter will be descremented in the destructor of + // device_image_impl + const detail::plugin &Plugin = ContextImpl->getPlugin(); + Plugin.call(ResProgram); + + DeviceImageImplPtr ExecImpl = std::make_shared( + InputImpl->get_bin_image_ref(), Context, Devs, bundle_state::executable, + InputImpl->get_kernel_ids_ref(), ResProgram); + + return createSyclObjFromImpl(ExecImpl); +} + +std::pair ProgramManager::getOrCreateKernel( + const context &Context, const string_class &KernelName, + const property_list &PropList, RT::PiProgram Program) { + + (void)PropList; + + const ContextImplPtr Ctx = getSyclObjImpl(Context); + + using PiKernelT = KernelProgramCache::PiKernelT; + using KernelCacheT = KernelProgramCache::KernelCacheT; + using KernelByNameT = KernelProgramCache::KernelByNameT; + + KernelProgramCache &Cache = Ctx->getKernelProgramCache(); + + auto AcquireF = [](KernelProgramCache &Cache) { + return Cache.acquireKernelsPerProgramCache(); + }; + auto GetF = + [&Program](const Locked &LockedCache) -> KernelByNameT & { + return LockedCache.get()[Program]; + }; + auto BuildF = [&Program, &KernelName, &Ctx] { + PiKernelT *Result = nullptr; + + const detail::plugin &Plugin = Ctx->getPlugin(); + Plugin.call(Program, KernelName.c_str(), + &Result); + + Plugin.call(Result, PI_USM_INDIRECT_ACCESS, + sizeof(pi_bool), &PI_TRUE); + + return Result; + }; + + auto BuildResult = getOrBuild( + Cache, KernelName, AcquireF, GetF, BuildF); + return std::make_pair(BuildResult->Ptr.load(), + &(BuildResult->MBuildResultMutex)); } } // namespace detail diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index 77572330c2fda..7a706700c9a2d 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -91,10 +91,18 @@ class ProgramManager { const string_class &KernelName, const program_impl *Prg = nullptr, bool JITCompilationIsRequired = false); + + RT::PiProgram getBuiltPIProgram(OSModuleHandle M, const context &Context, + const device &Device, + const string_class &KernelName, + const property_list &PropList, + bool JITCompilationIsRequired = false); + std::pair getOrCreateKernel(OSModuleHandle M, const context &Context, const device &Device, const string_class &KernelName, const program_impl *Prg); + RT::PiProgram getPiProgramFromPiKernel(RT::PiKernel Kernel, const ContextImplPtr Context); @@ -135,20 +143,61 @@ class ProgramManager { const device &Device, pi::PiProgram NativePrg, const string_class &KernelName, bool KnownProgram); + // The function returns a vector of SYCL device images that are compiled with + // the required state and at least one device from the passed list of devices. + std::vector + getSYCLDeviceImagesWithCompatibleState(const context &Ctx, + const std::vector &Devs, + bundle_state TargetState); + + // Brind images in the passed vector to the required state. Does it inplace + void + bringSYCLDeviceImagesToState(std::vector &DeviceImages, + bundle_state TargetState); + // The function returns a vector of SYCL device images in required state, // which are compatible with at least one of the device from Devs. std::vector getSYCLDeviceImages(const context &Ctx, const std::vector &Devs, bundle_state State); + // The function returns a vector of SYCL device images, for which Selector + // callable returns true, in required state, which are compatible with at + // least one of the device from Devs. + std::vector + getSYCLDeviceImages(const context &Ctx, const std::vector &Devs, + const DevImgSelectorImpl &Selector, + bundle_state TargetState); + + // The function returns a vector of SYCL device images which represent at + // least one kernel from kernel ids vector in required state, which are + // compatible with at least one of the device from Devs. + std::vector + getSYCLDeviceImages(const context &Ctx, const std::vector &Devs, + const std::vector &KernelIDs, + bundle_state TargetState); + + // Produces new device image by convering input device image to the object + // state device_image_plain compile(const device_image_plain &DeviceImage, - property_list PropList); + const std::vector &Devs, + const property_list &PropList); - device_image_plain link(const std::vector &DeviceImage, - property_list PropList); + // Produces set of device images by convering input device images to object + // the executable state + std::vector + link(const std::vector &DeviceImages, + const std::vector &Devs, const property_list &PropList); + // Produces new device image by converting input device image to the + // executable state device_image_plain build(const device_image_plain &DeviceImage, - property_list PropList); + const std::vector &Devs, + const property_list &PropList); + + std::pair + getOrCreateKernel(const context &Context, const string_class &KernelName, + const property_list &PropList, RT::PiProgram Program); ProgramManager(); ~ProgramManager() = default; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 2e0e28a3538f7..99b6f13a4c823 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -1880,7 +1881,8 @@ cl_int ExecCGCommand::enqueueImp() { "Enqueueing run_on_host_intel task has failed.", Error); } } - case CG::CGTYPE::KERNEL: { + case CG::CGTYPE::KERNEL: + case CG::CGTYPE::KERNEL_V1: { CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get(); NDRDescT &NDRDesc = ExecKernel->MNDRDesc; @@ -1903,6 +1905,9 @@ cl_int ExecCGCommand::enqueueImp() { return CL_SUCCESS; } + const std::shared_ptr &KernelBundleImplPtr = + ExecKernel->getKernelBundle(); + // Run OpenCL kernel sycl::context Context = MQueue->get_context(); RT::PiKernel Kernel = nullptr; @@ -1910,7 +1915,31 @@ cl_int ExecCGCommand::enqueueImp() { RT::PiProgram Program = nullptr; bool KnownProgram = true; - if (nullptr != ExecKernel->MSyclKernel) { + std::shared_ptr SyclKernelImpl; + // Use kernel_bundle is available + if (KernelBundleImplPtr) { + + std::shared_ptr KernelIDImpl = + std::make_shared(ExecKernel->MKernelName); + + kernel SyclKernel = KernelBundleImplPtr->get_kernel( + detail::createSyclObjFromImpl(KernelIDImpl), + KernelBundleImplPtr); + + SyclKernelImpl = detail::getSyclObjImpl(SyclKernel); + + Kernel = SyclKernelImpl->getHandleRef(); + + std::shared_ptr DeviceImageImpl = + SyclKernelImpl->getDeviceImage(); + + Program = DeviceImageImpl->get_program_ref(); + + std::tie(Kernel, KernelMutex) = + detail::ProgramManager::getInstance().getOrCreateKernel( + KernelBundleImplPtr->get_context(), ExecKernel->MKernelName, + /*PropList=*/{}, Program); + } else if (nullptr != ExecKernel->MSyclKernel) { assert(ExecKernel->MSyclKernel->get_info() == Context); Kernel = ExecKernel->MSyclKernel->getHandleRef(); diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 5e63addf8e446..68ad4f9094d1e 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include #include @@ -21,6 +23,75 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +handler::handler(shared_ptr_class Queue, bool IsHost) + : MQueue(std::move(Queue)), MIsHost(IsHost) { + MSharedPtrStorage.emplace_back( + std::make_shared>()); +} + +// Returns a shared_ptr to kernel_bundle stored in the extended members vector. +// If there is no kernel_bundle created: +// returns newly created kernel_bundle if Insert is true +// returns shared_ptr(nullptr) if Insert is false +std::shared_ptr +handler::getOrInsertHandlerKernelBundle(bool Insert) const { + + std::lock_guard Lock( + detail::GlobalHandler::instance().getHandlerExtendedMembersMutex()); + + assert(!MSharedPtrStorage.empty()); + + std::shared_ptr> ExendedMembersVec = + detail::convertToExtendedMembers(MSharedPtrStorage[0]); + + // Look for the kernel bundle in extended members + std::shared_ptr KernelBundleImpPtr; + for (const detail::ExtendedMemberT &EMember : *ExendedMembersVec) + if (detail::ExtendedMembersType::HANDLER_KERNEL_BUNDLE == EMember.MType) { + KernelBundleImpPtr = + std::static_pointer_cast(EMember.MData); + break; + } + + // No kernel bundle yet, create one + if (!KernelBundleImpPtr && Insert) { + KernelBundleImpPtr = detail::getSyclObjImpl( + get_kernel_bundle(MQueue->get_context())); + + detail::ExtendedMemberT EMember = { + detail::ExtendedMembersType::HANDLER_KERNEL_BUNDLE, KernelBundleImpPtr}; + + ExendedMembersVec->push_back(EMember); + } + + return KernelBundleImpPtr; +} + +// Sets kernel bundle to the provided one. Either replaces existing one or +// create a new entry in the extended members vector. +void handler::setHandlerKernelBundle( + const std::shared_ptr &NewKernelBundleImpPtr) { + assert(!MSharedPtrStorage.empty()); + + std::lock_guard Lock( + detail::GlobalHandler::instance().getHandlerExtendedMembersMutex()); + + std::shared_ptr> ExendedMembersVec = + detail::convertToExtendedMembers(MSharedPtrStorage[0]); + + for (detail::ExtendedMemberT &EMember : *ExendedMembersVec) + if (detail::ExtendedMembersType::HANDLER_KERNEL_BUNDLE == EMember.MType) { + EMember.MData = NewKernelBundleImpPtr; + return; + } + + detail::ExtendedMemberT EMember = { + detail::ExtendedMembersType::HANDLER_KERNEL_BUNDLE, + NewKernelBundleImpPtr}; + + ExendedMembersVec->push_back(EMember); +} + event handler::finalize() { // This block of code is needed only for reduction implementation. // It is harmless (does nothing) for everything else. @@ -28,9 +99,37 @@ event handler::finalize() { return MLastEvent; MIsFinalized = true; + // Kernel_bundles could not be used before CGType version 1 + if (getCGTypeVersion(MCGType) > + static_cast(detail::CG::CG_VERSION::V0)) { + // If there were uses of set_specialization_constant build the kernel_bundle + std::shared_ptr KernelBundleImpPtr = + getOrInsertHandlerKernelBundle(/*Insert=*/false); + if (KernelBundleImpPtr) { + switch (KernelBundleImpPtr->get_bundle_state()) { + case bundle_state::input: { + // Underlying level expects kernel_bundle to be in executable state + kernel_bundle ExecBundle = build( + detail::createSyclObjFromImpl>( + KernelBundleImpPtr)); + setHandlerKernelBundle(detail::getSyclObjImpl(ExecBundle)); + break; + } + case bundle_state::executable: + // Nothing to do + break; + case bundle_state::object: + assert(0 && "Expected that the bundle is either in input or executable " + "states."); + break; + } + } + } + unique_ptr_class CommandGroup; switch (MCGType) { case detail::CG::KERNEL: + case detail::CG::KERNEL_V1: case detail::CG::RUN_ON_HOST_INTEL: { CommandGroup.reset(new detail::CGExecKernel( std::move(MNDRDesc), std::move(MHostKernel), std::move(MKernel), diff --git a/sycl/source/kernel_bundle.cpp b/sycl/source/kernel_bundle.cpp index 191933c286d6e..7f35903792bf7 100644 --- a/sycl/source/kernel_bundle.cpp +++ b/sycl/source/kernel_bundle.cpp @@ -62,23 +62,8 @@ bool kernel_bundle_plain::native_specialization_constant() const noexcept { return impl->native_specialization_constant(); } -bool kernel_bundle_plain::has_specialization_constant( - unsigned int SpecID) const noexcept { - return impl->has_specialization_constant(SpecID); -} - -void kernel_bundle_plain::set_specialization_constant_raw_value( - unsigned int SpecID, const void *Value, size_t ValueSize) { - impl->set_specialization_constant_raw_value(SpecID, Value, ValueSize); -} - -void kernel_bundle_plain::get_specialization_constant_raw_value( - unsigned int SpecID, void *ValueRet, size_t ValueSize) const { - impl->get_specialization_constant_raw_value(SpecID, ValueRet, ValueSize); -} - kernel kernel_bundle_plain::get_kernel(const kernel_id &KernelID) const { - return impl->get_kernel(KernelID); + return impl->get_kernel(KernelID, impl); } const device_image_plain *kernel_bundle_plain::begin() const { @@ -98,6 +83,10 @@ bool kernel_bundle_plain::has_kernel(const kernel_id &KernelID, return impl->has_kernel(KernelID, Dev); } +//////////////////////////// +///// free functions +/////////////////////////// + detail::KernelBundleImplPtr get_kernel_bundle_impl(const context &Ctx, const std::vector &Devs, bundle_state State) { @@ -115,11 +104,52 @@ get_kernel_bundle_impl(const context &Ctx, const std::vector &Devs, detail::KernelBundleImplPtr get_kernel_bundle_impl(const context &Ctx, const std::vector &Devs, bundle_state State, const DevImgSelectorImpl &Selector) { - return std::make_shared(Ctx, Devs, Selector, State); } +std::shared_ptr +join_impl(const std::vector &Bundles) { + return std::make_shared(Bundles); +} + +bool has_kernel_bundle_impl(const context &Ctx, const std::vector &Devs, + bundle_state State) { + // Just create a kernel_bundle and check if it has any device_images inside. + detail::kernel_bundle_impl KernelBundleImpl(Ctx, Devs, State); + return KernelBundleImpl.size(); +} + +bool has_kernel_bundle_impl(const context &Ctx, const std::vector &Devs, + const std::vector &KernelIds, + bundle_state State) { + // Just create a kernel_bundle and check if it has any device_images inside. + detail::kernel_bundle_impl KernelBundleImpl(Ctx, Devs, KernelIds, State); + return KernelBundleImpl.size(); +} + +std::shared_ptr +compile_impl(const kernel_bundle &InputBundle, + const std::vector &Devs, const property_list &PropList) { + return std::make_shared( + InputBundle, Devs, PropList, bundle_state::object); +} + +std::shared_ptr +link_impl(const std::vector> &ObjectBundles, + const std::vector &Devs, const property_list &PropList) { + return std::make_shared(ObjectBundles, Devs, + PropList); +} + +std::shared_ptr +build_impl(const kernel_bundle &InputBundle, + const std::vector &Devs, const property_list &PropList) { + return std::make_shared( + InputBundle, Devs, PropList, bundle_state::executable); +} + } // namespace detail + } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index f9540b8637696..4601549761311 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3654,6 +3654,7 @@ _ZN2cl4sycl6ONEAPI15filter_selectorC1ERKNSt7__cxx1112basic_stringIcSt11char_trai _ZN2cl4sycl6ONEAPI15filter_selectorC2ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE _ZN2cl4sycl6ONEAPI6detail16reduGetMaxWGSizeESt10shared_ptrINS0_6detail10queue_implEEm _ZN2cl4sycl6ONEAPI6detail17reduComputeWGSizeEmmRm +_ZN2cl4sycl6detail10build_implERKNS0_13kernel_bundleILNS0_12bundle_stateE0EEERKSt6vectorINS0_6deviceESaIS8_EERKNS0_13property_listE _ZN2cl4sycl6detail10image_implILi1EE10getDevicesESt10shared_ptrINS1_12context_implEE _ZN2cl4sycl6detail10image_implILi1EE10setPitchesEv _ZN2cl4sycl6detail10image_implILi1EE11allocateMemESt10shared_ptrINS1_12context_implEEbPvRP9_pi_event @@ -3735,6 +3736,7 @@ _ZN2cl4sycl6detail11stream_impl20accessGlobalFlushBufERNS0_7handlerE _ZN2cl4sycl6detail11stream_impl5flushEv _ZN2cl4sycl6detail11stream_implC1EmmRNS0_7handlerE _ZN2cl4sycl6detail11stream_implC2EmmRNS0_7handlerE +_ZN2cl4sycl6detail12compile_implERKNS0_13kernel_bundleILNS0_12bundle_stateE0EEERKSt6vectorINS0_6deviceESaIS8_EERKNS0_13property_listE _ZN2cl4sycl6detail12isOutOfRangeENS0_3vecIiLi4EEENS0_15addressing_modeENS0_5rangeILi3EEE _ZN2cl4sycl6detail12make_contextEmRKSt8functionIFvNS0_14exception_listEEENS0_7backendE _ZN2cl4sycl6detail12sampler_impl18getOrCreateSamplerERKNS0_7contextE @@ -3784,7 +3786,6 @@ _ZN2cl4sycl6detail18stringifyErrorCodeEi _ZN2cl4sycl6detail19convertChannelOrderE23_pi_image_channel_order _ZN2cl4sycl6detail19convertChannelOrderENS0_19image_channel_orderE _ZN2cl4sycl6detail19getImageElementSizeEhNS0_18image_channel_typeE -_ZN2cl4sycl6detail19kernel_bundle_plain37set_specialization_constant_raw_valueEjPKvm _ZN2cl4sycl6detail20associateWithHandlerERNS0_7handlerEPNS1_16AccessorBaseHostENS0_6access6targetE _ZN2cl4sycl6detail20getDeviceFromHandlerERNS0_7handlerE _ZN2cl4sycl6detail22addHostAccessorAndWaitEPNS1_16AccessorImplHostE @@ -3792,6 +3793,8 @@ _ZN2cl4sycl6detail22getImageNumberChannelsENS0_19image_channel_orderE _ZN2cl4sycl6detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EENS0_12bundle_stateE _ZN2cl4sycl6detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EENS0_12bundle_stateERKSt8functionIFbRKSt10shared_ptrINS1_17device_image_implEEEE _ZN2cl4sycl6detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKS5_INS0_9kernel_idESaISB_EENS0_12bundle_stateE +_ZN2cl4sycl6detail22has_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EENS0_12bundle_stateE +_ZN2cl4sycl6detail22has_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKS5_INS0_9kernel_idESaISB_EENS0_12bundle_stateE _ZN2cl4sycl6detail27getPixelCoordLinearFiltModeENS0_3vecIfLi4EEENS0_15addressing_modeENS0_5rangeILi3EEERS3_ _ZN2cl4sycl6detail28getDeviceFunctionPointerImplERNS0_6deviceERNS0_7programEPKc _ZN2cl4sycl6detail28getPixelCoordNearestFiltModeENS0_3vecIfLi4EEENS0_15addressing_modeENS0_5rangeILi3EEE @@ -3804,6 +3807,8 @@ _ZN2cl4sycl6detail6OSUtil12alignedAllocEmm _ZN2cl4sycl6detail6OSUtil12getOSMemSizeEv _ZN2cl4sycl6detail6OSUtil16getCurrentDSODirB5cxx11Ev _ZN2cl4sycl6detail6OSUtil17getOSModuleHandleEPKv +_ZN2cl4sycl6detail9join_implERKSt6vectorISt10shared_ptrINS1_18kernel_bundle_implEESaIS5_EE +_ZN2cl4sycl6detail9link_implERKSt6vectorINS0_13kernel_bundleILNS0_12bundle_stateE1EEESaIS5_EERKS2_INS0_6deviceESaISA_EERKNS0_13property_listE _ZN2cl4sycl6device11get_devicesENS0_4info11device_typeE _ZN2cl4sycl6deviceC1EP13_cl_device_id _ZN2cl4sycl6deviceC1ERKNS0_15device_selectorE @@ -3849,6 +3854,7 @@ _ZN2cl4sycl7handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmbb _ZN2cl4sycl7handler13getKernelNameB5cxx11Ev _ZN2cl4sycl7handler18extractArgsAndReqsEv _ZN2cl4sycl7handler20associateWithHandlerEPNS0_6detail16AccessorBaseHostENS0_6access6targetE +_ZN2cl4sycl7handler22setHandlerKernelBundleERKSt10shared_ptrINS0_6detail18kernel_bundle_implEE _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tE _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tEb _ZN2cl4sycl7handler6memcpyEPvPKvm @@ -3856,6 +3862,8 @@ _ZN2cl4sycl7handler6memsetEPvim _ZN2cl4sycl7handler7barrierERKSt6vectorINS0_5eventESaIS3_EE _ZN2cl4sycl7handler8finalizeEv _ZN2cl4sycl7handler8prefetchEPKvm +_ZN2cl4sycl7handlerC1ESt10shared_ptrINS0_6detail10queue_implEEb +_ZN2cl4sycl7handlerC2ESt10shared_ptrINS0_6detail10queue_implEEb _ZN2cl4sycl7program17build_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_ _ZN2cl4sycl7program19compile_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_ _ZN2cl4sycl7program22build_with_kernel_nameENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_l @@ -3968,10 +3976,8 @@ _ZNK2cl4sycl6detail19kernel_bundle_plain11get_backendEv _ZNK2cl4sycl6detail19kernel_bundle_plain11get_contextEv _ZNK2cl4sycl6detail19kernel_bundle_plain11get_devicesEv _ZNK2cl4sycl6detail19kernel_bundle_plain14get_kernel_idsEv -_ZNK2cl4sycl6detail19kernel_bundle_plain27has_specialization_constantEj _ZNK2cl4sycl6detail19kernel_bundle_plain30native_specialization_constantEv _ZNK2cl4sycl6detail19kernel_bundle_plain33contains_specialization_constantsEv -_ZNK2cl4sycl6detail19kernel_bundle_plain37get_specialization_constant_raw_valueEjPvm _ZNK2cl4sycl6detail19kernel_bundle_plain3endEv _ZNK2cl4sycl6detail19kernel_bundle_plain5beginEv _ZNK2cl4sycl6detail19kernel_bundle_plain5emptyEv @@ -4142,6 +4148,7 @@ _ZNK2cl4sycl7context8get_infoILNS0_4info7contextE4224EEENS3_12param_traitsIS4_XT _ZNK2cl4sycl7context8get_infoILNS0_4info7contextE4225EEENS3_12param_traitsIS4_XT_EE11return_typeEv _ZNK2cl4sycl7context8get_infoILNS0_4info7contextE4228EEENS3_12param_traitsIS4_XT_EE11return_typeEv _ZNK2cl4sycl7context9getNativeEv +_ZNK2cl4sycl7handler30getOrInsertHandlerKernelBundleEb _ZNK2cl4sycl7program10get_kernelENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE _ZNK2cl4sycl7program10get_kernelENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEb _ZNK2cl4sycl7program10has_kernelENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE diff --git a/sycl/test/on-device/basic_tests/kernel_bundle/kernel_bundle_api.cpp b/sycl/test/on-device/basic_tests/kernel_bundle/kernel_bundle_api.cpp index 058e266771ea0..cb7e21c82113a 100644 --- a/sycl/test/on-device/basic_tests/kernel_bundle/kernel_bundle_api.cpp +++ b/sycl/test/on-device/basic_tests/kernel_bundle/kernel_bundle_api.cpp @@ -1,15 +1,17 @@ // RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: %t.out +// RUN: env SYCL_PI_TRACE=2 %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER // // -fsycl-device-code-split is not supported for cuda // UNSUPPORTED: cuda #include +#include #include class Kernel1Name; class Kernel2Name; +class Kernel3Name; int main() { sycl::queue Q; @@ -21,8 +23,11 @@ int main() { const sycl::context Ctx = Q.get_context(); const sycl::device Dev = Q.get_device(); - Q.submit([](sycl::handler &CGH) { CGH.single_task([]() {}); }); - Q.submit([](sycl::handler &CGH) { CGH.single_task([]() {}); }); + // The code is needed to just have device images in the executable + if (0) { + Q.submit([](sycl::handler &CGH) { CGH.single_task([]() {}); }); + Q.submit([](sycl::handler &CGH) { CGH.single_task([]() {}); }); + } sycl::kernel_id Kernel1ID = sycl::get_kernel_id(); sycl::kernel_id Kernel2ID = sycl::get_kernel_id(); @@ -78,11 +83,12 @@ int main() { // of kernel_id's and Selector. { - sycl::kernel_bundle KernelBundle2 = + // Test get_kernel_bundle with filters, join and get_kernel_ids API. + sycl::kernel_bundle KernelBundleInput1 = sycl::get_kernel_bundle(Ctx, {Dev}, {Kernel1ID}); - assert(KernelBundle2.has_kernel(Kernel1ID)); - assert(!KernelBundle2.has_kernel(Kernel2ID)); + assert(KernelBundleInput1.has_kernel(Kernel1ID)); + assert(!KernelBundleInput1.has_kernel(Kernel2ID)); auto Selector = [&Kernel2ID]( @@ -90,11 +96,161 @@ int main() { return DevImage.has_kernel(Kernel2ID); }; - sycl::kernel_bundle KernelBundle3 = + sycl::kernel_bundle KernelBundleInput2 = sycl::get_kernel_bundle(Ctx, {Dev}, Selector); - assert(!KernelBundle3.has_kernel(Kernel1ID)); - assert(KernelBundle3.has_kernel(Kernel2ID)); + assert(!KernelBundleInput2.has_kernel(Kernel1ID)); + assert(KernelBundleInput2.has_kernel(Kernel2ID)); + + sycl::kernel_bundle KernelBundleJoint = + sycl::join(std::vector>{ + KernelBundleInput1, KernelBundleInput2}); + + assert(KernelBundleJoint.has_kernel(Kernel1ID)); + assert(KernelBundleJoint.has_kernel(Kernel2ID)); + + std::vector KernelIDs = KernelBundleJoint.get_kernel_ids(); + + assert(KernelIDs.size() == 2); + } + + { + // Test compile, link, build + sycl::kernel_bundle KernelBundleInput1 = + sycl::get_kernel_bundle(Ctx, {Dev}, + {Kernel1ID}); + + sycl::kernel_bundle KernelBundleInput2 = + sycl::get_kernel_bundle(Ctx, {Dev}, + {Kernel2ID}); + + sycl::kernel_bundle KernelBundleObject1 = + sycl::compile(KernelBundleInput1, KernelBundleInput1.get_devices()); + // CHECK:---> piProgramCreate + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: ) ---> pi_result : PI_SUCCESS + // CHECK-NEXT: [out] ** : {{.*}}[ [[PROGRAM_HANDLE1:[0-9a-fA-Fx]]] + // + // CHECK:---> piProgramCompile( + // CHECK-Next: : [[PROGRAM_HANDLE1]] + + + + sycl::kernel_bundle KernelBundleObject2 = + sycl::compile(KernelBundleInput2, KernelBundleInput2.get_devices()); + // CHECK:---> piProgramCreate + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: ) ---> pi_result : PI_SUCCESS + // CHECK-NEXT: [out] ** : {{.*}}[ [[PROGRAM_HANDLE2:[0-9a-fA-Fx]]] + // + // CHECK:---> piProgramCompile( + // CHECK-Next: : [[PROGRAM_HANDLE2]] + + + // TODO: Pass more kernel bundles + sycl::kernel_bundle KernelBundleExecutable = + sycl::link({KernelBundleObject1, KernelBundleObject2}, + KernelBundleObject1.get_devices()); + // CHECK:---> piProgramLink( + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: + // CHECK-NEXT: + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT:---> pi_result : PI_SUCCESS + // CHECK-NEXT: [out] ** : {{.*}} + // PI tracing doesn't allow checking for all input programs so far. + + assert(KernelBundleExecutable.has_kernel(Kernel1ID)); + assert(KernelBundleExecutable.has_kernel(Kernel2ID)); + + sycl::kernel_bundle + KernelBundleExecutable2 = + sycl::build(KernelBundleInput1, KernelBundleInput1.get_devices()); + + // CHECK:---> piProgramCreate + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: ) ---> pi_result : PI_SUCCESS + // CHECK-NEXT: [out] ** : {{.*}}[ [[PROGRAM_HANDLE3:[0-9a-fA-Fx]]] + // + // CHECK:---> piProgramBuild( + // CHECK-NEXT: : [[PROGRAM_HANDLE3]] + // + // CHECK:---> piProgramRetain( + // CHECK-NEXT: : [[PROGRAM_HANDLE3]] + // CHECK-NEXT:---> pi_result : PI_SUCCESS + } + + { + // Test handle::use_kernel_bundle APIs. + sycl::kernel_id Kernel3ID = sycl::get_kernel_id(); + + sycl::kernel_bundle KernelBundleExecutable = + sycl::get_kernel_bundle(Ctx, {Dev}, + {Kernel3ID}); + // 3 SPIRV images - 3 calls to piextDeviceSelectBinary are expected + // CHECK:---> piextDeviceSelectBinary + // CHECK:---> piextDeviceSelectBinary + // CHECK:---> piextDeviceSelectBinary + // CHECK:---> piProgramCreate + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: ) ---> pi_result : PI_SUCCESS + // CHECK-NEXT: [out] ** : {{.*}}[ [[PROGRAM_HANDLE4:[0-9a-fA-Fx]]] + // + // CHECK:---> piProgramBuild( + // CHECK-NEXT: : [[PROGRAM_HANDLE4]] + // + // CHECK:---> piProgramRetain( + // CHECK-NEXT: : [[PROGRAM_HANDLE4]] + // CHECK-NEXT:---> pi_result : PI_SUCCESS + // + // CHECK:---> piKernelCreate( + // CHECK-NEXT: : [[PROGRAM_HANDLE4]] + // CHECK-NEXT:: _ZTS11Kernel3Name + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: ---> pi_result : PI_SUCCESS + // CHECK-NEXT: [out] ** : {{.*}}[ [[KERNEL_HANDLE:[0-9a-fA-Fx]]] + // + // CHECK:---> piKernelRetain( + // CHECK-NEXT: : [[KERNEL_HANDLE]] + // CHECK-NEXT:---> pi_result : PI_SUCCESS + // + // CHECK:---> piEnqueueKernelLaunch( + // CHECK-NEXT: : {{.*}} + // CHECK-NEXT: : [[KERNEL_HANDLE]] + // + // CHECK:---> piKernelRelease( + // CHECK-NEXT: : [[KERNEL_HANDLE]] + // CHECK-NEXT:---> pi_result : PI_SUCCESS + + cl::sycl::buffer Buf(sycl::range<1>{1}); + + Q.submit([&](sycl::handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.use_kernel_bundle(KernelBundleExecutable); + CGH.single_task([=]() { Acc[0] = 42; }); + }); + + { + auto HostAcc = Buf.get_access(); + assert(HostAcc[0] == 42); + } } return 0;