From 8694e6ec1dfa1300641854945c86b15c8d63966e Mon Sep 17 00:00:00 2001 From: Alex Duran Date: Tue, 10 Jun 2025 10:39:29 +0200 Subject: [PATCH] [OFFLOAD][OPENMP] 6.0 compatible interop interface The following patch introduces a new interop interface implementation with the following characteristics: * It supports the new 6.0 prefer_type specification * It supports both explicit objects (from interop constructs) and implicit objects (from variant calls). * Implements a per-thread reuse mechanism for implicit objects to reduce overheads. * It provides a plugin interface that allows selecting the supported interop types, and managing all the backend related interop operations (init, sync, ...). * It enables cooperation with the OpenMP runtime to allow progress on OpenMP synchronizations. * It cleanups some vendor/fr_id mismatchs from the current query routines. * It supports extension to define interop callbacks for library cleanup. --- offload/include/OpenMP/InteropAPI.h | 149 ++++++- offload/include/OpenMP/omp.h | 51 +-- offload/include/PerThreadTable.h | 109 +++++ offload/include/PluginManager.h | 7 +- offload/include/Shared/APITypes.h | 1 + offload/libomptarget/OffloadRTL.cpp | 6 + offload/libomptarget/OpenMP/API.cpp | 12 + offload/libomptarget/OpenMP/InteropAPI.cpp | 371 ++++++++++++------ offload/libomptarget/PluginManager.cpp | 6 + offload/libomptarget/exports | 5 +- .../common/include/PluginInterface.h | 55 +++ openmp/runtime/src/kmp.h | 7 + openmp/runtime/src/kmp_barrier.cpp | 8 + openmp/runtime/src/kmp_runtime.cpp | 15 + openmp/runtime/src/kmp_tasking.cpp | 29 ++ 15 files changed, 688 insertions(+), 143 deletions(-) create mode 100644 offload/include/PerThreadTable.h diff --git a/offload/include/OpenMP/InteropAPI.h b/offload/include/OpenMP/InteropAPI.h index 71c78760a3226..61cbedf06a9a6 100644 --- a/offload/include/OpenMP/InteropAPI.h +++ b/offload/include/OpenMP/InteropAPI.h @@ -13,17 +13,70 @@ #include "omp.h" +#include "PerThreadTable.h" #include "omptarget.h" extern "C" { typedef enum kmp_interop_type_t { kmp_interop_type_unknown = -1, - kmp_interop_type_platform, - kmp_interop_type_device, - kmp_interop_type_tasksync, + kmp_interop_type_target, + kmp_interop_type_targetsync, } kmp_interop_type_t; +struct interop_attrs_t { + bool inorder : 1; + int reserved : 31; + + /* Check if the supported attributes are compatible with the current + attributes. Only if an attribute is supported can the value be true, + otherwise it needs to be false + */ + bool checkSupportedOnly(interop_attrs_t supported) const { + return supported.inorder || (!supported.inorder && !inorder); + } +}; + +struct interop_spec_t { + int32_t fr_id; + interop_attrs_t attrs; // Common attributes + int64_t impl_attrs; // Implementation specific attributes (recognized by each + // plugin) +}; + +struct interop_flags_t { + bool implicit : 1; // dispatch (true) or interop (false) + bool nowait : 1; // has nowait flag + int reserved : 30; +}; + +struct interop_ctx_t { + uint16_t version; // version of the interface (current is 0) + interop_flags_t flags; + int gtid; +}; + +struct dep_pack_t { + int32_t ndeps; + kmp_depend_info_t *deplist; + int32_t ndeps_noalias; + kmp_depend_info_t *noalias_deplist; +}; + +struct omp_interop_val_t; + +typedef void ompx_interop_cb_t(omp_interop_val_t *interop, void *data); + +struct omp_interop_cb_instance_t { + ompx_interop_cb_t *cb; + void *data; + + omp_interop_cb_instance_t(ompx_interop_cb_t *cb, void *data) + : cb(cb), data(data) {} + + void operator()(omp_interop_val_t *interop) { cb(interop, data); } +}; + /// The interop value type, aka. the interop object. typedef struct omp_interop_val_t { /// Device and interop-type are determined at construction time and fix. @@ -34,10 +87,96 @@ typedef struct omp_interop_val_t { __tgt_device_info device_info; const kmp_interop_type_t interop_type; const intptr_t device_id; - const omp_foreign_runtime_ids_t vendor_id = cuda; - const intptr_t backend_type_id = omp_interop_backend_type_cuda_1; + omp_vendor_id_t vendor_id = omp_vendor_llvm; + omp_foreign_runtime_id_t fr_id = omp_fr_none; + interop_attrs_t attrs{false, 0}; // Common prefer specification attributes + int64_t impl_attrs = 0; // Implementation prefer specification attributes + + void *RTLProperty = nullptr; // Plugin dependent information + // For implicitly created Interop objects (e.g., from a dispatch construct) + // who owns the object + int OwnerGtid = -1; + // Marks whether the object was requested since the last time it was synced + bool Clean = true; + + typedef llvm::SmallVector callback_list_t; + + callback_list_t CompletionCbs; + + void reset() { + OwnerGtid = -1; + markClean(); + clearCompletionCbs(); + } + + bool hasOwner() const { return OwnerGtid != -1; } + + void setOwner(int gtid) { OwnerGtid = gtid; } + bool isOwnedBy(int gtid) { return OwnerGtid == gtid; } + bool isCompatibleWith(int32_t InteropType, const interop_spec_t &Spec); + bool isCompatibleWith(int32_t InteropType, const interop_spec_t &Spec, + int64_t DeviceNum, int gtid); + void markClean() { Clean = true; } + void markDirty() { Clean = false; } + bool isClean() const { return Clean; } + + int32_t flush(DeviceTy &Device); + int32_t sync_barrier(DeviceTy &Device); + int32_t async_barrier(DeviceTy &Device); + int32_t release(DeviceTy &Device); + + int32_t flush(); + int32_t syncBarrier(); + int32_t asyncBarrier(); + int32_t release(); + + void addCompletionCb(ompx_interop_cb_t *cb, void *data) { + CompletionCbs.push_back(omp_interop_cb_instance_t(cb, data)); + } + + int numCompletionCbs() const { return CompletionCbs.size(); } + void clearCompletionCbs() { CompletionCbs.clear(); } + + void runCompletionCbs() { + for (auto &cbInstance : CompletionCbs) + cbInstance(this); + clearCompletionCbs(); + } } omp_interop_val_t; } // extern "C" +struct InteropTableEntry { + using ContainerTy = typename std::vector; + using iterator = typename ContainerTy::iterator; + + ContainerTy Interops; + + const int reservedEntriesPerThread = + 20; // reserve some entries to avoid reallocation + + void add(omp_interop_val_t *obj) { + if (Interops.capacity() == 0) + Interops.reserve(reservedEntriesPerThread); + Interops.push_back(obj); + } + + template void clear(ClearFuncTy f) { + for (auto &Obj : Interops) { + f(Obj); + } + } + + /* vector interface */ + int size() const { return Interops.size(); } + iterator begin() { return Interops.begin(); } + iterator end() { return Interops.end(); } + iterator erase(iterator it) { return Interops.erase(it); } +}; + +struct InteropTblTy + : public PerThreadTable { + void clear(); +}; + #endif // OMPTARGET_OPENMP_INTEROP_API_H diff --git a/offload/include/OpenMP/omp.h b/offload/include/OpenMP/omp.h index b44c6aff1b289..67b3bab9e8599 100644 --- a/offload/include/OpenMP/omp.h +++ b/offload/include/OpenMP/omp.h @@ -80,15 +80,18 @@ typedef enum omp_interop_rc { omp_irc_other = -6 } omp_interop_rc_t; -typedef enum omp_interop_fr { - omp_ifr_cuda = 1, - omp_ifr_cuda_driver = 2, - omp_ifr_opencl = 3, - omp_ifr_sycl = 4, - omp_ifr_hip = 5, - omp_ifr_level_zero = 6, - omp_ifr_last = 7 -} omp_interop_fr_t; +/* Foreign runtime values from OpenMP Additional Definitions document v2.1 */ +typedef enum omp_foreign_runtime_id_t { + omp_fr_none = 0, + omp_fr_cuda = 1, + omp_fr_cuda_driver = 2, + omp_fr_opencl = 3, + omp_fr_sycl = 4, + omp_fr_hip = 5, + omp_fr_level_zero = 6, + omp_fr_hsa = 7, + omp_fr_last = 8 +} omp_foreign_runtime_id_t; typedef void *omp_interop_t; @@ -134,19 +137,23 @@ omp_get_interop_type_desc(const omp_interop_t, omp_interop_property_t); extern const char *__KAI_KMPC_CONVENTION omp_get_interop_rc_desc(const omp_interop_t, omp_interop_rc_t); -typedef enum omp_interop_backend_type_t { - // reserve 0 - omp_interop_backend_type_cuda_1 = 1, -} omp_interop_backend_type_t; - -typedef enum omp_foreign_runtime_ids { - cuda = 1, - cuda_driver = 2, - opencl = 3, - sycl = 4, - hip = 5, - level_zero = 6, -} omp_foreign_runtime_ids_t; +/* Vendor defined values from OpenMP Additional Definitions document v2.1*/ +typedef enum omp_vendor_id { + omp_vendor_unknown = 0, + omp_vendor_amd = 1, + omp_vendor_arm = 2, + omp_vendor_bsc = 3, + omp_vendor_fujitsu = 4, + omp_vendor_gnu = 5, + omp_vendor_hpe = 6, + omp_vendor_ibm = 7, + omp_vendor_intel = 8, + omp_vendor_llvm = 9, + omp_vendor_nec = 10, + omp_vendor_nvidia = 11, + omp_vendor_ti = 12, + omp_vendor_last = 13 +} omp_vendor_id_t; ///} InteropAPI diff --git a/offload/include/PerThreadTable.h b/offload/include/PerThreadTable.h new file mode 100644 index 0000000000000..1e20b56c734d2 --- /dev/null +++ b/offload/include/PerThreadTable.h @@ -0,0 +1,109 @@ +//===-- PerThreadTable.h -- PerThread Storage Structure ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Table indexed with one entry per thread. +// +//===----------------------------------------------------------------------===// + +#ifndef OFFLOAD_PERTHREADTABLE_H +#define OFFLOAD_PERTHREADTABLE_H + +#include +#include +#include + +// Using an STL container (such as std::vector) indexed by thread ID has +// too many race conditions issues so we store each thread entry into a +// thread_local variable. +// T is the container type used to store the objects, e.g., std::vector, +// std::set, etc. by each thread. O is the type of the stored objects e.g., +// omp_interop_val_t *, ... + +template struct PerThreadTable { + using iterator = typename ContainerType::iterator; + + struct PerThreadData { + size_t NElements = 0; + std::unique_ptr ThEntry; + }; + + std::mutex Mtx; + std::list ThreadDataList; + + // define default constructors, disable copy and move constructors + PerThreadTable() = default; + PerThreadTable(const PerThreadTable &) = delete; + PerThreadTable(PerThreadTable &&) = delete; + PerThreadTable &operator=(const PerThreadTable &) = delete; + PerThreadTable &operator=(PerThreadTable &&) = delete; + ~PerThreadTable() { + std::lock_guard Lock(Mtx); + ThreadDataList.clear(); + } + +private: + PerThreadData &getThreadData() { + static thread_local PerThreadData ThData; + return ThData; + } + +protected: + ContainerType &getThreadEntry() { + auto &ThData = getThreadData(); + if (ThData.ThEntry) + return *ThData.ThEntry; + ThData.ThEntry = std::make_unique(); + std::lock_guard Lock(Mtx); + ThreadDataList.push_back(&ThData); + return *ThData.ThEntry; + } + + size_t &getThreadNElements() { + auto &ThData = getThreadData(); + return ThData.NElements; + } + +public: + void add(ObjectType obj) { + auto &Entry = getThreadEntry(); + auto &NElements = getThreadNElements(); + NElements++; + Entry.add(obj); + } + + iterator erase(iterator it) { + auto &Entry = getThreadEntry(); + auto &NElements = getThreadNElements(); + NElements--; + return Entry.erase(it); + } + + size_t size() { return getThreadNElements(); } + + // Iterators to traverse objects owned by + // the current thread + iterator begin() { + auto &Entry = getThreadEntry(); + return Entry.begin(); + } + iterator end() { + auto &Entry = getThreadEntry(); + return Entry.end(); + } + + template void clear(F f) { + std::lock_guard Lock(Mtx); + for (auto ThData : ThreadDataList) { + ThData->ThEntry->clear(f); + ThData->NElements = 0; + } + ThreadDataList.clear(); + } +}; + +#endif diff --git a/offload/include/PluginManager.h b/offload/include/PluginManager.h index ec3adadf0819b..ea1f3b6406ce7 100644 --- a/offload/include/PluginManager.h +++ b/offload/include/PluginManager.h @@ -35,6 +35,8 @@ #include #include +#include "OpenMP/InteropAPI.h" + using GenericPluginTy = llvm::omp::target::plugin::GenericPluginTy; /// Struct for the data required to handle plugins @@ -88,6 +90,9 @@ struct PluginManager { HostPtrToTableMapTy HostPtrToTableMap; std::mutex TblMapMtx; ///< For HostPtrToTableMap + /// Table of cached implicit interop objects + InteropTblTy InteropTbl; + // Work around for plugins that call dlopen on shared libraries that call // tgt_register_lib during their initialisation. Stash the pointers in a // vector until the plugins are all initialised and then register them. @@ -185,5 +190,5 @@ void initRuntime(); void deinitRuntime(); extern PluginManager *PM; - +extern std::atomic RTLAlive; // Indicates if the RTL has been initialized #endif // OMPTARGET_PLUGIN_MANAGER_H diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h index 978b53d5d69b9..f376c7dc861f9 100644 --- a/offload/include/Shared/APITypes.h +++ b/offload/include/Shared/APITypes.h @@ -36,6 +36,7 @@ struct __tgt_device_image { struct __tgt_device_info { void *Context = nullptr; void *Device = nullptr; + void *Platform = nullptr; }; /// This struct is a record of all the host code that may be offloaded to a diff --git a/offload/libomptarget/OffloadRTL.cpp b/offload/libomptarget/OffloadRTL.cpp index 29b573a27d087..134ab7c95ac0b 100644 --- a/offload/libomptarget/OffloadRTL.cpp +++ b/offload/libomptarget/OffloadRTL.cpp @@ -22,6 +22,7 @@ extern void llvm::omp::target::ompt::connectLibrary(); static std::mutex PluginMtx; static uint32_t RefCount = 0; +std::atomic RTLAlive{false}; void initRuntime() { std::scoped_lock Lock(PluginMtx); @@ -41,6 +42,9 @@ void initRuntime() { PM->init(); PM->registerDelayedLibraries(); + + // RTL initialization is complete + RTLAlive = true; } } @@ -50,6 +54,8 @@ void deinitRuntime() { if (RefCount == 1) { DP("Deinit offload library!\n"); + // RTL deinitialization has started + RTLAlive = false; PM->deinit(); delete PM; PM = nullptr; diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp index 4576f9bd06121..f61f56772504b 100644 --- a/offload/libomptarget/OpenMP/API.cpp +++ b/offload/libomptarget/OpenMP/API.cpp @@ -683,3 +683,15 @@ EXTERN void *omp_get_mapped_ptr(const void *Ptr, int DeviceNum) { return TPR.TargetPointer; } + +void syncImplicitInterops(int gtid, void *event); +// This routine gets called from the Host RTL at sync points (taskwait, barrier, +// ...) so we can synchronize the necessary objects from the offload side. +EXTERN void __tgt_target_sync(ident_t *loc_ref, int gtid, void *current_task, + void *event) { + + if (!RTLAlive) + return; + + syncImplicitInterops(gtid, event); +} diff --git a/offload/libomptarget/OpenMP/InteropAPI.cpp b/offload/libomptarget/OpenMP/InteropAPI.cpp index bdbc440c64a2c..55e47d87a865d 100644 --- a/offload/libomptarget/OpenMP/InteropAPI.cpp +++ b/offload/libomptarget/OpenMP/InteropAPI.cpp @@ -10,6 +10,7 @@ #include "OpenMP/InternalTypes.h" #include "OpenMP/omp.h" +#include "OffloadPolicy.h" #include "PluginManager.h" #include "device.h" #include "omptarget.h" @@ -56,22 +57,22 @@ void getTypeMismatch(omp_interop_property_t Property, int *Err) { *Err = getPropertyErrorType(Property); } -const char *getVendorIdToStr(const omp_foreign_runtime_ids_t VendorId) { - switch (VendorId) { - case cuda: - return ("cuda"); - case cuda_driver: - return ("cuda_driver"); - case opencl: - return ("opencl"); - case sycl: - return ("sycl"); - case hip: - return ("hip"); - case level_zero: - return ("level_zero"); - } - return ("unknown"); +static const char *VendorStrTbl[] = { + "unknown", "amd", "arm", "bsc", "fujitsu", "gnu", "hpe", + "ibm", "intel", "llvm", "nec", "nvidia", "ti"}; +const char *getVendorIdToStr(const omp_vendor_id_t VendorId) { + if (VendorId < omp_vendor_unknown || VendorId >= omp_vendor_last) + return ("unknown"); + return VendorStrTbl[VendorId]; +} + +static const char *ForeignRuntimeStrTbl[] = { + "none", "cuda", "cuda_driver", "opencl", + "sycl", "hip", "level_zero", "hsa"}; +const char *getForeignRuntimeIdToStr(const omp_foreign_runtime_id_t FrId) { + if (FrId < omp_fr_none || FrId >= omp_fr_last) + return ("unknown"); + return ForeignRuntimeStrTbl[FrId]; } template @@ -83,7 +84,7 @@ intptr_t getProperty(omp_interop_val_t &InteropVal, omp_interop_property_t Property, int *Err) { switch (Property) { case omp_ipr_fr_id: - return InteropVal.backend_type_id; + return InteropVal.fr_id; case omp_ipr_vendor: return InteropVal.vendor_id; case omp_ipr_device_num: @@ -99,10 +100,8 @@ const char *getProperty(omp_interop_val_t &InteropVal, omp_interop_property_t Property, int *Err) { switch (Property) { - case omp_ipr_fr_id: - return InteropVal.interop_type == kmp_interop_type_tasksync - ? "tasksync" - : "device+context"; + case omp_ipr_fr_name: + return getForeignRuntimeIdToStr(InteropVal.fr_id); case omp_ipr_vendor_name: return getVendorIdToStr(InteropVal.vendor_id); default: @@ -120,6 +119,8 @@ void *getProperty(omp_interop_val_t &InteropVal, return InteropVal.device_info.Device; *Err = omp_irc_no_value; return const_cast(InteropVal.err_str); + case omp_ipr_platform: + return InteropVal.device_info.Platform; case omp_ipr_device_context: return InteropVal.device_info.Context; case omp_ipr_targetsync: @@ -145,13 +146,13 @@ bool getPropertyCheck(omp_interop_val_t **InteropPtr, return false; } if (Property == omp_ipr_targetsync && - (*InteropPtr)->interop_type != kmp_interop_type_tasksync) { + (*InteropPtr)->interop_type != kmp_interop_type_targetsync) { if (Err) *Err = omp_irc_other; return false; } if ((Property == omp_ipr_device || Property == omp_ipr_device_context) && - (*InteropPtr)->interop_type == kmp_interop_type_tasksync) { + (*InteropPtr)->interop_type == kmp_interop_type_targetsync) { if (Err) *Err = omp_irc_other; return false; @@ -166,7 +167,7 @@ bool getPropertyCheck(omp_interop_val_t **InteropPtr, omp_interop_property_t property_id, \ int *err) { \ omp_interop_val_t *interop_val = (omp_interop_val_t *)interop; \ - assert((interop_val)->interop_type == kmp_interop_type_tasksync); \ + assert((interop_val)->interop_type == kmp_interop_type_targetsync); \ if (!getPropertyCheck(&interop_val, property_id, err)) { \ return (RETURN_TYPE)(0); \ } \ @@ -193,119 +194,263 @@ __OMP_GET_INTEROP_TY3(const char *, type_desc) __OMP_GET_INTEROP_TY3(const char *, rc_desc) #undef __OMP_GET_INTEROP_TY3 -static const char *copyErrorString(llvm::Error &&Err) { - // TODO: Use the error string while avoiding leaks. - std::string ErrMsg = llvm::toString(std::move(Err)); - char *UsrMsg = reinterpret_cast(malloc(ErrMsg.size() + 1)); - strcpy(UsrMsg, ErrMsg.c_str()); - return UsrMsg; -} - extern "C" { -void __tgt_interop_init(ident_t *LocRef, int32_t Gtid, - omp_interop_val_t *&InteropPtr, - kmp_interop_type_t InteropType, int32_t DeviceId, - int32_t Ndeps, kmp_depend_info_t *DepList, - int32_t HaveNowait) { - int32_t NdepsNoalias = 0; - kmp_depend_info_t *NoaliasDepList = NULL; - assert(InteropType != kmp_interop_type_unknown && - "Cannot initialize with unknown interop_type!"); - if (DeviceId == -1) { - DeviceId = omp_get_default_device(); +omp_interop_val_t *__tgt_interop_get(ident_t *LocRef, int32_t InteropType, + int64_t DeviceNum, int32_t NumPrefers, + interop_spec_t *Prefers, + interop_ctx_t *Ctx, dep_pack_t *Deps) { + + DP("Call to %s with device_num %" PRId64 ", interop type %" PRId32 + ", number of preferred specs %" PRId32 "%s%s\n", + __func__, DeviceNum, InteropType, NumPrefers, + Ctx->flags.implicit ? " (implicit)" : "", + Ctx->flags.nowait ? " (nowait)" : ""); + + if (OffloadPolicy::get(*PM).Kind == OffloadPolicy::DISABLED) + return omp_interop_none; + + // Now, try to create an interop with device_num. + if (DeviceNum == OFFLOAD_DEVICE_DEFAULT) + DeviceNum = omp_get_default_device(); + + auto gtid = Ctx->gtid; + + if (InteropType == kmp_interop_type_targetsync) { + if (Ctx->flags.nowait) + DP("Warning: nowait flag on interop creation not supported yet. " + "Ignored\n"); + if (Deps) + __kmpc_omp_wait_deps(LocRef, gtid, Deps->ndeps, Deps->deplist, + Deps->ndeps_noalias, Deps->noalias_deplist); } - if (InteropType == kmp_interop_type_tasksync) { - __kmpc_omp_wait_deps(LocRef, Gtid, Ndeps, DepList, NdepsNoalias, - NoaliasDepList); + auto DeviceOrErr = PM->getDevice(DeviceNum); + if (!DeviceOrErr) { + [[maybe_unused]] std::string ErrStr = toString(DeviceOrErr.takeError()); + DP("Couldn't find device %" PRId64 + " while constructing interop object: %s\n", + DeviceNum, ErrStr.c_str()); + return omp_interop_none; + } + auto &Device = *DeviceOrErr; + omp_interop_val_t *Interop = omp_interop_none; + auto InteropSpec = Device.RTL->select_interop_preference( + DeviceNum, InteropType, NumPrefers, Prefers); + if (InteropSpec.fr_id == omp_fr_none) { + DP("Interop request not supported by device %" PRId64 "\n", DeviceNum); + return omp_interop_none; + } + DP("Selected interop preference is fr_id=%s%s impl_attrs=%" PRId64 "\n", + getForeignRuntimeIdToStr((omp_foreign_runtime_id_t)InteropSpec.fr_id), + InteropSpec.attrs.inorder ? " inorder" : "", InteropSpec.impl_attrs); + + if (Ctx->flags.implicit) { + // This is a request for an RTL managed interop object. + // Get it from the InteropTbl if possible + if (PM->InteropTbl.size() > 0) { + for (auto iop : PM->InteropTbl) { + if (iop->isCompatibleWith(InteropType, InteropSpec, DeviceNum, gtid)) { + Interop = iop; + Interop->markDirty(); + DP("Reused interop " DPxMOD " from device number %" PRId64 + " for gtid %" PRId32 "\n", + DPxPTR(Interop), DeviceNum, gtid); + return Interop; + } + } + } } - InteropPtr = new omp_interop_val_t(DeviceId, InteropType); - - auto DeviceOrErr = PM->getDevice(DeviceId); - if (!DeviceOrErr) { - InteropPtr->err_str = copyErrorString(DeviceOrErr.takeError()); - return; + Interop = Device.RTL->create_interop(DeviceNum, InteropType, &InteropSpec); + DP("Created an interop " DPxMOD " from device number %" PRId64 "\n", + DPxPTR(Interop), DeviceNum); + + if (Ctx->flags.implicit) { + // register the new implicit interop in the RTL + Interop->setOwner(gtid); + Interop->markDirty(); + PM->InteropTbl.add(Interop); + } else { + Interop->setOwner(-1); } - DeviceTy &Device = *DeviceOrErr; - if (!Device.RTL || - Device.RTL->init_device_info(DeviceId, &(InteropPtr)->device_info, - &(InteropPtr)->err_str)) { - delete InteropPtr; - InteropPtr = omp_interop_none; + return Interop; +} + +int __tgt_interop_use(ident_t *LocRef, omp_interop_val_t *Interop, + interop_ctx_t *Ctx, dep_pack_t *Deps) { + bool nowait = Ctx->flags.nowait; + DP("Call to %s with interop " DPxMOD ", nowait %" PRId32 "\n", __func__, + DPxPTR(Interop), nowait); + if (OffloadPolicy::get(*PM).Kind == OffloadPolicy::DISABLED || !Interop) + return OFFLOAD_FAIL; + + if (!Interop) + return OFFLOAD_FAIL; + + if (Interop->interop_type == kmp_interop_type_targetsync) { + if (Ctx->flags.nowait) + DP("Warning: nowait flag on interop use not supported yet. " + "Ignored\n"); + if (Deps) + __kmpc_omp_wait_deps(LocRef, Ctx->gtid, Deps->ndeps, Deps->deplist, + Deps->ndeps_noalias, Deps->noalias_deplist); } - if (InteropType == kmp_interop_type_tasksync) { - if (!Device.RTL || - Device.RTL->init_async_info(DeviceId, &(InteropPtr)->async_info)) { - delete InteropPtr; - InteropPtr = omp_interop_none; + + if (Interop->async_info && Interop->async_info->Queue) { + if (nowait) + Interop->asyncBarrier(); + else { + Interop->flush(); + Interop->syncBarrier(); + Interop->markClean(); } } + + return OFFLOAD_SUCCESS; } -void __tgt_interop_use(ident_t *LocRef, int32_t Gtid, - omp_interop_val_t *&InteropPtr, int32_t DeviceId, - int32_t Ndeps, kmp_depend_info_t *DepList, - int32_t HaveNowait) { - int32_t NdepsNoalias = 0; - kmp_depend_info_t *NoaliasDepList = NULL; - assert(InteropPtr && "Cannot use nullptr!"); - omp_interop_val_t *InteropVal = InteropPtr; - if (DeviceId == -1) { - DeviceId = omp_get_default_device(); - } - assert(InteropVal != omp_interop_none && - "Cannot use uninitialized interop_ptr!"); - assert((DeviceId == -1 || InteropVal->device_id == DeviceId) && - "Inconsistent device-id usage!"); +int __tgt_interop_release(ident_t *LocRef, omp_interop_val_t *Interop, + interop_ctx_t *Ctx, dep_pack_t *Deps) { + DP("Call to %s with interop " DPxMOD "\n", __func__, DPxPTR(Interop)); - auto DeviceOrErr = PM->getDevice(DeviceId); - if (!DeviceOrErr) { - InteropPtr->err_str = copyErrorString(DeviceOrErr.takeError()); - return; + if (OffloadPolicy::get(*PM).Kind == OffloadPolicy::DISABLED || !Interop) + return OFFLOAD_FAIL; + + if (!Interop) + return OFFLOAD_FAIL; + + if (Interop->interop_type == kmp_interop_type_targetsync) { + if (Ctx->flags.nowait) + DP("Warning: nowait flag on interop destroy not supported yet. " + "Ignored\n"); + if (Deps) { + __kmpc_omp_wait_deps(LocRef, Ctx->gtid, Deps->ndeps, Deps->deplist, + Deps->ndeps_noalias, Deps->noalias_deplist); + } } - if (InteropVal->interop_type == kmp_interop_type_tasksync) { - __kmpc_omp_wait_deps(LocRef, Gtid, Ndeps, DepList, NdepsNoalias, - NoaliasDepList); + return Interop->release(); +} + +} // extern "C" + +bool omp_interop_val_t::isCompatibleWith(int32_t InteropType, + const interop_spec_t &Spec) { + if (interop_type != InteropType) + return false; + if (Spec.fr_id != fr_id) + return false; + if (Spec.attrs.inorder != attrs.inorder) + return false; + if (Spec.impl_attrs != impl_attrs) + return false; + + return true; +} + +bool omp_interop_val_t::isCompatibleWith(int32_t InteropType, + const interop_spec_t &Spec, + int64_t DeviceNum, int GTID) { + if (device_id != DeviceNum) + return false; + + if (GTID != OwnerGtid) + return false; + + return isCompatibleWith(InteropType, Spec); +} + +int32_t omp_interop_val_t::flush(DeviceTy &Device) { + return Device.RTL->flush_queue(this); +} + +int32_t omp_interop_val_t::sync_barrier(DeviceTy &Device) { + if (Device.RTL->sync_barrier(this) != OFFLOAD_SUCCESS) { + FATAL_MESSAGE(device_id, "Interop sync barrier failed for %p object\n", + this); } - // TODO Flush the queue associated with the interop through the plugin + DP("Calling completion callbacks for " DPxMOD "\n", DPxPTR(this)); + runCompletionCbs(); + return OFFLOAD_SUCCESS; +} + +int32_t omp_interop_val_t::async_barrier(DeviceTy &Device) { + return Device.RTL->async_barrier(this); } -void __tgt_interop_destroy(ident_t *LocRef, int32_t Gtid, - omp_interop_val_t *&InteropPtr, int32_t DeviceId, - int32_t Ndeps, kmp_depend_info_t *DepList, - int32_t HaveNowait) { - int32_t NdepsNoalias = 0; - kmp_depend_info_t *NoaliasDepList = NULL; - assert(InteropPtr && "Cannot use nullptr!"); - omp_interop_val_t *InteropVal = InteropPtr; - if (DeviceId == -1) { - DeviceId = omp_get_default_device(); +int32_t omp_interop_val_t::release(DeviceTy &Device) { + if (async_info != nullptr && (!hasOwner() || !isClean())) { + flush(); + syncBarrier(); } + return Device.RTL->release_interop(device_id, this); +} - if (InteropVal == omp_interop_none) - return; +int32_t omp_interop_val_t::flush() { + auto DeviceOrErr = PM->getDevice(device_id); + if (!DeviceOrErr) + FATAL_MESSAGE(device_id, "%s", toString(DeviceOrErr.takeError()).c_str()); + DeviceTy &Device = *DeviceOrErr; + return flush(Device); +} - assert((DeviceId == -1 || InteropVal->device_id == DeviceId) && - "Inconsistent device-id usage!"); - auto DeviceOrErr = PM->getDevice(DeviceId); - if (!DeviceOrErr) { - InteropPtr->err_str = copyErrorString(DeviceOrErr.takeError()); +int32_t omp_interop_val_t::syncBarrier() { + auto DeviceOrErr = PM->getDevice(device_id); + if (!DeviceOrErr) + FATAL_MESSAGE(device_id, "%s", toString(DeviceOrErr.takeError()).c_str()); + DeviceTy &Device = *DeviceOrErr; + return sync_barrier(Device); +} + +int32_t omp_interop_val_t::asyncBarrier() { + auto DeviceOrErr = PM->getDevice(device_id); + if (!DeviceOrErr) + FATAL_MESSAGE(device_id, "%s", toString(DeviceOrErr.takeError()).c_str()); + DeviceTy &Device = *DeviceOrErr; + return async_barrier(Device); +} + +int32_t omp_interop_val_t::release() { + auto DeviceOrErr = PM->getDevice(device_id); + if (!DeviceOrErr) + FATAL_MESSAGE(device_id, "%s", toString(DeviceOrErr.takeError()).c_str()); + DeviceTy &Device = *DeviceOrErr; + return release(Device); +} + +void syncImplicitInterops(int gtid, void *event) { + if (PM->InteropTbl.size() == 0) return; - } - if (InteropVal->interop_type == kmp_interop_type_tasksync) { - __kmpc_omp_wait_deps(LocRef, Gtid, Ndeps, DepList, NdepsNoalias, - NoaliasDepList); + DP("target_sync: syncing interops for gtid %" PRId32 ", event " DPxMOD "\n", + gtid, DPxPTR(event)); + + for (auto iop : PM->InteropTbl) { + if (iop->async_info && iop->async_info->Queue && iop->isOwnedBy(gtid) && + !iop->isClean()) { + + iop->flush(); + iop->syncBarrier(); + iop->markClean(); + + // TODO: Alternate implementation option + // Instead of using a synchronous barrier, queue an asynchronous + // barrier and create a proxy task associated to the event to handle + // OpenMP synchronizations. + // When the event is completed, fulfill the proxy task to notify the + // OpenMP runtime. + // event = iop->asyncBarrier(); + // ptask = createProxyTask(); + // Events->add(event,ptask); + } } - // TODO Flush the queue associated with the interop through the plugin - // TODO Signal out dependences - - delete InteropPtr; - InteropPtr = omp_interop_none; + // This would be needed for the alternate implementation + // processEvents(); } -} // extern "C" +void InteropTblTy::clear() { + DP("Clearing Interop Table\n"); + PerThreadTable::clear([](auto &IOP) { IOP->release(); }); +} diff --git a/offload/libomptarget/PluginManager.cpp b/offload/libomptarget/PluginManager.cpp index 93589960a426d..2cc1314e7a4f0 100644 --- a/offload/libomptarget/PluginManager.cpp +++ b/offload/libomptarget/PluginManager.cpp @@ -128,6 +128,12 @@ void PluginManager::initializeAllDevices() { initializeDevice(Plugin, DeviceId); } } + // After all plugins are initialized, register atExit cleanup handlers + std::atexit([]() { + // Interop cleanup should be done before the plugins are deinitialized as + // the backend libraries may be already unloaded. + PM->InteropTbl.clear(); + }); } // Returns a pointer to the binary descriptor, upgrading from a legacy format if diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports index 2406776c1fb5f..b40d9b22a1be9 100644 --- a/offload/libomptarget/exports +++ b/offload/libomptarget/exports @@ -67,9 +67,10 @@ VERS1.0 { omp_get_interop_int; omp_get_interop_name; omp_get_interop_type_desc; - __tgt_interop_init; + __tgt_interop_get; __tgt_interop_use; - __tgt_interop_destroy; + __tgt_interop_release; + __tgt_target_sync; __llvmPushCallConfiguration; __llvmPopCallConfiguration; llvmLaunchKernel; diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index d2437908a0a6f..40a428dbccb06 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -20,6 +20,7 @@ #include #include "ExclusiveAccess.h" +#include "OpenMP/InteropAPI.h" #include "Shared/APITypes.h" #include "Shared/Debug.h" #include "Shared/Environment.h" @@ -937,6 +938,21 @@ struct GenericDeviceTy : public DeviceAllocatorTy { bool useAutoZeroCopy(); virtual bool useAutoZeroCopyImpl() { return false; } + virtual omp_interop_val_t *createInterop(int32_t InteropType, + interop_spec_t &InteropSpec) { + return nullptr; + } + + virtual int32_t releaseInterop(omp_interop_val_t *Interop) { + return OFFLOAD_SUCCESS; + } + + virtual interop_spec_t selectInteropPreference(int32_t InteropType, + int32_t NumPrefers, + interop_spec_t *Prefers) { + return interop_spec_t{omp_fr_none, {false, 0}, 0}; + } + /// Allocate and construct a kernel object. virtual Expected constructKernel(const char *Name) = 0; @@ -1342,6 +1358,45 @@ struct GenericPluginTy { int32_t get_function(__tgt_device_binary Binary, const char *Name, void **KernelPtr); + /// Return the interop specification that the plugin supports + /// It might not be one of the user specified ones. + interop_spec_t select_interop_preference(int32_t ID, int32_t InteropType, + int32_t NumPrefers, + interop_spec_t *Prefers) { + auto &Device = getDevice(ID); + return Device.selectInteropPreference(InteropType, NumPrefers, Prefers); + } + + /// Create OpenMP interop with the given interop context + omp_interop_val_t *create_interop(int32_t ID, int32_t InteropContext, + interop_spec_t *InteropSpec) { + auto &Device = getDevice(ID); + return Device.createInterop(InteropContext, *InteropSpec); + } + + /// Release OpenMP interop object + int32_t release_interop(int32_t ID, omp_interop_val_t *Interop) { + auto &Device = getDevice(ID); + return Device.releaseInterop(Interop); + } + + /// Flush the queue associated with the interop object if necessary + virtual int32_t flush_queue(omp_interop_val_t *Interop) { + return OFFLOAD_SUCCESS; + } + + /// Queue a synchronous barrier in the queue associated with the interop + /// object and wait for it to complete. + virtual int32_t sync_barrier(omp_interop_val_t *Interop) { + return OFFLOAD_FAIL; + } + + /// Queue an asynchronous barrier in the queue associated with the interop + /// object and return immediately. + virtual int32_t async_barrier(omp_interop_val_t *Interop) { + return OFFLOAD_FAIL; + } + private: /// Indicates if the platform runtime has been fully initialized. bool Initialized = false; diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index a2cacc8792b15..9c4939b029861 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -4665,6 +4665,13 @@ static inline int __kmp_adjust_gtid_for_hidden_helpers(int gtid) { return adjusted_gtid; } +#if ENABLE_LIBOMPTARGET +// Pointers to callbacks registered by the offload library to be notified of +// task progress. +extern void (*kmp_target_sync_cb)(ident_t *loc_ref, int gtid, + void *current_task, void *event); +#endif // ENABLE_LIBOMPTARGET + // Support for error directive typedef enum kmp_severity_t { severity_warning = 1, diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp index d7ef57c608149..c6908c35fc3d9 100644 --- a/openmp/runtime/src/kmp_barrier.cpp +++ b/openmp/runtime/src/kmp_barrier.cpp @@ -1828,6 +1828,14 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, } #endif +#if ENABLE_LIBOMPTARGET + // Give an opportunity to the offload runtime to make progress and create + // proxy tasks if necessary + if (UNLIKELY(kmp_target_sync_cb != NULL)) + (*kmp_target_sync_cb)( + NULL, gtid, KMP_TASKDATA_TO_TASK(this_thr->th.th_current_task), NULL); +#endif + if (!team->t.t_serialized) { #if USE_ITT_BUILD // This value will be used in itt notify events below. diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp index 417eceb8ebecc..d99d1a410b5d3 100644 --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -93,6 +93,9 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only = 0); #endif static void __kmp_do_serial_initialize(void); +#if ENABLE_LIBOMPTARGET +static void __kmp_target_init(void); +#endif // ENABLE_LIBOMPTARGET void __kmp_fork_barrier(int gtid, int tid); void __kmp_join_barrier(int gtid); void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, @@ -7173,6 +7176,9 @@ static void __kmp_do_serial_initialize(void) { #if KMP_MIC_SUPPORTED __kmp_check_mic_type(); #endif +#if ENABLE_LIBOMPTARGET + __kmp_target_init(); +#endif /* ENABLE_LIBOMPTARGET */ // Some global variable initialization moved here from kmp_env_initialize() #ifdef KMP_DEBUG @@ -9386,6 +9392,15 @@ void __kmp_set_nesting_mode_threads() { set__max_active_levels(thread, __kmp_nesting_mode_nlevels); } +#if ENABLE_LIBOMPTARGET +void (*kmp_target_sync_cb)(ident_t *loc_ref, int gtid, void *current_task, + void *event) = NULL; +void __kmp_target_init() { + // Look for hooks in the libomptarget library + *(void **)(&kmp_target_sync_cb) = KMP_DLSYM("__tgt_target_sync"); +} +#endif // ENABLE_LIBOMPTARGET + // Empty symbols to export (see exports_so.txt) when feature is disabled extern "C" { #if !KMP_STATS_ENABLED diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp index 3d85a29423540..d45e3d690510e 100644 --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -1378,6 +1378,13 @@ void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, // thread: thread data structure corresponding to implicit task void __kmp_finish_implicit_task(kmp_info_t *thread) { kmp_taskdata_t *task = thread->th.th_current_task; +#if ENABLE_LIBOMPTARGET + // Give an opportunity to the offload runtime to synchronize any unfinished + // target async regions before finishing the implicit task + if (UNLIKELY(kmp_target_sync_cb != NULL)) + (*kmp_target_sync_cb)(NULL, thread->th.th_info.ds.ds_gtid, + KMP_TASKDATA_TO_TASK(task), NULL); +#endif // ENABLE_LIBOMPTARGET if (task->td_dephash) { int children; task->td_flags.complete = 1; @@ -2249,6 +2256,14 @@ static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid, } #endif // OMPT_SUPPORT && OMPT_OPTIONAL +#if ENABLE_LIBOMPTARGET + // Give an opportunity to the offload runtime to make progress and create + // any necessary proxy tasks + if (UNLIKELY(kmp_target_sync_cb)) + (*kmp_target_sync_cb)(loc_ref, gtid, KMP_TASKDATA_TO_TASK(taskdata), + NULL); +#endif // ENABLE_LIBOMPTARGET + // Debugger: The taskwait is active. Store location and thread encountered the // taskwait. #if USE_ITT_BUILD @@ -2948,6 +2963,13 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) { } #endif +#if ENABLE_LIBOMPTARGET + // Give an opportunity to the offload runtime to make progress and create + // any necessary proxy tasks + if (UNLIKELY(kmp_target_sync_cb)) + (*kmp_target_sync_cb)(loc, gtid, KMP_TASKDATA_TO_TASK(taskdata), NULL); +#endif // ENABLE_LIBOMPTARGET + if (!taskdata->td_flags.team_serial || (thread->th.th_task_team != NULL && (thread->th.th_task_team->tt.tt_found_proxy_tasks || @@ -3391,6 +3413,13 @@ static inline int __kmp_execute_tasks_template( while (1) { // Outer loop keeps trying to find tasks in case of single thread // getting tasks from target constructs while (1) { // Inner loop to find a task and execute it +#if ENABLE_LIBOMPTARGET + // Give an opportunity to the offload runtime to make progress + if (UNLIKELY(kmp_target_sync_cb)) + (*kmp_target_sync_cb)(NULL, gtid, KMP_TASKDATA_TO_TASK(current_task), + NULL); +#endif // ENABLE_LIBOMPTARGET + task = NULL; if (task_team->tt.tt_num_task_pri) { // get priority task first task = __kmp_get_priority_task(gtid, task_team, is_constrained);