-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[Offload][CUDA] Allow CUDA kernels to use LLVM/Offload #94549
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-offload @llvm/pr-subscribers-clang Author: Johannes Doerfert (jdoerfert) ChangesThrough the new As we are about to redefine the Offload API, this wil help us in the We do not support any CUDA APIs yet, however, we could: For proper host execution we need to resurrect/rebase
Patch is 30.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/94549.diff 23 Files Affected:
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 4061451b2150a..8aff98867202e 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -288,6 +288,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kern
LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP")
LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.")
+LANGOPT(OffloadViaLLVM, 1, 0, "target LLVM/Offload as portable offloading runtime.")
LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device")
LANGOPT(SYCLIsHost , 1, 0, "SYCL host compilation")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 57f37c5023110..a09d75917ff98 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1275,6 +1275,12 @@ def no_offload_compress : Flag<["--"], "no-offload-compress">;
def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">,
Flags<[HelpHidden]>,
HelpText<"Compression level for offload device binaries (HIP only)">;
+
+defm offload_via_llvm : BoolFOption<"offload-via-llvm",
+ LangOpts<"OffloadViaLLVM">, DefaultFalse,
+ PosFlag<SetTrue, [], [ClangOption, CC1Option], "Use">,
+ NegFlag<SetFalse, [], [ClangOption], "Don't use">,
+ BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading runtime.">>;
}
// CUDA options
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 43dfbbb90dd52..acbc7e9e43635 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -19,6 +19,7 @@
#include "clang/Basic/Cuda.h"
#include "clang/CodeGen/CodeGenABITypes.h"
#include "clang/CodeGen/ConstantInitBuilder.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/Frontend/Offloading/Utility.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
@@ -36,6 +37,11 @@ constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
class CGNVCUDARuntime : public CGCUDARuntime {
+ /// The prefix used for function calls and section names (CUDA, HIP, LLVM)
+ StringRef Prefix;
+ /// TODO: We should transition the OpenMP section to LLVM/Offload
+ StringRef SectionPrefix;
+
private:
llvm::IntegerType *IntTy, *SizeTy;
llvm::Type *VoidTy;
@@ -191,15 +197,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
} // end anonymous namespace
std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
- if (CGM.getLangOpts().HIP)
- return ((Twine("hip") + Twine(FuncName)).str());
- return ((Twine("cuda") + Twine(FuncName)).str());
+ return (Prefix + FuncName).str();
}
std::string
CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
- if (CGM.getLangOpts().HIP)
- return ((Twine("__hip") + Twine(FuncName)).str());
- return ((Twine("__cuda") + Twine(FuncName)).str());
+ return ("__" + Prefix + FuncName).str();
}
static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
@@ -227,6 +229,14 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
SizeTy = CGM.SizeTy;
VoidTy = CGM.VoidTy;
PtrTy = CGM.UnqualPtrTy;
+
+ if (CGM.getLangOpts().OffloadViaLLVM) {
+ Prefix = "llvm";
+ SectionPrefix = "omp";
+ } else if (CGM.getLangOpts().HIP)
+ SectionPrefix = Prefix = "hip";
+ else
+ SectionPrefix = Prefix = "cuda";
}
llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
@@ -305,7 +315,8 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
}
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
CudaFeature::CUDA_USES_NEW_LAUNCH) ||
- (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
+ (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) ||
+ (CGF.getLangOpts().OffloadViaLLVM))
emitDeviceStubBodyNew(CGF, Args);
else
emitDeviceStubBodyLegacy(CGF, Args);
@@ -1129,8 +1140,9 @@ void CGNVCUDARuntime::transformManagedVars() {
// registered. The linker will provide a pointer to this section so we can
// register the symbols with the linked device image.
void CGNVCUDARuntime::createOffloadingEntries() {
- StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
- : "cuda_offloading_entries";
+ SmallVector<char, 32> Out;
+ StringRef Section = (SectionPrefix + "_offloading_entries").toStringRef(Out);
+
llvm::Module &M = CGM.getModule();
for (KernelInfo &I : EmittedKernels)
llvm::offloading::emitOffloadingEntry(
@@ -1199,7 +1211,9 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
}
return nullptr;
}
- if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
+ if (CGM.getLangOpts().OffloadViaLLVM)
+ createOffloadingEntries();
+ else if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
createOffloadingEntries();
else
return makeModuleCtorFunction();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index f5ea73a04ae5c..815149a49d018 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -792,11 +792,13 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
}) ||
C.getInputArgs().hasArg(options::OPT_hip_link) ||
C.getInputArgs().hasArg(options::OPT_hipstdpar);
+ bool UseLLVMOffload = C.getInputArgs().hasArg(
+ options::OPT_foffload_via_llvm, options::OPT_fno_offload_via_llvm, false);
if (IsCuda && IsHIP) {
Diag(clang::diag::err_drv_mix_cuda_hip);
return;
}
- if (IsCuda) {
+ if (IsCuda && !UseLLVMOffload) {
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
const llvm::Triple &HostTriple = HostTC->getTriple();
auto OFK = Action::OFK_Cuda;
@@ -818,7 +820,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
CudaInstallation.WarnIfUnsupportedVersion();
}
C.addOffloadDeviceToolChain(CudaTC.get(), OFK);
- } else if (IsHIP) {
+ } else if (IsHIP && !UseLLVMOffload) {
if (auto *OMPTargetArg =
C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) {
Diag(clang::diag::err_drv_unsupported_opt_for_language_mode)
@@ -842,10 +844,11 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
// We need to generate an OpenMP toolchain if the user specified targets with
// the -fopenmp-targets option or used --offload-arch with OpenMP enabled.
bool IsOpenMPOffloading =
- C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
- options::OPT_fno_openmp, false) &&
- (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
- C.getInputArgs().hasArg(options::OPT_offload_arch_EQ));
+ ((IsCuda || IsHIP) && UseLLVMOffload) ||
+ (C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
+ options::OPT_fno_openmp, false) &&
+ (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
+ C.getInputArgs().hasArg(options::OPT_offload_arch_EQ)));
if (IsOpenMPOffloading) {
// We expect that -fopenmp-targets is always used in conjunction with the
// option -fopenmp specifying a valid runtime with offloading support, i.e.
@@ -873,7 +876,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
for (StringRef T : OpenMPTargets->getValues())
OpenMPTriples.insert(T);
} else if (C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) &&
- !IsHIP && !IsCuda) {
+ ((!IsHIP && !IsCuda) || UseLLVMOffload)) {
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs());
auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(),
@@ -4138,6 +4141,8 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
bool UseNewOffloadingDriver =
C.isOffloadingHostKind(Action::OFK_OpenMP) ||
+ Args.hasFlag(options::OPT_foffload_via_llvm,
+ options::OPT_fno_offload_via_llvm, false) ||
Args.hasFlag(options::OPT_offload_new_driver,
options::OPT_no_offload_new_driver, false);
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 4e1c52462e584..07b7e60caa2b4 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1125,6 +1125,18 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
CmdArgs.push_back("__clang_openmp_device_functions.h");
}
+ if (Args.hasArg(options::OPT_foffload_via_llvm)) {
+ CmdArgs.push_back("-include");
+ SmallString<128> P(D.ResourceDir);
+ llvm::sys::path::append(P, "include");
+ llvm::sys::path::append(P, "openmp_wrappers");
+ if (JA.isDeviceOffloading(Action::OFK_OpenMP))
+ llvm::sys::path::append(P, "__llvm_offload_device.h");
+ else
+ llvm::sys::path::append(P, "__llvm_offload_host.h");
+ CmdArgs.push_back(Args.MakeArgString(P));
+ }
+
// Add -i* options, and automatically translate to
// -include-pch/-include-pth for transparent PCH support. It's
// wonky, but we include looking for .gch so we can support seamless
@@ -6672,11 +6684,17 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
Args.addOptOutFlag(CmdArgs, options::OPT_fopenmp_extensions,
options::OPT_fno_openmp_extensions);
}
-
- // Forward the new driver to change offloading code generation.
- if (Args.hasFlag(options::OPT_offload_new_driver,
- options::OPT_no_offload_new_driver, false))
+ // Forward the offload runtime change to code generation, liboffload implies
+ // new driver. Otherwise, check if we should forward the new driver to change
+ // offloading code generation.
+ if (Args.hasFlag(options::OPT_foffload_via_llvm,
+ options::OPT_fno_offload_via_llvm, false)) {
+ CmdArgs.push_back("--offload-new-driver");
+ CmdArgs.push_back("-foffload-via-llvm");
+ } else if (Args.hasFlag(options::OPT_offload_new_driver,
+ options::OPT_no_offload_new_driver, false)) {
CmdArgs.push_back("--offload-new-driver");
+ }
SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType);
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 71e993119436a..74a69f65f7ad5 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1144,8 +1144,13 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
bool ForceStaticHostRuntime, bool IsOffloadingHost,
bool GompNeedsRT) {
if (!Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
- options::OPT_fno_openmp, false))
+ options::OPT_fno_openmp, false)) {
+ // We need libomptarget (liboffload) if it's the choosen offloading runtime.
+ if (Args.hasFlag(options::OPT_foffload_via_llvm,
+ options::OPT_fno_offload_via_llvm, false))
+ CmdArgs.push_back("-lomptarget");
return false;
+ }
Driver::OpenMPRuntimeKind RTKind = TC.getDriver().getOpenMPRuntime(Args);
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index d3090e488306f..215dfe207c228 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -323,6 +323,9 @@ set(openmp_wrapper_files
openmp_wrappers/__clang_openmp_device_functions.h
openmp_wrappers/complex_cmath.h
openmp_wrappers/new
+ openmp_wrappers/__llvm_offload.h
+ openmp_wrappers/__llvm_offload_host.h
+ openmp_wrappers/__llvm_offload_device.h
)
set(llvm_libc_wrapper_files
diff --git a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
index d5b6846b03488..889d607557ecb 100644
--- a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
+++ b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
@@ -10,14 +10,11 @@
#ifndef __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
#define __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
-#ifndef _OPENMP
-#error "This file is for OpenMP compilation only."
-#endif
-
#ifdef __cplusplus
extern "C" {
#endif
+#ifdef __NVPTX__
#pragma omp begin declare variant match( \
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
@@ -34,6 +31,7 @@ extern "C" {
#undef __CUDA__
#pragma omp end declare variant
+#endif
#ifdef __AMDGCN__
#pragma omp begin declare variant match(device = {arch(amdgcn)})
diff --git a/clang/lib/Headers/openmp_wrappers/__llvm_offload.h b/clang/lib/Headers/openmp_wrappers/__llvm_offload.h
new file mode 100644
index 0000000000000..d78e3b41f99a5
--- /dev/null
+++ b/clang/lib/Headers/openmp_wrappers/__llvm_offload.h
@@ -0,0 +1,31 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include <stdlib.h>
+
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+#define __managed__ __attribute__((managed))
+
+extern "C" {
+
+typedef struct dim3 {
+ dim3() {}
+ dim3(unsigned x) : x(x) {}
+ unsigned x = 0, y = 0, z = 0;
+} dim3;
+
+// TODO: For some reason the CUDA device compilation requires this declaration
+// to be present but it should not.
+unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+ size_t sharedMem = 0, void *stream = 0);
+}
diff --git a/clang/lib/Headers/openmp_wrappers/__llvm_offload_device.h b/clang/lib/Headers/openmp_wrappers/__llvm_offload_device.h
new file mode 100644
index 0000000000000..1a813b331515b
--- /dev/null
+++ b/clang/lib/Headers/openmp_wrappers/__llvm_offload_device.h
@@ -0,0 +1,10 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include "__llvm_offload.h"
diff --git a/clang/lib/Headers/openmp_wrappers/__llvm_offload_host.h b/clang/lib/Headers/openmp_wrappers/__llvm_offload_host.h
new file mode 100644
index 0000000000000..160289d169b55
--- /dev/null
+++ b/clang/lib/Headers/openmp_wrappers/__llvm_offload_host.h
@@ -0,0 +1,15 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include "__llvm_offload.h"
+
+extern "C" {
+unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+ void **args, size_t sharedMem = 0, void *stream = 0);
+}
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 80ea43dc5316e..b507e19556363 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -1059,6 +1059,9 @@ void SemaCUDA::inheritTargetAttrs(FunctionDecl *FD,
}
std::string SemaCUDA::getConfigureFuncName() const {
+ if (getLangOpts().OffloadViaLLVM)
+ return "__llvmPushCallConfiguration";
+
if (getLangOpts().HIP)
return getLangOpts().HIPUseNewLaunchAPI ? "__hipPushCallConfiguration"
: "hipConfigureCall";
diff --git a/clang/test/Driver/cuda-via-liboffload.cu b/clang/test/Driver/cuda-via-liboffload.cu
new file mode 100644
index 0000000000000..68dc963e906b2
--- /dev/null
+++ b/clang/test/Driver/cuda-via-liboffload.cu
@@ -0,0 +1,23 @@
+// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \
+// RUN: --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \
+// RUN: | FileCheck -check-prefix BINDINGS %s
+
+// BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_BC:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_35:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_35]]"], output: "[[CUBIN_SM_35:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_70:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_70:.+]]"], output: "[[CUBIN_SM_70:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[CUBIN_SM_35]]", "[[CUBIN_SM_70]]"], output: "[[BINARY:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[BINARY]]"], output: "[[HOST_OBJ:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out"
+
+// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \
+// RUN: --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \
+// RUN: | FileCheck -check-prefix BINDINGS-DEVICE %s
+
+// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[PTX:.+]]"
+// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX]]"], output: "[[CUBIN:.+]]"
+
+// RUN: %clang -### -target x86_64-linux-gnu -ccc-print-bindings --offload-link -foffload-via-llvm %s 2>&1 | FileCheck -check-prefix DEVICE-LINK %s
+
+// DEVICE-LINK: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[INPUT:.+]]"], output: "a.out"
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index e8fc27785b6c2..2c41a6e2ba43c 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -89,22 +89,27 @@ struct __tgt_async_info {
/// This struct contains all of the arguments to a target kernel region launch.
struct KernelArgsTy {
- uint32_t Version; // Version of this struct for ABI compatibility.
- uint32_t NumArgs; // Number of arguments in each input pointer.
- void **ArgBasePtrs; // Base pointer of each argument (e.g. a struct).
- void **ArgPtrs; // Pointer to the argument data.
- int64_t *ArgSizes; // Size of the argument data in bytes.
- int64_t *ArgTypes; // Type of the data (e.g. to / from).
- void **ArgNames; // Name of the data for debugging, possibly null.
- void **ArgMappers; // User-defined mappers, possibly null.
- uint64_t Tripcount; // Tripcount for the teams / distribute loop, 0 otherwise.
+ uint32_t Version = 0; // Version of this struct for ABI compatibility.
+ uint32_t NumArgs = 0; // Number of arguments in each input pointer.
+ void **ArgBasePtrs =
+ nullptr; // Base pointer of each argument (e.g. a struct).
+ void **ArgPtrs = nullptr; // Pointer to the argument data.
+ int64_t *ArgSizes = nullptr; // Size of the argument data in bytes.
+ int64_t *ArgTypes = nullptr; // Type of the data (e.g. to / from).
+ void **ArgNames = nullptr; // Name of the data for debugging, possibly null.
+ void **ArgMappers = nullptr; // User-defined mappers, possibly null.
+ uint64_t Tripcount =
+ 0; // Tripcount for the teams / distribute loop, 0 otherwise.
struct {
uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
- uint64_t Unused : 63;
- } Flags;
- uint32_t NumTeams[3]; // The number of teams (for x,y,z dimension).
- ...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
3c495d0
to
ab7b370
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM in principle.
Will kernels in TUs compiled with -foffload-via-llvm
be interoperable with code that wants to launch them from another TU compiled w/o -foffload-via-llvm
?
E.g.:
- a.cu:
__global__ void kernel() { ... }
- b.cu:
extern __global__ void kernel(); void func() { kernel<<<1,1>>>();}
This could use a test in the testsuite to actually check whether it works.
There seems to be some trouble with NVIDIA offload (I tested mainly AMDGPU) and f128, I'll make sure that works too.
I'll look into this. Intuitively, the kernel launch needs -foffload-via-llvm (which implies -foffload-new-driver) and the kernel definition needs -foffload-new-driver. Similarly, with the new driver flag device code should link fine. Right now, this defaults to gpu-rdc, as OpenMP does, but we can change that. On that note, non-rdc should actually internalize all but the kernels and thereby help the middle end as well. |
Should the NFCI changes (like initializing struct fields) be put into a separate PR? |
1d2ec75
to
7b334f1
Compare
Done.
I don't think they are/reasonably can be, at least not for now. I added a test to show 2 TUs with |
b4ddd9b
to
bce55b1
Compare
ac40dbe
to
d065850
Compare
This provides the header overlay for cuda_runtime.h which is found before any CUDA installation (none is necessary). Some basic APIs are defined in terms of the omp_target_* ones, but with the API redesign the requirements of CUDA should be taken into account. Based on: llvm#94549
This provides the header overlay for cuda_runtime.h which is found before any CUDA installation (none is necessary). Some basic APIs are defined in terms of the omp_target_* ones, but with the API redesign the requirements of CUDA should be taken into account. Based on: llvm#94549
a9f228a
to
5f90e77
Compare
Through the new `-foffload-via-llvm` flag, CUDA kernels can now be lowered to the LLVM/Offload API. On the Clang side, this is simply done by using the OpenMP offload toolchain and emitting calls to `llvm*` functions to orchestrate the kernel launch rather than `cuda*` functions. These `llvm*` functions are implemented on top of the existing LLVM/Offload API. As we are about to redefine the Offload API, this wil help us in the design process as a second offload language. We do not support any CUDA APIs yet, however, we could: https://www.osti.gov/servlets/purl/1892137 For proper host execution we need to resurrect/rebase https://tianshilei.me/wp-content/uploads/2021/12/llpp-2021.pdf (which was designed for debugging). ``` ❯❯❯ cat test.cu extern "C" { void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); } __global__ void square(int *A) { *A = 42; } int main(int argc, char **argv) { int DevNo = 0; int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo)); *Ptr = 7; printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr); square<<<1, 1>>>(Ptr); printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr); llvm_omp_target_free_shared(Ptr, DevNo); } ❯❯❯ clang++ test.cu -O3 -o test123 -foffload-via-llvm --offload-arch=native ❯❯❯ llvm-objdump --offloading test123 test123: file format elf64-x86-64 OFFLOADING IMAGE [0]: kind elf arch gfx90a triple amdgcn-amd-amdhsa producer openmp ❯❯❯ LIBOMPTARGET_INFO=16 ./test123 Ptr 0x155448ac8000, *Ptr 7 Ptr 0x155448ac8000, *Ptr 42 ```
5f90e77
to
e0dca91
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/30/builds/3871 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/3628 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/193/builds/1824 Here is the relevant piece of the build log for the reference:
|
One buildbot error was unrelated, the others where fixed by making the tests more robust. |
This provides the header overlay for cuda_runtime.h which is found before any CUDA installation (none is necessary). Some basic APIs are defined in terms of the omp_target_* ones, but with the API redesign the requirements of CUDA should be taken into account. Based on: llvm#94549
Through the new
-foffload-via-llvm
flag, CUDA kernels can now belowered to the LLVM/Offload API. On the Clang side, this is simply done
by using the OpenMP offload toolchain and emitting calls to
llvm*
functions to orchestrate the kernel launch rather than
cuda*
functions. These
llvm*
functions are implemented on top of theexisting LLVM/Offload API.
As we are about to redefine the Offload API, this wil help us in the
design process as a second offload language.
We do not support any CUDA APIs yet, however, we could:
https://www.osti.gov/servlets/purl/1892137
For proper host execution we need to resurrect/rebase
https://tianshilei.me/wp-content/uploads/2021/12/llpp-2021.pdf
(which was designed for debugging).