Skip to content

[Offload][CUDA] Allow CUDA kernels to use LLVM/Offload #94549

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/include/clang/Basic/LangOptions.def
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kern
LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP")
LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.")
LANGOPT(OffloadViaLLVM, 1, 0, "target LLVM/Offload as portable offloading runtime.")

LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device")
LANGOPT(SYCLIsHost , 1, 0, "SYCL host compilation")
Expand Down
6 changes: 6 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -1302,6 +1302,12 @@ def no_offload_compress : Flag<["--"], "no-offload-compress">;
def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">,
Flags<[HelpHidden]>,
HelpText<"Compression level for offload device binaries (HIP only)">;

defm offload_via_llvm : BoolFOption<"offload-via-llvm",
LangOpts<"OffloadViaLLVM">, DefaultFalse,
PosFlag<SetTrue, [], [ClangOption, CC1Option], "Use">,
NegFlag<SetFalse, [], [ClangOption], "Don't use">,
BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading runtime.">>;
}

// CUDA options
Expand Down
97 changes: 82 additions & 15 deletions clang/lib/CodeGen/CGCUDANV.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@
#include "CGCXXABI.h"
#include "CodeGenFunction.h"
#include "CodeGenModule.h"
#include "clang/AST/CharUnits.h"
#include "clang/AST/Decl.h"
#include "clang/Basic/Cuda.h"
#include "clang/CodeGen/CodeGenABITypes.h"
#include "clang/CodeGen/ConstantInitBuilder.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Frontend/Offloading/Utility.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
Expand All @@ -36,6 +38,11 @@ constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"

class CGNVCUDARuntime : public CGCUDARuntime {

/// The prefix used for function calls and section names (CUDA, HIP, LLVM)
StringRef Prefix;
/// TODO: We should transition the OpenMP section to LLVM/Offload
StringRef SectionPrefix;

private:
llvm::IntegerType *IntTy, *SizeTy;
llvm::Type *VoidTy;
Expand Down Expand Up @@ -132,6 +139,9 @@ class CGNVCUDARuntime : public CGCUDARuntime {
return DummyFunc;
}

Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args);
Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
FunctionArgList &Args);
void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
std::string getDeviceSideName(const NamedDecl *ND) override;
Expand Down Expand Up @@ -191,15 +201,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
} // end anonymous namespace

std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
if (CGM.getLangOpts().HIP)
return ((Twine("hip") + Twine(FuncName)).str());
return ((Twine("cuda") + Twine(FuncName)).str());
return (Prefix + FuncName).str();
}
std::string
CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
if (CGM.getLangOpts().HIP)
return ((Twine("__hip") + Twine(FuncName)).str());
return ((Twine("__cuda") + Twine(FuncName)).str());
return ("__" + Prefix + FuncName).str();
}

static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
Expand Down Expand Up @@ -227,6 +233,14 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
SizeTy = CGM.SizeTy;
VoidTy = CGM.VoidTy;
PtrTy = CGM.UnqualPtrTy;

if (CGM.getLangOpts().OffloadViaLLVM) {
Prefix = "llvm";
SectionPrefix = "omp";
} else if (CGM.getLangOpts().HIP)
SectionPrefix = Prefix = "hip";
else
SectionPrefix = Prefix = "cuda";
}

llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
Expand Down Expand Up @@ -305,18 +319,58 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
}
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
CudaFeature::CUDA_USES_NEW_LAUNCH) ||
(CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
(CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) ||
(CGF.getLangOpts().OffloadViaLLVM))
emitDeviceStubBodyNew(CGF, Args);
else
emitDeviceStubBodyLegacy(CGF, Args);
}

// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
// array and kernels are launched using cudaLaunchKernel().
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
FunctionArgList &Args) {
// Build the shadow stack entry at the very start of the function.
/// CUDA passes the arguments with a level of indirection. For example, a
/// (void*, short, void*) is passed as {void **, short *, void **} to the launch
/// function. For the LLVM/offload launch we flatten the arguments into the
/// struct directly. In addition, we include the size of the arguments, thus
/// pass {sizeof({void *, short, void *}), ptr to {void *, short, void *},
/// nullptr}. The last nullptr needs to be initialized to an array of pointers
/// pointing to the arguments if we want to offload to the host.
Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
FunctionArgList &Args) {
SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
for (auto &Arg : Args)
ArgTypes.push_back(CGF.ConvertTypeForMem(Arg->getType()));
llvm::StructType *KernelArgsTy = llvm::StructType::create(ArgTypes);

auto *Int64Ty = CGF.Builder.getInt64Ty();
KernelLaunchParamsTypes.push_back(Int64Ty);
KernelLaunchParamsTypes.push_back(PtrTy);
KernelLaunchParamsTypes.push_back(PtrTy);

llvm::StructType *KernelLaunchParamsTy =
llvm::StructType::create(KernelLaunchParamsTypes);
Address KernelArgs = CGF.CreateTempAllocaWithoutCast(
KernelArgsTy, CharUnits::fromQuantity(16), "kernel_args");
Address KernelLaunchParams = CGF.CreateTempAllocaWithoutCast(
KernelLaunchParamsTy, CharUnits::fromQuantity(16),
"kernel_launch_params");

auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy);
CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize),
CGF.Builder.CreateStructGEP(KernelLaunchParams, 0));
CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF),
CGF.Builder.CreateStructGEP(KernelLaunchParams, 1));
CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy),
CGF.Builder.CreateStructGEP(KernelLaunchParams, 2));

for (unsigned i = 0; i < Args.size(); ++i) {
auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i]));
CGF.Builder.CreateStore(ArgVal, CGF.Builder.CreateStructGEP(KernelArgs, i));
}

return KernelLaunchParams;
}

Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF,
FunctionArgList &Args) {
// Calculate amount of space we will need for all arguments. If we have no
// args, allocate a single pointer so we still have a valid pointer to the
// argument array that we can pass to runtime, even if it will be unused.
Expand All @@ -331,6 +385,17 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
PtrTy, KernelArgs.emitRawPointer(CGF), i));
}
return KernelArgs;
}

// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
// array and kernels are launched using cudaLaunchKernel().
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
FunctionArgList &Args) {
// Build the shadow stack entry at the very start of the function.
Address KernelArgs = CGF.getLangOpts().OffloadViaLLVM
? prepareKernelArgsLLVMOffload(CGF, Args)
: prepareKernelArgs(CGF, Args);

llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");

Expand Down Expand Up @@ -1129,8 +1194,9 @@ void CGNVCUDARuntime::transformManagedVars() {
// registered. The linker will provide a pointer to this section so we can
// register the symbols with the linked device image.
void CGNVCUDARuntime::createOffloadingEntries() {
StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
: "cuda_offloading_entries";
SmallVector<char, 32> Out;
StringRef Section = (SectionPrefix + "_offloading_entries").toStringRef(Out);

llvm::Module &M = CGM.getModule();
for (KernelInfo &I : EmittedKernels)
llvm::offloading::emitOffloadingEntry(
Expand Down Expand Up @@ -1199,7 +1265,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
}
return nullptr;
}
if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
if (CGM.getLangOpts().OffloadViaLLVM ||
(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
createOffloadingEntries();
else
return makeModuleCtorFunction();
Expand Down
3 changes: 2 additions & 1 deletion clang/lib/CodeGen/CodeGenFunction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1013,7 +1013,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
}

if (FD && (getLangOpts().OpenCL ||
(getLangOpts().HIP && getLangOpts().CUDAIsDevice))) {
((getLangOpts().HIP || getLangOpts().OffloadViaLLVM) &&
getLangOpts().CUDAIsDevice))) {
// Add metadata for a kernel function.
EmitKernelMetadata(FD, Fn);
}
Expand Down
19 changes: 12 additions & 7 deletions clang/lib/Driver/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -786,11 +786,13 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
}) ||
C.getInputArgs().hasArg(options::OPT_hip_link) ||
C.getInputArgs().hasArg(options::OPT_hipstdpar);
bool UseLLVMOffload = C.getInputArgs().hasArg(
options::OPT_foffload_via_llvm, options::OPT_fno_offload_via_llvm, false);
if (IsCuda && IsHIP) {
Diag(clang::diag::err_drv_mix_cuda_hip);
return;
}
if (IsCuda) {
if (IsCuda && !UseLLVMOffload) {
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
const llvm::Triple &HostTriple = HostTC->getTriple();
auto OFK = Action::OFK_Cuda;
Expand All @@ -812,7 +814,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
CudaInstallation.WarnIfUnsupportedVersion();
}
C.addOffloadDeviceToolChain(CudaTC.get(), OFK);
} else if (IsHIP) {
} else if (IsHIP && !UseLLVMOffload) {
if (auto *OMPTargetArg =
C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) {
Diag(clang::diag::err_drv_unsupported_opt_for_language_mode)
Expand All @@ -836,10 +838,11 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
// We need to generate an OpenMP toolchain if the user specified targets with
// the -fopenmp-targets option or used --offload-arch with OpenMP enabled.
bool IsOpenMPOffloading =
C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
options::OPT_fno_openmp, false) &&
(C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
C.getInputArgs().hasArg(options::OPT_offload_arch_EQ));
((IsCuda || IsHIP) && UseLLVMOffload) ||
(C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
options::OPT_fno_openmp, false) &&
(C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
C.getInputArgs().hasArg(options::OPT_offload_arch_EQ)));
if (IsOpenMPOffloading) {
// We expect that -fopenmp-targets is always used in conjunction with the
// option -fopenmp specifying a valid runtime with offloading support, i.e.
Expand Down Expand Up @@ -867,7 +870,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
for (StringRef T : OpenMPTargets->getValues())
OpenMPTriples.insert(T);
} else if (C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) &&
!IsHIP && !IsCuda) {
((!IsHIP && !IsCuda) || UseLLVMOffload)) {
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs());
auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(),
Expand Down Expand Up @@ -4152,6 +4155,8 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,

bool UseNewOffloadingDriver =
C.isOffloadingHostKind(Action::OFK_OpenMP) ||
Args.hasFlag(options::OPT_foffload_via_llvm,
options::OPT_fno_offload_via_llvm, false) ||
Args.hasFlag(options::OPT_offload_new_driver,
options::OPT_no_offload_new_driver, false);

Expand Down
31 changes: 27 additions & 4 deletions clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,18 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
CmdArgs.push_back("__clang_openmp_device_functions.h");
}

if (Args.hasArg(options::OPT_foffload_via_llvm)) {
// Add llvm_wrappers/* to our system include path. This lets us wrap
// standard library headers and other headers.
SmallString<128> P(D.ResourceDir);
llvm::sys::path::append(P, "include", "llvm_offload_wrappers");
CmdArgs.append({"-internal-isystem", Args.MakeArgString(P), "-include"});
if (JA.isDeviceOffloading(Action::OFK_OpenMP))
CmdArgs.push_back("__llvm_offload_device.h");
else
CmdArgs.push_back("__llvm_offload_host.h");
}

// Add -i* options, and automatically translate to
// -include-pch/-include-pth for transparent PCH support. It's
// wonky, but we include looking for .gch so we can support seamless
Expand Down Expand Up @@ -6665,6 +6677,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
// device offloading action other than OpenMP.
if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
options::OPT_fno_openmp, false) &&
!Args.hasFlag(options::OPT_foffload_via_llvm,
options::OPT_fno_offload_via_llvm, false) &&
(JA.isDeviceOffloading(Action::OFK_None) ||
JA.isDeviceOffloading(Action::OFK_OpenMP))) {
switch (D.getOpenMPRuntime(Args)) {
Expand Down Expand Up @@ -6742,11 +6756,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
Args.addOptOutFlag(CmdArgs, options::OPT_fopenmp_extensions,
options::OPT_fno_openmp_extensions);
}

// Forward the new driver to change offloading code generation.
if (Args.hasFlag(options::OPT_offload_new_driver,
options::OPT_no_offload_new_driver, false))
// Forward the offload runtime change to code generation, liboffload implies
// new driver. Otherwise, check if we should forward the new driver to change
// offloading code generation.
if (Args.hasFlag(options::OPT_foffload_via_llvm,
options::OPT_fno_offload_via_llvm, false)) {
CmdArgs.append({"--offload-new-driver", "-foffload-via-llvm"});
} else if (Args.hasFlag(options::OPT_offload_new_driver,
options::OPT_no_offload_new_driver, false)) {
CmdArgs.push_back("--offload-new-driver");
}

SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType);

Expand Down Expand Up @@ -7778,6 +7797,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
// so that only the relevant declarations are emitted.
if (IsOpenMPDevice) {
CmdArgs.push_back("-fopenmp-is-target-device");
// If we are offloading cuda/hip via llvm, it's also "cuda device code".
if (Args.hasArg(options::OPT_foffload_via_llvm))
CmdArgs.push_back("-fcuda-is-device");

if (OpenMPDeviceInput) {
CmdArgs.push_back("-fopenmp-host-ir-file-path");
CmdArgs.push_back(Args.MakeArgString(OpenMPDeviceInput->getFilename()));
Expand Down
7 changes: 6 additions & 1 deletion clang/lib/Driver/ToolChains/CommonArgs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1199,8 +1199,13 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
bool ForceStaticHostRuntime, bool IsOffloadingHost,
bool GompNeedsRT) {
if (!Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
options::OPT_fno_openmp, false))
options::OPT_fno_openmp, false)) {
// We need libomptarget (liboffload) if it's the choosen offloading runtime.
if (Args.hasFlag(options::OPT_foffload_via_llvm,
options::OPT_fno_offload_via_llvm, false))
CmdArgs.push_back("-lomptarget");
return false;
}

Driver::OpenMPRuntimeKind RTKind = TC.getDriver().getOpenMPRuntime(Args);

Expand Down
27 changes: 16 additions & 11 deletions clang/lib/Driver/ToolChains/Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -839,17 +839,15 @@ void CudaToolChain::addClangTargetOptions(
DeviceOffloadingKind == Action::OFK_Cuda) &&
"Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");

if (DeviceOffloadingKind == Action::OFK_Cuda) {
CC1Args.append(
{"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});

// Unsized function arguments used for variadics were introduced in CUDA-9.0
// We still do not support generating code that actually uses variadic
// arguments yet, but we do need to allow parsing them as recent CUDA
// headers rely on that. https://github.com/llvm/llvm-project/issues/58410
if (CudaInstallation.version() >= CudaVersion::CUDA_90)
CC1Args.push_back("-fcuda-allow-variadic-functions");
}
CC1Args.append(
{"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});

// Unsized function arguments used for variadics were introduced in CUDA-9.0
// We still do not support generating code that actually uses variadic
// arguments yet, but we do need to allow parsing them as recent CUDA
// headers rely on that. https://github.com/llvm/llvm-project/issues/58410
if (CudaInstallation.version() >= CudaVersion::CUDA_90)
CC1Args.push_back("-fcuda-allow-variadic-functions");

if (DriverArgs.hasArg(options::OPT_nogpulib))
return;
Expand All @@ -867,6 +865,13 @@ void CudaToolChain::addClangTargetOptions(
CC1Args.push_back("-mlink-builtin-bitcode");
CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));

// For now, we don't use any Offload/OpenMP device runtime when we offload
// CUDA via LLVM/Offload. We should split the Offload/OpenMP device runtime
// and include the "generic" (or CUDA-specific) parts.
if (DriverArgs.hasFlag(options::OPT_foffload_via_llvm,
options::OPT_fno_offload_via_llvm, false))
return;

clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();

if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr,
Expand Down
Loading
Loading