diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index e5f78dfdc22ab..56c6b70f18704 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -1156,6 +1156,10 @@ class ASTContext : public RefCountedBase { /// host code. llvm::DenseSet CUDAExternalDeviceDeclODRUsedByHost; + /// Keep track of CUDA/HIP implicit host device functions used on device side + /// in device compilation. + llvm::DenseSet CUDAImplicitHostDeviceFunUsedByDevice; + ASTContext(LangOptions &LOpts, SourceManager &SM, IdentifierTable &idents, SelectorTable &sels, Builtin::Context &builtins, TranslationUnitKind TUKind); diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def index cf626d0120cc7..da77aee8de369 100644 --- a/clang/include/clang/Basic/Features.def +++ b/clang/include/clang/Basic/Features.def @@ -283,6 +283,7 @@ FEATURE(cxx_abi_relative_vtable, LangOpts.CPlusPlus && LangOpts.RelativeCXXABIVT // CUDA/HIP Features FEATURE(cuda_noinline_keyword, LangOpts.CUDA) +EXTENSION(cuda_implicit_host_device_templates, LangOpts.CUDA && LangOpts.OffloadImplicitHostDeviceTemplates) #undef EXTENSION #undef FEATURE diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index c0ea4ecb9806a..8f09d714d498c 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -268,6 +268,7 @@ LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA d LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__") LANGOPT(GPUDeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions") LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code") +LANGOPT(OffloadImplicitHostDeviceTemplates, 1, 0, "assume template functions to be implicitly host device by default for CUDA/HIP") LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP") LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kernel launch bounds for HIP") LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index c8b730e0f7ecd..759aee8022387 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1146,6 +1146,14 @@ defm gpu_rdc : BoolFOption<"gpu-rdc", "Generate relocatable device code, also known as separate compilation mode">, NegFlag>; +defm offload_implicit_host_device_templates : + BoolFOption<"offload-implicit-host-device-templates", + LangOpts<"OffloadImplicitHostDeviceTemplates">, DefaultFalse, + PosFlag, + NegFlag>; + def fgpu_default_stream_EQ : Joined<["-"], "fgpu-default-stream=">, HelpText<"Specify default stream. The default value is 'legacy'. (CUDA/HIP only)">, Visibility<[ClangOption, CC1Option]>, diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 91a4211a5cf5c..79b5472a2e747 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -13489,6 +13489,10 @@ class Sema final { /// host or device attribute. void CUDASetLambdaAttrs(CXXMethodDecl *Method); + /// Record \p FD if it is a CUDA/HIP implicit host device function used on + /// device side in device compilation. + void CUDARecordImplicitHostDeviceFuncUsedByDevice(const FunctionDecl *FD); + /// Finds a function in \p Matches with highest calling priority /// from \p Caller context and erases all functions with lower /// calling priority. diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 4203a6218aba6..99ba025ef27d9 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -28,6 +28,7 @@ #include "CoverageMappingGen.h" #include "TargetInfo.h" #include "clang/AST/ASTContext.h" +#include "clang/AST/ASTLambda.h" #include "clang/AST/CharUnits.h" #include "clang/AST/DeclCXX.h" #include "clang/AST/DeclObjC.h" @@ -3565,6 +3566,14 @@ ConstantAddress CodeGenModule::GetWeakRefReference(const ValueDecl *VD) { return ConstantAddress(Aliasee, DeclTy, Alignment); } +template static bool hasImplicitAttr(const ValueDecl *D) { + if (!D) + return false; + if (auto *A = D->getAttr()) + return A->isImplicit(); + return D->isImplicit(); +} + void CodeGenModule::EmitGlobal(GlobalDecl GD) { const auto *Global = cast(GD.getDecl()); @@ -3586,16 +3595,23 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) { return emitCPUDispatchDefinition(GD); // If this is CUDA, be selective about which declarations we emit. + // Non-constexpr non-lambda implicit host device functions are not emitted + // unless they are used on device side. if (LangOpts.CUDA) { if (LangOpts.CUDAIsDevice) { - if (!Global->hasAttr() && + const auto *FD = dyn_cast(Global); + if ((!Global->hasAttr() || + (LangOpts.OffloadImplicitHostDeviceTemplates && FD && + hasImplicitAttr(FD) && + hasImplicitAttr(FD) && !FD->isConstexpr() && + !isLambdaCallOperator(FD) && + !getContext().CUDAImplicitHostDeviceFunUsedByDevice.count(FD))) && !Global->hasAttr() && !Global->hasAttr() && !Global->hasAttr() && !Global->getType()->isCUDADeviceBuiltinSurfaceType() && !Global->getType()->isCUDADeviceBuiltinTextureType() && - !(LangOpts.HIPStdPar && - isa(Global) && + !(LangOpts.HIPStdPar && isa(Global) && !Global->hasAttr())) return; } else { diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 79f7fba225707..6570ed1bf3981 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -7395,6 +7395,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.AddLastArg(CmdArgs, options::OPT_foffload_uniform_block, options::OPT_fno_offload_uniform_block); + Args.AddLastArg(CmdArgs, options::OPT_foffload_implicit_host_device_templates, + options::OPT_fno_offload_implicit_host_device_templates); + if (IsCudaDevice || IsHIPDevice) { StringRef InlineThresh = Args.getLastArgValue(options::OPT_fgpu_inline_threshold_EQ); diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp index d993499cf4a6e..318174f7be8fa 100644 --- a/clang/lib/Sema/SemaCUDA.cpp +++ b/clang/lib/Sema/SemaCUDA.cpp @@ -678,6 +678,27 @@ void Sema::checkAllowedCUDAInitializer(VarDecl *VD) { } } +void Sema::CUDARecordImplicitHostDeviceFuncUsedByDevice( + const FunctionDecl *Callee) { + FunctionDecl *Caller = getCurFunctionDecl(/*AllowLambda=*/true); + if (!Caller) + return; + + if (!isCUDAImplicitHostDeviceFunction(Callee)) + return; + + CUDAFunctionTarget CallerTarget = IdentifyCUDATarget(Caller); + + // Record whether an implicit host device function is used on device side. + if (CallerTarget != CFT_Device && CallerTarget != CFT_Global && + (CallerTarget != CFT_HostDevice || + (isCUDAImplicitHostDeviceFunction(Caller) && + !getASTContext().CUDAImplicitHostDeviceFunUsedByDevice.count(Caller)))) + return; + + getASTContext().CUDAImplicitHostDeviceFunUsedByDevice.insert(Callee); +} + // With -fcuda-host-device-constexpr, an unattributed constexpr function is // treated as implicitly __host__ __device__, unless: // * it is a variadic function (device-side variadic functions are not @@ -702,6 +723,18 @@ void Sema::maybeAddCUDAHostDeviceAttrs(FunctionDecl *NewD, return; } + // If a template function has no host/device/global attributes, + // make it implicitly host device function. + if (getLangOpts().OffloadImplicitHostDeviceTemplates && + !NewD->hasAttr() && !NewD->hasAttr() && + !NewD->hasAttr() && + (NewD->getDescribedFunctionTemplate() || + NewD->isFunctionTemplateSpecialization())) { + NewD->addAttr(CUDAHostAttr::CreateImplicit(Context)); + NewD->addAttr(CUDADeviceAttr::CreateImplicit(Context)); + return; + } + if (!getLangOpts().CUDAHostDeviceConstexpr || !NewD->isConstexpr() || NewD->isVariadic() || NewD->hasAttr() || NewD->hasAttr() || NewD->hasAttr()) @@ -950,7 +983,14 @@ void Sema::checkCUDATargetOverload(FunctionDecl *NewFD, // HD/global functions "exist" in some sense on both the host and device, so // should have the same implementation on both sides. if (NewTarget != OldTarget && - ((NewTarget == CFT_HostDevice) || (OldTarget == CFT_HostDevice) || + ((NewTarget == CFT_HostDevice && + !(LangOpts.OffloadImplicitHostDeviceTemplates && + isCUDAImplicitHostDeviceFunction(NewFD) && + OldTarget == CFT_Device)) || + (OldTarget == CFT_HostDevice && + !(LangOpts.OffloadImplicitHostDeviceTemplates && + isCUDAImplicitHostDeviceFunction(OldFD) && + NewTarget == CFT_Device)) || (NewTarget == CFT_Global) || (OldTarget == CFT_Global)) && !IsOverload(NewFD, OldFD, /* UseMemberUsingDeclRules = */ false, /* ConsiderCudaAttrs = */ false)) { diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 2a3cd7a00806d..23683ebf4c788 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -19096,6 +19096,13 @@ void Sema::MarkFunctionReferenced(SourceLocation Loc, FunctionDecl *Func, if (FPT && isUnresolvedExceptionSpec(FPT->getExceptionSpecType())) ResolveExceptionSpec(Loc, FPT); + // A callee could be called by a host function then by a device function. + // If we only try recording once, we will miss recording the use on device + // side. Therefore keep trying until it is recorded. + if (LangOpts.OffloadImplicitHostDeviceTemplates && LangOpts.CUDAIsDevice && + !getASTContext().CUDAImplicitHostDeviceFunUsedByDevice.count(Func)) + CUDARecordImplicitHostDeviceFuncUsedByDevice(Func); + // If this is the first "real" use, act on that. if (OdrUse == OdrUseContext::Used && !Func->isUsed(/*CheckUsedAttr=*/false)) { // Keep track of used but undefined functions. diff --git a/clang/test/CodeGenCUDA/implicit-host-device-fun.cu b/clang/test/CodeGenCUDA/implicit-host-device-fun.cu new file mode 100644 index 0000000000000..19c13b38b5096 --- /dev/null +++ b/clang/test/CodeGenCUDA/implicit-host-device-fun.cu @@ -0,0 +1,118 @@ +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu \ +// RUN: -foffload-implicit-host-device-templates \ +// RUN: -emit-llvm -o - -x hip %s 2>&1 | \ +// RUN: FileCheck -check-prefixes=COMM,HOST %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: -target-cpu gfx1100 \ +// RUN: -foffload-implicit-host-device-templates \ +// RUN: -emit-llvm -o - -x hip %s 2>&1 | \ +// RUN: FileCheck -check-prefixes=COMM,DEV %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: -target-cpu gfx1100 \ +// RUN: -foffload-implicit-host-device-templates \ +// RUN: -emit-llvm -o - -x hip %s 2>&1 | \ +// RUN: FileCheck -check-prefixes=DEV-NEG %s + +#include "Inputs/cuda.h" + +// Implicit host device template not overloaded by device template. +// Used by both device and host function. +// Emitted on both host and device. + +// COMM-LABEL: define {{.*}}@_Z20template_no_overloadIiET_S0_( +// COMM: ret i32 1 +template +T template_no_overload(T x) { + return 1; +} + +// Implicit host device template overloaded by device template. +// Used by both device and host function. +// Implicit host device template emitted on host. +// Device template emitted on device. + +// COMM-LABEL: define {{.*}}@_Z22template_with_overloadIiET_S0_( +// HOST: ret i32 2 +// DEV: ret i32 3 +template +T template_with_overload(T x) { + return 2; +} + +template +__device__ T template_with_overload(T x) { + return 3; +} + +// Implicit host device template used by host function only. +// Emitted on host only. +// HOST-LABEL: define {{.*}}@_Z21template_used_by_hostIiET_S0_( +// DEV-NEG-NOT: define {{.*}}@_Z21template_used_by_hostIiET_S0_( +// HOST: ret i32 10 +template +T template_used_by_host(T x) { + return 10; +} + +// Implicit host device template indirectly used by host function only. +// Emitted on host only. +// HOST-LABEL: define {{.*}}@_Z32template_indirectly_used_by_hostIiET_S0_( +// DEV-NEG-NOT: define {{.*}}@_Z32template_indirectly_used_by_hostIiET_S0_( +// HOST: ret i32 11 +template +T template_indirectly_used_by_host(T x) { + return 11; +} + +template +T template_in_middle_by_host(T x) { + template_indirectly_used_by_host(x); + return 12; +} + +// Implicit host device template indirectly used by device function only. +// Emitted on device. +// DEVICE-LABEL: define {{.*}}@_Z34template_indirectly_used_by_deviceIiET_S0_( +// DEVICE: ret i32 21 +template +T template_indirectly_used_by_device(T x) { + return 21; +} + +template +T template_in_middle_by_device(T x) { + template_indirectly_used_by_device(x); + return 22; +} + +// Implicit host device template indirectly used by host device function only. +// Emitted on host and device. +// COMMON-LABEL: define {{.*}}@_Z39template_indirectly_used_by_host_deviceIiET_S0_( +// COMMON: ret i32 31 +template +T template_indirectly_used_by_host_device(T x) { + return 31; +} + +template +T template_in_middle_by_host_device(T x) { + template_indirectly_used_by_host_device(x); + return 32; +} + +void host_fun() { + template_no_overload(0); + template_with_overload(0); + template_used_by_host(0); + template_in_middle_by_host(0); +} + +__device__ void device_fun() { + template_no_overload(0); + template_with_overload(0); + template_in_middle_by_device(0); +} + +__host__ __device__ void host_device_fun() { + template_in_middle_by_host_device(0); +} diff --git a/clang/test/Lexer/has_extension.cu b/clang/test/Lexer/has_extension.cu new file mode 100644 index 0000000000000..fd5083e84b887 --- /dev/null +++ b/clang/test/Lexer/has_extension.cu @@ -0,0 +1,13 @@ +// RUN: %clang_cc1 -E -triple x86_64-linux-gnu %s -o - \ +// RUN: | FileCheck -check-prefix=NOHDT %s +// RUN: %clang_cc1 -E -triple x86_64-linux-gnu %s -o - \ +// RUN: -foffload-implicit-host-device-templates \ +// RUN: | FileCheck -check-prefix=HDT %s + +// NOHDT: no_implicit_host_device_templates +// HDT: has_implicit_host_device_templates +#if __has_extension(cuda_implicit_host_device_templates) +int has_implicit_host_device_templates(); +#else +int no_implicit_host_device_templates(); +#endif diff --git a/clang/test/SemaCUDA/implicit-host-device-fun.cu b/clang/test/SemaCUDA/implicit-host-device-fun.cu new file mode 100644 index 0000000000000..f73a48f2f11ea --- /dev/null +++ b/clang/test/SemaCUDA/implicit-host-device-fun.cu @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 -isystem %S/Inputs -fsyntax-only %s +// RUN: %clang_cc1 -isystem %S/Inputs -fcuda-is-device -fsyntax-only %s +// RUN: %clang_cc1 -isystem %S/Inputs -foffload-implicit-host-device-templates -fsyntax-only %s +// RUN: %clang_cc1 -isystem %S/Inputs -foffload-implicit-host-device-templates -fcuda-is-device -fsyntax-only %s + +#include + +template +void tempf(T x) { +} + +template +__device__ void tempf(T x) { +} + +void host_fun() { + tempf(1); +} + +__device__ void device_fun() { + tempf(1); +}