diff --git a/clang/include/clang/Sema/ScopeInfo.h b/clang/include/clang/Sema/ScopeInfo.h index 958d65055fa9b..6bf9ae8d074fb 100644 --- a/clang/include/clang/Sema/ScopeInfo.h +++ b/clang/include/clang/Sema/ScopeInfo.h @@ -949,6 +949,9 @@ class LambdaScopeInfo final : SourceLocation PotentialThisCaptureLocation; + /// Variables that are potentially ODR-used in CUDA/HIP. + llvm::SmallPtrSet CUDAPotentialODRUsedVars; + LambdaScopeInfo(DiagnosticsEngine &Diag) : CapturingScopeInfo(Diag, ImpCap_None) { Kind = SK_Lambda; diff --git a/clang/include/clang/Sema/SemaCUDA.h b/clang/include/clang/Sema/SemaCUDA.h index 71f05e88fb539..dbc1432860d89 100644 --- a/clang/include/clang/Sema/SemaCUDA.h +++ b/clang/include/clang/Sema/SemaCUDA.h @@ -274,6 +274,10 @@ class SemaCUDA : public SemaBase { /// parameters specified via <<<>>>. std::string getConfigureFuncName() const; + /// Record variables that are potentially ODR-used in CUDA/HIP. + void recordPotentialODRUsedVariable(MultiExprArg Args, + OverloadCandidateSet &CandidateSet); + private: unsigned ForceHostDeviceDepth = 0; diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp index 0e5fc5e1a40b4..0a8c24f8be537 100644 --- a/clang/lib/Sema/SemaCUDA.cpp +++ b/clang/lib/Sema/SemaCUDA.cpp @@ -18,6 +18,7 @@ #include "clang/Basic/TargetInfo.h" #include "clang/Lex/Preprocessor.h" #include "clang/Sema/Lookup.h" +#include "clang/Sema/Overload.h" #include "clang/Sema/ScopeInfo.h" #include "clang/Sema/Sema.h" #include "clang/Sema/Template.h" @@ -1100,3 +1101,49 @@ std::string SemaCUDA::getConfigureFuncName() const { // Legacy CUDA kernel configuration call return "cudaConfigureCall"; } + +// Record any local constexpr variables that are passed one way on the host +// and another on the device. +void SemaCUDA::recordPotentialODRUsedVariable( + MultiExprArg Arguments, OverloadCandidateSet &Candidates) { + sema::LambdaScopeInfo *LambdaInfo = SemaRef.getCurLambda(); + if (!LambdaInfo) + return; + + for (unsigned I = 0; I < Arguments.size(); ++I) { + auto *DeclRef = dyn_cast(Arguments[I]); + if (!DeclRef) + continue; + auto *Variable = dyn_cast(DeclRef->getDecl()); + if (!Variable || !Variable->isLocalVarDecl() || !Variable->isConstexpr()) + continue; + + bool HostByValue = false, HostByRef = false; + bool DeviceByValue = false, DeviceByRef = false; + + for (OverloadCandidate &Candidate : Candidates) { + FunctionDecl *Callee = Candidate.Function; + if (!Callee || I >= Callee->getNumParams()) + continue; + + CUDAFunctionTarget Target = IdentifyTarget(Callee); + if (Target == CUDAFunctionTarget::InvalidTarget || + Target == CUDAFunctionTarget::Global) + continue; + + bool CoversHost = (Target == CUDAFunctionTarget::Host || + Target == CUDAFunctionTarget::HostDevice); + bool CoversDevice = (Target == CUDAFunctionTarget::Device || + Target == CUDAFunctionTarget::HostDevice); + + bool IsRef = Callee->getParamDecl(I)->getType()->isReferenceType(); + HostByValue |= CoversHost && !IsRef; + HostByRef |= CoversHost && IsRef; + DeviceByValue |= CoversDevice && !IsRef; + DeviceByRef |= CoversDevice && IsRef; + } + + if ((HostByValue && DeviceByRef) || (HostByRef && DeviceByValue)) + LambdaInfo->CUDAPotentialODRUsedVars.insert(Variable); + } +} diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 2e6ce17f8bf91..41869995f90d3 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -19517,11 +19517,29 @@ static ExprResult rebuildPotentialResultsAsNonOdrUsed(Sema &S, Expr *E, return false; }; + // Check whether this expression may be odr-used in CUDA/HIP. + auto MaybeCUDAODRUsed = [&]() -> bool { + if (!S.LangOpts.CUDA) + return false; + LambdaScopeInfo *LSI = S.getCurLambda(); + if (!LSI) + return false; + auto *DRE = dyn_cast(E); + if (!DRE) + return false; + auto *VD = dyn_cast(DRE->getDecl()); + if (!VD) + return false; + return LSI->CUDAPotentialODRUsedVars.count(VD); + }; + // Mark that this expression does not constitute an odr-use. auto MarkNotOdrUsed = [&] { - S.MaybeODRUseExprs.remove(E); - if (LambdaScopeInfo *LSI = S.getCurLambda()) - LSI->markVariableExprAsNonODRUsed(E); + if (!MaybeCUDAODRUsed()) { + S.MaybeODRUseExprs.remove(E); + if (LambdaScopeInfo *LSI = S.getCurLambda()) + LSI->markVariableExprAsNonODRUsed(E); + } }; // C++2a [basic.def.odr]p2: diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 5b224b6c08fef..042de8d8a821a 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -14706,6 +14706,8 @@ ExprResult Sema::BuildOverloadedCallExpr(Scope *S, Expr *Fn, // the UnresolvedLookupExpr was type-dependent. if (OverloadResult == OR_Success) { const FunctionDecl *FDecl = Best->Function; + if (LangOpts.CUDA) + CUDA().recordPotentialODRUsedVariable(Args, CandidateSet); if (FDecl && FDecl->isTemplateInstantiation() && FDecl->getReturnType()->isUndeducedType()) { diff --git a/clang/test/CodeGenCUDA/lambda-constexpr-capture.cu b/clang/test/CodeGenCUDA/lambda-constexpr-capture.cu new file mode 100644 index 0000000000000..1a1db63ceb717 --- /dev/null +++ b/clang/test/CodeGenCUDA/lambda-constexpr-capture.cu @@ -0,0 +1,135 @@ +// RUN: %clang_cc1 -emit-llvm -x hip %s -o - -triple x86_64-linux-gnu \ +// RUN: | FileCheck -check-prefixes=CHECK,HOST %s +// RUN: %clang_cc1 -emit-llvm -x hip %s -o - -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: | FileCheck -check-prefixes=CHECK,DEV %s + +#include "Inputs/cuda.h" + +// CHECK: %class.anon = type { ptr, float, ptr, ptr } +// CHECK: %class.anon.0 = type { ptr, float, ptr, ptr } +// CHECK: %class.anon.1 = type { ptr, ptr, ptr } +// CHECK: %class.anon.2 = type { ptr, float, ptr, ptr } + +// HOST: call void @_ZN8DevByVal21__device_stub__kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr noundef byval(%class.anon) +// DEV: define amdgpu_kernel void @_ZN8DevByVal6kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr addrspace(4) noundef byref(%class.anon) + +// Only the device function passes arugments by value. +namespace DevByVal { +__device__ float fun(float x, float y) { + return x; +} + +float fun(const float &x, const float &y) { + return x; +} + +template +void __global__ kernel(F f) +{ + f(1); +} + +void test(float const * fl, float const * A, float * Vf) +{ + float constexpr small(1.0e-25); + + auto lambda = [=] __device__ __host__ (unsigned int n) { + float const value = fun(small, fl[0]); + Vf[0] = value * A[0]; + }; + kernel<<<1, 1>>>(lambda); +} +} + +// HOST: call void @_ZN9HostByVal21__device_stub__kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr noundef byval(%class.anon.0) +// DEV: define amdgpu_kernel void @_ZN9HostByVal6kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr addrspace(4) noundef byref(%class.anon.0) + +// Only the host function passes arugments by value. +namespace HostByVal { +float fun(float x, float y) { + return x; +} + +__device__ float fun(const float &x, const float &y) { + return x; +} + +template +void __global__ kernel(F f) +{ + f(1); +} + +void test(float const * fl, float const * A, float * Vf) +{ + float constexpr small(1.0e-25); + + auto lambda = [=] __device__ __host__ (unsigned int n) { + float const value = fun(small, fl[0]); + Vf[0] = value * A[0]; + }; + kernel<<<1, 1>>>(lambda); +} +} + +// HOST: call void @_ZN9BothByVal21__device_stub__kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr noundef byval(%class.anon.1) +// DEV: define amdgpu_kernel void @_ZN9BothByVal6kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr addrspace(4) noundef byref(%class.anon.1) + +// Both the host and device functions pass arugments by value. +namespace BothByVal { +float fun(float x, float y) { + return x; +} + +__device__ float fun(float x, float y) { + return x; +} + +template +void __global__ kernel(F f) +{ + f(1); +} + +void test(float const * fl, float const * A, float * Vf) +{ + float constexpr small(1.0e-25); + + auto lambda = [=] __device__ __host__ (unsigned int n) { + float const value = fun(small, fl[0]); + Vf[0] = value * A[0]; + }; + kernel<<<1, 1>>>(lambda); +} +} + +// HOST: call void @_ZN12NeitherByVal21__device_stub__kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr noundef byval(%class.anon.2) +// DEV: define amdgpu_kernel void @_ZN12NeitherByVal6kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr addrspace(4) noundef byref(%class.anon.2) + +// Neither the host nor device function passes arugments by value. +namespace NeitherByVal { +float fun(const float& x, const float& y) { + return x; +} + +__device__ float fun(const float& x, const float& y) { + return x; +} + +template +void __global__ kernel(F f) +{ + f(1); +} + +void test(float const * fl, float const * A, float * Vf) +{ + float constexpr small(1.0e-25); + + auto lambda = [=] __device__ __host__ (unsigned int n) { + float const value = fun(small, fl[0]); + Vf[0] = value * A[0]; + }; + kernel<<<1, 1>>>(lambda); +} +}