-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[OpenMP] Team reduction work specialization #70766
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[OpenMP] Team reduction work specialization #70766
Conversation
@llvm/pr-subscribers-flang-openmp @llvm/pr-subscribers-backend-amdgpu Author: Johannes Doerfert (jdoerfert) ChangesLast commit, the others are part of existing PRs. Patch is 4.73 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70766.diff 186 Files Affected:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 9d00ebae702802a..de028b0209c171a 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -803,8 +803,30 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
if (!IsSPMD)
emitGenericVarsEpilog(CGF);
+ // This is temporary until we remove the fixed sized buffer.
+ ASTContext &C = CGM.getContext();
+ RecordDecl *StaticRD = C.buildImplicitRecord(
+ "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
+ StaticRD->startDefinition();
+ for (const RecordDecl *TeamReductionRec : TeamsReductions) {
+ QualType RecTy = C.getRecordType(TeamReductionRec);
+ auto *Field = FieldDecl::Create(
+ C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
+ C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
+ /*BW=*/nullptr, /*Mutable=*/false,
+ /*InitStyle=*/ICIS_NoInit);
+ Field->setAccess(AS_public);
+ StaticRD->addDecl(Field);
+ }
+ StaticRD->completeDefinition();
+ QualType StaticTy = C.getRecordType(StaticRD);
+ llvm::Type *LLVMReductionsBufferTy =
+ CGM.getTypes().ConvertTypeForMem(StaticTy);
+ const auto &DL = CGM.getModule().getDataLayout();
+ uint64_t BufferSize =
+ DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
CGBuilderTy &Bld = CGF.Builder;
- OMPBuilder.createTargetDeinit(Bld);
+ OMPBuilder.createTargetDeinit(Bld, BufferSize);
}
void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
@@ -2998,15 +3020,10 @@ void CGOpenMPRuntimeGPU::emitReduction(
CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
C.getLangOpts().OpenMPCUDAReductionBufNum);
TeamsReductions.push_back(TeamReductionRec);
- if (!KernelTeamsReductionPtr) {
- KernelTeamsReductionPtr = new llvm::GlobalVariable(
- CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
- llvm::GlobalValue::InternalLinkage, nullptr,
- "_openmp_teams_reductions_buffer_$_$ptr");
- }
- llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
- Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
- /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
+ auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
+ {}, "_openmp_teams_reductions_buffer_$_$ptr");
llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
@@ -3021,7 +3038,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
llvm::Value *Args[] = {
RTLoc,
ThreadId,
- GlobalBufferPtr,
+ KernelTeamsReductionPtr,
CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
RL,
ShuffleAndReduceFn,
@@ -3654,42 +3671,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
CGOpenMPRuntime::processRequiresDirective(D);
}
-void CGOpenMPRuntimeGPU::clear() {
-
- if (!TeamsReductions.empty()) {
- ASTContext &C = CGM.getContext();
- RecordDecl *StaticRD = C.buildImplicitRecord(
- "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
- StaticRD->startDefinition();
- for (const RecordDecl *TeamReductionRec : TeamsReductions) {
- QualType RecTy = C.getRecordType(TeamReductionRec);
- auto *Field = FieldDecl::Create(
- C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
- C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
- /*BW=*/nullptr, /*Mutable=*/false,
- /*InitStyle=*/ICIS_NoInit);
- Field->setAccess(AS_public);
- StaticRD->addDecl(Field);
- }
- StaticRD->completeDefinition();
- QualType StaticTy = C.getRecordType(StaticRD);
- llvm::Type *LLVMReductionsBufferTy =
- CGM.getTypes().ConvertTypeForMem(StaticTy);
- // FIXME: nvlink does not handle weak linkage correctly (object with the
- // different size are reported as erroneous).
- // Restore CommonLinkage as soon as nvlink is fixed.
- auto *GV = new llvm::GlobalVariable(
- CGM.getModule(), LLVMReductionsBufferTy,
- /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
- llvm::Constant::getNullValue(LLVMReductionsBufferTy),
- "_openmp_teams_reductions_buffer_$_");
- KernelTeamsReductionPtr->setInitializer(
- llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
- CGM.VoidPtrTy));
- }
- CGOpenMPRuntime::clear();
-}
-
llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Module *M = &CGF.CGM.getModule();
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index 46e1361f2f895ba..141436f26230dde 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -130,7 +130,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
public:
explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM);
- void clear() override;
bool isGPU() const override { return true; };
@@ -386,7 +385,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
/// Maps the function to the list of the globalized variables with their
/// addresses.
llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls;
- llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr;
/// List of the records with the list of fields for the reductions across the
/// teams. Used to build the intermediate buffer for the fast teams
/// reductions.
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 75f9e152dca9297..145f4dc4670081d 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4249,12 +4249,15 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
getCurCapturedRegion()->TheCapturedDecl->addAttr(
AlwaysInlineAttr::CreateImplicit(
Context, {}, AlwaysInlineAttr::Keyword_forceinline));
- Sema::CapturedParamNameType ParamsTarget[] = {
- std::make_pair(StringRef(), QualType()) // __context with shared vars
- };
+ SmallVector<Sema::CapturedParamNameType, 2> ParamsTarget;
+ if (getLangOpts().OpenMPIsTargetDevice)
+ ParamsTarget.push_back(std::make_pair(StringRef("dyn_ptr"), VoidPtrTy));
+ ParamsTarget.push_back(
+ std::make_pair(StringRef(), QualType())); // __context with shared vars;
// Start a captured region for 'target' with no implicit parameters.
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
- ParamsTarget, /*OpenMPCaptureLevel=*/1);
+ ParamsTarget,
+ /*OpenMPCaptureLevel=*/1);
Sema::CapturedParamNameType ParamsTeamsOrParallel[] = {
std::make_pair(".global_tid.", KmpInt32PtrTy),
std::make_pair(".bound_tid.", KmpInt32PtrTy),
@@ -4293,8 +4296,13 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
getCurCapturedRegion()->TheCapturedDecl->addAttr(
AlwaysInlineAttr::CreateImplicit(
Context, {}, AlwaysInlineAttr::Keyword_forceinline));
+ SmallVector<Sema::CapturedParamNameType, 2> ParamsTarget;
+ if (getLangOpts().OpenMPIsTargetDevice)
+ ParamsTarget.push_back(std::make_pair(StringRef("dyn_ptr"), VoidPtrTy));
+ ParamsTarget.push_back(
+ std::make_pair(StringRef(), QualType())); // __context with shared vars;
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
- std::make_pair(StringRef(), QualType()),
+ ParamsTarget,
/*OpenMPCaptureLevel=*/1);
break;
}
@@ -4499,9 +4507,11 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
getCurCapturedRegion()->TheCapturedDecl->addAttr(
AlwaysInlineAttr::CreateImplicit(
Context, {}, AlwaysInlineAttr::Keyword_forceinline));
- Sema::CapturedParamNameType ParamsTarget[] = {
- std::make_pair(StringRef(), QualType()) // __context with shared vars
- };
+ SmallVector<Sema::CapturedParamNameType, 2> ParamsTarget;
+ if (getLangOpts().OpenMPIsTargetDevice)
+ ParamsTarget.push_back(std::make_pair(StringRef("dyn_ptr"), VoidPtrTy));
+ ParamsTarget.push_back(
+ std::make_pair(StringRef(), QualType())); // __context with shared vars;
// Start a captured region for 'target' with no implicit parameters.
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
ParamsTarget, /*OpenMPCaptureLevel=*/1);
diff --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp b/clang/test/OpenMP/amdgcn_target_codegen.cpp
index 90d2ebdf26bd645..3ea2d107f072adb 100644
--- a/clang/test/OpenMP/amdgcn_target_codegen.cpp
+++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp
@@ -29,15 +29,18 @@ int test_amdgcn_target_tid_threads_simd() {
#endif
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z30test_amdgcn_target_tid_threadsv_l14
-// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[ARR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[ARR:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[ARR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
// CHECK-NEXT: [[ARR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ARR_ADDR]] to ptr
// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[ARR]], ptr [[ARR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARR_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z30test_amdgcn_target_tid_threadsv_l14_kernel_environment to ptr))
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z30test_amdgcn_target_tid_threadsv_l14_kernel_environment to ptr), ptr [[DYN_PTR]])
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
@@ -66,19 +69,22 @@ int test_amdgcn_target_tid_threads_simd() {
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z35test_amdgcn_target_tid_threads_simdv_l23
-// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[ARR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[ARR:%.*]]) #[[ATTR1:[0-9]+]] {
// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[ARR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
// CHECK-NEXT: [[ARR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ARR_ADDR]] to ptr
// CHECK-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[ARR]], ptr [[ARR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARR_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z35test_amdgcn_target_tid_threads_simdv_l23_kernel_environment to ptr))
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z35test_amdgcn_target_tid_threads_simdv_l23_kernel_environment to ptr), ptr [[DYN_PTR]])
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
diff --git a/clang/test/OpenMP/amdgcn_target_device_vla.cpp b/clang/test/OpenMP/amdgcn_target_device_vla.cpp
index b2b630b546713dd..de150a0fcb4afd2 100644
--- a/clang/test/OpenMP/amdgcn_target_device_vla.cpp
+++ b/clang/test/OpenMP/amdgcn_target_device_vla.cpp
@@ -97,21 +97,24 @@ int main() {
#endif
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12
-// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[N:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5)
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[I1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
// CHECK-NEXT: [[N_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N]] to ptr
// CHECK-NEXT: [[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr
// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
// CHECK-NEXT: [[I1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I1]] to ptr
+// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12_kernel_environment to ptr))
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12_kernel_environment to ptr), ptr [[DYN_PTR]])
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
@@ -174,26 +177,29 @@ int main() {
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l30
-// CHECK-SAME: (i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]]) #[[ATTR0]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[M_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
// CHECK-NEXT: [[RESULT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[M_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
// CHECK-NEXT: [[M_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_ADDR]] to ptr
// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr
// CHECK-NEXT: [[RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_ADDR]] to ptr
// CHECK-NEXT: [[M_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_CASTED]] to ptr
// CHECK-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
// CHECK-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
+// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[M]], ptr [[M_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[RESULT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l30_kernel_environment to ptr))
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l30_kernel_environment to ptr), ptr [[DYN_PTR]])
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
@@ -540,26 +546,29 @@ int main() {
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l52
-// CHECK-SAME: (i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]]) #[[ATTR0]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[M_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
// CHECK-NEXT: [[RESULT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[M_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
// CHECK-NEXT: [[M_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_ADDR]] to ptr
// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr
// CHECK-NEXT: [[RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_ADDR]] to ptr
// CHECK-NEXT: [[M_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_CASTED]] to ptr
// CHECK-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
// CHECK-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
+// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[M]], ptr [[M_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load...
[truncated]
|
9c557e1
to
e28dfce
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG
We default to < 1024 teams if the user did not specify otherwise. As such we can avoid the extra logic in the teams reduction that handles more than num_of_records (default 1024) teams. This is a stopgap but still shaves of 33% of the runtime in some simple reduction examples.
e28dfce
to
33b1a34
Compare
patch seems to break these 4 sollve tests ./sollve_vv/tests/4.5/target_teams_distribute/test_target_teams_distribute_reduction_and.c fail in latest build of llvm, passes if patch reverted locally i nbuild |
…case (llvm#70766)" fails 4 sollve tests: test_target_teams_distribute_reduction_and.c test_target_teams_distribute_reduction_multiply.c test_target_teams_distribute_reduction_and.c test_target_teams_distribute_reduction_multiply.c This reverts commit eab828d. Change-Id: If6beb31e12531c9232ccf9a711fbb2a1cbe99898
This commit breaks minimization and multiplication reductions. |
The patch has been reverted @AntonRydahl |
When? I just found it with git bisect on main. Maybe something is wrong in my fork. |
i dont see it reverted either. i do recall Shilei , you reverted a different reduction related patch with 3 sollve failures, this one has 4. |
I think there were multiple reduction commits on the same day. It is in the history here: https://github.com/llvm/llvm-project/commits/main/openmp/libomptarget/DeviceRTL/src/Reduction.cpp |
@AntonRydahl @ronlieb Sorry I was looking at the wrong one:
I'll revert it right now. |
Thanks a bunch, @shiltian! |
shilei, are you willing to revert this one also ? it breaks spec accel v1.4 552.pep |
That one has to be reverted by @jdoerfert as I tried but there are way too many conflicts. |
I'll revert this one. |
Based on #70766 I think it would be good to have a few more offloading reduction tests, so we do not accidentally break minimum and maximum reductions another time.
It also caused problems for OpenMC. I used OpenMC to verify my fix worked. I did not think this patch changed much but I forgot that they used the type for offset calculations, not only for type adjustment. The fix will make sure we properly adjust the layout change everywhere. |
Based on llvm/llvm-project#70766 I think it would be good to have a few more offloading reduction tests, so we do not accidentally break minimum and maximum reductions another time.
Last commit, the others are part of existing PRs.