diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 9932074830866..5b2b191fe43e7 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1720,6 +1720,10 @@ The AMDGPU backend supports the following LLVM IR attributes. "amdgpu-sgpr-hazard-mem-wait-cull-threshold" Sets the number of active SGPR hazards that must be present before inserting a cull sequence at a memory wait. + "amdgpu-promote-alloca-to-vector-max-regs" Maximum vector size (in 32b registers) to create when promoting alloca. + + "amdgpu-promote-alloca-to-vector-vgpr-ratio" Ratio of VGPRs to budget for promoting alloca to vectors. + ================================================ ========================================================== Calling Conventions diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 69ddb384e1a40..51fee8e02126c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -66,6 +66,19 @@ static cl::opt PromoteAllocaToVectorLimit( cl::desc("Maximum byte size to consider promote alloca to vector"), cl::init(0)); +static cl::opt PromoteAllocaToVectorMaxRegs( + "amdgpu-promote-alloca-to-vector-max-regs", + cl::desc( + "Maximum vector size (in 32b registers) to use when promoting alloca"), + cl::init(16)); + +// Use up to 1/4 of available register budget for vectorization. +// FIXME: Increase the limit for whole function budgets? Perhaps x2? +static cl::opt PromoteAllocaToVectorVGPRRatio( + "amdgpu-promote-alloca-to-vector-vgpr-ratio", + cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"), + cl::init(4)); + static cl::opt LoopUserWeight("promote-alloca-vector-loop-user-weight", cl::desc("The bonus weight of users of allocas within loop " @@ -84,6 +97,8 @@ class AMDGPUPromoteAllocaImpl { uint32_t LocalMemLimit = 0; uint32_t CurrentLocalMemUsage = 0; unsigned MaxVGPRs; + unsigned VGPRBudgetRatio; + unsigned MaxVectorRegs; bool IsAMDGCN = false; bool IsAMDHSA = false; @@ -112,6 +127,8 @@ class AMDGPUPromoteAllocaImpl { void sortAllocasToPromote(SmallVectorImpl &Allocas); + void setFunctionLimits(const Function &F); + public: AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) { @@ -298,6 +315,19 @@ void AMDGPUPromoteAllocaImpl::sortAllocasToPromote( // clang-format on } +void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) { + // Load per function limits, overriding with global options where appropriate. + MaxVectorRegs = F.getFnAttributeAsParsedInteger( + "amdgpu-promote-alloca-to-vector-max-regs", PromoteAllocaToVectorMaxRegs); + if (PromoteAllocaToVectorMaxRegs.getNumOccurrences()) + MaxVectorRegs = PromoteAllocaToVectorMaxRegs; + VGPRBudgetRatio = F.getFnAttributeAsParsedInteger( + "amdgpu-promote-alloca-to-vector-vgpr-ratio", + PromoteAllocaToVectorVGPRRatio); + if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences()) + VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio; +} + bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { Mod = F.getParent(); DL = &Mod->getDataLayout(); @@ -307,15 +337,14 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { return false; MaxVGPRs = getMaxVGPRs(TM, F); + setFunctionLimits(F); bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false; - // Use up to 1/4 of available register budget for vectorization. - // FIXME: Increase the limit for whole function budgets? Perhaps x2? unsigned VectorizationBudget = (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8 : (MaxVGPRs * 32)) / - 4; + VGPRBudgetRatio; SmallVector Allocas; for (Instruction &I : F.getEntryBlock()) { @@ -398,7 +427,8 @@ calculateVectorIndex(Value *Ptr, } static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, - Type *VecElemTy, const DataLayout &DL) { + Type *VecElemTy, const DataLayout &DL, + SmallVector &NewInsts) { // TODO: Extracting a "multiple of X" from a GEP might be a useful generic // helper. unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType()); @@ -412,22 +442,37 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, if (VarOffsets.size() > 1) return nullptr; - if (VarOffsets.size() == 1) { - // Only handle cases where we don't need to insert extra arithmetic - // instructions. - const auto &VarOffset = VarOffsets.front(); - if (!ConstOffset.isZero() || VarOffset.second != VecElemSize) - return nullptr; - return VarOffset.first; - } - APInt Quot; uint64_t Rem; APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem); if (Rem != 0) return nullptr; - return ConstantInt::get(GEP->getContext(), Quot); + ConstantInt *ConstIndex = ConstantInt::get(GEP->getContext(), Quot); + if (VarOffsets.size() == 0) + return ConstIndex; + + IRBuilder<> Builder(GEP); + + const auto &VarOffset = VarOffsets.front(); + APInt::udivrem(VarOffset.second, VecElemSize, Quot, Rem); + if (Rem != 0 || Quot.isZero()) + return nullptr; + + Value *Offset = VarOffset.first; + if (!Quot.isOne()) { + ConstantInt *ConstMul = ConstantInt::get(GEP->getContext(), Quot); + Offset = Builder.CreateMul(Offset, ConstMul); + if (Instruction *NewInst = dyn_cast(Offset)) + NewInsts.push_back(NewInst); + } + if (ConstOffset.isZero()) + return Offset; + + Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset); + if (Instruction *NewInst = dyn_cast(IndexAdd)) + NewInsts.push_back(NewInst); + return IndexAdd; } /// Promotes a single user of the alloca to a vector form. @@ -735,23 +780,44 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { Type *AllocaTy = Alloca.getAllocatedType(); auto *VectorTy = dyn_cast(AllocaTy); if (auto *ArrayTy = dyn_cast(AllocaTy)) { - if (VectorType::isValidElementType(ArrayTy->getElementType()) && - ArrayTy->getNumElements() > 0) - VectorTy = FixedVectorType::get(ArrayTy->getElementType(), - ArrayTy->getNumElements()); + uint64_t NumElems = 1; + Type *ElemTy; + do { + NumElems *= ArrayTy->getNumElements(); + ElemTy = ArrayTy->getElementType(); + } while ((ArrayTy = dyn_cast(ElemTy))); + + // Check for array of vectors + auto *InnerVectorTy = dyn_cast(ElemTy); + if (InnerVectorTy) { + NumElems *= InnerVectorTy->getNumElements(); + ElemTy = InnerVectorTy->getElementType(); + } + + if (VectorType::isValidElementType(ElemTy) && NumElems > 0) { + unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8; + unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy); + // Expand vector if required to match padding of inner type, + // i.e. odd size subvectors. + // Storage size of new vector must match that of alloca for correct + // behaviour of byte offsets and GEP computation. + if (NumElems * ElementSize != AllocaSize) + NumElems = AllocaSize / ElementSize; + if (NumElems > 0 && (AllocaSize % ElementSize) == 0) + VectorTy = FixedVectorType::get(ElemTy, NumElems); + } } - // FIXME: There is no reason why we can't support larger arrays, we - // are just being conservative for now. - // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or - // equivalent. Potentially these could also be promoted but we don't currently - // handle this case if (!VectorTy) { LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n"); return false; } - if (VectorTy->getNumElements() > 16 || VectorTy->getNumElements() < 2) { + const unsigned MaxElements = + (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType()); + + if (VectorTy->getNumElements() > MaxElements || + VectorTy->getNumElements() < 2) { LLVM_DEBUG(dbgs() << " " << *VectorTy << " has an unsupported number of elements\n"); return false; @@ -761,11 +827,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { SmallVector WorkList; SmallVector UsersToRemove; SmallVector DeferredInsts; + SmallVector NewGEPInsts; DenseMap TransferInfo; const auto RejectUser = [&](Instruction *Inst, Twine Msg) { LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n" << " " << *Inst << "\n"); + for (auto *Inst : reverse(NewGEPInsts)) + Inst->eraseFromParent(); return false; }; @@ -815,7 +884,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { if (auto *GEP = dyn_cast(Inst)) { // If we can't compute a vector index from this GEP, then we can't // promote this alloca to vector. - Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL); + Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts); if (!Index) return RejectUser(Inst, "cannot compute vector index for GEP"); diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index 82832277b1aba..a663d451cad35 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -258,7 +258,7 @@ entry: ; FUNC-LABEL: {{^}}no_overlap: ; ; A total of 5 bytes should be allocated and used. -; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ; +; SI-ALLOCA: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ; define amdgpu_kernel void @no_overlap(ptr addrspace(1) %out, i32 %in) #0 { entry: %0 = alloca [3 x i8], align 1, addrspace(5) @@ -281,6 +281,7 @@ entry: ret void } +; FUNC-LABEL: {{^}}char_array_array: define amdgpu_kernel void @char_array_array(ptr addrspace(1) %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i8]], addrspace(5) @@ -294,6 +295,7 @@ entry: ret void } +; FUNC-LABEL: {{^}}i32_array_array: define amdgpu_kernel void @i32_array_array(ptr addrspace(1) %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i32]], addrspace(5) @@ -306,6 +308,7 @@ entry: ret void } +; FUNC-LABEL: {{^}}i64_array_array: define amdgpu_kernel void @i64_array_array(ptr addrspace(1) %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i64]], addrspace(5) @@ -319,7 +322,7 @@ entry: } %struct.pair32 = type { i32, i32 } - +; FUNC-LABEL: {{^}}struct_array_array: define amdgpu_kernel void @struct_array_array(ptr addrspace(1) %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x %struct.pair32]], addrspace(5) @@ -333,6 +336,7 @@ entry: ret void } +; FUNC-LABEL: {{^}}struct_pair32_array: define amdgpu_kernel void @struct_pair32_array(ptr addrspace(1) %out, i32 %index) #0 { entry: %alloca = alloca [2 x %struct.pair32], addrspace(5) @@ -346,6 +350,7 @@ entry: ret void } +; FUNC-LABEL: {{^}}select_private: define amdgpu_kernel void @select_private(ptr addrspace(1) %out, i32 %in) nounwind { entry: %tmp = alloca [2 x i32], addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll index ac8782e1e542f..e1bbc243344b0 100644 --- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll @@ -22,13 +22,7 @@ declare void @llvm.amdgcn.s.barrier() #2 ; SI-ALLOCA: s_barrier ; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64 ; -; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this -; alloca to a vector. It currently fails because it does not know how -; to interpret: -; getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 1, i32 %b - -; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64 -; SI-PROMOTE: ds_write_b32 [[PTRREG]] +; SI-PROMOTE: LDSByteSize: 0 define amdgpu_kernel void @test_private_array_ptr_calc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) #0 { %alloca = alloca [16 x i32], align 16, addrspace(5) %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0); diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll new file mode 100644 index 0000000000000..f9bdfc51f61ff --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll @@ -0,0 +1,236 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE --check-prefix=DEFAULT %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=24 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX24 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=32 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX32 + +define amdgpu_kernel void @i32_24_elements(ptr %out) #0 { +; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements( +; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; DEFAULT-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; DEFAULT-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; DEFAULT-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; DEFAULT-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; DEFAULT-NEXT: [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5) +; DEFAULT-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false) +; DEFAULT-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; DEFAULT-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20 +; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; DEFAULT-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 +; DEFAULT-NEXT: ret void +; +; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements( +; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; MAX24-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; MAX24-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; MAX24-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; MAX24-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; MAX24-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; MAX24-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; MAX24-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> , i32 [[SEL2]] +; MAX24-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; MAX24-NEXT: ret void +; +; MAX32-LABEL: define amdgpu_kernel void @i32_24_elements( +; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; MAX32-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; MAX32-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; MAX32-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; MAX32-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; MAX32-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; MAX32-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; MAX32-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> , i32 [[SEL2]] +; MAX32-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; MAX32-NEXT: ret void +; + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %c1 = icmp uge i32 %x, 3 + %c2 = icmp uge i32 %y, 3 + %sel1 = select i1 %c1, i32 1, i32 2 + %sel2 = select i1 %c2, i32 0, i32 %sel1 + %alloca = alloca [24 x i32], align 16, addrspace(5) + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false) + %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0 + %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20 + store i32 42, ptr addrspace(5) %gep.0 + store i32 43, ptr addrspace(5) %gep.1 + %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 + %load = load i32, ptr addrspace(5) %gep + store i32 %load, ptr %out + ret void +} + +define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 { +; BASE-LABEL: define amdgpu_kernel void @i32_24_elements_attrib( +; BASE-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +; BASE-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; BASE-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; BASE-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; BASE-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; BASE-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; BASE-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; BASE-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> , i32 [[SEL2]] +; BASE-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; BASE-NEXT: ret void +; + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %c1 = icmp uge i32 %x, 3 + %c2 = icmp uge i32 %y, 3 + %sel1 = select i1 %c1, i32 1, i32 2 + %sel2 = select i1 %c2, i32 0, i32 %sel1 + %alloca = alloca [24 x i32], align 16, addrspace(5) + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false) + %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0 + %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20 + store i32 42, ptr addrspace(5) %gep.0 + store i32 43, ptr addrspace(5) %gep.1 + %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 + %load = load i32, ptr addrspace(5) %gep + store i32 %load, ptr %out + ret void +} + +define amdgpu_kernel void @i32_32_elements(ptr %out) #0 { +; DEFAULT-LABEL: define amdgpu_kernel void @i32_32_elements( +; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] { +; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; DEFAULT-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; DEFAULT-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; DEFAULT-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; DEFAULT-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; DEFAULT-NEXT: [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5) +; DEFAULT-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false) +; DEFAULT-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; DEFAULT-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30 +; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; DEFAULT-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 +; DEFAULT-NEXT: ret void +; +; MAX24-LABEL: define amdgpu_kernel void @i32_32_elements( +; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] { +; MAX24-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; MAX24-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; MAX24-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; MAX24-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; MAX24-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; MAX24-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; MAX24-NEXT: [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5) +; MAX24-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false) +; MAX24-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; MAX24-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30 +; MAX24-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; MAX24-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; MAX24-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; MAX24-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; MAX24-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 +; MAX24-NEXT: ret void +; +; MAX32-LABEL: define amdgpu_kernel void @i32_32_elements( +; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] { +; MAX32-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; MAX32-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; MAX32-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; MAX32-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; MAX32-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; MAX32-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; MAX32-NEXT: [[TMP1:%.*]] = extractelement <32 x i32> , i32 [[SEL2]] +; MAX32-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; MAX32-NEXT: ret void +; + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %c1 = icmp uge i32 %x, 3 + %c2 = icmp uge i32 %y, 3 + %sel1 = select i1 %c1, i32 1, i32 2 + %sel2 = select i1 %c2, i32 0, i32 %sel1 + %alloca = alloca [32 x i32], align 16, addrspace(5) + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 128, i1 false) + %gep.0 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 0 + %gep.1 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 30 + store i32 42, ptr addrspace(5) %gep.0 + store i32 43, ptr addrspace(5) %gep.1 + %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 + %load = load i32, ptr addrspace(5) %gep + store i32 %load, ptr %out + ret void +} + +define amdgpu_kernel void @i32_32_elements_attrib(ptr %out) #2 { +; DEFAULT-LABEL: define amdgpu_kernel void @i32_32_elements_attrib( +; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] { +; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; DEFAULT-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; DEFAULT-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; DEFAULT-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; DEFAULT-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; DEFAULT-NEXT: [[TMP1:%.*]] = extractelement <32 x i32> , i32 [[SEL2]] +; DEFAULT-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; DEFAULT-NEXT: ret void +; +; MAX24-LABEL: define amdgpu_kernel void @i32_32_elements_attrib( +; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] { +; MAX24-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; MAX24-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; MAX24-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; MAX24-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; MAX24-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; MAX24-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; MAX24-NEXT: [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5) +; MAX24-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false) +; MAX24-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; MAX24-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30 +; MAX24-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; MAX24-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; MAX24-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; MAX24-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; MAX24-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 +; MAX24-NEXT: ret void +; +; MAX32-LABEL: define amdgpu_kernel void @i32_32_elements_attrib( +; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] { +; MAX32-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; MAX32-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; MAX32-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; MAX32-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; MAX32-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; MAX32-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; MAX32-NEXT: [[TMP1:%.*]] = extractelement <32 x i32> , i32 [[SEL2]] +; MAX32-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; MAX32-NEXT: ret void +; + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %c1 = icmp uge i32 %x, 3 + %c2 = icmp uge i32 %y, 3 + %sel1 = select i1 %c1, i32 1, i32 2 + %sel2 = select i1 %c2, i32 0, i32 %sel1 + %alloca = alloca [32 x i32], align 16, addrspace(5) + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 128, i1 false) + %gep.0 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 0 + %gep.1 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 30 + store i32 42, ptr addrspace(5) %gep.0 + store i32 43, ptr addrspace(5) %gep.1 + %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 + %load = load i32, ptr addrspace(5) %gep + store i32 %load, ptr %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.workitem.id.y() +declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) + +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" } +attributes #1 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="24" } +attributes #2 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="32" } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll index 74651e10c7809..bb51c1d27b336 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll @@ -110,10 +110,7 @@ define amdgpu_kernel void @memset_vector_ptr_alloca(ptr %out) { define amdgpu_kernel void @memset_array_of_array_ptr_alloca(ptr %out) { ; CHECK-LABEL: @memset_array_of_array_ptr_alloca( -; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x [3 x ptr]], align 16, addrspace(5) -; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false) -; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8 -; CHECK-NEXT: store i64 [[LOAD]], ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: store i64 0, ptr [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; %alloca = alloca [2 x [3 x ptr]], align 16, addrspace(5) @@ -125,14 +122,11 @@ define amdgpu_kernel void @memset_array_of_array_ptr_alloca(ptr %out) { define amdgpu_kernel void @memset_array_of_vec_ptr_alloca(ptr %out) { ; CHECK-LABEL: @memset_array_of_vec_ptr_alloca( -; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x <3 x ptr>], align 16, addrspace(5) -; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false) -; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8 -; CHECK-NEXT: store i64 [[LOAD]], ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: store i64 0, ptr [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; %alloca = alloca [2 x <3 x ptr>], align 16, addrspace(5) - call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false) + call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 64, i1 false) %load = load i64, ptr addrspace(5) %alloca store i64 %load, ptr %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll new file mode 100644 index 0000000000000..5198597c935fd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll @@ -0,0 +1,391 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck %s + +define amdgpu_kernel void @i32_2d_load_store(ptr %out) { +; CHECK-LABEL: define amdgpu_kernel void @i32_2d_load_store( +; CHECK-SAME: ptr [[OUT:%.*]]) { +; CHECK-NEXT: store i32 3, ptr [[OUT]], align 4 +; CHECK-NEXT: ret void +; + %alloca = alloca [2 x [3 x i32]], align 16, addrspace(5) + %gep.00 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0 + %gep.01 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 + %gep.02 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2 + %gep.10 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 + %gep.11 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1 + %gep.12 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2 + store i32 0, ptr addrspace(5) %gep.00 + store i32 1, ptr addrspace(5) %gep.01 + store i32 2, ptr addrspace(5) %gep.02 + store i32 3, ptr addrspace(5) %gep.10 + store i32 4, ptr addrspace(5) %gep.11 + store i32 5, ptr addrspace(5) %gep.12 + %gep = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 1 + %load = load i32, ptr addrspace(5) %gep + store i32 %load, ptr %out + ret void +} + +define amdgpu_kernel void @i64_2d_load_store(ptr %out) { +; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store( +; CHECK-SAME: ptr [[OUT:%.*]]) { +; CHECK-NEXT: store i64 3, ptr [[OUT]], align 8 +; CHECK-NEXT: ret void +; + %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5) + %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0 + %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 + %gep.02 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2 + %gep.10 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 + %gep.11 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1 + %gep.12 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2 + store i64 0, ptr addrspace(5) %gep.00 + store i64 1, ptr addrspace(5) %gep.01 + store i64 2, ptr addrspace(5) %gep.02 + store i64 3, ptr addrspace(5) %gep.10 + store i64 4, ptr addrspace(5) %gep.11 + store i64 5, ptr addrspace(5) %gep.12 + %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1 + %load = load i64, ptr addrspace(5) %gep + store i64 %load, ptr %out + ret void +} + +define amdgpu_kernel void @i32_2d_alloca_store_partial(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { +; CHECK-LABEL: define amdgpu_kernel void @i32_2d_alloca_store_partial( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[DUMMY_LDS:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; CHECK-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i32> , i32 [[SEL2]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[TMP0]] to float +; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; +bb: + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %c1 = icmp uge i32 %x, 3 + %c2 = icmp uge i32 %y, 3 + %sel1 = select i1 %c1, i32 1, i32 2 + %sel2 = select i1 %c2, i32 0, i32 %sel1 + %alloca = alloca [2 x [4 x i32]], align 4, addrspace(5) + %gep = getelementptr inbounds <4 x i32>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 + store <4 x i32> , ptr addrspace(5) %alloca, align 4 + %load = load float, ptr addrspace(5) %gep, align 4 + store float %load, ptr addrspace(1) %out, align 4 + ret void +} + + +define amdgpu_kernel void @i64_2d_load_store_cast(ptr %out) { +; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_cast( +; CHECK-SAME: ptr [[OUT:%.*]]) { +; CHECK-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; CHECK-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i64> , i32 [[SEL2]] +; CHECK-NEXT: store i64 [[TMP1]], ptr [[OUT]], align 8 +; CHECK-NEXT: ret void +; + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %c1 = icmp uge i32 %x, 3 + %c2 = icmp uge i32 %y, 3 + %sel1 = select i1 %c1, i32 1, i32 2 + %sel2 = select i1 %c2, i32 0, i32 %sel1 + %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5) + %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0 + %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 + %gep.02 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2 + %gep.10 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 + %gep.11 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1 + %gep.12 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2 + store i64 0, ptr addrspace(5) %gep.00 + store i64 1, ptr addrspace(5) %gep.01 + store i64 2, ptr addrspace(5) %gep.02 + store i64 3, ptr addrspace(5) %gep.10 + store i64 4, ptr addrspace(5) %gep.11 + store i64 5, ptr addrspace(5) %gep.12 + %gep = getelementptr inbounds [6 x i64], ptr addrspace(5) %alloca, i32 0, i32 %sel2 + %load = load i64, ptr addrspace(5) %gep + store i64 %load, ptr %out + ret void +} + +define amdgpu_kernel void @i64_2d_load_store_subvec_1(ptr %out) { +; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_1( +; CHECK-SAME: ptr [[OUT:%.*]]) { +; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> , i32 2 +; CHECK-NEXT: store i64 [[ELEM]], ptr [[OUT]], align 8 +; CHECK-NEXT: ret void +; + %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5) + %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0 + %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 + store <3 x i64> , ptr addrspace(5) %gep.00 + store <3 x i64> , ptr addrspace(5) %gep.01 + %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1 + %load = load <3 x i64>, ptr addrspace(5) %gep + %elem = extractelement <3 x i64> %load, i32 2 + store i64 %elem, ptr %out + ret void +} + +define amdgpu_kernel void @i64_2d_load_store_subvec_2(ptr %out) { +; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_2( +; CHECK-SAME: ptr [[OUT:%.*]]) { +; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> , i32 2 +; CHECK-NEXT: store i64 [[ELEM]], ptr [[OUT]], align 8 +; CHECK-NEXT: ret void +; + %alloca = alloca [2 x <3 x i64>], align 16, addrspace(5) + %gep.00 = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0 + %gep.01 = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 + store <3 x i64> , ptr addrspace(5) %gep.00 + store <3 x i64> , ptr addrspace(5) %gep.01 + %gep = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0, i32 1 + %load = load <3 x i64>, ptr addrspace(5) %gep + %elem = extractelement <3 x i64> %load, i32 2 + store i64 %elem, ptr %out + ret void +} + +define amdgpu_kernel void @i64_2d_load_store_subvec_3(ptr %out) { +; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_3( +; CHECK-SAME: ptr [[OUT:%.*]]) { +; CHECK-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; CHECK-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[SEL2]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x i64> , i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <6 x i64> , i32 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <3 x i64> [[TMP3]], i64 [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x i64> , i32 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <3 x i64> [[TMP6]], i64 [[TMP8]], i64 2 +; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> [[TMP9]], i32 2 +; CHECK-NEXT: store i64 [[ELEM]], ptr [[OUT]], align 8 +; CHECK-NEXT: ret void +; + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %c1 = icmp uge i32 %x, 3 + %c2 = icmp uge i32 %y, 3 + %sel1 = select i1 %c1, i32 1, i32 2 + %sel2 = select i1 %c2, i32 0, i32 %sel1 + %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5) + %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0 + %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 + store <3 x i64> , ptr addrspace(5) %gep.00 + store <3 x i64> , ptr addrspace(5) %gep.01 + %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 %sel2 + %load = load <3 x i64>, ptr addrspace(5) %gep + %elem = extractelement <3 x i64> %load, i32 2 + store i64 %elem, ptr %out + ret void +} + +define amdgpu_kernel void @i64_2d_load_store_subvec_4(ptr %out) { +; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_4( +; CHECK-SAME: ptr [[OUT:%.*]]) { +; CHECK-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; CHECK-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[SEL2]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i64> , i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> , i32 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <3 x i64> [[TMP3]], i64 [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i64> , i32 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <3 x i64> [[TMP6]], i64 [[TMP8]], i64 2 +; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> [[TMP9]], i32 2 +; CHECK-NEXT: store i64 [[ELEM]], ptr [[OUT]], align 8 +; CHECK-NEXT: ret void +; + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %c1 = icmp uge i32 %x, 3 + %c2 = icmp uge i32 %y, 3 + %sel1 = select i1 %c1, i32 1, i32 2 + %sel2 = select i1 %c2, i32 0, i32 %sel1 + %alloca = alloca [2 x <3 x i64>], align 16, addrspace(5) + %gep.00 = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0 + %gep.01 = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 + store <3 x i64> , ptr addrspace(5) %gep.00 + store <3 x i64> , ptr addrspace(5) %gep.01 + %gep = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0, i32 %sel2 + %load = load <3 x i64>, ptr addrspace(5) %gep + %elem = extractelement <3 x i64> %load, i32 2 + store i64 %elem, ptr %out + ret void +} + +define amdgpu_kernel void @i32_3d_load_store(ptr %out) { +; CHECK-LABEL: define amdgpu_kernel void @i32_3d_load_store( +; CHECK-SAME: ptr [[OUT:%.*]]) { +; CHECK-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; CHECK-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <12 x i32> , i32 [[SEL2]] +; CHECK-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; CHECK-NEXT: ret void +; + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %c1 = icmp uge i32 %x, 3 + %c2 = icmp uge i32 %y, 3 + %sel1 = select i1 %c1, i32 1, i32 2 + %sel2 = select i1 %c2, i32 0, i32 %sel1 + %alloca = alloca [2 x [2 x [3 x i32]]], align 16, addrspace(5) + %gep.000 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0, i32 0 + %gep.001 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0, i32 1 + %gep.002 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0, i32 2 + %gep.010 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1, i32 0 + %gep.011 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1, i32 1 + %gep.012 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1, i32 2 + %gep.100 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0, i32 0 + %gep.101 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0, i32 1 + %gep.102 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0, i32 2 + %gep.110 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1, i32 0 + %gep.111 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1, i32 1 + %gep.112 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1, i32 2 + store i32 0, ptr addrspace(5) %gep.000 + store i32 1, ptr addrspace(5) %gep.001 + store i32 2, ptr addrspace(5) %gep.002 + store i32 3, ptr addrspace(5) %gep.010 + store i32 4, ptr addrspace(5) %gep.011 + store i32 5, ptr addrspace(5) %gep.012 + store i32 6, ptr addrspace(5) %gep.100 + store i32 7, ptr addrspace(5) %gep.101 + store i32 8, ptr addrspace(5) %gep.102 + store i32 9, ptr addrspace(5) %gep.110 + store i32 10, ptr addrspace(5) %gep.111 + store i32 11, ptr addrspace(5) %gep.112 + %gep = getelementptr inbounds [12 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 + %load = load i32, ptr addrspace(5) %gep + store i32 %load, ptr %out + ret void +} + +define amdgpu_kernel void @i16_2d_load_store(ptr %out, i32 %sel) { +; CHECK-LABEL: define amdgpu_kernel void @i16_2d_load_store( +; CHECK-SAME: ptr [[OUT:%.*]], i32 [[SEL:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = add i32 3, [[SEL]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x i16> , i32 [[TMP1]] +; CHECK-NEXT: store i16 [[TMP2]], ptr [[OUT]], align 2 +; CHECK-NEXT: ret void +; + %alloca = alloca [2 x [3 x i16]], align 16, addrspace(5) + %gep.00 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0 + %gep.01 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 + %gep.02 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2 + %gep.10 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 + %gep.11 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1 + %gep.12 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2 + store i16 0, ptr addrspace(5) %gep.00 + store i16 1, ptr addrspace(5) %gep.01 + store i16 2, ptr addrspace(5) %gep.02 + store i16 3, ptr addrspace(5) %gep.10 + store i16 4, ptr addrspace(5) %gep.11 + store i16 5, ptr addrspace(5) %gep.12 + %gep = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 %sel + %load = load i16, ptr addrspace(5) %gep + store i16 %load, ptr %out + ret void +} + +define amdgpu_kernel void @float_2d_load_store(ptr %out, i32 %sel) { +; CHECK-LABEL: define amdgpu_kernel void @float_2d_load_store( +; CHECK-SAME: ptr [[OUT:%.*]], i32 [[SEL:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = add i32 3, [[SEL]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x float> , i32 [[TMP1]] +; CHECK-NEXT: store float [[TMP2]], ptr [[OUT]], align 4 +; CHECK-NEXT: ret void +; + %alloca = alloca [2 x [3 x float]], align 16, addrspace(5) + %gep.00 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0 + %gep.01 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 + %gep.02 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2 + %gep.10 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 + %gep.11 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1 + %gep.12 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2 + store float 0.0, ptr addrspace(5) %gep.00 + store float 1.0, ptr addrspace(5) %gep.01 + store float 2.0, ptr addrspace(5) %gep.02 + store float 3.0, ptr addrspace(5) %gep.10 + store float 4.0, ptr addrspace(5) %gep.11 + store float 5.0, ptr addrspace(5) %gep.12 + %gep = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 %sel + %load = load float, ptr addrspace(5) %gep + store float %load, ptr %out + ret void +} + +define amdgpu_kernel void @ptr_2d_load_store(ptr %out, i32 %sel) { +; CHECK-LABEL: define amdgpu_kernel void @ptr_2d_load_store( +; CHECK-SAME: ptr [[OUT:%.*]], i32 [[SEL:%.*]]) { +; CHECK-NEXT: [[PTR_0:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 0 +; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 1 +; CHECK-NEXT: [[PTR_2:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 2 +; CHECK-NEXT: [[PTR_3:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 3 +; CHECK-NEXT: [[PTR_4:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 4 +; CHECK-NEXT: [[PTR_5:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 5 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <6 x ptr> {{.*}}, ptr [[PTR_0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <6 x ptr> [[TMP1]], ptr [[PTR_1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <6 x ptr> [[TMP2]], ptr [[PTR_2]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <6 x ptr> [[TMP3]], ptr [[PTR_3]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <6 x ptr> [[TMP4]], ptr [[PTR_4]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <6 x ptr> [[TMP5]], ptr [[PTR_5]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 3, [[SEL]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x ptr> [[TMP6]], i32 [[TMP7]] +; CHECK-NEXT: store ptr [[TMP8]], ptr [[OUT]], align 8 +; CHECK-NEXT: ret void +; + %alloca = alloca [2 x [3 x ptr]], align 16, addrspace(5) + %gep.00 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0 + %gep.01 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 + %gep.02 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2 + %gep.10 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 + %gep.11 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1 + %gep.12 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2 + %ptr.0 = getelementptr inbounds ptr, ptr %out, i32 0 + %ptr.1 = getelementptr inbounds ptr, ptr %out, i32 1 + %ptr.2 = getelementptr inbounds ptr, ptr %out, i32 2 + %ptr.3 = getelementptr inbounds ptr, ptr %out, i32 3 + %ptr.4 = getelementptr inbounds ptr, ptr %out, i32 4 + %ptr.5 = getelementptr inbounds ptr, ptr %out, i32 5 + store ptr %ptr.0, ptr addrspace(5) %gep.00 + store ptr %ptr.1, ptr addrspace(5) %gep.01 + store ptr %ptr.2, ptr addrspace(5) %gep.02 + store ptr %ptr.3, ptr addrspace(5) %gep.10 + store ptr %ptr.4, ptr addrspace(5) %gep.11 + store ptr %ptr.5, ptr addrspace(5) %gep.12 + %gep = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 %sel + %load = load ptr, ptr addrspace(5) %gep + store ptr %load, ptr %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.workitem.id.y() diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll index 28b923243b6db..ac28d9c20e2fa 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll @@ -1,5 +1,5 @@ -; RUN: llc -O0 -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s -; RUN: llc -O1 -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s +; RUN: llc -O0 -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -disable-promote-alloca-to-vector=1 -mattr=+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s +; RUN: llc -O1 -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -disable-promote-alloca-to-vector=1 -mattr=+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s ; ALL-LABEL: {{^}}promote_alloca_i32_array_array: ; NOOPTS: .amdhsa_group_segment_fixed_size 0 diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll index 7c5410004ed5b..e5d0c563b74c4 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -amdgpu-promote-alloca-to-vector-max-regs=32 < %s | FileCheck %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll new file mode 100644 index 0000000000000..e06b491f5986c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll @@ -0,0 +1,276 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE --check-prefix=DEFAULT %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-vgpr-ratio=2 < %s | FileCheck --check-prefix=BASE %s --check-prefix=RATIO2 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-vgpr-ratio=8 < %s | FileCheck --check-prefix=BASE %s --check-prefix=RATIO8 + +define amdgpu_kernel void @i32_24_elements(ptr %out) #0 { +; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements( +; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; DEFAULT-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; DEFAULT-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; DEFAULT-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; DEFAULT-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; DEFAULT-NEXT: [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5) +; DEFAULT-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false) +; DEFAULT-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; DEFAULT-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20 +; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; DEFAULT-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 +; DEFAULT-NEXT: ret void +; +; RATIO2-LABEL: define amdgpu_kernel void @i32_24_elements( +; RATIO2-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; RATIO2-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; RATIO2-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; RATIO2-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; RATIO2-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; RATIO2-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; RATIO2-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; RATIO2-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> , i32 [[SEL2]] +; RATIO2-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; RATIO2-NEXT: ret void +; +; RATIO8-LABEL: define amdgpu_kernel void @i32_24_elements( +; RATIO8-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; RATIO8-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; RATIO8-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; RATIO8-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; RATIO8-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; RATIO8-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; RATIO8-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; RATIO8-NEXT: [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5) +; RATIO8-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false) +; RATIO8-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; RATIO8-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20 +; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; RATIO8-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; RATIO8-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 +; RATIO8-NEXT: ret void +; + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %c1 = icmp uge i32 %x, 3 + %c2 = icmp uge i32 %y, 3 + %sel1 = select i1 %c1, i32 1, i32 2 + %sel2 = select i1 %c2, i32 0, i32 %sel1 + %alloca = alloca [24 x i32], align 16, addrspace(5) + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false) + %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0 + %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20 + store i32 42, ptr addrspace(5) %gep.0 + store i32 43, ptr addrspace(5) %gep.1 + %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 + %load = load i32, ptr addrspace(5) %gep + store i32 %load, ptr %out + ret void +} + +define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 { +; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements_attrib( +; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; DEFAULT-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; DEFAULT-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; DEFAULT-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; DEFAULT-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; DEFAULT-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> , i32 [[SEL2]] +; DEFAULT-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; DEFAULT-NEXT: ret void +; +; RATIO2-LABEL: define amdgpu_kernel void @i32_24_elements_attrib( +; RATIO2-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +; RATIO2-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; RATIO2-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; RATIO2-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; RATIO2-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; RATIO2-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; RATIO2-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; RATIO2-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> , i32 [[SEL2]] +; RATIO2-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; RATIO2-NEXT: ret void +; +; RATIO8-LABEL: define amdgpu_kernel void @i32_24_elements_attrib( +; RATIO8-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +; RATIO8-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; RATIO8-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; RATIO8-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; RATIO8-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; RATIO8-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; RATIO8-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; RATIO8-NEXT: [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5) +; RATIO8-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false) +; RATIO8-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; RATIO8-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20 +; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; RATIO8-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; RATIO8-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 +; RATIO8-NEXT: ret void +; + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %c1 = icmp uge i32 %x, 3 + %c2 = icmp uge i32 %y, 3 + %sel1 = select i1 %c1, i32 1, i32 2 + %sel2 = select i1 %c2, i32 0, i32 %sel1 + %alloca = alloca [24 x i32], align 16, addrspace(5) + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false) + %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0 + %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20 + store i32 42, ptr addrspace(5) %gep.0 + store i32 43, ptr addrspace(5) %gep.1 + %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 + %load = load i32, ptr addrspace(5) %gep + store i32 %load, ptr %out + ret void +} + +define amdgpu_kernel void @i32_16_elements(ptr %out) #0 { +; DEFAULT-LABEL: define amdgpu_kernel void @i32_16_elements( +; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] { +; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; DEFAULT-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; DEFAULT-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; DEFAULT-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; DEFAULT-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; DEFAULT-NEXT: [[TMP1:%.*]] = extractelement <16 x i32> , i32 [[SEL2]] +; DEFAULT-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; DEFAULT-NEXT: ret void +; +; RATIO2-LABEL: define amdgpu_kernel void @i32_16_elements( +; RATIO2-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] { +; RATIO2-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; RATIO2-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; RATIO2-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; RATIO2-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; RATIO2-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; RATIO2-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; RATIO2-NEXT: [[TMP1:%.*]] = extractelement <16 x i32> , i32 [[SEL2]] +; RATIO2-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; RATIO2-NEXT: ret void +; +; RATIO8-LABEL: define amdgpu_kernel void @i32_16_elements( +; RATIO8-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] { +; RATIO8-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; RATIO8-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; RATIO8-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; RATIO8-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; RATIO8-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; RATIO8-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; RATIO8-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 16, addrspace(5) +; RATIO8-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false) +; RATIO8-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; RATIO8-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15 +; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; RATIO8-NEXT: [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; RATIO8-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 +; RATIO8-NEXT: ret void +; + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %c1 = icmp uge i32 %x, 3 + %c2 = icmp uge i32 %y, 3 + %sel1 = select i1 %c1, i32 1, i32 2 + %sel2 = select i1 %c2, i32 0, i32 %sel1 + %alloca = alloca [16 x i32], align 16, addrspace(5) + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 64, i1 false) + %gep.0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 0 + %gep.1 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 15 + store i32 42, ptr addrspace(5) %gep.0 + store i32 43, ptr addrspace(5) %gep.1 + %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 + %load = load i32, ptr addrspace(5) %gep + store i32 %load, ptr %out + ret void +} + +define amdgpu_kernel void @i32_16_elements_attrib(ptr %out) #2 { +; DEFAULT-LABEL: define amdgpu_kernel void @i32_16_elements_attrib( +; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] { +; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; DEFAULT-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; DEFAULT-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; DEFAULT-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; DEFAULT-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; DEFAULT-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 16, addrspace(5) +; DEFAULT-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false) +; DEFAULT-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; DEFAULT-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15 +; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; DEFAULT-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 +; DEFAULT-NEXT: ret void +; +; RATIO2-LABEL: define amdgpu_kernel void @i32_16_elements_attrib( +; RATIO2-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] { +; RATIO2-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; RATIO2-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; RATIO2-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; RATIO2-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; RATIO2-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; RATIO2-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; RATIO2-NEXT: [[TMP1:%.*]] = extractelement <16 x i32> , i32 [[SEL2]] +; RATIO2-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; RATIO2-NEXT: ret void +; +; RATIO8-LABEL: define amdgpu_kernel void @i32_16_elements_attrib( +; RATIO8-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] { +; RATIO8-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; RATIO8-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; RATIO8-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; RATIO8-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; RATIO8-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; RATIO8-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; RATIO8-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 16, addrspace(5) +; RATIO8-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false) +; RATIO8-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; RATIO8-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15 +; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; RATIO8-NEXT: [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; RATIO8-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 +; RATIO8-NEXT: ret void +; + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %c1 = icmp uge i32 %x, 3 + %c2 = icmp uge i32 %y, 3 + %sel1 = select i1 %c1, i32 1, i32 2 + %sel2 = select i1 %c2, i32 0, i32 %sel1 + %alloca = alloca [16 x i32], align 16, addrspace(5) + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 64, i1 false) + %gep.0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 0 + %gep.1 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 15 + store i32 42, ptr addrspace(5) %gep.0 + store i32 43, ptr addrspace(5) %gep.1 + %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 + %load = load i32, ptr addrspace(5) %gep + store i32 %load, ptr %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.workitem.id.y() +declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) + +attributes #0 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" } +attributes #1 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="2" } +attributes #2 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="8" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; BASE: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll index 64d7c7868ca8d..fee045359e751 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll @@ -1,5 +1,5 @@ -; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-limit=32 < %s | FileCheck -check-prefix=LIMIT32 %s +; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-max-regs=64 < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-limit=32 -amdgpu-promote-alloca-to-vector-max-regs=64 < %s | FileCheck -check-prefix=LIMIT32 %s target datalayout = "A5"