From 365f4acab9530dc85511b331b4bd9319171cab3b Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 2 Jul 2025 11:14:59 -0400 Subject: [PATCH 1/3] [AMDGPU][Attributor] Infer inreg attribute in `AMDGPUAttributor` This patch introduces `AAAMDGPUUniformArgument` that can infer `inreg` function argument attribute. The idea is, for a function argument, if the corresponding call site arguments are always uniform, we can mark it as `inreg` thus pass it via SGPR. In addition, this AA is also able to propagate the inreg attribute if feasible. --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 120 +++++++++++++++++- .../test/CodeGen/AMDGPU/aa-inreg-inference.ll | 74 +++++++++++ 2 files changed, 193 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 59cc1df292f46..67bf3f3bb88b5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -13,6 +13,11 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" +<<<<<<< HEAD +======= +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h" +>>>>>>> 844ed4358374 ([AMDGPU][Attributor] Infer inreg attribute in `AMDGPUAttributor`) #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Target/TargetMachine.h" @@ -1296,6 +1301,114 @@ struct AAAMDGPUNoAGPR const char AAAMDGPUNoAGPR::ID = 0; +struct AAAMDGPUUniform : public StateWrapper { + using Base = StateWrapper; + AAAMDGPUUniform(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + + /// Create an abstract attribute view for the position \p IRP. + static AAAMDGPUUniform &createForPosition(const IRPosition &IRP, + Attributor &A); + + /// See AbstractAttribute::getName() + StringRef getName() const override { return "AAAMDGPUUniform"; } + + const std::string getAsStr(Attributor *A) const override { + return getAssumed() ? "uniform" : "divergent"; + } + + void trackStatistics() const override {} + + /// See AbstractAttribute::getIdAddr() + const char *getIdAddr() const override { return &ID; } + + /// This function should return true if the type of the \p AA is + /// AAAMDGPUUniform + static bool classof(const AbstractAttribute *AA) { + return (AA->getIdAddr() == &ID); + } + + /// Unique ID (due to the unique address) + static const char ID; +}; + +const char AAAMDGPUUniform::ID = 0; + +/// This AA is to infer the inreg attribute for a function argument. +struct AAAMDGPUUniformArgument : public AAAMDGPUUniform { + AAAMDGPUUniformArgument(const IRPosition &IRP, Attributor &A) + : AAAMDGPUUniform(IRP, A) {} + + void initialize(Attributor &A) override { + Argument *Arg = getAssociatedArgument(); + CallingConv::ID CC = Arg->getParent()->getCallingConv(); + if (Arg->hasAttribute(Attribute::InReg)) { + indicateOptimisticFixpoint(); + return; + } + + if (AMDGPU::isEntryFunctionCC(CC)) { + // We only use isArgPassedInSGPR on kernel entry function argument, so + // even if we will use SPGR for non-uniform i1 argument passing, it will + // not affect this. + if (AMDGPU::isArgPassedInSGPR(Arg)) + indicateOptimisticFixpoint(); + else + indicatePessimisticFixpoint(); + } + } + + ChangeStatus updateImpl(Attributor &A) override { + unsigned ArgNo = getAssociatedArgument()->getArgNo(); + TargetMachine &TM = + static_cast(A.getInfoCache()).TM; + + auto isUniform = [&](AbstractCallSite ACS) -> bool { + CallBase *CB = ACS.getInstruction(); + Value *V = CB->getArgOperand(ArgNo); + if (auto *Arg = dyn_cast(V)) { + auto *AA = A.getOrCreateAAFor( + IRPosition::argument(*Arg), this, DepClassTy::REQUIRED); + return AA && AA->isValidState(); + } + TargetTransformInfo TTI = TM.getTargetTransformInfo(*CB->getFunction()); + return TTI.isAlwaysUniform(V); + }; + + bool UsedAssumedInformation = true; + if (!A.checkForAllCallSites(isUniform, *this, /*RequireAllCallSites=*/true, + UsedAssumedInformation)) + return indicatePessimisticFixpoint(); + + if (!UsedAssumedInformation) + return indicateOptimisticFixpoint(); + + return ChangeStatus::UNCHANGED; + } + + ChangeStatus manifest(Attributor &A) override { + Argument *Arg = getAssociatedArgument(); + // If the argument already has inreg attribute, we will not do anything + // about it. + if (Arg->hasAttribute(Attribute::InReg)) + return ChangeStatus::UNCHANGED; + if (AMDGPU::isEntryFunctionCC(Arg->getParent()->getCallingConv())) + return ChangeStatus::UNCHANGED; + LLVMContext &Ctx = Arg->getContext(); + return A.manifestAttrs(getIRPosition(), + {Attribute::get(Ctx, Attribute::InReg)}); + } +}; + +AAAMDGPUUniform &AAAMDGPUUniform::createForPosition(const IRPosition &IRP, + Attributor &A) { + switch (IRP.getPositionKind()) { + case IRPosition::IRP_ARGUMENT: + return *new (A.Allocator) AAAMDGPUUniformArgument(IRP, A); + default: + llvm_unreachable("not a valid position for AAAMDGPUUniform"); + } +} + /// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute /// based on the finalized 'amdgpu-flat-work-group-size' attribute. /// Both attributes start with narrow ranges that expand during iteration. @@ -1382,7 +1495,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, - &AAIndirectCallInfo::ID}); + &AAIndirectCallInfo::ID, &AAAMDGPUUniform::ID}); AttributorConfig AC(CGUpdater); AC.IsClosedWorldModule = Options.IsClosedWorld; @@ -1435,6 +1548,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, A.getOrCreateAAFor(IRPosition::value(*Ptr)); A.getOrCreateAAFor(IRPosition::value(*Ptr)); } + + if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) { + for (auto &Arg : F->args()) + A.getOrCreateAAFor(IRPosition::argument(Arg)); + } } } diff --git a/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll b/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll new file mode 100644 index 0000000000000..91dc5618b2989 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o - | FileCheck %s + +@g1 = protected addrspace(1) externally_initialized global i32 0, align 4 +@g2 = protected addrspace(1) externally_initialized global i32 0, align 4 +@g3 = protected addrspace(1) externally_initialized global i32 0, align 4 +@g4 = protected addrspace(1) externally_initialized global i32 0, align 4 + +define internal void @callee_with_always_uniform_argument(ptr addrspace(1) %x, i32 %y) { +; CHECK-LABEL: define internal void @callee_with_always_uniform_argument( +; CHECK-SAME: ptr addrspace(1) inreg [[X:%.*]], i32 inreg [[Y:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4 +; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4 +; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4 +; CHECK-NEXT: ret void +; +entry: + %x.val = load i32, ptr addrspace(1) %x, align 4 + store i32 %x.val, ptr addrspace(1) @g3, align 4 + store i32 %y, ptr addrspace(1) @g4, align 4 + ret void +} + +define amdgpu_kernel void @kernel_with_readfirstlane(ptr addrspace(1) %p, i32 %x) { +; CHECK-LABEL: define amdgpu_kernel void @kernel_with_readfirstlane( +; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P0:%.*]] = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) [[P]]) +; CHECK-NEXT: call void @callee_with_always_uniform_argument(ptr addrspace(1) [[P0]], i32 [[X]]) +; CHECK-NEXT: ret void +; +entry: + %p0 = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) %p) + call void @callee_with_always_uniform_argument(ptr addrspace(1) %p0, i32 %x) + ret void +} + +define internal void @callee_without_always_uniform_argument(ptr addrspace(1) %x, i32 %y) { +; CHECK-LABEL: define internal void @callee_without_always_uniform_argument( +; CHECK-SAME: ptr addrspace(1) [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4 +; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4 +; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4 +; CHECK-NEXT: ret void +; +entry: + %x.val = load i32, ptr addrspace(1) %x, align 4 + store i32 %x.val, ptr addrspace(1) @g3, align 4 + store i32 %y, ptr addrspace(1) @g4, align 4 + ret void +} + +define amdgpu_kernel void @kernel_with_divergent_callsite_argument(ptr addrspace(1) %p, i32 %x) { +; CHECK-LABEL: define amdgpu_kernel void @kernel_with_divergent_callsite_argument( +; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[P]], i32 [[ID_X]] +; CHECK-NEXT: [[D:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4 +; CHECK-NEXT: call void @callee_without_always_uniform_argument(ptr addrspace(1) [[GEP]], i32 [[D]]) +; CHECK-NEXT: ret void +; +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, ptr addrspace(1) %p, i32 %id.x + %d = load i32, ptr addrspace(1) %gep + call void @callee_without_always_uniform_argument(ptr addrspace(1) %gep, i32 %d) + ret void +} + +declare ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1)) +declare noundef i32 @llvm.amdgcn.workitem.id.x() From 7b99ff9a5c815ec5b25d8d303959687635197e16 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Mon, 21 Jul 2025 09:59:55 -0400 Subject: [PATCH 2/3] fix comments --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 14 ++++----- .../test/CodeGen/AMDGPU/aa-inreg-inference.ll | 29 +++++++++++++------ .../AMDGPU/attributor-noalias-addrspace.ll | 4 +-- 3 files changed, 28 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 67bf3f3bb88b5..8d420022f0f2f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -13,11 +13,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" -<<<<<<< HEAD -======= #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/CodeGen/TargetPassConfig.h" ->>>>>>> 844ed4358374 ([AMDGPU][Attributor] Infer inreg attribute in `AMDGPUAttributor`) #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Target/TargetMachine.h" @@ -1359,19 +1355,21 @@ struct AAAMDGPUUniformArgument : public AAAMDGPUUniform { ChangeStatus updateImpl(Attributor &A) override { unsigned ArgNo = getAssociatedArgument()->getArgNo(); - TargetMachine &TM = - static_cast(A.getInfoCache()).TM; auto isUniform = [&](AbstractCallSite ACS) -> bool { CallBase *CB = ACS.getInstruction(); Value *V = CB->getArgOperand(ArgNo); + if (isa(V)) + return true; if (auto *Arg = dyn_cast(V)) { auto *AA = A.getOrCreateAAFor( IRPosition::argument(*Arg), this, DepClassTy::REQUIRED); return AA && AA->isValidState(); } - TargetTransformInfo TTI = TM.getTargetTransformInfo(*CB->getFunction()); - return TTI.isAlwaysUniform(V); + TargetTransformInfo *TTI = + A.getInfoCache().getAnalysisResultForFunction( + *CB->getFunction()); + return TTI->isAlwaysUniform(V); }; bool UsedAssumedInformation = true; diff --git a/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll b/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll index 91dc5618b2989..22cfd4827e5da 100644 --- a/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll +++ b/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll @@ -4,21 +4,20 @@ @g1 = protected addrspace(1) externally_initialized global i32 0, align 4 @g2 = protected addrspace(1) externally_initialized global i32 0, align 4 @g3 = protected addrspace(1) externally_initialized global i32 0, align 4 -@g4 = protected addrspace(1) externally_initialized global i32 0, align 4 define internal void @callee_with_always_uniform_argument(ptr addrspace(1) %x, i32 %y) { ; CHECK-LABEL: define internal void @callee_with_always_uniform_argument( ; CHECK-SAME: ptr addrspace(1) inreg [[X:%.*]], i32 inreg [[Y:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4 -; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4 -; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4 +; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g2, align 4 +; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g3, align 4 ; CHECK-NEXT: ret void ; entry: %x.val = load i32, ptr addrspace(1) %x, align 4 - store i32 %x.val, ptr addrspace(1) @g3, align 4 - store i32 %y, ptr addrspace(1) @g4, align 4 + store i32 %x.val, ptr addrspace(1) @g2, align 4 + store i32 %y, ptr addrspace(1) @g3, align 4 ret void } @@ -36,19 +35,31 @@ entry: ret void } +define amdgpu_kernel void @kernel_with_constant(i32 %x) { +; CHECK-LABEL: define amdgpu_kernel void @kernel_with_constant( +; CHECK-SAME: i32 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @callee_with_always_uniform_argument(ptr addrspace(1) @g1, i32 [[X]]) +; CHECK-NEXT: ret void +; +entry: + call void @callee_with_always_uniform_argument(ptr addrspace(1) @g1, i32 %x) + ret void +} + define internal void @callee_without_always_uniform_argument(ptr addrspace(1) %x, i32 %y) { ; CHECK-LABEL: define internal void @callee_without_always_uniform_argument( ; CHECK-SAME: ptr addrspace(1) [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4 -; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4 -; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4 +; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g2, align 4 +; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g3, align 4 ; CHECK-NEXT: ret void ; entry: %x.val = load i32, ptr addrspace(1) %x, align 4 - store i32 %x.val, ptr addrspace(1) @g3, align 4 - store i32 %y, ptr addrspace(1) @g4, align 4 + store i32 %x.val, ptr addrspace(1) @g2, align 4 + store i32 %y, ptr addrspace(1) @g3, align 4 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll index d91b2117c7ad9..d4e213fecddf8 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll @@ -480,7 +480,7 @@ bb.2.end: define internal void @callee_no_alias_addr_space_select(ptr %ptr1, ptr %ptr2, ptr %ptr3, i1 %cond1, i1 %cond2, i32 %val) #0 { ; CHECK-LABEL: define internal void @callee_no_alias_addr_space_select( -; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[VAL:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 inreg [[COND1:%.*]], i1 inreg [[COND2:%.*]], i32 inreg [[VAL:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[PTR4:%.*]] = select i1 [[COND1]], ptr addrspacecast (ptr addrspace(1) @gptr to ptr), ptr addrspacecast (ptr addrspace(4) @gptr2 to ptr) ; CHECK-NEXT: [[PTR5:%.*]] = select i1 [[COND2]], ptr [[PTR4]], ptr addrspacecast (ptr addrspace(3) @gptr3 to ptr) ; CHECK-NEXT: store i32 [[VAL]], ptr [[PTR5]], align 4, !noalias.addrspace [[META1:![0-9]+]] @@ -516,7 +516,7 @@ define internal void @callee_no_alias_addr_space_select(ptr %ptr1, ptr %ptr2, pt define internal void @callee_alias_addr_space_branch(ptr %ptr1, ptr %ptr2, ptr %ptr3, i1 %cond1, i1 %cond2, i32 %val) #0 { ; CHECK-LABEL: define internal void @callee_alias_addr_space_branch( -; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[VAL:%.*]]) #[[ATTR1]] { +; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 inreg [[COND1:%.*]], i1 inreg [[COND2:%.*]], i32 inreg [[VAL:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: br i1 [[COND1]], label %[[BB_1_TRUE:.*]], label %[[BB_1_FALSE:.*]] ; CHECK: [[BB_1_TRUE]]: ; CHECK-NEXT: br label %[[BB_1_END:.*]] From 3b36e243eff8570632c9e8409d8f716e4419223f Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Mon, 18 Aug 2025 18:05:17 -0400 Subject: [PATCH 3/3] fix comments --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 8d420022f0f2f..cad9a14661bf4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1366,7 +1366,7 @@ struct AAAMDGPUUniformArgument : public AAAMDGPUUniform { IRPosition::argument(*Arg), this, DepClassTy::REQUIRED); return AA && AA->isValidState(); } - TargetTransformInfo *TTI = + const TargetTransformInfo *TTI = A.getInfoCache().getAnalysisResultForFunction( *CB->getFunction()); return TTI->isAlwaysUniform(V);