From ea3f6bbee6f732b6fd18f4ab1d00e01462e2ddad Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Sun, 28 Jul 2024 18:48:54 -0400 Subject: [PATCH] [Attributor][AMD] Enable AAIndirectCallInfo for AMDAttributorPass --- llvm/include/llvm/Transforms/IPO/Attributor.h | 9 +-- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 15 ++++- llvm/lib/Transforms/IPO/Attributor.cpp | 2 +- .../Transforms/IPO/AttributorAttributes.cpp | 3 +- .../CodeGen/AMDGPU/direct-indirect-call.ll | 4 +- .../AMDGPU/duplicate-attribute-indirect.ll | 4 +- .../CodeGen/AMDGPU/simple-indirect-call-2.ll | 66 +++++++++++++++++++ .../CodeGen/AMDGPU/simple-indirect-call.ll | 14 ++-- 8 files changed, 101 insertions(+), 16 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index 34557238ecb23..f6264c40ec5eb 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -1448,7 +1448,7 @@ struct AttributorConfig { /// Callback function to determine if an indirect call targets should be made /// direct call targets (with an if-cascade). std::function + Function &AssumedCallee, unsigned NumAssumedCallees)> IndirectCalleeSpecializationCallback = nullptr; /// Helper to update an underlying call graph and to delete functions. @@ -1718,10 +1718,11 @@ struct Attributor { /// Return true if we should specialize the call site \b CB for the potential /// callee \p Fn. bool shouldSpecializeCallSiteForCallee(const AbstractAttribute &AA, - CallBase &CB, Function &Callee) { + CallBase &CB, Function &Callee, + unsigned NumAssumedCallees) { return Configuration.IndirectCalleeSpecializationCallback - ? Configuration.IndirectCalleeSpecializationCallback(*this, AA, - CB, Callee) + ? Configuration.IndirectCalleeSpecializationCallback( + *this, AA, CB, Callee, NumAssumedCallees) : true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index de1f3421cce4e..6548011cee537 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -14,6 +14,7 @@ #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/CycleAnalysis.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" @@ -1038,12 +1039,24 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) { &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, - &AAUnderlyingObjects::ID}); + &AAUnderlyingObjects::ID, &AAIndirectCallInfo::ID, &AAInstanceInfo::ID}); AttributorConfig AC(CGUpdater); AC.Allowed = &Allowed; AC.IsModulePass = true; AC.DefaultInitializeLiveInternals = false; + AC.IndirectCalleeSpecializationCallback = + [&TM](Attributor &A, const AbstractAttribute &AA, CallBase &CB, + Function &Callee, unsigned NumAssumedCallees) { + if (AMDGPU::isEntryFunctionCC(Callee.getCallingConv())) + return false; + // Singleton functions can be specialized. + if (NumAssumedCallees == 1) + return true; + // Otherwise specialize uniform values. + const auto &TTI = TM.getTargetTransformInfo(*CB.getCaller()); + return TTI.isAlwaysUniform(CB.getCalledOperand()); + }; AC.IPOAmendableCB = [](const Function &F) { return F.getCallingConv() == CallingConv::AMDGPU_KERNEL; }; diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index 910c0aeacc42e..38b61b6a88357 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -3836,7 +3836,7 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache, if (MaxSpecializationPerCB.getNumOccurrences()) { AC.IndirectCalleeSpecializationCallback = [&](Attributor &, const AbstractAttribute &AA, CallBase &CB, - Function &Callee) { + Function &Callee, unsigned) { if (MaxSpecializationPerCB == 0) return false; auto &Set = IndirectCalleeTrackingMap[&CB]; diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index cd31c4be1c1da..2460fb222c3c3 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -12347,7 +12347,8 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo { SmallVector SkippedAssumedCallees; SmallVector> NewCalls; for (Function *NewCallee : AssumedCallees) { - if (!A.shouldSpecializeCallSiteForCallee(*this, *CB, *NewCallee)) { + if (!A.shouldSpecializeCallSiteForCallee(*this, *CB, *NewCallee, + AssumedCallees.size())) { SkippedAssumedCallees.push_back(NewCallee); SpecializedForAllCallees = false; continue; diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll index 386f9cd3f9ce7..aa182b720c604 100644 --- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll @@ -15,7 +15,7 @@ define internal void @direct() { ; CHECK-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) ; CHECK-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8 ; CHECK-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8 -; CHECK-NEXT: call void [[FP]]() +; CHECK-NEXT: call void @indirect() ; CHECK-NEXT: ret void ; %fptr = alloca ptr, addrspace(5) @@ -36,5 +36,5 @@ define amdgpu_kernel void @test_direct_indirect_call() { } ;. ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll index 05558c555c581..848019c872925 100644 --- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_simple_indirect_call() #0 { ; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) ; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8 ; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8 -; ATTRIBUTOR_GCN-NEXT: call void [[FP]]() +; ATTRIBUTOR_GCN-NEXT: call void @indirect() ; ATTRIBUTOR_GCN-NEXT: ret void ; %fptr = alloca ptr, addrspace(5) @@ -43,5 +43,5 @@ attributes #0 = { "amdgpu-no-dispatch-id" } ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll new file mode 100644 index 0000000000000..9c3457e87dbf3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck --check-prefixes=CHECK,OW %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -attributor-assume-closed-world=1 %s | FileCheck --check-prefixes=CHECK,CW %s + +target datalayout = "A5" + +@G = global i32 0, align 4 + +;. +; CHECK: @G = global i32 0, align 4 +;. +define void @bar() { +; CHECK-LABEL: define {{[^@]+}}@bar +; CHECK-SAME: () #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 1, ptr @G, align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 1, ptr @G, align 4 + ret void +} + +define ptr @helper() { +; CHECK-LABEL: define {{[^@]+}}@helper +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret ptr @bar +; +entry: + ret ptr @bar +} + +define amdgpu_kernel void @foo(ptr noundef %fp) { +; OW-LABEL: define {{[^@]+}}@foo +; OW-SAME: (ptr noundef [[FP:%.*]]) #[[ATTR1:[0-9]+]] { +; OW-NEXT: entry: +; OW-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +; OW-NEXT: store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8 +; OW-NEXT: call void [[FP]]() +; OW-NEXT: ret void +; +; CW-LABEL: define {{[^@]+}}@foo +; CW-SAME: (ptr noundef [[FP:%.*]]) #[[ATTR1:[0-9]+]] { +; CW-NEXT: entry: +; CW-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +; CW-NEXT: store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8 +; CW-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8 +; CW-NEXT: call void @bar() +; CW-NEXT: ret void +; +entry: + %fp.addr = alloca ptr, addrspace(5) + store ptr %fp, ptr addrspace(5) %fp.addr + %load = load ptr, ptr addrspace(5) %fp.addr + call void %load() + ret void +} + +;. +; OW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; OW: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } +;. +; CW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +;. diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index e86ee1adef3d0..971161a1c5985 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -37,7 +37,7 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; ATTRIBUTOR_GCN-NEXT: [[FPTR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR]] to ptr ; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr [[FPTR_CAST]], align 8 ; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load ptr, ptr [[FPTR_CAST]], align 8 -; ATTRIBUTOR_GCN-NEXT: call void [[FP]]() +; ATTRIBUTOR_GCN-NEXT: call void @indirect() ; ATTRIBUTOR_GCN-NEXT: ret void ; ; GFX9-LABEL: test_simple_indirect_call: @@ -75,12 +75,16 @@ define amdgpu_kernel void @test_simple_indirect_call() { ret void } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +;. +; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} +;. +; ATTRIBUTOR_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. - -!llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}