From d1c949492ddf8b637b44e441b329e420e217606c Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Mon, 29 Jul 2024 18:05:27 +0000 Subject: [PATCH 1/2] [AArch64] Add streaming-mode stack hazard optimization remarks Emit an optimization remark when objects in the stack frame may cause hazards in a streaming mode function. The analysis requires either the `aarch64-stack-hazard-size` or `aarch64-stack-hazard-remark-size` flag to be set by the user, with the former flag taking precedence. --- .../Target/AArch64/AArch64FrameLowering.cpp | 205 +++++++++++++++++- .../lib/Target/AArch64/AArch64FrameLowering.h | 6 +- .../AArch64/ssve-stack-hazard-remarks.ll | 156 +++++++++++++ .../CodeGen/AArch64/sve-stack-frame-layout.ll | 4 +- 4 files changed, 360 insertions(+), 11 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index bd530903bb664..5b134f5e35324 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -240,6 +240,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -275,6 +276,10 @@ cl::opt EnableHomogeneousPrologEpilog( // Stack hazard padding size. 0 = disabled. static cl::opt StackHazardSize("aarch64-stack-hazard-size", cl::init(0), cl::Hidden); +// Stack hazard size for analysis remarks. StackHazardSize takes precedence. +static cl::opt + StackHazardRemarkSize("aarch64-stack-hazard-remark-size", cl::init(0), + cl::Hidden); // Whether to insert padding into non-streaming functions (for testing). static cl::opt StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming", @@ -2615,9 +2620,16 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, const auto &MFI = MF.getFrameInfo(); int64_t ObjectOffset = MFI.getObjectOffset(FI); + StackOffset SVEStackSize = getSVEStackSize(MF); + + // For VLA-area objects, just emit an offset at the end of the stack frame. + // Whilst not quite correct, these objects do live at the end of the frame and + // so it is more useful for analysis for the offset to reflect this. + if (MFI.isVariableSizedObjectIndex(FI)) { + return StackOffset::getFixed(-((int64_t)MFI.getStackSize())) - SVEStackSize; + } // This is correct in the absence of any SVE stack objects. - StackOffset SVEStackSize = getSVEStackSize(MF); if (!SVEStackSize) return StackOffset::getFixed(ObjectOffset - getOffsetOfLocalArea()); @@ -3528,13 +3540,9 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( return true; } -// Return the FrameID for a Load/Store instruction by looking at the MMO. -static std::optional getLdStFrameID(const MachineInstr &MI, - const MachineFrameInfo &MFI) { - if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1) - return std::nullopt; - - MachineMemOperand *MMO = *MI.memoperands_begin(); +// Return the FrameID for a MMO. +static std::optional getMMOFrameID(MachineMemOperand *MMO, + const MachineFrameInfo &MFI) { auto *PSV = dyn_cast_or_null(MMO->getPseudoValue()); if (PSV) @@ -3552,6 +3560,15 @@ static std::optional getLdStFrameID(const MachineInstr &MI, return std::nullopt; } +// Return the FrameID for a Load/Store instruction by looking at the first MMO. +static std::optional getLdStFrameID(const MachineInstr &MI, + const MachineFrameInfo &MFI) { + if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1) + return std::nullopt; + + return getMMOFrameID(*MI.memoperands_begin(), MFI); +} + // Check if a Hazard slot is needed for the current function, and if so create // one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex, // which can be used to determine if any hazard padding is needed. @@ -4626,6 +4643,10 @@ void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced( if (StackTaggingMergeSetTag) II = tryMergeAdjacentSTG(II, this, RS); } + + // Run remarks pass. + MachineOptimizationRemarkEmitter ORE(MF, nullptr); + emitRemarks(MF, ORE); } /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP @@ -5029,3 +5050,171 @@ void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF, MI->eraseFromParent(); } } + +struct StackAccess { + enum AccessType { + NotAccessed = 0, // Stack object not accessed by load/store instructions. + GPR = 1 << 0, // A general purpose register. + PPR = 1 << 1, // A predicate register. + FPR = 1 << 2, // A floating point/Neon/SVE register. + }; + + int Idx; + StackOffset Offset; + int64_t Size; + unsigned AccessTypes; + + StackAccess() : Idx(0), Offset(), Size(0), AccessTypes(NotAccessed) {} + + bool operator<(const StackAccess &Rhs) const { + return std::make_tuple(start(), Idx) < + std::make_tuple(Rhs.start(), Rhs.Idx); + } + + bool isCPU() const { + // Predicate register load and store instructions execute on the CPU. + return AccessTypes & (AccessType::GPR | AccessType::PPR); + } + bool isSME() const { return AccessTypes & AccessType::FPR; } + bool isMixed() const { return ((AccessTypes & (AccessTypes - 1)) != 0); } + + int64_t start() const { return Offset.getFixed() + Offset.getScalable(); } + int64_t end() const { return start() + Size; } + + std::string getTypeString() const { + switch (AccessTypes) { + case AccessType::FPR: + return "FPR"; + case AccessType::PPR: + return "PPR"; + case AccessType::GPR: + return "GPR"; + case AccessType::NotAccessed: + return "NA"; + default: + return "Mixed"; + } + } + + void print(raw_ostream &OS) const { + OS << getTypeString() << " stack object at [SP" + << (Offset.getFixed() < 0 ? "" : "+") << Offset.getFixed(); + if (Offset.getScalable()) + OS << (Offset.getScalable() < 0 ? "" : "+") << Offset.getScalable() + << " * vscale"; + OS << "]"; + } +}; + +static inline raw_ostream &operator<<(raw_ostream &OS, const StackAccess &SA) { + SA.print(OS); + return OS; +} + +void AArch64FrameLowering::emitRemarks( + const MachineFunction &MF, MachineOptimizationRemarkEmitter &ORE) const { + + SMEAttrs Attrs(MF.getFunction()); + if (Attrs.hasNonStreamingInterfaceAndBody()) + return; + + const uint64_t HazardSize = + (StackHazardSize) ? StackHazardSize : StackHazardRemarkSize; + + if (HazardSize == 0) + return; + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + std::vector StackAccesses(MFI.getNumObjects()); + + size_t NumFPLdSt = 0; + size_t NumNonFPLdSt = 0; + + // Collect stack accesses via Load/Store instructions. + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1) + continue; + for (MachineMemOperand *MMO : MI.memoperands()) { + std::optional FI = getMMOFrameID(MMO, MFI); + if (FI && !MFI.isDeadObjectIndex(*FI)) { + int FrameIdx = *FI; + + size_t ArrIdx = FrameIdx + MFI.getNumFixedObjects(); + if (StackAccesses[ArrIdx].AccessTypes == StackAccess::NotAccessed) { + StackAccesses[ArrIdx].Idx = FrameIdx; + StackAccesses[ArrIdx].Offset = + getFrameIndexReferenceFromSP(MF, FrameIdx); + StackAccesses[ArrIdx].Size = MFI.getObjectSize(FrameIdx); + } + + unsigned RegTy = StackAccess::AccessType::GPR; + if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) { + if (AArch64::PPRRegClass.contains(MI.getOperand(0).getReg())) + RegTy = StackAccess::PPR; + else + RegTy = StackAccess::FPR; + } else if (AArch64InstrInfo::isFpOrNEON(MI)) { + RegTy = StackAccess::FPR; + } + + StackAccesses[ArrIdx].AccessTypes |= RegTy; + + if (RegTy == StackAccess::FPR) + ++NumFPLdSt; + else + ++NumNonFPLdSt; + } + } + } + } + + if (NumFPLdSt == 0 || NumNonFPLdSt == 0) + return; + + llvm::sort(StackAccesses); + StackAccesses.erase(llvm::remove_if(StackAccesses, + [](const StackAccess &S) { + return S.AccessTypes == + StackAccess::NotAccessed; + }), + StackAccesses.end()); + + SmallVector MixedObjects; + SmallVector> HazardPairs; + + if (StackAccesses.front().isMixed()) + MixedObjects.push_back(&StackAccesses.front()); + + for (auto It = StackAccesses.begin(), End = StackAccesses.end(); + It != (End - 1); ++It) { + const auto &First = *It; + const auto &Second = *(It + 1); + + if (Second.isMixed()) + MixedObjects.push_back(&Second); + + if ((First.isSME() && Second.isCPU()) || + (First.isCPU() && Second.isSME())) { + uint64_t Distance = static_cast(Second.start() - First.end()); + if (Distance < HazardSize) + HazardPairs.emplace_back(&First, &Second); + } + } + + auto EmitRemark = [&](llvm::StringRef Str) { + ORE.emit([&]() { + auto R = MachineOptimizationRemarkAnalysis( + "sme", "StackHazard", MF.getFunction().getSubprogram(), &MF.front()); + return R << formatv("stack hazard in '{0}': ", MF.getName()).str() << Str; + }); + }; + + for (const auto &P : HazardPairs) + EmitRemark(formatv("{0} is too close to {1}", *P.first, *P.second).str()); + + for (const auto *Obj : MixedObjects) + EmitRemark( + formatv("{0} accessed by both GP and FP instructions", *Obj).str()); +} diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 0ebab1700e9ce..396caa6b04868 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -13,8 +13,9 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H #define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H -#include "llvm/Support/TypeSize.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/Support/TypeSize.h" namespace llvm { @@ -178,6 +179,9 @@ class AArch64FrameLowering : public TargetFrameLowering { inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI, int64_t NegProbeSize, Register TargetReg) const; + + void emitRemarks(const MachineFunction &MF, + MachineOptimizationRemarkEmitter &ORE) const; }; } // End llvm namespace diff --git a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll new file mode 100644 index 0000000000000..94b915eb42cfd --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll @@ -0,0 +1,156 @@ +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-stack-hazard-remark-size=64 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-stack-hazard-size=1024 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-PADDING + +; Don't emit remarks for non-streaming functions. +define float @csr_x20_stackargs_notsc(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) { +; CHECK-NOT: remark: :0:0: stack hazard in 'csr_x20_stackargs_notsc': +; CHECK-PADDING-NOT: remark: :0:0: stack hazard in 'csr_x20_stackargs_notsc': +entry: + tail call void asm sideeffect "", "~{x20}"() #1 + ret float %i +} + +; Don't emit remarks for functions that only access GPR stack objects. +define i64 @stackargs_gpr(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h, i64 %i) #2 { +; CHECK-NOT: remark: :0:0: stack hazard in 'csr_x20_stackargs_gpr': +; CHECK-PADDING-NOT: remark: :0:0: stack hazard in 'csr_x20_stackargs_gpr': +entry: + ret i64 %i +} + +; Don't emit remarks for functions that only access FPR stack objects. +define double @stackargs_fpr(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i) #2 { +; CHECK-NOT: remark: :0:0: stack hazard in 'csr_x20_stackargs_fpr': +; CHECK-PADDING-NOT: remark: :0:0: stack hazard in 'csr_x20_stackargs_fpr': +entry: + ret double %i +} + +; As this case is handled by addition of stack hazard padding, only emit remarks when this is not switched on. +define i32 @csr_d8_alloci64(i64 %d) #2 { +; CHECK: remark: :0:0: stack hazard in 'csr_d8_alloci64': FPR stack object at [SP-16] is too close to GPR stack object at [SP-8] +; CHECK-PADDING-NOT: remark: :0:0: stack hazard in 'csr_d8_alloci64': +entry: + %a = alloca i64 + tail call void asm sideeffect "", "~{d8}"() #1 + store i64 %d, ptr %a + ret i32 0 +} + +; As this case is handled by addition of stack hazard padding, only emit remarks when this is not switched on. +define i32 @csr_d8_allocnxv4i32(i64 %d) #2 { +; CHECK: remark: :0:0: stack hazard in 'csr_d8_allocnxv4i32': FPR stack object at [SP-16] is too close to GPR stack object at [SP-8] +; CHECK-PADDING-NOT: remark: :0:0: stack hazard in 'csr_d8_allocnxv4i32': +entry: + %a = alloca + tail call void asm sideeffect "", "~{d8}"() #1 + store zeroinitializer, ptr %a + ret i32 0 +} + +define float @csr_x20_stackargs(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) #2 { +; CHECK: remark: :0:0: stack hazard in 'csr_x20_stackargs': GPR stack object at [SP-16] is too close to FPR stack object at [SP+0] +; CHECK-PADDING: remark: :0:0: stack hazard in 'csr_x20_stackargs': GPR stack object at [SP-16] is too close to FPR stack object at [SP+0] +entry: + tail call void asm sideeffect "", "~{x20}"() #1 + ret float %i +} + +; In this case, addition of stack hazard padding triggers x29 (fp) spill, so we hazard occurs between FPR argument and GPR spill. +define float @csr_d8_stackargs(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) #2 { +; CHECK-NOT: remark: :0:0: stack hazard in 'csr_d8_stackargs': +; CHECK-PADDING: remark: :0:0: stack hazard in 'csr_d8_stackargs': GPR stack object at [SP-8] is too close to FPR stack object at [SP+0] +entry: + tail call void asm sideeffect "", "~{d8}"() #1 + ret float %i +} + +; SVE calling conventions +; Predicate register spills end up in FP region, currently. + +define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, i16 %P4) #2 { +; CHECK: remark: :0:0: stack hazard in 'svecc_call': PPR stack object at [SP-48-258 * vscale] is too close to FPR stack object at [SP-48-256 * vscale] +; CHECK: remark: :0:0: stack hazard in 'svecc_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48] +; CHECK-PADDING: remark: :0:0: stack hazard in 'svecc_call': PPR stack object at [SP-1072-258 * vscale] is too close to FPR stack object at [SP-1072-256 * vscale] +; CHECK-PADDING-NOT: remark: :0:0: stack hazard in 'svecc_call': +entry: + tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 + %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37) + ret i32 -396142473 +} + +define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, i16 %P4) #2 { +; CHECK: remark: :0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-48-258 * vscale] is too close to FPR stack object at [SP-48-256 * vscale] +; CHECK: remark: :0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48] +; CHECK-PADDING: remark: :0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-1072-258 * vscale] is too close to FPR stack object at [SP-1072-256 * vscale] +; CHECK-PADDING-NOT: remark: :0:0: stack hazard in 'svecc_alloca_call': +entry: + tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 + %0 = alloca [37 x i8], align 16 + %call = call ptr @memset(ptr noundef nonnull %0, i32 noundef 45, i32 noundef 37) + ret i32 -396142473 +} +declare ptr @memset(ptr, i32, i32) + +%struct.test_struct = type { i32, float, i32 } + +define i32 @mixed_stack_object(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i64 %mixed_obj) #2 { +; CHECK: remark: :0:0: stack hazard in 'mixed_stack_object': Mixed stack object at [SP+8] accessed by both GP and FP instructions +; CHECK-PADDING: remark: :0:0: stack hazard in 'mixed_stack_object': Mixed stack object at [SP+8] accessed by both GP and FP instructions +entry: + %t.sroa.0.0.extract.trunc = trunc i64 %mixed_obj to i32 + %t.sroa.2.0.extract.shift = lshr i64 %mixed_obj, 32 + %t.sroa.2.0.extract.trunc = trunc nuw i64 %t.sroa.2.0.extract.shift to i32 + %0 = bitcast i32 %t.sroa.2.0.extract.trunc to float + %conv = sitofp i32 %t.sroa.0.0.extract.trunc to float + %add = fadd float %conv, %0 + %conv2 = fptosi float %add to i32 + ret i32 %conv2 +} + +define i32 @mixed_stack_objects(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i64 %mixed_obj_0, i64 %mixed_obj_1) #2 { +; CHECK: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP+8] is too close to Mixed stack object at [SP+16] +; CHECK: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP+8] accessed by both GP and FP instructions +; CHECK: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP+16] accessed by both GP and FP instructions +; CHECK-PADDING: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP+8] is too close to Mixed stack object at [SP+16] +; CHECK-PADDING: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP+8] accessed by both GP and FP instructions +; CHECK-PADDING: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP+16] accessed by both GP and FP instructions +entry: + %t0.sroa.0.0.extract.trunc = trunc i64 %mixed_obj_0 to i32 + %t0.sroa.2.0.extract.shift = lshr i64 %mixed_obj_0, 32 + %t0.sroa.2.0.extract.trunc = trunc nuw i64 %t0.sroa.2.0.extract.shift to i32 + %t1.sroa.0.0.extract.trunc = trunc i64 %mixed_obj_1 to i32 + %t1.sroa.2.0.extract.shift = lshr i64 %mixed_obj_1, 32 + %t1.sroa.2.0.extract.trunc = trunc nuw i64 %t1.sroa.2.0.extract.shift to i32 + %0 = bitcast i32 %t0.sroa.2.0.extract.trunc to float + %1 = bitcast i32 %t1.sroa.2.0.extract.trunc to float + %conv0 = sitofp i32 %t0.sroa.0.0.extract.trunc to float + %conv1 = sitofp i32 %t1.sroa.0.0.extract.trunc to float + %add0 = fadd float %conv0, %0 + %add1 = fadd float %conv1, %1 + %add = fadd float %add0, %add1 + %conv2 = fptosi float %add to i32 + ret i32 %conv2 +} + +; VLA-area stack objects are not separated. +define i32 @csr_d8_allocnxv4i32i32f64_vlai32f64(double %d, i32 %i) #2 { +; CHECK: remark: :0:0: stack hazard in 'csr_d8_allocnxv4i32i32f64_vlai32f64': GPR stack object at [SP-48-16 * vscale] is too close to FPR stack object at [SP-48-16 * vscale] +; CHECK: remark: :0:0: stack hazard in 'csr_d8_allocnxv4i32i32f64_vlai32f64': FPR stack object at [SP-32] is too close to GPR stack object at [SP-24] +; CHECK-PADDING: remark: :0:0: stack hazard in 'csr_d8_allocnxv4i32i32f64_vlai32f64': GPR stack object at [SP-2096-16 * vscale] is too close to FPR stack object at [SP-2096-16 * vscale] +; CHECK-PADDING-NOT: remark: :0:0: stack hazard in 'csr_d8_allocnxv4i32i32f64_vlai32f64': +entry: + %a = alloca + %0 = zext i32 %i to i64 + %vla0 = alloca i32, i64 %0 + %vla1 = alloca double, i64 %0 + %c = alloca double + tail call void asm sideeffect "", "~{d8}"() #1 + store zeroinitializer, ptr %a + store i32 zeroinitializer, ptr %vla0 + store double %d, ptr %vla1 + store double %d, ptr %c + ret i32 0 +} + +attributes #2 = { "aarch64_pstate_sm_compatible" } diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll index 431c9dc76508f..ec94198a08ca7 100644 --- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll +++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll @@ -150,8 +150,8 @@ entry: ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Variable, Align: 16, Size: vscale x 16 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40-16 x vscale], Type: Variable, Align: 8, Size: 8 -; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: VariableSized, Align: 1, Size: 0 -; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: VariableSized, Align: 1, Size: 0 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-16 x vscale], Type: VariableSized, Align: 1, Size: 0 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-16 x vscale], Type: VariableSized, Align: 1, Size: 0 define i32 @csr_d8_allocnxv4i32i32f64_vla(double %d, i32 %i) "aarch64_pstate_sm_compatible" { ; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_vla: From e2caf98119ef61c19df339025f856e65f332876c Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Mon, 5 Aug 2024 10:38:46 +0000 Subject: [PATCH 2/2] Address Review comments. --- .../llvm/CodeGen/TargetFrameLowering.h | 6 ++ llvm/lib/CodeGen/PrologEpilogInserter.cpp | 3 + .../Target/AArch64/AArch64FrameLowering.cpp | 17 +++-- .../lib/Target/AArch64/AArch64FrameLowering.h | 2 +- .../AArch64/ssve-stack-hazard-remarks.ll | 64 +++++++++---------- 5 files changed, 48 insertions(+), 44 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h index 0656c0d739fdf..d8c9d0a432ad8 100644 --- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h +++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h @@ -15,6 +15,7 @@ #include "llvm/ADT/BitVector.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/Support/TypeSize.h" #include @@ -473,6 +474,11 @@ class TargetFrameLowering { /// Return the frame base information to be encoded in the DWARF subprogram /// debug info. virtual DwarfFrameBase getDwarfFrameBase(const MachineFunction &MF) const; + + /// This method is called at the end of prolog/epilog code insertion, so + /// targets can emit remarks based on the final frame layout. + virtual void emitRemarks(const MachineFunction &MF, + MachineOptimizationRemarkEmitter *ORE) const {}; }; } // End llvm namespace diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index cd5d877e53d82..f4490873cfdcd 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -341,6 +341,9 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) { << ore::NV("Function", MF.getFunction().getName()) << "'"; }); + // Emit any remarks implemented for the target, based on final frame layout. + TFI->emitRemarks(MF, ORE); + delete RS; SaveBlocks.clear(); RestoreBlocks.clear(); diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 5b134f5e35324..ba46ededc63a8 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -4643,10 +4643,6 @@ void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced( if (StackTaggingMergeSetTag) II = tryMergeAdjacentSTG(II, this, RS); } - - // Run remarks pass. - MachineOptimizationRemarkEmitter ORE(MF, nullptr); - emitRemarks(MF, ORE); } /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP @@ -5076,7 +5072,7 @@ struct StackAccess { return AccessTypes & (AccessType::GPR | AccessType::PPR); } bool isSME() const { return AccessTypes & AccessType::FPR; } - bool isMixed() const { return ((AccessTypes & (AccessTypes - 1)) != 0); } + bool isMixed() const { return isCPU() && isSME(); } int64_t start() const { return Offset.getFixed() + Offset.getScalable(); } int64_t end() const { return start() + Size; } @@ -5112,7 +5108,7 @@ static inline raw_ostream &operator<<(raw_ostream &OS, const StackAccess &SA) { } void AArch64FrameLowering::emitRemarks( - const MachineFunction &MF, MachineOptimizationRemarkEmitter &ORE) const { + const MachineFunction &MF, MachineOptimizationRemarkEmitter *ORE) const { SMEAttrs Attrs(MF.getFunction()); if (Attrs.hasNonStreamingInterfaceAndBody()) @@ -5125,6 +5121,9 @@ void AArch64FrameLowering::emitRemarks( return; const MachineFrameInfo &MFI = MF.getFrameInfo(); + // Bail if function has no stack objects. + if (!MFI.hasStackObjects()) + return; std::vector StackAccesses(MFI.getNumObjects()); @@ -5187,8 +5186,8 @@ void AArch64FrameLowering::emitRemarks( if (StackAccesses.front().isMixed()) MixedObjects.push_back(&StackAccesses.front()); - for (auto It = StackAccesses.begin(), End = StackAccesses.end(); - It != (End - 1); ++It) { + for (auto It = StackAccesses.begin(), End = std::prev(StackAccesses.end()); + It != End; ++It) { const auto &First = *It; const auto &Second = *(It + 1); @@ -5204,7 +5203,7 @@ void AArch64FrameLowering::emitRemarks( } auto EmitRemark = [&](llvm::StringRef Str) { - ORE.emit([&]() { + ORE->emit([&]() { auto R = MachineOptimizationRemarkAnalysis( "sme", "StackHazard", MF.getFunction().getSubprogram(), &MF.front()); return R << formatv("stack hazard in '{0}': ", MF.getName()).str() << Str; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 396caa6b04868..c197312496208 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -181,7 +181,7 @@ class AArch64FrameLowering : public TargetFrameLowering { Register TargetReg) const; void emitRemarks(const MachineFunction &MF, - MachineOptimizationRemarkEmitter &ORE) const; + MachineOptimizationRemarkEmitter *ORE) const override; }; } // End llvm namespace diff --git a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll index 94b915eb42cfd..0b6bf3892a0c2 100644 --- a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll +++ b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll @@ -92,45 +92,41 @@ entry: } declare ptr @memset(ptr, i32, i32) -%struct.test_struct = type { i32, float, i32 } +%struct.mixed_struct = type { i32, float } -define i32 @mixed_stack_object(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i64 %mixed_obj) #2 { -; CHECK: remark: :0:0: stack hazard in 'mixed_stack_object': Mixed stack object at [SP+8] accessed by both GP and FP instructions -; CHECK-PADDING: remark: :0:0: stack hazard in 'mixed_stack_object': Mixed stack object at [SP+8] accessed by both GP and FP instructions +define i32 @mixed_stack_object(i32 %a, float %b) #2 { +; CHECK: remark: :0:0: stack hazard in 'mixed_stack_object': Mixed stack object at [SP-8] accessed by both GP and FP instructions +; CHECK-PADDING: remark: :0:0: stack hazard in 'mixed_stack_object': Mixed stack object at [SP-8] accessed by both GP and FP instructions entry: - %t.sroa.0.0.extract.trunc = trunc i64 %mixed_obj to i32 - %t.sroa.2.0.extract.shift = lshr i64 %mixed_obj, 32 - %t.sroa.2.0.extract.trunc = trunc nuw i64 %t.sroa.2.0.extract.shift to i32 - %0 = bitcast i32 %t.sroa.2.0.extract.trunc to float - %conv = sitofp i32 %t.sroa.0.0.extract.trunc to float - %add = fadd float %conv, %0 - %conv2 = fptosi float %add to i32 - ret i32 %conv2 + %s = alloca %struct.mixed_struct + %s.i = getelementptr %struct.mixed_struct, ptr %s, i32 0, i32 0 + %s.f = getelementptr %struct.mixed_struct, ptr %s, i32 0, i32 1 + store i32 %a, ptr %s.i + store float %b, ptr %s.f + ret i32 %a } -define i32 @mixed_stack_objects(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i64 %mixed_obj_0, i64 %mixed_obj_1) #2 { -; CHECK: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP+8] is too close to Mixed stack object at [SP+16] -; CHECK: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP+8] accessed by both GP and FP instructions -; CHECK: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP+16] accessed by both GP and FP instructions -; CHECK-PADDING: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP+8] is too close to Mixed stack object at [SP+16] -; CHECK-PADDING: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP+8] accessed by both GP and FP instructions -; CHECK-PADDING: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP+16] accessed by both GP and FP instructions +define i32 @mixed_stack_objects(i32 %a, float %b) #2 { +; CHECK: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-16] is too close to Mixed stack object at [SP-8] +; CHECK: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-16] accessed by both GP and FP instructions +; CHECK: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-8] accessed by both GP and FP instructions +; CHECK-PADDING: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-16] is too close to Mixed stack object at [SP-8] +; CHECK-PADDING: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-16] accessed by both GP and FP instructions +; CHECK-PADDING: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-8] accessed by both GP and FP instructions entry: - %t0.sroa.0.0.extract.trunc = trunc i64 %mixed_obj_0 to i32 - %t0.sroa.2.0.extract.shift = lshr i64 %mixed_obj_0, 32 - %t0.sroa.2.0.extract.trunc = trunc nuw i64 %t0.sroa.2.0.extract.shift to i32 - %t1.sroa.0.0.extract.trunc = trunc i64 %mixed_obj_1 to i32 - %t1.sroa.2.0.extract.shift = lshr i64 %mixed_obj_1, 32 - %t1.sroa.2.0.extract.trunc = trunc nuw i64 %t1.sroa.2.0.extract.shift to i32 - %0 = bitcast i32 %t0.sroa.2.0.extract.trunc to float - %1 = bitcast i32 %t1.sroa.2.0.extract.trunc to float - %conv0 = sitofp i32 %t0.sroa.0.0.extract.trunc to float - %conv1 = sitofp i32 %t1.sroa.0.0.extract.trunc to float - %add0 = fadd float %conv0, %0 - %add1 = fadd float %conv1, %1 - %add = fadd float %add0, %add1 - %conv2 = fptosi float %add to i32 - ret i32 %conv2 + %s0 = alloca %struct.mixed_struct + %s0.i = getelementptr %struct.mixed_struct, ptr %s0, i32 0, i32 0 + %s0.f = getelementptr %struct.mixed_struct, ptr %s0, i32 0, i32 1 + store i32 %a, ptr %s0.i + store float %b, ptr %s0.f + + %s1 = alloca %struct.mixed_struct + %s1.i = getelementptr %struct.mixed_struct, ptr %s1, i32 0, i32 0 + %s1.f = getelementptr %struct.mixed_struct, ptr %s1, i32 0, i32 1 + store i32 %a, ptr %s1.i + store float %b, ptr %s1.f + + ret i32 %a } ; VLA-area stack objects are not separated.