From 9898fd1ff6a6f5d96dee2f29681b8a731569d6d9 Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Mon, 6 Nov 2023 22:19:58 -0600 Subject: [PATCH 01/17] [AIX][TLS] Optimize the -maix-small-local-exec-tls local-exec access sequence for non-zero offsets This patch utilizes the -maix-small-local-exec-tls option to produce a faster, non-TOC-based access sequence for the local-exec TLS model, specifically for when the offsets from the TLS variable are non-zero. In particular, this patch produces either a single: - addi/la with a displacement off of R13 plus a non-zero offset for when an address is calculated, or - load or store off of R13 plus a non-zero offset for when an address is calculated and used for further access Where R13 is the thread pointer, respectively. In order to produce a single addi or load/store off of the thread pointer with a non-zero offset, this patch also adds the necessary support in the assembly printer when printing these instructions. Specifically: - The non-zero offset is added to the TLS variable address when the address of the TLS variable + it's offset is less than 32KB. - Otherwise, when the address of the TLS variable + its offset is greater than 32KB, the non-zero offset (multiplied by a multiple of 64KB) is subtracted from the TLS address. This handling in the assembly printer is necessary to ensure that the TLS address + the non-zero offset is between [-32768, 32768), so that the total displacement can fit within the addi/load/store instructions. --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 170 +++++++++++++- llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 69 +++++- .../PowerPC/aix-small-local-exec-tls-char.ll | 6 +- .../aix-small-local-exec-tls-double.ll | 6 +- .../PowerPC/aix-small-local-exec-tls-float.ll | 6 +- .../PowerPC/aix-small-local-exec-tls-int.ll | 6 +- .../aix-small-local-exec-tls-largeaccess.ll | 211 ++++++++---------- .../aix-small-local-exec-tls-largeaccess2.ll | 160 +++++++++++++ .../PowerPC/aix-small-local-exec-tls-short.ll | 6 +- 9 files changed, 497 insertions(+), 143 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 528267cb01329..7859c231f32c3 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -66,6 +66,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/Process.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/Threading.h" @@ -155,6 +156,11 @@ class PPCAsmPrinter : public AsmPrinter { TOC; const PPCSubtarget *Subtarget = nullptr; + // Keep track of the number of TLS variables and their corresponding + // addresses, which is then used for the assembly printing of + // non-TOC-based local-exec variables. + MapVector TLSVarsToAddressMapping; + public: explicit PPCAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) @@ -199,6 +205,8 @@ class PPCAsmPrinter : public AsmPrinter { void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI); void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK); void EmitAIXTlsCallHelper(const MachineInstr *MI); + const MCExpr *getAdjustedLocalExecExpr(const MachineOperand &MO, + int64_t Offset); bool runOnMachineFunction(MachineFunction &MF) override { Subtarget = &MF.getSubtarget(); bool Changed = AsmPrinter::runOnMachineFunction(MF); @@ -1503,13 +1511,42 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { case PPC::LWA: { // Verify alignment is legal, so we don't create relocations // that can't be supported. - unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1; + unsigned OpNum; + if (Subtarget->hasAIXSmallLocalExecTLS()) + OpNum = 1; + else + OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1; const MachineOperand &MO = MI->getOperand(OpNum); if (MO.isGlobal()) { const DataLayout &DL = MO.getGlobal()->getParent()->getDataLayout(); if (MO.getGlobal()->getPointerAlignment(DL) < 4) llvm_unreachable("Global must be word-aligned for LD, STD, LWA!"); + + // A faster non-TOC-based local-exec sequence is represented by + // `lwa`/`ld`/`std` directingly loading or storing off of the thread + // pointer and with an immediate operand having the MO_TPREL_FLAG. + // Such instructions do not otherwise arise. + unsigned Flag = MO.getTargetFlags(); + if (Flag == PPCII::MO_TPREL_FLAG) { + assert(Subtarget->hasAIXSmallLocalExecTLS() && + "lwa/ld/std with thread-pointer only expected with " + "local-exec small TLS"); + int64_t Offset = MO.getOffset(); + // Non-zero offsets for lwa/ld/std require special handling and are + // handled here. + if (!Offset) + break; + + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); + if (Offset) { + const MCExpr *Expr = getAdjustedLocalExecExpr(MO, Offset); + TmpInst.getOperand(1) = MCOperand::createExpr(Expr); + } + EmitToStreamer(*OutStreamer, TmpInst); + return; + } } + // Now process the instruction normally. break; } @@ -1523,19 +1560,58 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::EnforceIEIO)); return; } + case PPC::LBZ: + case PPC::LBZ8: + case PPC::LHA: + case PPC::LHA8: + case PPC::LHZ: + case PPC::LHZ8: + case PPC::LWZ: + case PPC::LWZ8: + case PPC::STB: + case PPC::STB8: + case PPC::STH: + case PPC::STH8: + case PPC::STW: + case PPC::STW8: + case PPC::LFS: + case PPC::STFS: + case PPC::LFD: + case PPC::STFD: case PPC::ADDI8: { - // The faster non-TOC-based local-exec sequence is represented by `addi` - // with an immediate operand having the MO_TPREL_FLAG. Such an instruction - // does not otherwise arise. - unsigned Flag = MI->getOperand(2).getTargetFlags(); + // A faster non-TOC-based local-exec sequence is represented by `addi` + // or a load/store instruction (that directly loads or stores off of the + // thread pointer) with an immediate operand having the MO_TPREL_FLAG. + // Such instructions do not otherwise arise. + bool IsMIADDI8 = MI->getOpcode() == PPC::ADDI8; + unsigned OpNum = IsMIADDI8 ? 2 : 1; + const MachineOperand &MO = MI->getOperand(OpNum); + unsigned Flag = MO.getTargetFlags(); if (Flag == PPCII::MO_TPREL_FLAG || Flag == PPCII::MO_GOT_TPREL_PCREL_FLAG || Flag == PPCII::MO_TPREL_PCREL_FLAG) { assert( Subtarget->hasAIXSmallLocalExecTLS() && - "addi with thread-pointer only expected with local-exec small TLS"); + "addi, or load/stores with thread-pointer only expected with " + "local-exec small TLS"); + + int64_t Offset = MO.getOffset(); + // Non-zero offsets for loads/stores require special handling and are + // handled here. For `addi`, all offsets are handled here. + if (!Offset && !IsMIADDI8) + break; + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); - TmpInst.setOpcode(PPC::LA8); + + if (Offset) { + const MCExpr *Expr = getAdjustedLocalExecExpr(MO, Offset); + TmpInst.getOperand(OpNum) = MCOperand::createExpr(Expr); + } + + // Change the opcode to load address if the original opcode is an `addi`. + if (IsMIADDI8) + TmpInst.setOpcode(PPC::LA8); + EmitToStreamer(*OutStreamer, TmpInst); return; } @@ -1547,6 +1623,69 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, TmpInst); } +// For non-TOC-based local-exec variables that have a non-zero offset, +// we need to create a new MCExpr that adds the non-zero offset to the address +// of the local-exec variable that will be used in either an addi, load or +// store. However, the final displacement for these instructions must be +// between [-32768, 32768), so if the TLS address + it's non-zero offset is +// greater than 32KB, a new MCExpr is produced to accommodate this situation. +const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO, + int64_t Offset) { + assert(MO.isGlobal() && "Only expecting a global MachineOperand here!"); + const GlobalValue *GValue = MO.getGlobal(); + TLSModel::Model Model = TM.getTLSModel(GValue); + assert(Model == TLSModel::LocalExec && + "Only local-exec accesses are handled!"); + MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_PPC_AIX_TLSLE; + + const MCExpr *Expr = MCSymbolRefExpr::create(getSymbol(GValue), RefKind, + OutContext); + + bool IsGlobalADeclaration = GValue->isDeclarationForLinker(); + // Find the GlobalVariable that corresponds to the particular TLS variable + // in the TLS variable to address mapping. All TLS variables should exist + // within this map, with the exception of TLS variables marked as extern. + const auto TLSVarsMapEntryIter = TLSVarsToAddressMapping.find(GValue); + if (TLSVarsMapEntryIter == TLSVarsToAddressMapping.end()) + assert(IsGlobalADeclaration && + "Only expecting to find extern TLS variables not present in the TLS " + "variables to address map!"); + + unsigned TLSVarAddress = TLSVarsMapEntryIter->second; + ptrdiff_t FinalAddress = (TLSVarAddress + Offset); + // If the address of the TLS variable + the offset is less than 32KB, + // or if the TLS variable is extern, we simply produce an MCExpr to add the + // non-zero offset to the TLS variable address. + // For when TLS variables are extern, this is safe to do because we can + // assume that the address of extern TLS variables are zero. + if ((FinalAddress < 32768) || IsGlobalADeclaration) + Expr = MCBinaryExpr::createAdd(Expr, + MCConstantExpr::create(Offset, OutContext), + OutContext); + else { + // Handle the written offset for cases where: + // address of the TLS variable + the offset is greater than 32KB. + + // Get the address in the range of 0 to 64KB. + FinalAddress = FinalAddress & 0xFFFF; + // If the highest bit in the calculated address is set, subtract + // additional 64KB to ensure that the final address fits within + // [-32768,32768). + if (FinalAddress & 0x8000) + FinalAddress = FinalAddress - 0x10000; + assert((FinalAddress < 32768) || (FinalAddress >= -32768) && + "Expecting the final address for local-exec TLS variables to be " + "between [-32768,32768)!"); + // Get the offset that is actually written out in assembly by adding back + // the original address of the TLS variable. + ptrdiff_t WrittenOffset = FinalAddress - TLSVarAddress; + Expr = MCBinaryExpr::createAdd( + Expr, MCConstantExpr::create(WrittenOffset, OutContext), OutContext); + } + + return Expr; +} + void PPCLinuxAsmPrinter::emitGNUAttributes(Module &M) { // Emit float ABI into GNU attribute Metadata *MD = M.getModuleFlag("float-abi"); @@ -2757,6 +2896,23 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) { Csect->ensureMinAlignment(GOAlign); }; + // For all TLS variables, calculate their corresponding addresses and store + // them into TLSVarsToAddressMapping, which will be used to determine whether + // or not local-exec TLS variables require special assembly printing. + // This address calculation follows the same method seen within + // assignAddressesAndIndices() in XCOFFObjectWriter.cpp. + uint64_t Address = 0; + uint64_t TLSVarAddress = 0; + auto DL = M.getDataLayout(); + for (const auto &G : M.globals()) { + if (G.isThreadLocal() && !G.isDeclaration()) { + TLSVarAddress = alignTo(Address, getGVAlignment(&G, DL)); + unsigned GVSize = DL.getTypeAllocSize(G.getValueType()); + Address = TLSVarAddress + GVSize; + TLSVarsToAddressMapping[&G] = TLSVarAddress; + } + } + // We need to know, up front, the alignment of csects for the assembly path, // because once a .csect directive gets emitted, we could not change the // alignment value on it. diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 24e067f2ebfba..6e251cada2888 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -7565,8 +7565,64 @@ static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) { DAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), N->getOperand(0)); } +// For non-TOC-based local-exec access where an addi is feeding into another +// addi, fold this sequence into a single addi if possible. +static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) { + const PPCSubtarget &Subtarget = + DAG->getMachineFunction().getSubtarget(); + // This optimization is only performed for non-TOC-based local-exec accesses. + if (!Subtarget.hasAIXSmallLocalExecTLS()) + return; + + if (N->getMachineOpcode() != PPC::ADDI8) + return; + + // InitialADDI is the addi feeding into N (also an addi), and the addi that + // we want optimized out. + SDValue InitialADDI = N->getOperand(0); + if (!InitialADDI.isMachineOpcode()) + return; + if (InitialADDI.getMachineOpcode() != PPC::ADDI8) + return; + + // The first operand of the InitialADDI will be the thread pointer. + // This transformation is only performed if the first operand of the + // addi is the thread pointer. + SDValue TPRegNode = InitialADDI.getOperand(0); + RegisterSDNode *TPReg = + dyn_cast_or_null(TPRegNode.getNode()); + if (!TPReg) + return; + if (TPReg->getReg() != Subtarget.getThreadPointerRegister()) + return; + + // The second operand of the InitialADDI will be a TargetGlobalTLSAddress, + // (the local-exec TLS variable). We only perform the folding if the TLS + // variable is the second operand. + SDValue TLSVarNode = InitialADDI.getOperand(1); + GlobalAddressSDNode *GA = dyn_cast(TLSVarNode); + if (!GA) + return; + + unsigned TargetFlags = GA->getTargetFlags(); + if ((TargetFlags & PPCII::MO_TPREL_FLAG) == 0) + return; + // The second operand of the addi that we want to preserve will be an + // immediate. We add this immediate together with the address of the TLS + // variable found in InitialADDI in order to preserve the correct TLS address + // information during assembly printing. + int Offset = N->getConstantOperandVal(1); + TLSVarNode = DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA), MVT::i64, + Offset, TargetFlags); + + (void)DAG->UpdateNodeOperands(N, TPRegNode, TLSVarNode); + if (InitialADDI.getNode()->use_empty()) + DAG->RemoveDeadNode(InitialADDI.getNode()); +} + void PPCDAGToDAGISel::PeepholePPC64() { SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); + bool HasAIXSmallLocalExecTLS = Subtarget->hasAIXSmallLocalExecTLS(); while (Position != CurDAG->allnodes_begin()) { SDNode *N = &*--Position; @@ -7577,6 +7633,8 @@ void PPCDAGToDAGISel::PeepholePPC64() { if (isVSXSwap(SDValue(N, 0))) reduceVSXSwap(N, CurDAG); + foldADDIForLocalExecAccesses(N, CurDAG); + unsigned FirstOp; unsigned StorageOpcode = N->getMachineOpcode(); bool RequiresMod4Offset = false; @@ -7733,7 +7791,16 @@ void PPCDAGToDAGISel::PeepholePPC64() { ImmOpnd = CurDAG->getTargetConstant(Offset, SDLoc(ImmOpnd), ImmOpnd.getValueType()); } else if (Offset != 0) { - continue; + if (!HasAIXSmallLocalExecTLS) + continue; + // Add the non-zero offset information into the load or store + // instruction to be used for non-TOC-based local-exec accesses. + GlobalAddressSDNode *GA = dyn_cast(ImmOpnd); + if (!GA) + continue; + ImmOpnd = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA), + MVT::i64, Offset, + GA->getTargetFlags()); } } diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-char.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-char.ll index 6c05fb38ee16d..c938b9485c257 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-char.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-char.ll @@ -16,14 +16,12 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1 define nonnull ptr @AddrTest1() local_unnamed_addr #0 { ; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: AddrTest1: ; SMALL-LOCAL-EXEC-SMALLCM64: # %bb.0: # %entry -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r3, c[TL]@le(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: addi r3, r3, 1 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r3, c[TL]@le+1(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: blr ; ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: AddrTest1: ; SMALL-LOCAL-EXEC-LARGECM64: # %bb.0: # %entry -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, c[TL]@le(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: addi r3, r3, 1 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, c[TL]@le+1(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: blr entry: %0 = tail call align 1 ptr @llvm.threadlocal.address.p0(ptr align 1 @c) diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-double.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-double.ll index 5cf359f68f8bd..02d794fec75cc 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-double.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-double.ll @@ -16,14 +16,12 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1 define nonnull ptr @AddrTest1() local_unnamed_addr #0 { ; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: AddrTest1: ; SMALL-LOCAL-EXEC-SMALLCM64: # %bb.0: # %entry -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r3, f[TL]@le(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: addi r3, r3, 48 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r3, f[TL]@le+48(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: blr ; ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: AddrTest1: ; SMALL-LOCAL-EXEC-LARGECM64: # %bb.0: # %entry -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, f[TL]@le(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: addi r3, r3, 48 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, f[TL]@le+48(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: blr entry: %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @f) diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-float.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-float.ll index 1fc014edaf2bb..a1f6f4f974bd8 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-float.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-float.ll @@ -16,14 +16,12 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1 define nonnull ptr @AddrTest1() local_unnamed_addr #0 { ; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: AddrTest1: ; SMALL-LOCAL-EXEC-SMALLCM64: # %bb.0: # %entry -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r3, e[TL]@le(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: addi r3, r3, 16 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r3, e[TL]@le+16(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: blr ; ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: AddrTest1: ; SMALL-LOCAL-EXEC-LARGECM64: # %bb.0: # %entry -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, e[TL]@le(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: addi r3, r3, 16 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, e[TL]@le+16(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: blr entry: %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @e) diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-int.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-int.ll index 40adf27d7ee39..c74abe93c18bf 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-int.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-int.ll @@ -18,14 +18,12 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1 define nonnull ptr @AddrTest1() local_unnamed_addr #0 { ; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: AddrTest1: ; SMALL-LOCAL-EXEC-SMALLCM64: # %bb.0: # %entry -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r3, a[TL]@le(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: addi r3, r3, 12 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r3, a[TL]@le+12(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: blr ; ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: AddrTest1: ; SMALL-LOCAL-EXEC-LARGECM64: # %bb.0: # %entry -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, a[TL]@le(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: addi r3, r3, 12 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, a[TL]@le+12(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: blr entry: %0 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @a) diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll index 55c69839515c4..3aa3ecc9f2b0d 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll @@ -25,43 +25,33 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1 define signext i32 @StoreArrays1() { ; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: StoreArrays1: ; SMALL-LOCAL-EXEC-SMALLCM64: # %bb.0: # %entry -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r3, mySmallLocalExecTLSv1[TL]@le(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 1 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r5, 4 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLSv1[TL]@le(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r4, mySmallLocalExecTLS2[TL]@le(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r5, 24(r3) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 1 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 4 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, mySmallLocalExecTLSv1[TL]@le(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 2 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, 320(r4) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r3, mySmallLocalExecTLS3[TL]@le(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 3 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, 324(r3) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r3, mySmallLocalExecTLS4[TL]@le(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 88 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r5, 328(r3) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r3, mySmallLocalExecTLS5[TL]@le(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, 332(r3) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLSv1[TL]@le+24(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, mySmallLocalExecTLS2[TL]@le-65216(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 3 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, mySmallLocalExecTLS3[TL]@le-65212(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 88 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLS4[TL]@le-65208(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, mySmallLocalExecTLS5[TL]@le-65204(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 102 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: blr ; ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: StoreArrays1: ; SMALL-LOCAL-EXEC-LARGECM64: # %bb.0: # %entry -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, mySmallLocalExecTLSv1[TL]@le(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 1 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r5, 4 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, mySmallLocalExecTLSv1[TL]@le(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r4, mySmallLocalExecTLS2[TL]@le(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r5, 24(r3) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 1 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 4 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLSv1[TL]@le(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 2 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, 320(r4) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, mySmallLocalExecTLS3[TL]@le(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 3 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, 324(r3) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, mySmallLocalExecTLS4[TL]@le(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 88 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r5, 328(r3) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, mySmallLocalExecTLS5[TL]@le(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, 332(r3) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, mySmallLocalExecTLSv1[TL]@le+24(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS2[TL]@le-65216(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 3 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS3[TL]@le-65212(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 88 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, mySmallLocalExecTLS4[TL]@le-65208(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS5[TL]@le-65204(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 102 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: blr entry: @@ -98,46 +88,38 @@ entry: define signext i32 @StoreArrays2() { ; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: StoreArrays2: ; SMALL-LOCAL-EXEC-SMALLCM64: # %bb.0: # %entry -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: ld r3, L..C0(r2) # target-flags(ppc-tprel) @mySmallLocalExecTLSv2 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 1 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r5, 4 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: add r3, r13, r3 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, 0(r3) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r4, mySmallLocalExecTLS2[TL]@le(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r5, 24(r3) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 2 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, 320(r4) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r3, mySmallLocalExecTLS3[TL]@le(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: ld r4, L..C0(r2) # target-flags(ppc-tprel) @mySmallLocalExecTLSv2 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 1 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: add r4, r13, r4 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, 0(r4) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 4 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, 24(r4) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 2 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLS2[TL]@le-65216(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 3 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, 324(r3) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r3, mySmallLocalExecTLS4[TL]@le(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r4, mySmallLocalExecTLS5[TL]@le(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r5, 328(r3) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 88 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, 332(r4) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLS3[TL]@le-65212(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 88 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, mySmallLocalExecTLS4[TL]@le-65208(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 102 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLS5[TL]@le-65204(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: blr ; ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: StoreArrays2: ; SMALL-LOCAL-EXEC-LARGECM64: # %bb.0: # %entry -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: addis r4, L..C0@u(r2) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 1 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r5, 4 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: ld r4, L..C0@l(r4) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: add r4, r13, r4 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, 0(r4) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, mySmallLocalExecTLS2[TL]@le(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r5, 24(r4) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 2 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, 320(r3) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, mySmallLocalExecTLS3[TL]@le(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 3 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, 324(r3) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, mySmallLocalExecTLS4[TL]@le(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r4, mySmallLocalExecTLS5[TL]@le(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r5, 328(r3) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: addis r3, L..C0@u(r2) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 1 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: ld r3, L..C0@l(r3) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: add r3, r13, r3 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, 0(r3) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 4 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, 24(r3) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 2 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS2[TL]@le-65216(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 3 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS3[TL]@le-65212(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 88 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, 332(r4) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, mySmallLocalExecTLS4[TL]@le-65208(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS5[TL]@le-65204(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 102 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: blr entry: @@ -173,77 +155,76 @@ entry: ; DIS: {{.*}}aix-small-local-exec-tls-largeaccess.ll.tmp.o: file format aix5coff64-rs6000 ; DIS: Disassembly of section .text: ; DIS: 0000000000000000 (idx: 3) .StoreArrays1: -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addi 3, 13, 0 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 1 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 4 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, 0(13) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: 15) mySmallLocalExecTLSv1[TL] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 1 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 5, 4 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 0(13) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 2 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 24(13) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: 15) mySmallLocalExecTLSv1[TL] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addi 4, 13, 32748 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, -32468(13) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: 17) mySmallLocalExecTLS2[TL] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 5, 24(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 2 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, 320(4) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addi 3, 13, -16788 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 3 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, -16464(13) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: 19) mySmallLocalExecTLS3[TL] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 3 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 324(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addi 3, 13, -788 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 88 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, -460(13) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: 21) mySmallLocalExecTLS4[TL] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 88 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 5, 328(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addi 3, 13, 15212 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, 15544(13) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: 23) mySmallLocalExecTLS5[TL] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 332(3) ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 102 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} blr -; DIS: 0000000000000050 (idx: 5) .StoreArrays2: -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addis 4, 2, 0 +; DIS: 0000000000000040 (idx: 5) .StoreArrays2: +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addis 3, 2, 0 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCU (idx: 13) mySmallLocalExecTLSv2[TE] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 1 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 5, 4 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} ld 4, 0(4) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 1 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} ld 3, 0(3) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCL (idx: 13) mySmallLocalExecTLSv2[TE] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} add 4, 13, 4 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, 0(4) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addi 3, 13, 32748 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} add 3, 13, 3 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 0(3) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 4 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 24(3) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 2 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, -32468(13) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: 17) mySmallLocalExecTLS2[TL] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 5, 24(4) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 2 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 320(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addi 3, 13, -16788 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 3 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, -16464(13) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: 19) mySmallLocalExecTLS3[TL] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 3 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 324(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addi 3, 13, -788 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 88 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, -460(13) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: 21) mySmallLocalExecTLS4[TL] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addi 4, 13, 15212 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, 15544(13) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: 23) mySmallLocalExecTLS5[TL] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 5, 328(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 88 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, 332(4) ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 102 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} blr ; DIS: Disassembly of section .data: -; DIS: 00000000000000a0 (idx: 7) StoreArrays1[DS]: +; DIS: 0000000000000080 (idx: 7) StoreArrays1[DS]: +; DIS-NEXT: 80: 00 00 00 00 +; DIS-NEXT: 0000000000000080: R_POS (idx: 3) .StoreArrays1 +; DIS-NEXT: 84: 00 00 00 00 +; DIS-NEXT: 88: 00 00 00 00 +; DIS-NEXT: 0000000000000088: R_POS (idx: 11) TOC[TC0] +; DIS-NEXT: 8c: 00 00 00 b0 + +; DIS: 0000000000000098 (idx: 9) StoreArrays2[DS]: +; DIS-NEXT: 98: 00 00 00 00 +; DIS-NEXT: 0000000000000098: R_POS (idx: 5) .StoreArrays2 +; DIS-NEXT: 9c: 00 00 00 40 ; DIS-NEXT: a0: 00 00 00 00 -; DIS-NEXT: 00000000000000a0: R_POS (idx: 3) .StoreArrays1 -; DIS-NEXT: a4: 00 00 00 00 -; DIS-NEXT: a8: 00 00 00 00 -; DIS-NEXT: 00000000000000a8: R_POS (idx: 11) TOC[TC0] -; DIS-NEXT: ac: 00 00 00 d0 +; DIS-NEXT: 00000000000000a0: R_POS (idx: 11) TOC[TC0] +; DIS-NEXT: a4: 00 00 00 b0 -; DIS: 00000000000000b8 (idx: 9) StoreArrays2[DS]: -; DIS-NEXT: b8: 00 00 00 00 -; DIS-NEXT: 00000000000000b8: R_POS (idx: 5) .StoreArrays2 -; DIS-NEXT: bc: 00 00 00 50 -; DIS-NEXT: c0: 00 00 00 00 -; DIS-NEXT: 00000000000000c0: R_POS (idx: 11) TOC[TC0] -; DIS-NEXT: c4: 00 00 00 d0 +; DIS: 00000000000000b0 (idx: 13) mySmallLocalExecTLSv2[TE]: +; DIS-NEXT: b0: 00 00 00 00 +; DIS-NEXT: 00000000000000b0: R_TLS_LE (idx: 25) mySmallLocalExecTLSv2[TL] +; DIS-NEXT: b4: 00 01 79 ec -; DIS: 00000000000000d0 (idx: 13) mySmallLocalExecTLSv2[TE]: -; DIS-NEXT: d0: 00 00 00 00 -; DIS-NEXT: 00000000000000d0: R_TLS_LE (idx: 25) mySmallLocalExecTLSv2[TL] -; DIS-NEXT: d4: 00 01 79 ec +; DIS: Disassembly of section .tdata: +; DIS: 0000000000000000 (idx: 15) mySmallLocalExecTLSv1[TL]: +; DIS: 0000000000007fec (idx: 17) mySmallLocalExecTLS2[TL]: +; DIS: 000000000000be6c (idx: 19) mySmallLocalExecTLS3[TL]: +; DIS: 000000000000fcec (idx: 21) mySmallLocalExecTLS4[TL]: +; DIS: 0000000000013b6c (idx: 23) mySmallLocalExecTLS5[TL]: +; DIS: 00000000000179ec (idx: 25) mySmallLocalExecTLSv2[TL]: diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll new file mode 100644 index 0000000000000..c87b7acb6211c --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll @@ -0,0 +1,160 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \ +; RUN: -mtriple powerpc64-ibm-aix-xcoff -mattr=+aix-small-local-exec-tls < %s \ +; RUN: | FileCheck %s --check-prefix=SMALL-LOCAL-EXEC-SMALLCM64 +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \ +; RUN: -mtriple powerpc64-ibm-aix-xcoff --code-model=large \ +; RUN: -mattr=+aix-small-local-exec-tls < %s | FileCheck %s \ +; RUN: --check-prefix=SMALL-LOCAL-EXEC-LARGECM64 + +; Test disassembly of object. +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=+aix-small-local-exec-tls \ +; RUN: -mtriple powerpc64-ibm-aix-xcoff -xcoff-traceback-table=false \ +; RUN: --code-model=large -filetype=obj -o %t.o < %s +; RUN: llvm-objdump -D -r --symbol-description %t.o | FileCheck --check-prefix=DIS %s + +@mySmallLocalExecTLS6 = external thread_local(localexec) global [60 x i64], align 8 +@mySmallLocalExecTLS2 = thread_local(localexec) global [3000 x i64] zeroinitializer, align 8 +@MyTLSGDVar = thread_local global [800 x i64] zeroinitializer, align 8 +@mySmallLocalExecTLS3 = thread_local(localexec) global [3000 x i64] zeroinitializer, align 8 +@mySmallLocalExecTLS4 = thread_local(localexec) global [3000 x i64] zeroinitializer, align 8 +@mySmallLocalExecTLS5 = thread_local(localexec) global [3000 x i64] zeroinitializer, align 8 +@mySmallLocalExecTLS = thread_local(localexec) local_unnamed_addr global [7800 x i64] zeroinitializer, align 8 +declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1 + +; All accesses use a "faster" local-exec sequence directly off the thread pointer. +define i64 @StoreLargeAccess1() { +; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: StoreLargeAccess1: +; SMALL-LOCAL-EXEC-SMALLCM64: # %bb.0: # %entry +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: mflr r0 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stdu r1, -48(r1) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 212 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 203 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r0, 64(r1) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r3, mySmallLocalExecTLS6[UL]@le+424(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r4, mySmallLocalExecTLS2[TL]@le+1200(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: ld r3, L..C0(r2) # target-flags(ppc-lo) @MyTLSGDVar +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: ld r4, L..C1(r2) # target-flags(ppc-tlsgd) @MyTLSGDVar +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: bla .__tls_get_addr[PR] +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 44 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r4, 440(r3) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 6 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 100 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r3, mySmallLocalExecTLS3[TL]@le+2000(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 882 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r4, mySmallLocalExecTLS4[TL]@le-58736(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r3, mySmallLocalExecTLS5[TL]@le-57136(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 1191 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: addi r1, r1, 48 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: ld r0, 16(r1) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: mtlr r0 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: blr +; +; SMALL-LOCAL-EXEC-LARGECM64-LABEL: StoreLargeAccess1: +; SMALL-LOCAL-EXEC-LARGECM64: # %bb.0: # %entry +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: mflr r0 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stdu r1, -48(r1) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 212 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r0, 64(r1) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: addis r4, L..C0@u(r2) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: ld r4, L..C0@l(r4) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r3, mySmallLocalExecTLS6[UL]@le+424(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 203 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r3, mySmallLocalExecTLS2[TL]@le+1200(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: addis r3, L..C1@u(r2) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: ld r3, L..C1@l(r3) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: bla .__tls_get_addr[PR] +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 44 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r4, 440(r3) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 6 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 100 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r3, mySmallLocalExecTLS3[TL]@le+2000(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 882 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r4, mySmallLocalExecTLS4[TL]@le-58736(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r3, mySmallLocalExecTLS5[TL]@le-57136(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 1191 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: addi r1, r1, 48 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: ld r0, 16(r1) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: mtlr r0 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: blr +entry: + %0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS6) + %arrayidx = getelementptr inbounds [60 x i64], ptr %0, i64 0, i64 53 + store i64 212, ptr %arrayidx, align 8 + %1 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS2) + %arrayidx1 = getelementptr inbounds [3000 x i64], ptr %1, i64 0, i64 150 + store i64 203, ptr %arrayidx1, align 8 + %2 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @MyTLSGDVar) + %arrayidx2 = getelementptr inbounds [800 x i64], ptr %2, i64 0, i64 55 + store i64 44, ptr %arrayidx2, align 8 + %3 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS3) + %arrayidx3 = getelementptr inbounds [3000 x i64], ptr %3, i64 0, i64 250 + store i64 6, ptr %arrayidx3, align 8 + %4 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS4) + %arrayidx4 = getelementptr inbounds [3000 x i64], ptr %4, i64 0, i64 850 + store i64 100, ptr %arrayidx4, align 8 + %5 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallLocalExecTLS5) + %arrayidx5 = getelementptr inbounds [3000 x i64], ptr %5, i64 0, i64 1050 + store i64 882, ptr %arrayidx5, align 8 + %6 = load i64, ptr %arrayidx1, align 8 + %7 = load i64, ptr %arrayidx3, align 8 + %8 = load i64, ptr %arrayidx4, align 8 + %add = add i64 %6, 882 + %add9 = add i64 %add, %7 + %add11 = add i64 %add9, %8 + ret i64 %add11 +} + +; DIS: 0000000000000000 (idx: 7) .StoreLargeAccess1: +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} mflr 0 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stdu 1, -48(1) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 212 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} std 0, 64(1) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addis 4, 2, 0 +; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCU (idx: 13) MyTLSGDVar[TE] +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} ld 4, 0(4) +; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCL (idx: 13) MyTLSGDVar[TE] +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} std 3, 424(13) +; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: 1) mySmallLocalExecTLS6[UL] +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 203 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} std 3, 1200(13) +; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: 17) mySmallLocalExecTLS2[TL] +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addis 3, 2, 0 +; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCU (idx: 15) .MyTLSGDVar[TE] +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} ld 3, 8(3) +; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCL (idx: 15) .MyTLSGDVar[TE] +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} bla 0 +; DIS-NEXT: {{0*}}[[#ADDR]]: R_RBA (idx: 3) .__tls_get_addr[PR] +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 44 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} std 4, 440(3) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 6 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 100 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} std 3, 32400(13) +; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: 21) mySmallLocalExecTLS3[TL] +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 882 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} std 4, -4336(13) +; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: 23) mySmallLocalExecTLS4[TL] +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} std 3, 21264(13) +; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: 25) mySmallLocalExecTLS5[TL] +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 1191 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addi 1, 1, 48 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} ld 0, 16(1) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} mtlr 0 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} blr + +; DIS: Disassembly of section .data: +; DIS: 0000000000000068 (idx: 9) StoreLargeAccess1[DS]: +; DIS-NEXT: 68: 00 00 00 00 +; DIS-NEXT: 0000000000000068: R_POS (idx: 7) .StoreLargeAccess1 +; DIS-NEXT: 6c: 00 00 00 00 +; DIS-NEXT: 70: 00 00 00 00 +; DIS-NEXT: 0000000000000070: R_POS (idx: 11) TOC[TC0] +; DIS-NEXT: 74: 00 00 00 80 + +; DIS: Disassembly of section .tdata: +; DIS: 0000000000000000 (idx: 17) mySmallLocalExecTLS2[TL]: +; DIS: 0000000000005dc0 (idx: 19) MyTLSGDVar[TL]: +; DIS: 00000000000076c0 (idx: 21) mySmallLocalExecTLS3[TL]: +; DIS: 000000000000d480 (idx: 23) mySmallLocalExecTLS4[TL]: +; DIS: 0000000000013240 (idx: 25) mySmallLocalExecTLS5[TL]: +; DIS: 0000000000019000 (idx: 27) mySmallLocalExecTLS[TL]: diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-short.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-short.ll index bf1b7fab30814..b172c2985e695 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-short.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-short.ll @@ -16,14 +16,12 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) #1 define nonnull ptr @AddrTest1() local_unnamed_addr #0 { ; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: AddrTest1: ; SMALL-LOCAL-EXEC-SMALLCM64: # %bb.0: # %entry -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r3, b[TL]@le(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: addi r3, r3, 4 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: la r3, b[TL]@le+4(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: blr ; ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: AddrTest1: ; SMALL-LOCAL-EXEC-LARGECM64: # %bb.0: # %entry -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, b[TL]@le(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: addi r3, r3, 4 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: la r3, b[TL]@le+4(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: blr entry: %0 = tail call align 2 ptr @llvm.threadlocal.address.p0(ptr align 2 @b) From b4e7632776762783794a8354ee0cf0f6c793222c Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Mon, 6 Nov 2023 22:36:01 -0600 Subject: [PATCH 02/17] Apply formatting changes --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 18 +++++++++--------- llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 8 +++----- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 7859c231f32c3..288f8a672cee4 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -68,8 +68,8 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Process.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/Threading.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -1638,8 +1638,8 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO, "Only local-exec accesses are handled!"); MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_PPC_AIX_TLSLE; - const MCExpr *Expr = MCSymbolRefExpr::create(getSymbol(GValue), RefKind, - OutContext); + const MCExpr *Expr = + MCSymbolRefExpr::create(getSymbol(GValue), RefKind, OutContext); bool IsGlobalADeclaration = GValue->isDeclarationForLinker(); // Find the GlobalVariable that corresponds to the particular TLS variable @@ -1659,9 +1659,8 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO, // For when TLS variables are extern, this is safe to do because we can // assume that the address of extern TLS variables are zero. if ((FinalAddress < 32768) || IsGlobalADeclaration) - Expr = MCBinaryExpr::createAdd(Expr, - MCConstantExpr::create(Offset, OutContext), - OutContext); + Expr = MCBinaryExpr::createAdd( + Expr, MCConstantExpr::create(Offset, OutContext), OutContext); else { // Handle the written offset for cases where: // address of the TLS variable + the offset is greater than 32KB. @@ -1673,9 +1672,10 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO, // [-32768,32768). if (FinalAddress & 0x8000) FinalAddress = FinalAddress - 0x10000; - assert((FinalAddress < 32768) || (FinalAddress >= -32768) && - "Expecting the final address for local-exec TLS variables to be " - "between [-32768,32768)!"); + assert((FinalAddress < 32768) || + (FinalAddress >= -32768) && + "Expecting the final address for local-exec TLS variables to be " + "between [-32768,32768)!"); // Get the offset that is actually written out in assembly by adding back // the original address of the TLS variable. ptrdiff_t WrittenOffset = FinalAddress - TLSVarAddress; diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 6e251cada2888..3c4c798294694 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -7589,8 +7589,7 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) { // This transformation is only performed if the first operand of the // addi is the thread pointer. SDValue TPRegNode = InitialADDI.getOperand(0); - RegisterSDNode *TPReg = - dyn_cast_or_null(TPRegNode.getNode()); + RegisterSDNode *TPReg = dyn_cast_or_null(TPRegNode.getNode()); if (!TPReg) return; if (TPReg->getReg() != Subtarget.getThreadPointerRegister()) @@ -7798,9 +7797,8 @@ void PPCDAGToDAGISel::PeepholePPC64() { GlobalAddressSDNode *GA = dyn_cast(ImmOpnd); if (!GA) continue; - ImmOpnd = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA), - MVT::i64, Offset, - GA->getTargetFlags()); + ImmOpnd = CurDAG->getTargetGlobalAddress( + GA->getGlobal(), SDLoc(GA), MVT::i64, Offset, GA->getTargetFlags()); } } From 594cf6a488abc982c3e7db186c5570c8dd3caca9 Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Wed, 15 Nov 2023 23:27:01 -0600 Subject: [PATCH 03/17] Address review comments: fix comments, simplify offset generation, remove unnecessary breaks --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 85 +++++++++++------------ 1 file changed, 39 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 288f8a672cee4..0efb4462a10da 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -761,6 +761,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { MCInst TmpInst; const bool IsPPC64 = Subtarget->isPPC64(); const bool IsAIX = Subtarget->isAIXABI(); + const bool HasAIXSmallLocalExecTLS = Subtarget->hasAIXSmallLocalExecTLS(); const Module *M = MF->getFunction().getParent(); PICLevel::Level PL = M->getPICLevel(); @@ -1511,11 +1512,15 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { case PPC::LWA: { // Verify alignment is legal, so we don't create relocations // that can't be supported. - unsigned OpNum; - if (Subtarget->hasAIXSmallLocalExecTLS()) - OpNum = 1; - else - OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1; + unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1; + // For non-TOC-based local-exec TLS accesses with non-zero offsets, the + // machine operand (which is a TargetGlobalTLSAddress) is expected to be + // the same operand for both loads and stores. + for (const MachineOperand &TempMO : MI->operands()) { + if (((TempMO.getTargetFlags() & PPCII::MO_TPREL_FLAG) != 0) && + TempMO.getOperandNo() == 1) + OpNum = 1; + } const MachineOperand &MO = MI->getOperand(OpNum); if (MO.isGlobal()) { const DataLayout &DL = MO.getGlobal()->getParent()->getDataLayout(); @@ -1528,20 +1533,14 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { // Such instructions do not otherwise arise. unsigned Flag = MO.getTargetFlags(); if (Flag == PPCII::MO_TPREL_FLAG) { - assert(Subtarget->hasAIXSmallLocalExecTLS() && + assert(HasAIXSmallLocalExecTLS && "lwa/ld/std with thread-pointer only expected with " "local-exec small TLS"); int64_t Offset = MO.getOffset(); - // Non-zero offsets for lwa/ld/std require special handling and are - // handled here. - if (!Offset) - break; - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); - if (Offset) { - const MCExpr *Expr = getAdjustedLocalExecExpr(MO, Offset); - TmpInst.getOperand(1) = MCOperand::createExpr(Expr); - } + const MCExpr *Expr = getAdjustedLocalExecExpr(MO, Offset); + if (Expr) + TmpInst.getOperand(OpNum) = MCOperand::createExpr(Expr); EmitToStreamer(*OutStreamer, TmpInst); return; } @@ -1590,23 +1589,16 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { if (Flag == PPCII::MO_TPREL_FLAG || Flag == PPCII::MO_GOT_TPREL_PCREL_FLAG || Flag == PPCII::MO_TPREL_PCREL_FLAG) { - assert( - Subtarget->hasAIXSmallLocalExecTLS() && - "addi, or load/stores with thread-pointer only expected with " - "local-exec small TLS"); + assert(HasAIXSmallLocalExecTLS && + "addi, or load/stores with thread-pointer only expected with " + "local-exec small TLS"); int64_t Offset = MO.getOffset(); - // Non-zero offsets for loads/stores require special handling and are - // handled here. For `addi`, all offsets are handled here. - if (!Offset && !IsMIADDI8) - break; - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); - if (Offset) { - const MCExpr *Expr = getAdjustedLocalExecExpr(MO, Offset); + const MCExpr *Expr = getAdjustedLocalExecExpr(MO, Offset); + if (Expr) TmpInst.getOperand(OpNum) = MCOperand::createExpr(Expr); - } // Change the opcode to load address if the original opcode is an `addi`. if (IsMIADDI8) @@ -1627,7 +1619,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { // we need to create a new MCExpr that adds the non-zero offset to the address // of the local-exec variable that will be used in either an addi, load or // store. However, the final displacement for these instructions must be -// between [-32768, 32768), so if the TLS address + it's non-zero offset is +// between [-32768, 32768), so if the TLS address + its non-zero offset is // greater than 32KB, a new MCExpr is produced to accommodate this situation. const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO, int64_t Offset) { @@ -1638,6 +1630,10 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO, "Only local-exec accesses are handled!"); MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_PPC_AIX_TLSLE; + // Non-zero offsets (for loads, stores or `addi`) require additional handling. + // When the offset is zero, there is no need to create an adjusted MCExpr. + if (!Offset) + return nullptr; const MCExpr *Expr = MCSymbolRefExpr::create(getSymbol(GValue), RefKind, OutContext); @@ -1651,7 +1647,8 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO, "Only expecting to find extern TLS variables not present in the TLS " "variables to address map!"); - unsigned TLSVarAddress = TLSVarsMapEntryIter->second; + unsigned TLSVarAddress = + IsGlobalADeclaration ? 0 : TLSVarsMapEntryIter->second; ptrdiff_t FinalAddress = (TLSVarAddress + Offset); // If the address of the TLS variable + the offset is less than 32KB, // or if the TLS variable is extern, we simply produce an MCExpr to add the @@ -1663,24 +1660,20 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO, Expr, MCConstantExpr::create(Offset, OutContext), OutContext); else { // Handle the written offset for cases where: - // address of the TLS variable + the offset is greater than 32KB. - - // Get the address in the range of 0 to 64KB. - FinalAddress = FinalAddress & 0xFFFF; - // If the highest bit in the calculated address is set, subtract - // additional 64KB to ensure that the final address fits within - // [-32768,32768). - if (FinalAddress & 0x8000) - FinalAddress = FinalAddress - 0x10000; - assert((FinalAddress < 32768) || - (FinalAddress >= -32768) && - "Expecting the final address for local-exec TLS variables to be " - "between [-32768,32768)!"); - // Get the offset that is actually written out in assembly by adding back - // the original address of the TLS variable. - ptrdiff_t WrittenOffset = FinalAddress - TLSVarAddress; + // TLS variable address + Offset > 32KB. + + // The assembly that is printed is actually: + // TLSVar[storageMappingClass]@le + Offset - Delta + // where Delta is a multiple of 64KB: ((FinalAddress + 32768) & ~0xFFFF). + ptrdiff_t OffsetDelta = Offset - ((FinalAddress + 32768) & ~0xFFFF); + // Check that the total instruction displacement fits within [-32768,32768). + ptrdiff_t InstDisp = TLSVarAddress + OffsetDelta; + assert((InstDisp < 32768) || + (InstDisp >= -32768) && + "Expecting the instruction displacement for local-exec TLS " + "variables to be between [-32768, 32768)!"); Expr = MCBinaryExpr::createAdd( - Expr, MCConstantExpr::create(WrittenOffset, OutContext), OutContext); + Expr, MCConstantExpr::create(OffsetDelta, OutContext), OutContext); } return Expr; From b5bcb2541b5380e08f1d0fd22fd066d93d27fa56 Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Wed, 22 Nov 2023 09:30:45 -0600 Subject: [PATCH 04/17] Update comments and combine conditions --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 2 +- llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 0efb4462a10da..41da22a5d9fb1 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1663,7 +1663,7 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO, // TLS variable address + Offset > 32KB. // The assembly that is printed is actually: - // TLSVar[storageMappingClass]@le + Offset - Delta + // TLSVar@le + Offset - Delta // where Delta is a multiple of 64KB: ((FinalAddress + 32768) & ~0xFFFF). ptrdiff_t OffsetDelta = Offset - ((FinalAddress + 32768) & ~0xFFFF); // Check that the total instruction displacement fits within [-32768,32768). diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 3c4c798294694..fbdfee69acc1d 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -7580,22 +7580,19 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) { // InitialADDI is the addi feeding into N (also an addi), and the addi that // we want optimized out. SDValue InitialADDI = N->getOperand(0); - if (!InitialADDI.isMachineOpcode()) - return; - if (InitialADDI.getMachineOpcode() != PPC::ADDI8) + if (!InitialADDI.isMachineOpcode() || + (InitialADDI.getMachineOpcode() != PPC::ADDI8)) return; - // The first operand of the InitialADDI will be the thread pointer. + // The first operand of the InitialADDI should be the thread pointer. // This transformation is only performed if the first operand of the // addi is the thread pointer. SDValue TPRegNode = InitialADDI.getOperand(0); RegisterSDNode *TPReg = dyn_cast_or_null(TPRegNode.getNode()); - if (!TPReg) - return; - if (TPReg->getReg() != Subtarget.getThreadPointerRegister()) + if (!TPReg || (TPReg->getReg() != Subtarget.getThreadPointerRegister())) return; - // The second operand of the InitialADDI will be a TargetGlobalTLSAddress, + // The second operand of the InitialADDI should be the global TLS address // (the local-exec TLS variable). We only perform the folding if the TLS // variable is the second operand. SDValue TLSVarNode = InitialADDI.getOperand(1); @@ -7603,12 +7600,15 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) { if (!GA) return; + // The local-exec TLS variable should only have the MO_TPREL_FLAG target flag, + // so this optimization is not performed otherwise if the flag is not set. unsigned TargetFlags = GA->getTargetFlags(); if ((TargetFlags & PPCII::MO_TPREL_FLAG) == 0) return; + // The second operand of the addi that we want to preserve will be an - // immediate. We add this immediate together with the address of the TLS - // variable found in InitialADDI in order to preserve the correct TLS address + // immediate. We add this immediate, together with the address of the TLS + // variable found in InitialADDI, in order to preserve the correct TLS address // information during assembly printing. int Offset = N->getConstantOperandVal(1); TLSVarNode = DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA), MVT::i64, From c34760e19cc949df56d0608b36534d7f2153e25a Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Wed, 22 Nov 2023 12:33:12 -0600 Subject: [PATCH 05/17] Remove unnecessary whitespace --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 41da22a5d9fb1..2fab49e8c6b94 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1545,7 +1545,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { return; } } - // Now process the instruction normally. break; } From 504c5e0321267d78ffc62f3a3793607002e650bb Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Wed, 29 Nov 2023 11:20:12 -0600 Subject: [PATCH 06/17] Address review comments by pulling out checks for if addi is eligible for folding --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 6 +- llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 84 ++++++++++++++------- 2 files changed, 58 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 2fab49e8c6b94..17cf23920c7c2 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1528,13 +1528,13 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { llvm_unreachable("Global must be word-aligned for LD, STD, LWA!"); // A faster non-TOC-based local-exec sequence is represented by - // `lwa`/`ld`/`std` directingly loading or storing off of the thread - // pointer and with an immediate operand having the MO_TPREL_FLAG. + // directingly loading or storing off of the thread pointer and with + // an immediate operand having the MO_TPREL_FLAG. // Such instructions do not otherwise arise. unsigned Flag = MO.getTargetFlags(); if (Flag == PPCII::MO_TPREL_FLAG) { assert(HasAIXSmallLocalExecTLS && - "lwa/ld/std with thread-pointer only expected with " + "loads/stores with thread-pointer only expected with " "local-exec small TLS"); int64_t Offset = MO.getOffset(); LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index fbdfee69acc1d..cf4a081671a88 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -7565,51 +7565,76 @@ static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) { DAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), N->getOperand(0)); } -// For non-TOC-based local-exec access where an addi is feeding into another -// addi, fold this sequence into a single addi if possible. -static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) { +// Is an ADDI eligible for folding for non-TOC-based local-exec accesses? +static bool isEligibleToFoldADDIForLocalExecAccesses(SDNode *N, + SelectionDAG *DAG, + SDValue ADDIToFold) { const PPCSubtarget &Subtarget = DAG->getMachineFunction().getSubtarget(); // This optimization is only performed for non-TOC-based local-exec accesses. if (!Subtarget.hasAIXSmallLocalExecTLS()) - return; - - if (N->getMachineOpcode() != PPC::ADDI8) - return; + return false; - // InitialADDI is the addi feeding into N (also an addi), and the addi that - // we want optimized out. - SDValue InitialADDI = N->getOperand(0); - if (!InitialADDI.isMachineOpcode() || - (InitialADDI.getMachineOpcode() != PPC::ADDI8)) - return; + // Check if ADDIToFold (the ADDI that we want to fold into local-exec + // accesses), is truly an ADDI. + if (!ADDIToFold.isMachineOpcode() || + (ADDIToFold.getMachineOpcode() != PPC::ADDI8)) + return false; - // The first operand of the InitialADDI should be the thread pointer. + // The first operand of the ADDIToFold should be the thread pointer. // This transformation is only performed if the first operand of the // addi is the thread pointer. - SDValue TPRegNode = InitialADDI.getOperand(0); + SDValue TPRegNode = ADDIToFold.getOperand(0); RegisterSDNode *TPReg = dyn_cast_or_null(TPRegNode.getNode()); if (!TPReg || (TPReg->getReg() != Subtarget.getThreadPointerRegister())) - return; + return false; - // The second operand of the InitialADDI should be the global TLS address + // The second operand of the ADDIToFold should be the global TLS address // (the local-exec TLS variable). We only perform the folding if the TLS // variable is the second operand. - SDValue TLSVarNode = InitialADDI.getOperand(1); + SDValue TLSVarNode = ADDIToFold.getOperand(1); GlobalAddressSDNode *GA = dyn_cast(TLSVarNode); if (!GA) - return; + return false; // The local-exec TLS variable should only have the MO_TPREL_FLAG target flag, // so this optimization is not performed otherwise if the flag is not set. unsigned TargetFlags = GA->getTargetFlags(); if ((TargetFlags & PPCII::MO_TPREL_FLAG) == 0) + return false; + + // If all conditions are satisfied, the ADDI is valid for folding. + return true; +} + +// For non-TOC-based local-exec access where an addi is feeding into another +// addi, fold this sequence into a single addi if possible. +static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) { + if (N->getMachineOpcode() != PPC::ADDI8) + return; + + // InitialADDI is the addi feeding into N (also an addi), and the addi that + // we want optimized out. + SDValue InitialADDI = N->getOperand(0); + + if (!isEligibleToFoldADDIForLocalExecAccesses(N, DAG, InitialADDI)) return; + // At this point, InitialADDI can be folded into a non-TOC-based local-exec + // access. The first operand of InitialADDI should be the thread pointer. + SDValue TPRegNode = InitialADDI.getOperand(0); + + // The second operand of the InitialADDI should be the global TLS address + // (the local-exec TLS variable), with the MO_TPREL_FLAG target flag. + SDValue TLSVarNode = InitialADDI.getOperand(1); + GlobalAddressSDNode *GA = dyn_cast(TLSVarNode); + unsigned TargetFlags = GA->getTargetFlags(); + // The second operand of the addi that we want to preserve will be an // immediate. We add this immediate, together with the address of the TLS // variable found in InitialADDI, in order to preserve the correct TLS address - // information during assembly printing. + // information during assembly printing. The offset is likely to be non-zero + // when we end up in this case. int Offset = N->getConstantOperandVal(1); TLSVarNode = DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA), MVT::i64, Offset, TargetFlags); @@ -7621,7 +7646,6 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) { void PPCDAGToDAGISel::PeepholePPC64() { SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); - bool HasAIXSmallLocalExecTLS = Subtarget->hasAIXSmallLocalExecTLS(); while (Position != CurDAG->allnodes_begin()) { SDNode *N = &*--Position; @@ -7790,15 +7814,17 @@ void PPCDAGToDAGISel::PeepholePPC64() { ImmOpnd = CurDAG->getTargetConstant(Offset, SDLoc(ImmOpnd), ImmOpnd.getValueType()); } else if (Offset != 0) { - if (!HasAIXSmallLocalExecTLS) - continue; - // Add the non-zero offset information into the load or store - // instruction to be used for non-TOC-based local-exec accesses. - GlobalAddressSDNode *GA = dyn_cast(ImmOpnd); - if (!GA) + if (isEligibleToFoldADDIForLocalExecAccesses(N, CurDAG, Base)) { + // Add the non-zero offset information into the load or store + // instruction to be used for non-TOC-based local-exec accesses. + GlobalAddressSDNode *GA = dyn_cast(ImmOpnd); + if (!GA) + continue; + ImmOpnd = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA), + MVT::i64, Offset, + GA->getTargetFlags()); + } else continue; - ImmOpnd = CurDAG->getTargetGlobalAddress( - GA->getGlobal(), SDLoc(GA), MVT::i64, Offset, GA->getTargetFlags()); } } From 98f5fcf72f013af6bdd70993d2248dc682adbdea Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Wed, 29 Nov 2023 15:05:36 -0600 Subject: [PATCH 07/17] Remove comment and add asserts --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 2 -- llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 12 +++++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 17cf23920c7c2..66a8cb6e5087d 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -2891,8 +2891,6 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) { // For all TLS variables, calculate their corresponding addresses and store // them into TLSVarsToAddressMapping, which will be used to determine whether // or not local-exec TLS variables require special assembly printing. - // This address calculation follows the same method seen within - // assignAddressesAndIndices() in XCOFFObjectWriter.cpp. uint64_t Address = 0; uint64_t TLSVarAddress = 0; auto DL = M.getDataLayout(); diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index cf4a081671a88..9cb1ecd8db175 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -7621,13 +7621,23 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) { return; // At this point, InitialADDI can be folded into a non-TOC-based local-exec - // access. The first operand of InitialADDI should be the thread pointer. + // access. The first operand of InitialADDI should be the thread pointer, + // which has been checked in isEligibleToFoldADDIForLocalExecAccesses(). SDValue TPRegNode = InitialADDI.getOperand(0); + RegisterSDNode *TPReg = dyn_cast_or_null(TPRegNode.getNode()); + const PPCSubtarget &Subtarget = + DAG->getMachineFunction().getSubtarget(); + assert((TPReg && (TPReg->getReg() == Subtarget.getThreadPointerRegister())) && + "Expecting the first operand to be a thread pointer for folding addi " + "in local-exec accesses!"); // The second operand of the InitialADDI should be the global TLS address // (the local-exec TLS variable), with the MO_TPREL_FLAG target flag. + // This has been checked in isEligibleToFoldADDIForLocalExecAccesses(). SDValue TLSVarNode = InitialADDI.getOperand(1); GlobalAddressSDNode *GA = dyn_cast(TLSVarNode); + assert(GA && "Expecting a valid GlobalAddressSDNode when folding addi into " + "local-exec accesses!"); unsigned TargetFlags = GA->getTargetFlags(); // The second operand of the addi that we want to preserve will be an From f3f7ca935016c563be4665cf798799521a2987d4 Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Thu, 7 Dec 2023 12:39:57 -0600 Subject: [PATCH 08/17] Update comments and condition --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 4 ++-- llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 66a8cb6e5087d..3acd19eb18fec 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1528,7 +1528,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { llvm_unreachable("Global must be word-aligned for LD, STD, LWA!"); // A faster non-TOC-based local-exec sequence is represented by - // directingly loading or storing off of the thread pointer and with + // directly loading or storing off of the thread pointer and with // an immediate operand having the MO_TPREL_FLAG. // Such instructions do not otherwise arise. unsigned Flag = MO.getTargetFlags(); @@ -1654,7 +1654,7 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO, // non-zero offset to the TLS variable address. // For when TLS variables are extern, this is safe to do because we can // assume that the address of extern TLS variables are zero. - if ((FinalAddress < 32768) || IsGlobalADeclaration) + if (FinalAddress < 32768) Expr = MCBinaryExpr::createAdd( Expr, MCConstantExpr::create(Offset, OutContext), OutContext); else { diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 9cb1ecd8db175..9dd9d32ab4854 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -7609,6 +7609,11 @@ static bool isEligibleToFoldADDIForLocalExecAccesses(SDNode *N, // For non-TOC-based local-exec access where an addi is feeding into another // addi, fold this sequence into a single addi if possible. +// Before this optimization, the sequence appears as: +// addi rN, r13, sym@le +// addi rM, rN, imm +// After this optimization, we can fold the two addi into a single one: +// addi rM, r13, sym@le + imm static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) { if (N->getMachineOpcode() != PPC::ADDI8) return; From 46c5079fe8460dc9deec8c4b25e2c7bf22dbecce Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Thu, 7 Dec 2023 15:34:30 -0600 Subject: [PATCH 09/17] Update target flags for TLSGD variable --- .../CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll index c87b7acb6211c..2f4b05ec7b016 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll @@ -33,7 +33,7 @@ define i64 @StoreLargeAccess1() { ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r0, 64(r1) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r3, mySmallLocalExecTLS6[UL]@le+424(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r4, mySmallLocalExecTLS2[TL]@le+1200(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: ld r3, L..C0(r2) # target-flags(ppc-lo) @MyTLSGDVar +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: ld r3, L..C0(r2) # target-flags(ppc-tlsgdm) @MyTLSGDVar ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: ld r4, L..C1(r2) # target-flags(ppc-tlsgd) @MyTLSGDVar ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: bla .__tls_get_addr[PR] ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 44 From 211663a99dd40b0c07871d7bdaba6700c7f23c6f Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Thu, 7 Dec 2023 23:58:54 -0600 Subject: [PATCH 10/17] Print assembly in the tlsVar+Offset-Delta method instead --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 15 ++++----- .../aix-small-local-exec-tls-largeaccess.ll | 32 +++++++++---------- .../aix-small-local-exec-tls-largeaccess2.ll | 8 ++--- 3 files changed, 27 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 3acd19eb18fec..632a254dbdcfa 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1654,25 +1654,24 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO, // non-zero offset to the TLS variable address. // For when TLS variables are extern, this is safe to do because we can // assume that the address of extern TLS variables are zero. - if (FinalAddress < 32768) - Expr = MCBinaryExpr::createAdd( - Expr, MCConstantExpr::create(Offset, OutContext), OutContext); - else { + Expr = MCBinaryExpr::createAdd( + Expr, MCConstantExpr::create(Offset, OutContext), OutContext); + if (FinalAddress >= 32768) { // Handle the written offset for cases where: // TLS variable address + Offset > 32KB. - // The assembly that is printed is actually: + // The assembly that is printed will look like: // TLSVar@le + Offset - Delta // where Delta is a multiple of 64KB: ((FinalAddress + 32768) & ~0xFFFF). - ptrdiff_t OffsetDelta = Offset - ((FinalAddress + 32768) & ~0xFFFF); + ptrdiff_t Delta = ((FinalAddress + 32768) & ~0xFFFF); // Check that the total instruction displacement fits within [-32768,32768). - ptrdiff_t InstDisp = TLSVarAddress + OffsetDelta; + ptrdiff_t InstDisp = TLSVarAddress + Offset - Delta; assert((InstDisp < 32768) || (InstDisp >= -32768) && "Expecting the instruction displacement for local-exec TLS " "variables to be between [-32768, 32768)!"); Expr = MCBinaryExpr::createAdd( - Expr, MCConstantExpr::create(OffsetDelta, OutContext), OutContext); + Expr, MCConstantExpr::create(-Delta, OutContext), OutContext); } return Expr; diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll index 3aa3ecc9f2b0d..22b8503ef403c 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll @@ -30,12 +30,12 @@ define signext i32 @StoreArrays1() { ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, mySmallLocalExecTLSv1[TL]@le(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 2 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLSv1[TL]@le+24(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, mySmallLocalExecTLS2[TL]@le-65216(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 3 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, mySmallLocalExecTLS3[TL]@le-65212(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 88 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLS4[TL]@le-65208(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, mySmallLocalExecTLS5[TL]@le-65204(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, (mySmallLocalExecTLS5[TL]@le+332)-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 102 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: blr ; @@ -46,12 +46,12 @@ define signext i32 @StoreArrays1() { ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLSv1[TL]@le(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 2 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, mySmallLocalExecTLSv1[TL]@le+24(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS2[TL]@le-65216(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 3 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS3[TL]@le-65212(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 88 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, mySmallLocalExecTLS4[TL]@le-65208(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS5[TL]@le-65204(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS5[TL]@le+332)-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 102 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: blr entry: @@ -95,13 +95,13 @@ define signext i32 @StoreArrays2() { ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 4 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, 24(r4) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 2 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLS2[TL]@le-65216(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 3 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLS3[TL]@le-65212(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 88 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, mySmallLocalExecTLS4[TL]@le-65208(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 102 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLS5[TL]@le-65204(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, (mySmallLocalExecTLS5[TL]@le+332)-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: blr ; ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: StoreArrays2: @@ -114,12 +114,12 @@ define signext i32 @StoreArrays2() { ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 4 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, 24(r3) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 2 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS2[TL]@le-65216(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 3 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS3[TL]@le-65212(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 88 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, mySmallLocalExecTLS4[TL]@le-65208(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS5[TL]@le-65204(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS5[TL]@le+332)-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 102 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll index 2f4b05ec7b016..725b680054926 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll @@ -42,8 +42,8 @@ define i64 @StoreLargeAccess1() { ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 100 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r3, mySmallLocalExecTLS3[TL]@le+2000(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 882 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r4, mySmallLocalExecTLS4[TL]@le-58736(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r3, mySmallLocalExecTLS5[TL]@le-57136(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r4, (mySmallLocalExecTLS4[TL]@le+6800)-65536(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r3, (mySmallLocalExecTLS5[TL]@le+8400)-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 1191 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: addi r1, r1, 48 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: ld r0, 16(r1) @@ -70,8 +70,8 @@ define i64 @StoreLargeAccess1() { ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 100 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r3, mySmallLocalExecTLS3[TL]@le+2000(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 882 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r4, mySmallLocalExecTLS4[TL]@le-58736(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r3, mySmallLocalExecTLS5[TL]@le-57136(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r4, (mySmallLocalExecTLS4[TL]@le+6800)-65536(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r3, (mySmallLocalExecTLS5[TL]@le+8400)-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 1191 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: addi r1, r1, 48 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: ld r0, 16(r1) From ed6834a4b85862aa89ce827dcf61ba6ff99d9cf2 Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Tue, 2 Jan 2024 13:18:56 -0600 Subject: [PATCH 11/17] Update target flag checks --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 2 +- llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 632a254dbdcfa..a1d5ff7ee0989 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1517,7 +1517,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { // machine operand (which is a TargetGlobalTLSAddress) is expected to be // the same operand for both loads and stores. for (const MachineOperand &TempMO : MI->operands()) { - if (((TempMO.getTargetFlags() & PPCII::MO_TPREL_FLAG) != 0) && + if (((TempMO.getTargetFlags() == PPCII::MO_TPREL_FLAG)) && TempMO.getOperandNo() == 1) OpNum = 1; } diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 9dd9d32ab4854..fb001288697dc 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -7600,7 +7600,7 @@ static bool isEligibleToFoldADDIForLocalExecAccesses(SDNode *N, // The local-exec TLS variable should only have the MO_TPREL_FLAG target flag, // so this optimization is not performed otherwise if the flag is not set. unsigned TargetFlags = GA->getTargetFlags(); - if ((TargetFlags & PPCII::MO_TPREL_FLAG) == 0) + if (TargetFlags != PPCII::MO_TPREL_FLAG) return false; // If all conditions are satisfied, the ADDI is valid for folding. From e023aef25565eafff2867919b79b353a95dd0ec2 Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Wed, 10 Jan 2024 08:53:43 -0600 Subject: [PATCH 12/17] Add hyphens to variable-to-address comment/assert --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index a1d5ff7ee0989..c0e965a05c06a 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1638,13 +1638,13 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO, bool IsGlobalADeclaration = GValue->isDeclarationForLinker(); // Find the GlobalVariable that corresponds to the particular TLS variable - // in the TLS variable to address mapping. All TLS variables should exist + // in the TLS variable-to-address mapping. All TLS variables should exist // within this map, with the exception of TLS variables marked as extern. const auto TLSVarsMapEntryIter = TLSVarsToAddressMapping.find(GValue); if (TLSVarsMapEntryIter == TLSVarsToAddressMapping.end()) assert(IsGlobalADeclaration && "Only expecting to find extern TLS variables not present in the TLS " - "variables to address map!"); + "variable-to-address map!"); unsigned TLSVarAddress = IsGlobalADeclaration ? 0 : TLSVarsMapEntryIter->second; From 7e5a882295a80db76fce02d60d5befffeeee0fd8 Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Thu, 18 Jan 2024 16:34:16 -0600 Subject: [PATCH 13/17] Address various comments: moving around variables, removing unnecessary variables, etc. --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 25 +++++++++------------ llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 24 +++++++++++--------- 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index c0e965a05c06a..4c4d95ea14844 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1592,10 +1592,9 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { "addi, or load/stores with thread-pointer only expected with " "local-exec small TLS"); - int64_t Offset = MO.getOffset(); LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); - const MCExpr *Expr = getAdjustedLocalExecExpr(MO, Offset); + const MCExpr *Expr = getAdjustedLocalExecExpr(MO, MO.getOffset()); if (Expr) TmpInst.getOperand(OpNum) = MCOperand::createExpr(Expr); @@ -1622,19 +1621,15 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { // greater than 32KB, a new MCExpr is produced to accommodate this situation. const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO, int64_t Offset) { - assert(MO.isGlobal() && "Only expecting a global MachineOperand here!"); - const GlobalValue *GValue = MO.getGlobal(); - TLSModel::Model Model = TM.getTLSModel(GValue); - assert(Model == TLSModel::LocalExec && - "Only local-exec accesses are handled!"); - MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_PPC_AIX_TLSLE; - // Non-zero offsets (for loads, stores or `addi`) require additional handling. // When the offset is zero, there is no need to create an adjusted MCExpr. if (!Offset) return nullptr; - const MCExpr *Expr = - MCSymbolRefExpr::create(getSymbol(GValue), RefKind, OutContext); + + assert(MO.isGlobal() && "Only expecting a global MachineOperand here!"); + const GlobalValue *GValue = MO.getGlobal(); + assert(TM.getTLSModel(GValue) == TLSModel::LocalExec && + "Only local-exec accesses are handled!"); bool IsGlobalADeclaration = GValue->isDeclarationForLinker(); // Find the GlobalVariable that corresponds to the particular TLS variable @@ -1654,6 +1649,8 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO, // non-zero offset to the TLS variable address. // For when TLS variables are extern, this is safe to do because we can // assume that the address of extern TLS variables are zero. + const MCExpr *Expr = MCSymbolRefExpr::create( + getSymbol(GValue), MCSymbolRefExpr::VK_PPC_AIX_TLSLE, OutContext); Expr = MCBinaryExpr::createAdd( Expr, MCConstantExpr::create(Offset, OutContext), OutContext); if (FinalAddress >= 32768) { @@ -2890,15 +2887,13 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) { // For all TLS variables, calculate their corresponding addresses and store // them into TLSVarsToAddressMapping, which will be used to determine whether // or not local-exec TLS variables require special assembly printing. - uint64_t Address = 0; uint64_t TLSVarAddress = 0; auto DL = M.getDataLayout(); for (const auto &G : M.globals()) { if (G.isThreadLocal() && !G.isDeclaration()) { - TLSVarAddress = alignTo(Address, getGVAlignment(&G, DL)); - unsigned GVSize = DL.getTypeAllocSize(G.getValueType()); - Address = TLSVarAddress + GVSize; + TLSVarAddress = alignTo(TLSVarAddress, getGVAlignment(&G, DL)); TLSVarsToAddressMapping[&G] = TLSVarAddress; + TLSVarAddress += DL.getTypeAllocSize(G.getValueType()); } } diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index fb001288697dc..93148e41e1ac2 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -7566,15 +7566,8 @@ static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) { } // Is an ADDI eligible for folding for non-TOC-based local-exec accesses? -static bool isEligibleToFoldADDIForLocalExecAccesses(SDNode *N, - SelectionDAG *DAG, +static bool isEligibleToFoldADDIForLocalExecAccesses(SelectionDAG *DAG, SDValue ADDIToFold) { - const PPCSubtarget &Subtarget = - DAG->getMachineFunction().getSubtarget(); - // This optimization is only performed for non-TOC-based local-exec accesses. - if (!Subtarget.hasAIXSmallLocalExecTLS()) - return false; - // Check if ADDIToFold (the ADDI that we want to fold into local-exec // accesses), is truly an ADDI. if (!ADDIToFold.isMachineOpcode() || @@ -7586,6 +7579,8 @@ static bool isEligibleToFoldADDIForLocalExecAccesses(SDNode *N, // addi is the thread pointer. SDValue TPRegNode = ADDIToFold.getOperand(0); RegisterSDNode *TPReg = dyn_cast_or_null(TPRegNode.getNode()); + const PPCSubtarget &Subtarget = + DAG->getMachineFunction().getSubtarget(); if (!TPReg || (TPReg->getReg() != Subtarget.getThreadPointerRegister())) return false; @@ -7622,7 +7617,7 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) { // we want optimized out. SDValue InitialADDI = N->getOperand(0); - if (!isEligibleToFoldADDIForLocalExecAccesses(N, DAG, InitialADDI)) + if (!isEligibleToFoldADDIForLocalExecAccesses(DAG, InitialADDI)) return; // At this point, InitialADDI can be folded into a non-TOC-based local-exec @@ -7661,6 +7656,9 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) { void PPCDAGToDAGISel::PeepholePPC64() { SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); + const PPCSubtarget &Subtarget = + CurDAG->getMachineFunction().getSubtarget(); + bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS(); while (Position != CurDAG->allnodes_begin()) { SDNode *N = &*--Position; @@ -7671,7 +7669,9 @@ void PPCDAGToDAGISel::PeepholePPC64() { if (isVSXSwap(SDValue(N, 0))) reduceVSXSwap(N, CurDAG); - foldADDIForLocalExecAccesses(N, CurDAG); + // This optimization is performed for non-TOC-based local-exec accesses. + if (HasAIXSmallLocalExecTLS) + foldADDIForLocalExecAccesses(N, CurDAG); unsigned FirstOp; unsigned StorageOpcode = N->getMachineOpcode(); @@ -7829,7 +7829,9 @@ void PPCDAGToDAGISel::PeepholePPC64() { ImmOpnd = CurDAG->getTargetConstant(Offset, SDLoc(ImmOpnd), ImmOpnd.getValueType()); } else if (Offset != 0) { - if (isEligibleToFoldADDIForLocalExecAccesses(N, CurDAG, Base)) { + // This optimization is performed for non-TOC-based local-exec accesses. + if (HasAIXSmallLocalExecTLS && + isEligibleToFoldADDIForLocalExecAccesses(CurDAG, Base)) { // Add the non-zero offset information into the load or store // instruction to be used for non-TOC-based local-exec accesses. GlobalAddressSDNode *GA = dyn_cast(ImmOpnd); From 58fef767566e91746ac7370f1fbed5771ceabcaf Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Thu, 18 Jan 2024 17:09:10 -0600 Subject: [PATCH 14/17] Remove unnecessary subtarget variable. --- llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 93148e41e1ac2..21355e96babd1 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -7656,9 +7656,7 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) { void PPCDAGToDAGISel::PeepholePPC64() { SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); - const PPCSubtarget &Subtarget = - CurDAG->getMachineFunction().getSubtarget(); - bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS(); + bool HasAIXSmallLocalExecTLS = Subtarget->hasAIXSmallLocalExecTLS(); while (Position != CurDAG->allnodes_begin()) { SDNode *N = &*--Position; From 079869e765b8932625ad422f7a27eceddfcae140 Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Wed, 24 Jan 2024 14:10:11 -0600 Subject: [PATCH 15/17] Add an assert and common up code from load/stores --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 45 +++++++-------------- llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 4 +- 2 files changed, 17 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 4c4d95ea14844..52c698cfc3040 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1526,37 +1526,11 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { const DataLayout &DL = MO.getGlobal()->getParent()->getDataLayout(); if (MO.getGlobal()->getPointerAlignment(DL) < 4) llvm_unreachable("Global must be word-aligned for LD, STD, LWA!"); - - // A faster non-TOC-based local-exec sequence is represented by - // directly loading or storing off of the thread pointer and with - // an immediate operand having the MO_TPREL_FLAG. - // Such instructions do not otherwise arise. - unsigned Flag = MO.getTargetFlags(); - if (Flag == PPCII::MO_TPREL_FLAG) { - assert(HasAIXSmallLocalExecTLS && - "loads/stores with thread-pointer only expected with " - "local-exec small TLS"); - int64_t Offset = MO.getOffset(); - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); - const MCExpr *Expr = getAdjustedLocalExecExpr(MO, Offset); - if (Expr) - TmpInst.getOperand(OpNum) = MCOperand::createExpr(Expr); - EmitToStreamer(*OutStreamer, TmpInst); - return; - } } - // Now process the instruction normally. - break; - } - case PPC::PseudoEIEIO: { - EmitToStreamer( - *OutStreamer, - MCInstBuilder(PPC::ORI).addReg(PPC::X2).addReg(PPC::X2).addImm(0)); - EmitToStreamer( - *OutStreamer, - MCInstBuilder(PPC::ORI).addReg(PPC::X2).addReg(PPC::X2).addImm(0)); - EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::EnforceIEIO)); - return; + // As these load/stores share common code with the following load/stores, + // fall through to the subsequent cases in order to either process the + // non-TOC-based local-exec sequence or to process the instruction normally. + [[fallthrough]]; } case PPC::LBZ: case PPC::LBZ8: @@ -1605,8 +1579,19 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, TmpInst); return; } + // Now process the instruction normally. break; } + case PPC::PseudoEIEIO: { + EmitToStreamer( + *OutStreamer, + MCInstBuilder(PPC::ORI).addReg(PPC::X2).addReg(PPC::X2).addImm(0)); + EmitToStreamer( + *OutStreamer, + MCInstBuilder(PPC::ORI).addReg(PPC::X2).addReg(PPC::X2).addImm(0)); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::EnforceIEIO)); + return; + } } LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 21355e96babd1..66917c0e4ec02 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -7833,8 +7833,8 @@ void PPCDAGToDAGISel::PeepholePPC64() { // Add the non-zero offset information into the load or store // instruction to be used for non-TOC-based local-exec accesses. GlobalAddressSDNode *GA = dyn_cast(ImmOpnd); - if (!GA) - continue; + assert(GA && "Expecting a valid GlobalAddressSDNode when folding " + "addi into local-exec accesses!"); ImmOpnd = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA), MVT::i64, Offset, GA->getTargetFlags()); From 9acad7667ba57512e56195db6308ed2e586e9e91 Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Fri, 26 Jan 2024 22:59:31 -0600 Subject: [PATCH 16/17] Update assert to early exit --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 52c698cfc3040..a5d19a13144df 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1555,6 +1555,8 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { // or a load/store instruction (that directly loads or stores off of the // thread pointer) with an immediate operand having the MO_TPREL_FLAG. // Such instructions do not otherwise arise. + if (!HasAIXSmallLocalExecTLS) + break; bool IsMIADDI8 = MI->getOpcode() == PPC::ADDI8; unsigned OpNum = IsMIADDI8 ? 2 : 1; const MachineOperand &MO = MI->getOperand(OpNum); @@ -1562,10 +1564,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { if (Flag == PPCII::MO_TPREL_FLAG || Flag == PPCII::MO_GOT_TPREL_PCREL_FLAG || Flag == PPCII::MO_TPREL_PCREL_FLAG) { - assert(HasAIXSmallLocalExecTLS && - "addi, or load/stores with thread-pointer only expected with " - "local-exec small TLS"); - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); const MCExpr *Expr = getAdjustedLocalExecExpr(MO, MO.getOffset()); From cf5df000a2d616e9b2ce13e9697e760e7d06a2da Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Thu, 1 Feb 2024 07:57:29 -0600 Subject: [PATCH 17/17] Update to use dyn_cast() calls --- llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 66917c0e4ec02..76996fbe1980b 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -7578,7 +7578,7 @@ static bool isEligibleToFoldADDIForLocalExecAccesses(SelectionDAG *DAG, // This transformation is only performed if the first operand of the // addi is the thread pointer. SDValue TPRegNode = ADDIToFold.getOperand(0); - RegisterSDNode *TPReg = dyn_cast_or_null(TPRegNode.getNode()); + RegisterSDNode *TPReg = dyn_cast(TPRegNode.getNode()); const PPCSubtarget &Subtarget = DAG->getMachineFunction().getSubtarget(); if (!TPReg || (TPReg->getReg() != Subtarget.getThreadPointerRegister())) @@ -7624,7 +7624,7 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) { // access. The first operand of InitialADDI should be the thread pointer, // which has been checked in isEligibleToFoldADDIForLocalExecAccesses(). SDValue TPRegNode = InitialADDI.getOperand(0); - RegisterSDNode *TPReg = dyn_cast_or_null(TPRegNode.getNode()); + RegisterSDNode *TPReg = dyn_cast(TPRegNode.getNode()); const PPCSubtarget &Subtarget = DAG->getMachineFunction().getSubtarget(); assert((TPReg && (TPReg->getReg() == Subtarget.getThreadPointerRegister())) &&