-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AIX][TLS] Optimize the small local-exec access sequence for non-zero offsets #71485
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9898fd1
b4e7632
594cf6a
b5bcb25
c34760e
504c5e0
98f5fcf
f3f7ca9
46c5079
211663a
ed6834a
e023aef
7e5a882
58fef76
079869e
9acad76
cf5df00
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -66,9 +66,10 @@ | |
#include "llvm/Support/Debug.h" | ||
#include "llvm/Support/Error.h" | ||
#include "llvm/Support/ErrorHandling.h" | ||
#include "llvm/Support/MathExtras.h" | ||
#include "llvm/Support/Process.h" | ||
#include "llvm/Support/raw_ostream.h" | ||
#include "llvm/Support/Threading.h" | ||
#include "llvm/Support/raw_ostream.h" | ||
#include "llvm/Target/TargetMachine.h" | ||
#include "llvm/TargetParser/Triple.h" | ||
#include "llvm/Transforms/Utils/ModuleUtils.h" | ||
|
@@ -155,6 +156,11 @@ class PPCAsmPrinter : public AsmPrinter { | |
TOC; | ||
const PPCSubtarget *Subtarget = nullptr; | ||
|
||
// Keep track of the number of TLS variables and their corresponding | ||
// addresses, which is then used for the assembly printing of | ||
// non-TOC-based local-exec variables. | ||
MapVector<const GlobalValue *, uint64_t> TLSVarsToAddressMapping; | ||
|
||
public: | ||
explicit PPCAsmPrinter(TargetMachine &TM, | ||
std::unique_ptr<MCStreamer> Streamer) | ||
|
@@ -199,6 +205,8 @@ class PPCAsmPrinter : public AsmPrinter { | |
void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI); | ||
void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK); | ||
void EmitAIXTlsCallHelper(const MachineInstr *MI); | ||
const MCExpr *getAdjustedLocalExecExpr(const MachineOperand &MO, | ||
int64_t Offset); | ||
bool runOnMachineFunction(MachineFunction &MF) override { | ||
Subtarget = &MF.getSubtarget<PPCSubtarget>(); | ||
bool Changed = AsmPrinter::runOnMachineFunction(MF); | ||
|
@@ -753,6 +761,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { | |
MCInst TmpInst; | ||
const bool IsPPC64 = Subtarget->isPPC64(); | ||
const bool IsAIX = Subtarget->isAIXABI(); | ||
const bool HasAIXSmallLocalExecTLS = Subtarget->hasAIXSmallLocalExecTLS(); | ||
const Module *M = MF->getFunction().getParent(); | ||
PICLevel::Level PL = M->getPICLevel(); | ||
|
||
|
@@ -1504,12 +1513,70 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { | |
// Verify alignment is legal, so we don't create relocations | ||
// that can't be supported. | ||
unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1; | ||
// For non-TOC-based local-exec TLS accesses with non-zero offsets, the | ||
// machine operand (which is a TargetGlobalTLSAddress) is expected to be | ||
// the same operand for both loads and stores. | ||
for (const MachineOperand &TempMO : MI->operands()) { | ||
if (((TempMO.getTargetFlags() == PPCII::MO_TPREL_FLAG)) && | ||
TempMO.getOperandNo() == 1) | ||
OpNum = 1; | ||
} | ||
const MachineOperand &MO = MI->getOperand(OpNum); | ||
if (MO.isGlobal()) { | ||
const DataLayout &DL = MO.getGlobal()->getParent()->getDataLayout(); | ||
if (MO.getGlobal()->getPointerAlignment(DL) < 4) | ||
llvm_unreachable("Global must be word-aligned for LD, STD, LWA!"); | ||
} | ||
// As these load/stores share common code with the following load/stores, | ||
// fall through to the subsequent cases in order to either process the | ||
// non-TOC-based local-exec sequence or to process the instruction normally. | ||
[[fallthrough]]; | ||
} | ||
case PPC::LBZ: | ||
case PPC::LBZ8: | ||
case PPC::LHA: | ||
case PPC::LHA8: | ||
case PPC::LHZ: | ||
case PPC::LHZ8: | ||
case PPC::LWZ: | ||
case PPC::LWZ8: | ||
case PPC::STB: | ||
case PPC::STB8: | ||
case PPC::STH: | ||
case PPC::STH8: | ||
case PPC::STW: | ||
case PPC::STW8: | ||
case PPC::LFS: | ||
case PPC::STFS: | ||
case PPC::LFD: | ||
case PPC::STFD: | ||
case PPC::ADDI8: { | ||
// A faster non-TOC-based local-exec sequence is represented by `addi` | ||
amy-kwan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// or a load/store instruction (that directly loads or stores off of the | ||
// thread pointer) with an immediate operand having the MO_TPREL_FLAG. | ||
// Such instructions do not otherwise arise. | ||
if (!HasAIXSmallLocalExecTLS) | ||
break; | ||
bool IsMIADDI8 = MI->getOpcode() == PPC::ADDI8; | ||
amy-kwan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
unsigned OpNum = IsMIADDI8 ? 2 : 1; | ||
const MachineOperand &MO = MI->getOperand(OpNum); | ||
unsigned Flag = MO.getTargetFlags(); | ||
if (Flag == PPCII::MO_TPREL_FLAG || | ||
Flag == PPCII::MO_GOT_TPREL_PCREL_FLAG || | ||
Flag == PPCII::MO_TPREL_PCREL_FLAG) { | ||
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); | ||
|
||
const MCExpr *Expr = getAdjustedLocalExecExpr(MO, MO.getOffset()); | ||
if (Expr) | ||
TmpInst.getOperand(OpNum) = MCOperand::createExpr(Expr); | ||
|
||
// Change the opcode to load address if the original opcode is an `addi`. | ||
if (IsMIADDI8) | ||
TmpInst.setOpcode(PPC::LA8); | ||
|
||
EmitToStreamer(*OutStreamer, TmpInst); | ||
return; | ||
} | ||
// Now process the instruction normally. | ||
break; | ||
} | ||
|
@@ -1523,30 +1590,73 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { | |
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::EnforceIEIO)); | ||
return; | ||
} | ||
case PPC::ADDI8: { | ||
// The faster non-TOC-based local-exec sequence is represented by `addi` | ||
// with an immediate operand having the MO_TPREL_FLAG. Such an instruction | ||
// does not otherwise arise. | ||
unsigned Flag = MI->getOperand(2).getTargetFlags(); | ||
if (Flag == PPCII::MO_TPREL_FLAG || | ||
Flag == PPCII::MO_GOT_TPREL_PCREL_FLAG || | ||
Flag == PPCII::MO_TPREL_PCREL_FLAG) { | ||
assert( | ||
Subtarget->hasAIXSmallLocalExecTLS() && | ||
"addi with thread-pointer only expected with local-exec small TLS"); | ||
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); | ||
TmpInst.setOpcode(PPC::LA8); | ||
EmitToStreamer(*OutStreamer, TmpInst); | ||
return; | ||
} | ||
break; | ||
} | ||
} | ||
|
||
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); | ||
EmitToStreamer(*OutStreamer, TmpInst); | ||
} | ||
|
||
// For non-TOC-based local-exec variables that have a non-zero offset, | ||
// we need to create a new MCExpr that adds the non-zero offset to the address | ||
// of the local-exec variable that will be used in either an addi, load or | ||
// store. However, the final displacement for these instructions must be | ||
// between [-32768, 32768), so if the TLS address + its non-zero offset is | ||
// greater than 32KB, a new MCExpr is produced to accommodate this situation. | ||
const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO, | ||
int64_t Offset) { | ||
// Non-zero offsets (for loads, stores or `addi`) require additional handling. | ||
// When the offset is zero, there is no need to create an adjusted MCExpr. | ||
if (!Offset) | ||
return nullptr; | ||
amy-kwan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
assert(MO.isGlobal() && "Only expecting a global MachineOperand here!"); | ||
const GlobalValue *GValue = MO.getGlobal(); | ||
assert(TM.getTLSModel(GValue) == TLSModel::LocalExec && | ||
"Only local-exec accesses are handled!"); | ||
|
||
bool IsGlobalADeclaration = GValue->isDeclarationForLinker(); | ||
// Find the GlobalVariable that corresponds to the particular TLS variable | ||
// in the TLS variable-to-address mapping. All TLS variables should exist | ||
// within this map, with the exception of TLS variables marked as extern. | ||
const auto TLSVarsMapEntryIter = TLSVarsToAddressMapping.find(GValue); | ||
if (TLSVarsMapEntryIter == TLSVarsToAddressMapping.end()) | ||
assert(IsGlobalADeclaration && | ||
"Only expecting to find extern TLS variables not present in the TLS " | ||
"variable-to-address map!"); | ||
|
||
unsigned TLSVarAddress = | ||
IsGlobalADeclaration ? 0 : TLSVarsMapEntryIter->second; | ||
ptrdiff_t FinalAddress = (TLSVarAddress + Offset); | ||
// If the address of the TLS variable + the offset is less than 32KB, | ||
// or if the TLS variable is extern, we simply produce an MCExpr to add the | ||
// non-zero offset to the TLS variable address. | ||
// For when TLS variables are extern, this is safe to do because we can | ||
// assume that the address of extern TLS variables are zero. | ||
const MCExpr *Expr = MCSymbolRefExpr::create( | ||
getSymbol(GValue), MCSymbolRefExpr::VK_PPC_AIX_TLSLE, OutContext); | ||
Expr = MCBinaryExpr::createAdd( | ||
Expr, MCConstantExpr::create(Offset, OutContext), OutContext); | ||
if (FinalAddress >= 32768) { | ||
// Handle the written offset for cases where: | ||
// TLS variable address + Offset > 32KB. | ||
|
||
// The assembly that is printed will look like: | ||
// TLSVar@le + Offset - Delta | ||
// where Delta is a multiple of 64KB: ((FinalAddress + 32768) & ~0xFFFF). | ||
diggerlin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ptrdiff_t Delta = ((FinalAddress + 32768) & ~0xFFFF); | ||
// Check that the total instruction displacement fits within [-32768,32768). | ||
ptrdiff_t InstDisp = TLSVarAddress + Offset - Delta; | ||
assert((InstDisp < 32768) || | ||
(InstDisp >= -32768) && | ||
"Expecting the instruction displacement for local-exec TLS " | ||
"variables to be between [-32768, 32768)!"); | ||
Comment on lines
+1649
to
+1652
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (not a review comment) Per my understanding, allow peephole for non-zero offsets but without changes to AsmPrinter, we'll get assembler error like Since code here is to rewrite offsets exceeding upper limit (32768) into negative, will this assert be hit if the offset is even larger? (for example, twice the size as objects in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For larger variables, the assert will be not triggered. This is because in the initial patch that introduced this feature, I only restricted this non-TOC-based access sequence if the size of the TLS variable is less than 32751 within
So in the case where the size is
If the size is increased past
Hope the above answers your question. |
||
Expr = MCBinaryExpr::createAdd( | ||
Expr, MCConstantExpr::create(-Delta, OutContext), OutContext); | ||
} | ||
|
||
return Expr; | ||
} | ||
|
||
void PPCLinuxAsmPrinter::emitGNUAttributes(Module &M) { | ||
// Emit float ABI into GNU attribute | ||
Metadata *MD = M.getModuleFlag("float-abi"); | ||
|
@@ -2757,6 +2867,19 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) { | |
Csect->ensureMinAlignment(GOAlign); | ||
}; | ||
|
||
// For all TLS variables, calculate their corresponding addresses and store | ||
// them into TLSVarsToAddressMapping, which will be used to determine whether | ||
// or not local-exec TLS variables require special assembly printing. | ||
uint64_t TLSVarAddress = 0; | ||
auto DL = M.getDataLayout(); | ||
for (const auto &G : M.globals()) { | ||
if (G.isThreadLocal() && !G.isDeclaration()) { | ||
TLSVarAddress = alignTo(TLSVarAddress, getGVAlignment(&G, DL)); | ||
TLSVarsToAddressMapping[&G] = TLSVarAddress; | ||
amy-kwan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
TLSVarAddress += DL.getTypeAllocSize(G.getValueType()); | ||
} | ||
} | ||
|
||
// We need to know, up front, the alignment of csects for the assembly path, | ||
// because once a .csect directive gets emitted, we could not change the | ||
// alignment value on it. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7565,8 +7565,98 @@ static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) { | |
DAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), N->getOperand(0)); | ||
} | ||
|
||
// Is an ADDI eligible for folding for non-TOC-based local-exec accesses? | ||
static bool isEligibleToFoldADDIForLocalExecAccesses(SelectionDAG *DAG, | ||
SDValue ADDIToFold) { | ||
// Check if ADDIToFold (the ADDI that we want to fold into local-exec | ||
// accesses), is truly an ADDI. | ||
if (!ADDIToFold.isMachineOpcode() || | ||
(ADDIToFold.getMachineOpcode() != PPC::ADDI8)) | ||
return false; | ||
|
||
// The first operand of the ADDIToFold should be the thread pointer. | ||
// This transformation is only performed if the first operand of the | ||
// addi is the thread pointer. | ||
SDValue TPRegNode = ADDIToFold.getOperand(0); | ||
RegisterSDNode *TPReg = dyn_cast<RegisterSDNode>(TPRegNode.getNode()); | ||
const PPCSubtarget &Subtarget = | ||
DAG->getMachineFunction().getSubtarget<PPCSubtarget>(); | ||
if (!TPReg || (TPReg->getReg() != Subtarget.getThreadPointerRegister())) | ||
return false; | ||
|
||
// The second operand of the ADDIToFold should be the global TLS address | ||
// (the local-exec TLS variable). We only perform the folding if the TLS | ||
// variable is the second operand. | ||
SDValue TLSVarNode = ADDIToFold.getOperand(1); | ||
GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(TLSVarNode); | ||
if (!GA) | ||
return false; | ||
|
||
// The local-exec TLS variable should only have the MO_TPREL_FLAG target flag, | ||
// so this optimization is not performed otherwise if the flag is not set. | ||
unsigned TargetFlags = GA->getTargetFlags(); | ||
if (TargetFlags != PPCII::MO_TPREL_FLAG) | ||
return false; | ||
|
||
// If all conditions are satisfied, the ADDI is valid for folding. | ||
return true; | ||
} | ||
|
||
// For non-TOC-based local-exec access where an addi is feeding into another | ||
// addi, fold this sequence into a single addi if possible. | ||
// Before this optimization, the sequence appears as: | ||
// addi rN, r13, sym@le | ||
// addi rM, rN, imm | ||
// After this optimization, we can fold the two addi into a single one: | ||
// addi rM, r13, sym@le + imm | ||
static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) { | ||
amy-kwan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (N->getMachineOpcode() != PPC::ADDI8) | ||
amy-kwan marked this conversation as resolved.
Show resolved
Hide resolved
diggerlin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return; | ||
|
||
// InitialADDI is the addi feeding into N (also an addi), and the addi that | ||
// we want optimized out. | ||
SDValue InitialADDI = N->getOperand(0); | ||
|
||
if (!isEligibleToFoldADDIForLocalExecAccesses(DAG, InitialADDI)) | ||
return; | ||
|
||
// At this point, InitialADDI can be folded into a non-TOC-based local-exec | ||
// access. The first operand of InitialADDI should be the thread pointer, | ||
// which has been checked in isEligibleToFoldADDIForLocalExecAccesses(). | ||
SDValue TPRegNode = InitialADDI.getOperand(0); | ||
RegisterSDNode *TPReg = dyn_cast<RegisterSDNode>(TPRegNode.getNode()); | ||
const PPCSubtarget &Subtarget = | ||
DAG->getMachineFunction().getSubtarget<PPCSubtarget>(); | ||
assert((TPReg && (TPReg->getReg() == Subtarget.getThreadPointerRegister())) && | ||
"Expecting the first operand to be a thread pointer for folding addi " | ||
"in local-exec accesses!"); | ||
diggerlin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// The second operand of the InitialADDI should be the global TLS address | ||
// (the local-exec TLS variable), with the MO_TPREL_FLAG target flag. | ||
// This has been checked in isEligibleToFoldADDIForLocalExecAccesses(). | ||
SDValue TLSVarNode = InitialADDI.getOperand(1); | ||
GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(TLSVarNode); | ||
assert(GA && "Expecting a valid GlobalAddressSDNode when folding addi into " | ||
diggerlin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"local-exec accesses!"); | ||
unsigned TargetFlags = GA->getTargetFlags(); | ||
|
||
// The second operand of the addi that we want to preserve will be an | ||
// immediate. We add this immediate, together with the address of the TLS | ||
// variable found in InitialADDI, in order to preserve the correct TLS address | ||
// information during assembly printing. The offset is likely to be non-zero | ||
// when we end up in this case. | ||
int Offset = N->getConstantOperandVal(1); | ||
amy-kwan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
TLSVarNode = DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA), MVT::i64, | ||
Offset, TargetFlags); | ||
|
||
(void)DAG->UpdateNodeOperands(N, TPRegNode, TLSVarNode); | ||
if (InitialADDI.getNode()->use_empty()) | ||
DAG->RemoveDeadNode(InitialADDI.getNode()); | ||
} | ||
|
||
void PPCDAGToDAGISel::PeepholePPC64() { | ||
SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); | ||
bool HasAIXSmallLocalExecTLS = Subtarget->hasAIXSmallLocalExecTLS(); | ||
|
||
while (Position != CurDAG->allnodes_begin()) { | ||
SDNode *N = &*--Position; | ||
|
@@ -7577,6 +7667,10 @@ void PPCDAGToDAGISel::PeepholePPC64() { | |
if (isVSXSwap(SDValue(N, 0))) | ||
reduceVSXSwap(N, CurDAG); | ||
|
||
// This optimization is performed for non-TOC-based local-exec accesses. | ||
if (HasAIXSmallLocalExecTLS) | ||
foldADDIForLocalExecAccesses(N, CurDAG); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if foldADDIForLocalExecAccesses success , It will be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is fine. Since we can fold addi instructions first, and then after there may be load and store instructions that we can fold, which can only happen if we continue further into this loop. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if foldADDIForLocalExecAccesses success only when N->getMachineOpcode() == PPC::ADDI8. and the Opcode of N is not changed in the foldADDIForLocalExecAccesses , so the OpeCode of N is still ADDI8 it will be
is it correct ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I see what you mean. If we have successfully did the transformation in I think this is fine and is what I initially expected, because as you mentioned, it will hit the I think what I have done here would be similar to the check and transformation above my change,
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the suggestion, Digger. I was initially thinking of keeping the code as is. However, just so I understand your suggestion, are you suggesting the following or am I mistaken?
In the case where I have something like:
It will visit the Just wanted to double check if I understood your suggestion correctly, because I think currently as I understand it, it doesn't seem like it would work in this situation. |
||
|
||
unsigned FirstOp; | ||
unsigned StorageOpcode = N->getMachineOpcode(); | ||
bool RequiresMod4Offset = false; | ||
|
@@ -7733,7 +7827,19 @@ void PPCDAGToDAGISel::PeepholePPC64() { | |
ImmOpnd = CurDAG->getTargetConstant(Offset, SDLoc(ImmOpnd), | ||
ImmOpnd.getValueType()); | ||
} else if (Offset != 0) { | ||
continue; | ||
// This optimization is performed for non-TOC-based local-exec accesses. | ||
if (HasAIXSmallLocalExecTLS && | ||
isEligibleToFoldADDIForLocalExecAccesses(CurDAG, Base)) { | ||
// Add the non-zero offset information into the load or store | ||
// instruction to be used for non-TOC-based local-exec accesses. | ||
GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd); | ||
assert(GA && "Expecting a valid GlobalAddressSDNode when folding " | ||
"addi into local-exec accesses!"); | ||
ImmOpnd = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA), | ||
MVT::i64, Offset, | ||
GA->getTargetFlags()); | ||
} else | ||
continue; | ||
} | ||
} | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.