Skip to content

[AIX][TLS] Produce a faster local-exec access sequence for the "aix-small-tls" global variable attribute #83053

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 20 additions & 8 deletions llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7558,6 +7558,16 @@ static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) {
DAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), N->getOperand(0));
}

// Check if an SDValue has the 'aix-small-tls' global variable attribute.
static bool hasAIXSmallTLSAttr(SDValue Val) {
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Val))
if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(GA->getGlobal()))
if (GV->hasAttribute("aix-small-tls"))
return true;

return false;
}

// Is an ADDI eligible for folding for non-TOC-based local-exec accesses?
static bool isEligibleToFoldADDIForLocalExecAccesses(SelectionDAG *DAG,
SDValue ADDIToFold) {
Expand All @@ -7567,20 +7577,25 @@ static bool isEligibleToFoldADDIForLocalExecAccesses(SelectionDAG *DAG,
(ADDIToFold.getMachineOpcode() != PPC::ADDI8))
return false;

// Folding is only allowed for the AIX small-local-exec TLS target attribute
// or when the 'aix-small-tls' global variable attribute is present.
const PPCSubtarget &Subtarget =
DAG->getMachineFunction().getSubtarget<PPCSubtarget>();
SDValue TLSVarNode = ADDIToFold.getOperand(1);
if (!(Subtarget.hasAIXSmallLocalExecTLS() || hasAIXSmallTLSAttr(TLSVarNode)))
return false;

// The first operand of the ADDIToFold should be the thread pointer.
// This transformation is only performed if the first operand of the
// addi is the thread pointer.
SDValue TPRegNode = ADDIToFold.getOperand(0);
RegisterSDNode *TPReg = dyn_cast<RegisterSDNode>(TPRegNode.getNode());
const PPCSubtarget &Subtarget =
DAG->getMachineFunction().getSubtarget<PPCSubtarget>();
if (!TPReg || (TPReg->getReg() != Subtarget.getThreadPointerRegister()))
return false;

// The second operand of the ADDIToFold should be the global TLS address
// (the local-exec TLS variable). We only perform the folding if the TLS
// variable is the second operand.
SDValue TLSVarNode = ADDIToFold.getOperand(1);
GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(TLSVarNode);
if (!GA)
return false;
Expand Down Expand Up @@ -7649,7 +7664,6 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {

void PPCDAGToDAGISel::PeepholePPC64() {
SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
bool HasAIXSmallLocalExecTLS = Subtarget->hasAIXSmallLocalExecTLS();

while (Position != CurDAG->allnodes_begin()) {
SDNode *N = &*--Position;
Expand All @@ -7661,8 +7675,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
reduceVSXSwap(N, CurDAG);

// This optimization is performed for non-TOC-based local-exec accesses.
if (HasAIXSmallLocalExecTLS)
foldADDIForLocalExecAccesses(N, CurDAG);
foldADDIForLocalExecAccesses(N, CurDAG);

unsigned FirstOp;
unsigned StorageOpcode = N->getMachineOpcode();
Expand Down Expand Up @@ -7821,8 +7834,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
ImmOpnd.getValueType());
} else if (Offset != 0) {
// This optimization is performed for non-TOC-based local-exec accesses.
if (HasAIXSmallLocalExecTLS &&
isEligibleToFoldADDIForLocalExecAccesses(CurDAG, Base)) {
if (isEligibleToFoldADDIForLocalExecAccesses(CurDAG, Base)) {
// Add the non-zero offset information into the load or store
// instruction to be used for non-TOC-based local-exec accesses.
GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd);
Expand Down
23 changes: 16 additions & 7 deletions llvm/lib/Target/PowerPC/PPCISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3367,15 +3367,21 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
const GlobalValue *GV = GA->getGlobal();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
bool Is64Bit = Subtarget.isPPC64();
bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS();
TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
bool IsTLSLocalExecModel = Model == TLSModel::LocalExec;

if (IsTLSLocalExecModel || Model == TLSModel::InitialExec) {
bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS();
bool HasAIXSmallTLSGlobalAttr = false;
SDValue VariableOffsetTGA =
DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_FLAG);
SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA);
SDValue TLSReg;

if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
if (GVar->hasAttribute("aix-small-tls"))
HasAIXSmallTLSGlobalAttr = true;

if (Is64Bit) {
// For local-exec and initial-exec on AIX (64-bit), the sequence generated
// involves a load of the variable offset (from the TOC), followed by an
Expand All @@ -3385,14 +3391,16 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
// add reg2, reg1, r13 // r13 contains the thread pointer
TLSReg = DAG.getRegister(PPC::X13, MVT::i64);

// With the -maix-small-local-exec-tls option, produce a faster access
// sequence for local-exec TLS variables where the offset from the TLS
// base is encoded as an immediate operand.
// With the -maix-small-local-exec-tls option, or with the "aix-small-tls"
// global variable attribute, produce a faster access sequence for
// local-exec TLS variables where the offset from the TLS base is encoded
// as an immediate operand.
//
// We only utilize the faster local-exec access sequence when the TLS
// variable has a size within the policy limit. We treat types that are
// not sized or are empty as being over the policy size limit.
if (HasAIXSmallLocalExecTLS && IsTLSLocalExecModel) {
if ((HasAIXSmallLocalExecTLS || HasAIXSmallTLSGlobalAttr) &&
IsTLSLocalExecModel) {
Type *GVType = GV->getValueType();
if (GVType->isSized() && !GVType->isEmptyTy() &&
GV->getParent()->getDataLayout().getTypeAllocSize(GVType) <=
Expand All @@ -3410,8 +3418,9 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
TLSReg = DAG.getNode(PPCISD::GET_TPOINTER, dl, PtrVT);

// We do not implement the 32-bit version of the faster access sequence
// for local-exec that is controlled by -maix-small-local-exec-tls.
if (HasAIXSmallLocalExecTLS)
// for local-exec that is controlled by the -maix-small-local-exec-tls
// option, or the "aix-small-tls" global variable attribute.
if (HasAIXSmallLocalExecTLS || HasAIXSmallTLSGlobalAttr)
report_fatal_error("The small-local-exec TLS access sequence is "
"currently only supported on AIX (64-bit mode).");
}
Expand Down
105 changes: 105 additions & 0 deletions llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-funcattr.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
; RUN: -mtriple powerpc64-ibm-aix-xcoff < %s \
; RUN: | FileCheck %s --check-prefixes=COMMONCM,CHECK-SMALLCM64
; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
; RUN: -mtriple powerpc64-ibm-aix-xcoff --code-model=large \
; RUN: < %s | FileCheck %s --check-prefixes=COMMONCM,CHECK-LARGECM64

@mySmallTLS = thread_local(localexec) global [7800 x i64] zeroinitializer, align 8 #0
@mySmallTLS2 = thread_local(localexec) global [3000 x i64] zeroinitializer, align 8 #0
@mySmallTLS3 = thread_local(localexec) global [3000 x i64] zeroinitializer, align 8
declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)

; All accesses use a "faster" local-exec sequence directly off the thread pointer,
; except for mySmallTLS, as this variable is over the 32KB size limit.
define i64 @StoreLargeAccess1() #1 {
; COMMONCM-LABEL: StoreLargeAccess1:
; COMMONCM-NEXT: # %bb.0: # %entry
; CHECK-SMALLCM64: ld r3, L..C0(r2) # target-flags(ppc-tprel) @mySmallTLS
; CHECK-SMALLCM64-NEXT: li r4, 0
; CHECK-SMALLCM64-NEXT: li r5, 23
; CHECK-LARGECM64: addis r3, L..C0@u(r2)
; CHECK-LARGECM64-NEXT: li r4, 0
; CHECK-LARGECM64-NEXT: li r5, 23
; CHECK-LARGECM64-NEXT: ld r3, L..C0@l(r3)
; COMMONCM: ori r4, r4, 53328
; COMMONCM-NEXT: add r3, r13, r3
; COMMONCM-NEXT: stdx r5, r3, r4
; COMMONCM-NEXT: li r3, 55
; COMMONCM-NEXT: li r4, 64
; COMMONCM-NEXT: std r3, (mySmallTLS2[TL]@le+696)-65536(r13)
; COMMONCM-NEXT: li r3, 142
; COMMONCM-NEXT: std r4, (mySmallTLS3[TL]@le+20000)-131072(r13)
; COMMONCM-NEXT: blr
entry:
%tls0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallTLS)
%arrayidx = getelementptr inbounds i8, ptr %tls0, i32 53328
store i64 23, ptr %arrayidx, align 8
%tls1 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallTLS2)
%arrayidx1 = getelementptr inbounds i8, ptr %tls1, i32 696
store i64 55, ptr %arrayidx1, align 8
%tls2 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallTLS3)
%arrayidx2 = getelementptr inbounds i8, ptr %tls2, i32 20000
store i64 64, ptr %arrayidx2, align 8
%load1 = load i64, ptr %arrayidx, align 8
%load2 = load i64, ptr %arrayidx1, align 8
%add1 = add i64 %load1, 64
%add2 = add i64 %add1, %load2
ret i64 %add2
}

; Since this function does not have the 'aix-small-local-exec-tls` attribute,
; only some local-exec variables should have the small-local-exec TLS access
; sequence (as opposed to all of them).
define i64 @StoreLargeAccess2() {
; COMMONCM-LABEL: StoreLargeAccess2:
; COMMONCM-NEXT: # %bb.0: # %entry
; CHECK-SMALLCM64: ld r5, L..C0(r2) # target-flags(ppc-tprel) @mySmallTLS
; CHECK-SMALLCM64-NEXT: li r3, 0
; CHECK-SMALLCM64-NEXT: li r4, 23
; CHECK-SMALLCM64-NEXT: ori r3, r3, 53328
; CHECK-SMALLCM64-NEXT: add r5, r13, r5
; CHECK-SMALLCM64-NEXT: stdx r4, r5, r3
; CHECK-SMALLCM64-NEXT: ld r5, L..C1(r2) # target-flags(ppc-tprel) @mySmallTLS3
; CHECK-SMALLCM64-NEXT: li r3, 55
; CHECK-SMALLCM64-NEXT: li r4, 64
; CHECK-SMALLCM64-NEXT: std r3, mySmallTLS2[TL]@le+696(r13)
; CHECK-SMALLCM64-NEXT: li r3, 142
; CHECK-SMALLCM64-NEXT: add r5, r13, r5
; CHECK-SMALLCM64-NEXT: std r4, 20000(r5)
; CHECK-LARGECM64: addis r3, L..C0@u(r2)
; CHECK-LARGECM64-NEXT: li r4, 0
; CHECK-LARGECM64-NEXT: li r5, 23
; CHECK-LARGECM64-NEXT: ld r3, L..C0@l(r3)
; CHECK-LARGECM64-NEXT: ori r4, r4, 53328
; CHECK-LARGECM64-NEXT: add r3, r13, r3
; CHECK-LARGECM64-NEXT: stdx r5, r3, r4
; CHECK-LARGECM64-NEXT: addis r3, L..C1@u(r2)
; CHECK-LARGECM64-NEXT: li r4, 55
; CHECK-LARGECM64-NEXT: li r5, 64
; CHECK-LARGECM64-NEXT: ld r3, L..C1@l(r3)
; CHECK-LARGECM64-NEXT: std r4, mySmallTLS2[TL]@le+696(r13)
; CHECK-LARGECM64-NEXT: add r3, r13, r3
; CHECK-LARGECM64-NEXT: std r5, 20000(r3)
; CHECK-LARGECM64-NEXT: li r3, 142
; COMMONCM-NEXT: blr
;
entry:
%tls0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallTLS)
%arrayidx = getelementptr inbounds i8, ptr %tls0, i32 53328
store i64 23, ptr %arrayidx, align 8
%tls1 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallTLS2)
%arrayidx1 = getelementptr inbounds i8, ptr %tls1, i32 696
store i64 55, ptr %arrayidx1, align 8
%tls2 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallTLS3)
%arrayidx2 = getelementptr inbounds i8, ptr %tls2, i32 20000
store i64 64, ptr %arrayidx2, align 8
%load1 = load i64, ptr %arrayidx, align 8
%load2 = load i64, ptr %arrayidx1, align 8
%add1 = add i64 %load1, 64
%add2 = add i64 %add1, %load2
ret i64 %add2
}

attributes #0 = { "aix-small-tls" }
attributes #1 = { "target-features"="+aix-small-local-exec-tls" }
Loading