Skip to content

Commit 2baaf7c

Browse files
SC llvm teamSC llvm team
SC llvm team
authored and
SC llvm team
committed
Merged main:a6dabed3483c into amd-gfx:adee0826382f
Local branch amd-gfx adee082 Merged main:75b3c3d267bf into amd-gfx:d648e114f351 Remote branch main a6dabed [AMDGPU] Fix nondeterminism in SIFixSGPRCopies (llvm#70644)
2 parents adee082 + a6dabed commit 2baaf7c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+1394
-38
lines changed

clang/include/clang/Basic/TargetInfo.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1291,20 +1291,20 @@ class TargetInfo : public TransferrableTargetInfo,
12911291
fillValidCPUList(Values);
12921292
}
12931293

1294-
/// brief Determine whether this TargetInfo supports the given CPU name.
1294+
/// Determine whether this TargetInfo supports the given CPU name.
12951295
virtual bool isValidCPUName(StringRef Name) const {
12961296
return true;
12971297
}
12981298

1299-
/// brief Determine whether this TargetInfo supports the given CPU name for
1300-
// tuning.
1299+
/// Determine whether this TargetInfo supports the given CPU name for
1300+
/// tuning.
13011301
virtual bool isValidTuneCPUName(StringRef Name) const {
13021302
return isValidCPUName(Name);
13031303
}
13041304

13051305
virtual ParsedTargetAttr parseTargetAttr(StringRef Str) const;
13061306

1307-
/// brief Determine whether this TargetInfo supports tune in target attribute.
1307+
/// Determine whether this TargetInfo supports tune in target attribute.
13081308
virtual bool supportsTargetAttributeTune() const {
13091309
return false;
13101310
}

clang/include/clang/Parse/Parser.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -663,9 +663,9 @@ class Parser : public CodeCompletionHandler {
663663
return PrevTokLocation;
664664
}
665665

666-
///\ brief When we are consuming a code-completion token without having
667-
/// matched specific position in the grammar, provide code-completion results
668-
/// based on context.
666+
/// When we are consuming a code-completion token without having matched
667+
/// specific position in the grammar, provide code-completion results based
668+
/// on context.
669669
///
670670
/// \returns the source location of the code-completion token.
671671
SourceLocation handleUnexpectedCodeCompletionToken();

llvm/docs/LangRef.rst

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18497,6 +18497,45 @@ Arguments:
1849718497
Both arguments must be vectors of the same type whereby their logical
1849818498
concatenation matches the result type.
1849918499

18500+
'``llvm.experimental.cttz.elts``' Intrinsic
18501+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
18502+
18503+
Syntax:
18504+
"""""""
18505+
18506+
This is an overloaded intrinsic. You can use ```llvm.experimental.cttz.elts```
18507+
on any vector of integer elements, both fixed width and scalable.
18508+
18509+
::
18510+
18511+
declare i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1> <src>, i1 <is_zero_poison>)
18512+
18513+
Overview:
18514+
"""""""""
18515+
18516+
The '``llvm.experimental.cttz.elts``' intrinsic counts the number of trailing
18517+
zero elements of a vector.
18518+
18519+
Arguments:
18520+
""""""""""
18521+
18522+
The first argument is the vector to be counted. This argument must be a vector
18523+
with integer element type. The return type must also be an integer type which is
18524+
wide enough to hold the maximum number of elements of the source vector. The
18525+
behaviour of this intrinsic is undefined if the return type is not wide enough
18526+
for the number of elements in the input vector.
18527+
18528+
The second argument is a constant flag that indicates whether the intrinsic
18529+
returns a valid result if the first argument is all zero. If the first argument
18530+
is all zero and the second argument is true, the result is poison.
18531+
18532+
Semantics:
18533+
""""""""""
18534+
18535+
The '``llvm.experimental.cttz.elts``' intrinsic counts the trailing (least
18536+
significant) zero elements in a vector. If ``src == 0`` the result is the
18537+
number of elements in the input vector.
18538+
1850018539
'``llvm.experimental.vector.splice``' Intrinsic
1850118540
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1850218541

llvm/include/llvm/Analysis/InlineCost.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,8 @@ InlineParams getInlineParams(unsigned OptLevel, unsigned SizeOptLevel);
259259

260260
/// Return the cost associated with a callsite, including parameter passing
261261
/// and the call/return instruction.
262-
int getCallsiteCost(const CallBase &Call, const DataLayout &DL);
262+
int getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call,
263+
const DataLayout &DL);
263264

264265
/// Get an InlineCost object representing the cost of inlining this
265266
/// callsite.

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1517,6 +1517,15 @@ class TargetTransformInfo {
15171517
bool areInlineCompatible(const Function *Caller,
15181518
const Function *Callee) const;
15191519

1520+
/// Returns a penalty for invoking call \p Call in \p F.
1521+
/// For example, if a function F calls a function G, which in turn calls
1522+
/// function H, then getInlineCallPenalty(F, H()) would return the
1523+
/// penalty of calling H from F, e.g. after inlining G into F.
1524+
/// \p DefaultCallPenalty is passed to give a default penalty that
1525+
/// the target can amend or override.
1526+
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
1527+
unsigned DefaultCallPenalty) const;
1528+
15201529
/// \returns True if the caller and callee agree on how \p Types will be
15211530
/// passed to or returned from the callee.
15221531
/// to the callee.
@@ -2012,6 +2021,8 @@ class TargetTransformInfo::Concept {
20122021
std::optional<uint32_t> AtomicCpySize) const = 0;
20132022
virtual bool areInlineCompatible(const Function *Caller,
20142023
const Function *Callee) const = 0;
2024+
virtual unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
2025+
unsigned DefaultCallPenalty) const = 0;
20152026
virtual bool areTypesABICompatible(const Function *Caller,
20162027
const Function *Callee,
20172028
const ArrayRef<Type *> &Types) const = 0;
@@ -2673,6 +2684,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
26732684
const Function *Callee) const override {
26742685
return Impl.areInlineCompatible(Caller, Callee);
26752686
}
2687+
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
2688+
unsigned DefaultCallPenalty) const override {
2689+
return Impl.getInlineCallPenalty(F, Call, DefaultCallPenalty);
2690+
}
26762691
bool areTypesABICompatible(const Function *Caller, const Function *Callee,
26772692
const ArrayRef<Type *> &Types) const override {
26782693
return Impl.areTypesABICompatible(Caller, Callee, Types);

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -802,6 +802,11 @@ class TargetTransformInfoImplBase {
802802
Callee->getFnAttribute("target-features"));
803803
}
804804

805+
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
806+
unsigned DefaultCallPenalty) const {
807+
return DefaultCallPenalty;
808+
}
809+
805810
bool areTypesABICompatible(const Function *Caller, const Function *Callee,
806811
const ArrayRef<Type *> &Types) const {
807812
return (Caller->getFnAttribute("target-cpu") ==

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,10 @@ class TargetLoweringBase {
465465
return true;
466466
}
467467

468+
/// Return true if the @llvm.experimental.cttz.elts intrinsic should be
469+
/// expanded using generic code in SelectionDAGBuilder.
470+
virtual bool shouldExpandCttzElements(EVT VT) const { return true; }
471+
468472
// Return true if op(vecreduce(x), vecreduce(y)) should be reassociated to
469473
// vecreduce(op(x, y)) for the reduction opcode RedOpc.
470474
virtual bool shouldReassociateReduction(unsigned RedOpc, EVT VT) const {

llvm/include/llvm/Config/llvm-config.h.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
/* Indicate that this is LLVM compiled from the amd-gfx branch. */
1818
#define LLVM_HAVE_BRANCH_AMD_GFX
19-
#define LLVM_MAIN_REVISION 478915
19+
#define LLVM_MAIN_REVISION 479203
2020

2121
/* Define if LLVM_ENABLE_DUMP is enabled */
2222
#cmakedefine LLVM_ENABLE_DUMP

llvm/include/llvm/IR/Intrinsics.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2182,6 +2182,11 @@ def int_experimental_get_vector_length:
21822182
[IntrNoMem, IntrNoSync, IntrWillReturn,
21832183
ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
21842184

2185+
def int_experimental_cttz_elts:
2186+
DefaultAttrsIntrinsic<[llvm_anyint_ty],
2187+
[llvm_anyvector_ty, llvm_i1_ty],
2188+
[IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
2189+
21852190
def int_experimental_vp_splice:
21862191
DefaultAttrsIntrinsic<[llvm_anyvector_ty],
21872192
[LLVMMatchType<0>,

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1953,6 +1953,11 @@ def int_amdgcn_inverse_ballot :
19531953
Intrinsic<[llvm_i1_ty], [llvm_anyint_ty],
19541954
[IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
19551955

1956+
// Lowers to S_BITREPLICATE_B64_B32.
1957+
// The argument must be uniform; otherwise, the result is undefined.
1958+
def int_amdgcn_s_bitreplicate :
1959+
DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
1960+
19561961
class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
19571962
[data_ty],
19581963
[

llvm/lib/Analysis/InlineCost.cpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -695,7 +695,8 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
695695
}
696696
} else
697697
// Otherwise simply add the cost for merely making the call.
698-
addCost(CallPenalty);
698+
addCost(TTI.getInlineCallPenalty(CandidateCall.getCaller(), Call,
699+
CallPenalty));
699700
}
700701

701702
void onFinalizeSwitch(unsigned JumpTableSize,
@@ -918,7 +919,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
918919
// Compute the total savings for the call site.
919920
auto *CallerBB = CandidateCall.getParent();
920921
BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
921-
CycleSavings += getCallsiteCost(this->CandidateCall, DL);
922+
CycleSavings += getCallsiteCost(TTI, this->CandidateCall, DL);
922923
CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
923924

924925
// Remove the cost of the cold basic blocks to model the runtime cost more
@@ -1076,7 +1077,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
10761077

10771078
// Give out bonuses for the callsite, as the instructions setting them up
10781079
// will be gone after inlining.
1079-
addCost(-getCallsiteCost(this->CandidateCall, DL));
1080+
addCost(-getCallsiteCost(TTI, this->CandidateCall, DL));
10801081

10811082
// If this function uses the coldcc calling convention, prefer not to inline
10821083
// it.
@@ -1315,7 +1316,7 @@ class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
13151316

13161317
InlineResult onAnalysisStart() override {
13171318
increment(InlineCostFeatureIndex::callsite_cost,
1318-
-1 * getCallsiteCost(this->CandidateCall, DL));
1319+
-1 * getCallsiteCost(TTI, this->CandidateCall, DL));
13191320

13201321
set(InlineCostFeatureIndex::cold_cc_penalty,
13211322
(F.getCallingConv() == CallingConv::Cold));
@@ -2887,7 +2888,8 @@ static bool functionsHaveCompatibleAttributes(
28872888
AttributeFuncs::areInlineCompatible(*Caller, *Callee);
28882889
}
28892890

2890-
int llvm::getCallsiteCost(const CallBase &Call, const DataLayout &DL) {
2891+
int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call,
2892+
const DataLayout &DL) {
28912893
int64_t Cost = 0;
28922894
for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
28932895
if (Call.isByValArgument(I)) {
@@ -2917,7 +2919,8 @@ int llvm::getCallsiteCost(const CallBase &Call, const DataLayout &DL) {
29172919
}
29182920
// The call instruction also disappears after inlining.
29192921
Cost += InstrCost;
2920-
Cost += CallPenalty;
2922+
Cost += TTI.getInlineCallPenalty(Call.getCaller(), Call, CallPenalty);
2923+
29212924
return std::min<int64_t>(Cost, INT_MAX);
29222925
}
29232926

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1133,6 +1133,13 @@ bool TargetTransformInfo::areInlineCompatible(const Function *Caller,
11331133
return TTIImpl->areInlineCompatible(Caller, Callee);
11341134
}
11351135

1136+
unsigned
1137+
TargetTransformInfo::getInlineCallPenalty(const Function *F,
1138+
const CallBase &Call,
1139+
unsigned DefaultCallPenalty) const {
1140+
return TTIImpl->getInlineCallPenalty(F, Call, DefaultCallPenalty);
1141+
}
1142+
11361143
bool TargetTransformInfo::areTypesABICompatible(
11371144
const Function *Caller, const Function *Callee,
11381145
const ArrayRef<Type *> &Types) const {

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7514,6 +7514,62 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
75147514
setValue(&I, Trunc);
75157515
return;
75167516
}
7517+
case Intrinsic::experimental_cttz_elts: {
7518+
auto DL = getCurSDLoc();
7519+
SDValue Op = getValue(I.getOperand(0));
7520+
EVT OpVT = Op.getValueType();
7521+
7522+
if (!TLI.shouldExpandCttzElements(OpVT)) {
7523+
visitTargetIntrinsic(I, Intrinsic);
7524+
return;
7525+
}
7526+
7527+
if (OpVT.getScalarType() != MVT::i1) {
7528+
// Compare the input vector elements to zero & use to count trailing zeros
7529+
SDValue AllZero = DAG.getConstant(0, DL, OpVT);
7530+
OpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
7531+
OpVT.getVectorElementCount());
7532+
Op = DAG.getSetCC(DL, OpVT, Op, AllZero, ISD::SETNE);
7533+
}
7534+
7535+
// Find the smallest "sensible" element type to use for the expansion.
7536+
ConstantRange CR(
7537+
APInt(64, OpVT.getVectorElementCount().getKnownMinValue()));
7538+
if (OpVT.isScalableVT())
7539+
CR = CR.umul_sat(getVScaleRange(I.getCaller(), 64));
7540+
7541+
// If the zero-is-poison flag is set, we can assume the upper limit
7542+
// of the result is VF-1.
7543+
if (!cast<ConstantSDNode>(getValue(I.getOperand(1)))->isZero())
7544+
CR = CR.subtract(APInt(64, 1));
7545+
7546+
unsigned EltWidth = I.getType()->getScalarSizeInBits();
7547+
EltWidth = std::min(EltWidth, (unsigned)CR.getActiveBits());
7548+
EltWidth = std::max(llvm::bit_ceil(EltWidth), (unsigned)8);
7549+
7550+
MVT NewEltTy = MVT::getIntegerVT(EltWidth);
7551+
7552+
// Create the new vector type & get the vector length
7553+
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), NewEltTy,
7554+
OpVT.getVectorElementCount());
7555+
7556+
SDValue VL =
7557+
DAG.getElementCount(DL, NewEltTy, OpVT.getVectorElementCount());
7558+
7559+
SDValue StepVec = DAG.getStepVector(DL, NewVT);
7560+
SDValue SplatVL = DAG.getSplat(NewVT, DL, VL);
7561+
SDValue StepVL = DAG.getNode(ISD::SUB, DL, NewVT, SplatVL, StepVec);
7562+
SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, Op);
7563+
SDValue And = DAG.getNode(ISD::AND, DL, NewVT, StepVL, Ext);
7564+
SDValue Max = DAG.getNode(ISD::VECREDUCE_UMAX, DL, NewEltTy, And);
7565+
SDValue Sub = DAG.getNode(ISD::SUB, DL, NewEltTy, VL, Max);
7566+
7567+
EVT RetTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
7568+
SDValue Ret = DAG.getZExtOrTrunc(Sub, DL, RetTy);
7569+
7570+
setValue(&I, Ret);
7571+
return;
7572+
}
75177573
case Intrinsic::vector_insert: {
75187574
SDValue Vec = getValue(I.getOperand(0));
75197575
SDValue SubVec = getValue(I.getOperand(1));

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1791,6 +1791,10 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
17911791
return false;
17921792
}
17931793

1794+
bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
1795+
return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
1796+
}
1797+
17941798
void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT,
17951799
bool StreamingSVE) {
17961800
assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
@@ -2634,6 +2638,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
26342638
MAKE_CASE(AArch64ISD::MRRS)
26352639
MAKE_CASE(AArch64ISD::MSRR)
26362640
MAKE_CASE(AArch64ISD::RSHRNB_I)
2641+
MAKE_CASE(AArch64ISD::CTTZ_ELTS)
26372642
}
26382643
#undef MAKE_CASE
26392644
return nullptr;
@@ -5338,6 +5343,12 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
53385343
}
53395344
return SDValue();
53405345
}
5346+
case Intrinsic::experimental_cttz_elts: {
5347+
SDValue NewCttzElts =
5348+
DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
5349+
5350+
return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
5351+
}
53415352
}
53425353
}
53435354

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,8 @@ enum NodeType : unsigned {
335335
PTEST_ANY,
336336
PTRUE,
337337

338+
CTTZ_ELTS,
339+
338340
BITREVERSE_MERGE_PASSTHRU,
339341
BSWAP_MERGE_PASSTHRU,
340342
REVH_MERGE_PASSTHRU,
@@ -927,6 +929,8 @@ class AArch64TargetLowering : public TargetLowering {
927929

928930
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override;
929931

932+
bool shouldExpandCttzElements(EVT VT) const override;
933+
930934
/// If a change in streaming mode is required on entry to/return from a
931935
/// function call it emits and returns the corresponding SMSTART or SMSTOP node.
932936
/// \p Entry tells whether this is before/after the Call, which is necessary

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -842,6 +842,9 @@ def AArch64rshrnb_pf : PatFrags<(ops node:$rs, node:$i),
842842
[(AArch64rshrnb node:$rs, node:$i),
843843
(int_aarch64_sve_rshrnb node:$rs, node:$i)]>;
844844

845+
def AArch64CttzElts : SDNode<"AArch64ISD::CTTZ_ELTS", SDTypeProfile<1, 1,
846+
[SDTCisInt<0>, SDTCisVec<1>]>, []>;
847+
845848
// Match add node and also treat an 'or' node is as an 'add' if the or'ed operands
846849
// have no common bits.
847850
def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs),

0 commit comments

Comments
 (0)