Skip to content

Commit 95c64b7

Browse files
authored
AMDGPU: Reduce readfirstlane for single demanded vector element (#128647)
If we are only extracting a single element, rewrite the intrinsic call to use the element type. We should extend this to arbitrary extract shuffles.
1 parent 3b38992 commit 95c64b7

File tree

3 files changed

+78
-31
lines changed

3 files changed

+78
-31
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1563,16 +1563,58 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
15631563
return NewCall;
15641564
}
15651565

1566+
Value *GCNTTIImpl::simplifyAMDGCNLaneIntrinsicDemanded(
1567+
InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
1568+
APInt &UndefElts) const {
1569+
auto *VT = dyn_cast<FixedVectorType>(II.getType());
1570+
if (!VT)
1571+
return nullptr;
1572+
1573+
const unsigned FirstElt = DemandedElts.countr_zero();
1574+
const unsigned LastElt = DemandedElts.getActiveBits() - 1;
1575+
const unsigned MaskLen = LastElt - FirstElt + 1;
1576+
1577+
// TODO: Handle general subvector extract.
1578+
if (MaskLen != 1)
1579+
return nullptr;
1580+
1581+
Type *EltTy = VT->getElementType();
1582+
if (!isTypeLegal(EltTy))
1583+
return nullptr;
1584+
1585+
Value *Src = II.getArgOperand(0);
1586+
1587+
assert(FirstElt == LastElt);
1588+
Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
1589+
1590+
// Make sure convergence tokens are preserved.
1591+
// TODO: CreateIntrinsic should allow directly copying bundles
1592+
SmallVector<OperandBundleDef, 2> OpBundles;
1593+
II.getOperandBundlesAsDefs(OpBundles);
1594+
1595+
Module *M = IC.Builder.GetInsertBlock()->getModule();
1596+
Function *Remangled = Intrinsic::getOrInsertDeclaration(
1597+
M, II.getIntrinsicID(), {Extract->getType()});
1598+
1599+
// TODO: Preserve callsite attributes?
1600+
CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
1601+
1602+
Value *Result = IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()),
1603+
NewCall, FirstElt);
1604+
IC.replaceInstUsesWith(II, Result);
1605+
IC.eraseInstFromFunction(II);
1606+
return Result;
1607+
}
1608+
15661609
std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
15671610
InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
15681611
APInt &UndefElts2, APInt &UndefElts3,
15691612
std::function<void(Instruction *, unsigned, APInt, APInt &)>
15701613
SimplifyAndSetOp) const {
15711614
switch (II.getIntrinsicID()) {
15721615
case Intrinsic::amdgcn_readfirstlane:
1573-
// TODO: For a vector extract, should reduce the intrinsic call type.
15741616
SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1575-
return std::nullopt;
1617+
return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
15761618
case Intrinsic::amdgcn_raw_buffer_load:
15771619
case Intrinsic::amdgcn_raw_ptr_buffer_load:
15781620
case Intrinsic::amdgcn_raw_buffer_load_format:

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,12 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
226226

227227
std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
228228
IntrinsicInst &II) const;
229+
230+
Value *simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC,
231+
IntrinsicInst &II,
232+
const APInt &DemandedElts,
233+
APInt &UndefElts) const;
234+
229235
std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
230236
InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
231237
APInt &UndefElts2, APInt &UndefElts3,

llvm/test/Transforms/InstCombine/AMDGPU/simplify-demanded-vector-elts-lane-intrinsics.ll

Lines changed: 28 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
define i16 @extract_elt0_v2i16_readfirstlane(<2 x i16> %src) {
55
; CHECK-LABEL: define i16 @extract_elt0_v2i16_readfirstlane(
66
; CHECK-SAME: <2 x i16> [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
7-
; CHECK-NEXT: [[VEC:%.*]] = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> [[SRC]])
8-
; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i16> [[VEC]], i64 0
7+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i16> [[SRC]], i64 0
8+
; CHECK-NEXT: [[ELT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[TMP1]])
99
; CHECK-NEXT: ret i16 [[ELT]]
1010
;
1111
%vec = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> %src)
@@ -16,8 +16,8 @@ define i16 @extract_elt0_v2i16_readfirstlane(<2 x i16> %src) {
1616
define i16 @extract_elt0_v1i16_readfirstlane(<1 x i16> %src) {
1717
; CHECK-LABEL: define i16 @extract_elt0_v1i16_readfirstlane(
1818
; CHECK-SAME: <1 x i16> [[SRC:%.*]]) #[[ATTR0]] {
19-
; CHECK-NEXT: [[VEC:%.*]] = call <1 x i16> @llvm.amdgcn.readfirstlane.v1i16(<1 x i16> [[SRC]])
20-
; CHECK-NEXT: [[ELT:%.*]] = extractelement <1 x i16> [[VEC]], i64 0
19+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i16> [[SRC]], i64 0
20+
; CHECK-NEXT: [[ELT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[TMP1]])
2121
; CHECK-NEXT: ret i16 [[ELT]]
2222
;
2323
%vec = call <1 x i16> @llvm.amdgcn.readfirstlane.v1i16(<1 x i16> %src)
@@ -28,8 +28,8 @@ define i16 @extract_elt0_v1i16_readfirstlane(<1 x i16> %src) {
2828
define i16 @extract_elt1_v2i16_readfirstlane(<2 x i16> %src) {
2929
; CHECK-LABEL: define i16 @extract_elt1_v2i16_readfirstlane(
3030
; CHECK-SAME: <2 x i16> [[SRC:%.*]]) #[[ATTR0]] {
31-
; CHECK-NEXT: [[VEC:%.*]] = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> [[SRC]])
32-
; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i16> [[VEC]], i64 1
31+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i16> [[SRC]], i64 1
32+
; CHECK-NEXT: [[ELT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[TMP1]])
3333
; CHECK-NEXT: ret i16 [[ELT]]
3434
;
3535
%vec = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> %src)
@@ -40,8 +40,8 @@ define i16 @extract_elt1_v2i16_readfirstlane(<2 x i16> %src) {
4040
define i16 @extract_elt0_v4i16_readfirstlane(<4 x i16> %src) {
4141
; CHECK-LABEL: define i16 @extract_elt0_v4i16_readfirstlane(
4242
; CHECK-SAME: <4 x i16> [[SRC:%.*]]) #[[ATTR0]] {
43-
; CHECK-NEXT: [[VEC:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[SRC]])
44-
; CHECK-NEXT: [[ELT:%.*]] = extractelement <4 x i16> [[VEC]], i64 0
43+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[SRC]], i64 0
44+
; CHECK-NEXT: [[ELT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[TMP1]])
4545
; CHECK-NEXT: ret i16 [[ELT]]
4646
;
4747
%vec = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> %src)
@@ -52,8 +52,8 @@ define i16 @extract_elt0_v4i16_readfirstlane(<4 x i16> %src) {
5252
define i16 @extract_elt2_v4i16_readfirstlane(<4 x i16> %src) {
5353
; CHECK-LABEL: define i16 @extract_elt2_v4i16_readfirstlane(
5454
; CHECK-SAME: <4 x i16> [[SRC:%.*]]) #[[ATTR0]] {
55-
; CHECK-NEXT: [[VEC:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[SRC]])
56-
; CHECK-NEXT: [[ELT:%.*]] = extractelement <4 x i16> [[VEC]], i64 2
55+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[SRC]], i64 2
56+
; CHECK-NEXT: [[ELT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[TMP1]])
5757
; CHECK-NEXT: ret i16 [[ELT]]
5858
;
5959
%vec = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> %src)
@@ -136,8 +136,8 @@ define <2 x i16> @extract_elt30_v4i16_readfirstlane(<4 x i16> %src) {
136136
define half @extract_elt0_v2f16_readfirstlane(<2 x half> %src) {
137137
; CHECK-LABEL: define half @extract_elt0_v2f16_readfirstlane(
138138
; CHECK-SAME: <2 x half> [[SRC:%.*]]) #[[ATTR0]] {
139-
; CHECK-NEXT: [[VEC:%.*]] = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> [[SRC]])
140-
; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x half> [[VEC]], i64 0
139+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x half> [[SRC]], i64 0
140+
; CHECK-NEXT: [[ELT:%.*]] = call half @llvm.amdgcn.readfirstlane.f16(half [[TMP1]])
141141
; CHECK-NEXT: ret half [[ELT]]
142142
;
143143
%vec = call <2 x half> @llvm.amdgcn.readfirstlane.v2i16(<2 x half> %src)
@@ -148,8 +148,8 @@ define half @extract_elt0_v2f16_readfirstlane(<2 x half> %src) {
148148
define half @extract_elt1_v2f16_readfirstlane(<2 x half> %src) {
149149
; CHECK-LABEL: define half @extract_elt1_v2f16_readfirstlane(
150150
; CHECK-SAME: <2 x half> [[SRC:%.*]]) #[[ATTR0]] {
151-
; CHECK-NEXT: [[VEC:%.*]] = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> [[SRC]])
152-
; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x half> [[VEC]], i64 1
151+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x half> [[SRC]], i64 1
152+
; CHECK-NEXT: [[ELT:%.*]] = call half @llvm.amdgcn.readfirstlane.f16(half [[TMP1]])
153153
; CHECK-NEXT: ret half [[ELT]]
154154
;
155155
%vec = call <2 x half> @llvm.amdgcn.readfirstlane.v2i16(<2 x half> %src)
@@ -186,8 +186,8 @@ define i32 @extract_elt0_nxv4i32_readfirstlane(<vscale x 2 x i32> %src) {
186186
define i32 @extract_elt0_v2i32_readfirstlane(<2 x i32> %src) {
187187
; CHECK-LABEL: define i32 @extract_elt0_v2i32_readfirstlane(
188188
; CHECK-SAME: <2 x i32> [[SRC:%.*]]) #[[ATTR0]] {
189-
; CHECK-NEXT: [[VEC:%.*]] = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> [[SRC]])
190-
; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i32> [[VEC]], i64 0
189+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[SRC]], i64 0
190+
; CHECK-NEXT: [[ELT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP1]])
191191
; CHECK-NEXT: ret i32 [[ELT]]
192192
;
193193
%vec = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> %src)
@@ -198,8 +198,8 @@ define i32 @extract_elt0_v2i32_readfirstlane(<2 x i32> %src) {
198198
define ptr addrspace(3) @extract_elt0_v2p3_readfirstlane(<2 x ptr addrspace(3)> %src) {
199199
; CHECK-LABEL: define ptr addrspace(3) @extract_elt0_v2p3_readfirstlane(
200200
; CHECK-SAME: <2 x ptr addrspace(3)> [[SRC:%.*]]) #[[ATTR0]] {
201-
; CHECK-NEXT: [[VEC:%.*]] = call <2 x ptr addrspace(3)> @llvm.amdgcn.readfirstlane.v2p3(<2 x ptr addrspace(3)> [[SRC]])
202-
; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x ptr addrspace(3)> [[VEC]], i64 0
201+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x ptr addrspace(3)> [[SRC]], i64 0
202+
; CHECK-NEXT: [[ELT:%.*]] = call ptr addrspace(3) @llvm.amdgcn.readfirstlane.p3(ptr addrspace(3) [[TMP1]])
203203
; CHECK-NEXT: ret ptr addrspace(3) [[ELT]]
204204
;
205205
%vec = call <2 x ptr addrspace(3)> @llvm.amdgcn.readfirstlane.v2p3(<2 x ptr addrspace(3)> %src)
@@ -210,8 +210,8 @@ define ptr addrspace(3) @extract_elt0_v2p3_readfirstlane(<2 x ptr addrspace(3)>
210210
define i64 @extract_elt0_v2i64_readfirstlane(<2 x i64> %src) {
211211
; CHECK-LABEL: define i64 @extract_elt0_v2i64_readfirstlane(
212212
; CHECK-SAME: <2 x i64> [[SRC:%.*]]) #[[ATTR0]] {
213-
; CHECK-NEXT: [[VEC:%.*]] = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> [[SRC]])
214-
; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i64> [[VEC]], i64 0
213+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[SRC]], i64 0
214+
; CHECK-NEXT: [[ELT:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[TMP1]])
215215
; CHECK-NEXT: ret i64 [[ELT]]
216216
;
217217
%vec = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> %src)
@@ -222,8 +222,8 @@ define i64 @extract_elt0_v2i64_readfirstlane(<2 x i64> %src) {
222222
define i64 @extract_elt1_v2i64_readfirstlane(<2 x i64> %src) {
223223
; CHECK-LABEL: define i64 @extract_elt1_v2i64_readfirstlane(
224224
; CHECK-SAME: <2 x i64> [[SRC:%.*]]) #[[ATTR0]] {
225-
; CHECK-NEXT: [[VEC:%.*]] = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> [[SRC]])
226-
; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i64> [[VEC]], i64 1
225+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[SRC]], i64 1
226+
; CHECK-NEXT: [[ELT:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[TMP1]])
227227
; CHECK-NEXT: ret i64 [[ELT]]
228228
;
229229
%vec = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> %src)
@@ -306,9 +306,8 @@ define <2 x i16> @extract_elt13_v4i16readfirstlane(<4 x i16> %src) {
306306
define <2 x i32> @extract_elt13_v4i32_readfirstlane_source_simplify0(i32 %src0, i32 %src2) {
307307
; CHECK-LABEL: define <2 x i32> @extract_elt13_v4i32_readfirstlane_source_simplify0(
308308
; CHECK-SAME: i32 [[SRC0:%.*]], i32 [[SRC2:%.*]]) #[[ATTR0]] {
309-
; CHECK-NEXT: [[INS_1:%.*]] = insertelement <4 x i32> poison, i32 [[SRC0]], i64 1
310-
; CHECK-NEXT: [[VEC:%.*]] = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> [[INS_1]])
311-
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[VEC]], <4 x i32> poison, <2 x i32> <i32 1, i32 poison>
309+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
310+
; CHECK-NEXT: [[SHUFFLE:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i64 0
312311
; CHECK-NEXT: ret <2 x i32> [[SHUFFLE]]
313312
;
314313
%ins.0 = insertelement <4 x i32> poison, i32 %src0, i32 0
@@ -350,8 +349,8 @@ define i32 @extract_elt0_v2i32_readfirstlane_convergencetoken(<2 x i32> %src) co
350349
; CHECK-LABEL: define i32 @extract_elt0_v2i32_readfirstlane_convergencetoken(
351350
; CHECK-SAME: <2 x i32> [[SRC:%.*]]) #[[ATTR1:[0-9]+]] {
352351
; CHECK-NEXT: [[T:%.*]] = call token @llvm.experimental.convergence.entry()
353-
; CHECK-NEXT: [[VEC:%.*]] = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> [[SRC]]) [ "convergencectrl"(token [[T]]) ]
354-
; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i32> [[VEC]], i64 0
352+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[SRC]], i64 0
353+
; CHECK-NEXT: [[ELT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP1]]) [ "convergencectrl"(token [[T]]) ]
355354
; CHECK-NEXT: ret i32 [[ELT]]
356355
;
357356
%t = call token @llvm.experimental.convergence.entry()
@@ -381,8 +380,8 @@ define < 2 x i32> @extract_elt13_v4i32_readfirstlane_source_simplify1_convergenc
381380
define i1 @extract_elt0_v2i1_readfirstlane(<2 x i1> %src) {
382381
; CHECK-LABEL: define i1 @extract_elt0_v2i1_readfirstlane(
383382
; CHECK-SAME: <2 x i1> [[SRC:%.*]]) #[[ATTR0]] {
384-
; CHECK-NEXT: [[VEC:%.*]] = call <2 x i1> @llvm.amdgcn.readfirstlane.v2i1(<2 x i1> [[SRC]])
385-
; CHECK-NEXT: [[ELT:%.*]] = extractelement <2 x i1> [[VEC]], i64 0
383+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[SRC]], i64 0
384+
; CHECK-NEXT: [[ELT:%.*]] = call i1 @llvm.amdgcn.readfirstlane.i1(i1 [[TMP1]])
386385
; CHECK-NEXT: ret i1 [[ELT]]
387386
;
388387
%vec = call <2 x i1> @llvm.amdgcn.readfirstlane.v2i1(<2 x i1> %src)

0 commit comments

Comments
 (0)