Skip to content

Commit d8cd8d5

Browse files
authored
[SLP] getSpillCost - fully populate IntrinsicCostAttributes to improve cost analysis. (#124129)
We were only constructing the IntrinsicCostAttributes with the arg type info, and not the args themselves, preventing more detailed cost analysis (constant / uniform args etc.) Just pass the whole IntrinsicInst to the constructor and let it resolve everything it can. Noticed while having yet another attempt at #63980
1 parent 7ddeea3 commit d8cd8d5

File tree

5 files changed

+72
-73
lines changed

5 files changed

+72
-73
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+4-10
Original file line numberDiff line numberDiff line change
@@ -12253,18 +12253,12 @@ InstructionCost BoUpSLP::getSpillCost() const {
1225312253
if (auto *II = dyn_cast<IntrinsicInst>(I)) {
1225412254
if (II->isAssumeLikeIntrinsic())
1225512255
return true;
12256-
FastMathFlags FMF;
12257-
SmallVector<Type *, 4> Tys;
12258-
for (auto &ArgOp : II->args())
12259-
Tys.push_back(ArgOp->getType());
12260-
if (auto *FPMO = dyn_cast<FPMathOperator>(II))
12261-
FMF = FPMO->getFastMathFlags();
12262-
IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
12263-
FMF);
12256+
IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
1226412257
InstructionCost IntrCost =
1226512258
TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12266-
InstructionCost CallCost = TTI->getCallInstrCost(
12267-
nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
12259+
InstructionCost CallCost =
12260+
TTI->getCallInstrCost(nullptr, II->getType(), ICA.getArgTypes(),
12261+
TTI::TCK_RecipThroughput);
1226812262
if (IntrCost < CallCost)
1226912263
return true;
1227012264
}

llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll

+15-18
Original file line numberDiff line numberDiff line change
@@ -684,27 +684,27 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
684684
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM5]]
685685
; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1
686686
; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64
687-
; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]]
688-
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4
689-
; CHECK-NEXT: [[ADD14:%.*]] = or disjoint i32 [[MUL]], 1
687+
; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]]
688+
; CHECK-NEXT: [[ADD14:%.*]] = add nsw i32 [[MUL]], 2
690689
; CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64
691690
; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM15]]
691+
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4
692692
; CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3
693693
; CHECK-NEXT: [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64
694694
; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM23]]
695695
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX24]], align 4
696696
; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1
697697
; CHECK-NEXT: [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64
698-
; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]]
698+
; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]]
699699
; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i8, ptr [[Y:%.*]], i64 8
700700
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4
701701
; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]]
702-
; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]]
702+
; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]]
703+
; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]]
703704
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4
704-
; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]]
705705
; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]]
706706
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4
707-
; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]]
707+
; CHECK-NEXT: [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]]
708708
; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds nuw i8, ptr [[Z:%.*]], i64 4
709709
; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]]
710710
; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 24
@@ -715,25 +715,22 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
715715
; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]]
716716
; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]]
717717
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
718+
; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 28
718719
; CHECK-NEXT: [[MUL81:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]]
719-
; CHECK-NEXT: [[ARRAYIDX82:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 32
720-
; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4
721-
; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4
722-
; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]]
723-
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
724720
; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]]
725721
; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 44
726-
; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 36
727722
; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4
728723
; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4
724+
; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i32>, ptr [[ARRAYIDX49]], align 4
725+
; CHECK-NEXT: [[TMP16:%.*]] = load <2 x i32>, ptr [[ARRAYIDX65]], align 4
729726
; CHECK-NEXT: store i32 [[MUL73]], ptr [[Z]], align 4
730727
; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4
731-
; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX82]], align 4
732-
; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX76]], align 4
728+
; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX76]], align 4
733729
; CHECK-NEXT: store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4
734-
; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP17]]
735-
; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
736-
; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[ARRAYIDX92]], align 4
730+
; CHECK-NEXT: [[TMP20:%.*]] = mul nsw <2 x i32> [[TMP15]], [[TMP17]]
731+
; CHECK-NEXT: [[TMP21:%.*]] = mul nsw <2 x i32> [[TMP16]], [[TMP18]]
732+
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP21]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
733+
; CHECK-NEXT: store <4 x i32> [[TMP19]], ptr [[ARRAYIDX84]], align 4
737734
; CHECK-NEXT: ret void
738735
;
739736
entry:

llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll

+14-14
Original file line numberDiff line numberDiff line change
@@ -716,29 +716,29 @@ define float @reduce_float_case3(ptr %a) {
716716
; CHECK-NEXT: [[GEP5:%.*]] = getelementptr inbounds float, ptr [[A]], i32 5
717717
; CHECK-NEXT: [[GEP6:%.*]] = getelementptr inbounds float, ptr [[A]], i32 6
718718
; CHECK-NEXT: [[GEP7:%.*]] = getelementptr inbounds float, ptr [[A]], i32 7
719-
; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr [[A]], align 4
720-
; CHECK-NEXT: [[LOAD1:%.*]] = load float, ptr [[GEP1]], align 4
721-
; CHECK-NEXT: [[LOAD2:%.*]] = load float, ptr [[GEP2]], align 4
722-
; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr [[GEP3]], align 4
723-
; CHECK-NEXT: [[LOAD4:%.*]] = load float, ptr [[GEP4]], align 4
724-
; CHECK-NEXT: [[LOAD5:%.*]] = load float, ptr [[GEP5]], align 4
725-
; CHECK-NEXT: [[LOAD6:%.*]] = load float, ptr [[GEP6]], align 4
726-
; CHECK-NEXT: [[LOAD7:%.*]] = load float, ptr [[GEP7]], align 4
727-
; CHECK-NEXT: [[LOG:%.*]] = call float @llvm.log.f32(float [[LOAD]])
728-
; CHECK-NEXT: [[LOG1:%.*]] = call float @llvm.log.f32(float [[LOAD1]])
719+
; CHECK-NEXT: [[LOAD2:%.*]] = load float, ptr [[A]], align 4
720+
; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr [[GEP1]], align 4
721+
; CHECK-NEXT: [[LOAD4:%.*]] = load float, ptr [[GEP2]], align 4
722+
; CHECK-NEXT: [[LOAD5:%.*]] = load float, ptr [[GEP3]], align 4
723+
; CHECK-NEXT: [[LOAD6:%.*]] = load float, ptr [[GEP4]], align 4
724+
; CHECK-NEXT: [[LOAD7:%.*]] = load float, ptr [[GEP5]], align 4
725+
; CHECK-NEXT: [[LOAD8:%.*]] = load float, ptr [[GEP6]], align 4
726+
; CHECK-NEXT: [[LOAD9:%.*]] = load float, ptr [[GEP7]], align 4
729727
; CHECK-NEXT: [[LOG2:%.*]] = call float @llvm.log.f32(float [[LOAD2]])
730728
; CHECK-NEXT: [[LOG3:%.*]] = call float @llvm.log.f32(float [[LOAD3]])
731729
; CHECK-NEXT: [[LOG4:%.*]] = call float @llvm.log.f32(float [[LOAD4]])
732730
; CHECK-NEXT: [[LOG5:%.*]] = call float @llvm.log.f32(float [[LOAD5]])
733731
; CHECK-NEXT: [[LOG6:%.*]] = call float @llvm.log.f32(float [[LOAD6]])
734732
; CHECK-NEXT: [[LOG7:%.*]] = call float @llvm.log.f32(float [[LOAD7]])
735-
; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[LOG]], [[LOG1]]
736-
; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[ADD1]], [[LOG2]]
737-
; CHECK-NEXT: [[ADD3:%.*]] = fadd float [[ADD2]], [[LOG3]]
733+
; CHECK-NEXT: [[LOG8:%.*]] = call float @llvm.log.f32(float [[LOAD8]])
734+
; CHECK-NEXT: [[LOG9:%.*]] = call float @llvm.log.f32(float [[LOAD9]])
735+
; CHECK-NEXT: [[ADD3:%.*]] = fadd float [[LOG2]], [[LOG3]]
738736
; CHECK-NEXT: [[ADD4:%.*]] = fadd float [[ADD3]], [[LOG4]]
739737
; CHECK-NEXT: [[ADD5:%.*]] = fadd float [[ADD4]], [[LOG5]]
740738
; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[ADD5]], [[LOG6]]
741-
; CHECK-NEXT: [[ADD7:%.*]] = fadd float [[ADD6]], [[LOG7]]
739+
; CHECK-NEXT: [[ADD8:%.*]] = fadd float [[ADD6]], [[LOG7]]
740+
; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[ADD8]], [[LOG8]]
741+
; CHECK-NEXT: [[ADD7:%.*]] = fadd float [[ADD9]], [[LOG9]]
742742
; CHECK-NEXT: ret float [[ADD7]]
743743
;
744744
entry:

llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -358,12 +358,12 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
358358
; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <4 x i16> [[ARG1]], i64 1
359359
; GFX8-NEXT: [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
360360
; GFX8-NEXT: [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
361-
; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
362-
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
361+
; GFX8-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
362+
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
363363
; GFX8-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0
364-
; GFX8-NEXT: [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
364+
; GFX8-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
365365
; GFX8-NEXT: [[TMP2:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
366-
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[INS_1]], <4 x i16> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
366+
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
367367
; GFX8-NEXT: ret <4 x i16> [[INS_31]]
368368
;
369369
; GFX9-LABEL: @uadd_sat_v4i16(

llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll

+35-27
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
2828
; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
2929
; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
3030
; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
31-
; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1
3231
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
33-
; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
34-
; CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
3532
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
3633
; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
37-
; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[TMP6]] to i32
3834
; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
3935
; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
4036
; CHECK-NEXT: [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
@@ -50,7 +46,6 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
5046
; CHECK-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]]
5147
; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
5248
; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
53-
; CHECK-NEXT: [[CONV9_2:%.*]] = zext i8 [[TMP7]] to i32
5449
; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
5550
; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
5651
; CHECK-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]]
@@ -64,8 +59,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
6459
; CHECK-NEXT: [[TMP34:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
6560
; CHECK-NEXT: [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
6661
; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[TMP34]], i32 0
67-
; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP34]], i32 1
68-
; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP45]], [[TMP43]]
62+
; CHECK-NEXT: [[CONV_2:%.*]] = extractelement <2 x i32> [[TMP34]], i32 1
63+
; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[CONV_2]], [[TMP43]]
6964
; CHECK-NEXT: [[TMP46:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0
7065
; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1
7166
; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP47]], [[TMP46]]
@@ -120,15 +115,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
120115
; CHECK-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP78]], [[TMP80]]
121116
; CHECK-NEXT: [[ADD95:%.*]] = add i32 [[ADD94]], [[ADD48_2]]
122117
; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[ADD48_2]], [[ADD94]]
123-
; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP77]], 15
124-
; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
125-
; CHECK-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
126-
; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP45]], 15
127-
; CHECK-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
128-
; CHECK-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
129-
; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
130-
; CHECK-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
131-
; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[CONV9_2]], 15
118+
; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP77]], 15
132119
; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
133120
; CHECK-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
134121
; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[CONV_2]], 15
@@ -244,32 +231,53 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
244231
; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD95]]
245232
; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB86_3]], [[SUB86]]
246233
; CHECK-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB86_3]]
247-
; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
234+
; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I_1]], [[ADD103]]
248235
; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP77]]
249-
; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
250-
; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP45]]
236+
; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51_1]], [[ADD105]]
237+
; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV_2]]
251238
; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
252239
; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP160]]
253240
; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
254241
; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP127]]
255242
; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
256243
; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
257244
; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
258-
; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]]
259-
; CHECK-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD55_1]]
260-
; CHECK-NEXT: [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]]
245+
; CHECK-NEXT: [[TMP169:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
246+
; CHECK-NEXT: [[TMP181:%.*]] = zext <2 x i8> [[TMP169]] to <2 x i32>
247+
; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_2]], i32 0
248+
; CHECK-NEXT: [[TMP182:%.*]] = shufflevector <2 x i32> [[TMP152]], <2 x i32> poison, <2 x i32> zeroinitializer
249+
; CHECK-NEXT: [[TMP183:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_3]], i32 0
250+
; CHECK-NEXT: [[TMP184:%.*]] = shufflevector <2 x i32> [[TMP183]], <2 x i32> poison, <2 x i32> zeroinitializer
251+
; CHECK-NEXT: [[TMP191:%.*]] = sub <2 x i32> [[TMP182]], [[TMP184]]
252+
; CHECK-NEXT: [[TMP192:%.*]] = add <2 x i32> [[TMP182]], [[TMP184]]
253+
; CHECK-NEXT: [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP191]], <2 x i32> [[TMP192]], <2 x i32> <i32 0, i32 3>
254+
; CHECK-NEXT: [[TMP195:%.*]] = lshr <2 x i32> [[TMP181]], splat (i32 15)
255+
; CHECK-NEXT: [[TMP196:%.*]] = and <2 x i32> [[TMP195]], splat (i32 65537)
256+
; CHECK-NEXT: [[TMP198:%.*]] = mul <2 x i32> [[TMP196]], splat (i32 65535)
257+
; CHECK-NEXT: [[TMP202:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55]], i32 0
258+
; CHECK-NEXT: [[TMP203:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> poison, <2 x i32> zeroinitializer
259+
; CHECK-NEXT: [[TMP205:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_1]], i32 0
260+
; CHECK-NEXT: [[TMP206:%.*]] = shufflevector <2 x i32> [[TMP205]], <2 x i32> poison, <2 x i32> zeroinitializer
261+
; CHECK-NEXT: [[TMP207:%.*]] = sub <2 x i32> [[TMP203]], [[TMP206]]
262+
; CHECK-NEXT: [[TMP210:%.*]] = add <2 x i32> [[TMP203]], [[TMP206]]
263+
; CHECK-NEXT: [[TMP168:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP210]], <2 x i32> <i32 0, i32 3>
264+
; CHECK-NEXT: [[ADD94_1:%.*]] = extractelement <2 x i32> [[TMP194]], i32 1
265+
; CHECK-NEXT: [[ADD78_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 1
261266
; CHECK-NEXT: [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]]
262-
; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
267+
; CHECK-NEXT: [[TMP220:%.*]] = add <2 x i32> [[TMP194]], [[TMP168]]
268+
; CHECK-NEXT: [[SUB102_1:%.*]] = extractelement <2 x i32> [[TMP194]], i32 0
269+
; CHECK-NEXT: [[SUB86_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 0
270+
; CHECK-NEXT: [[TMP174:%.*]] = shufflevector <2 x i32> [[TMP168]], <2 x i32> [[TMP194]], <2 x i32> <i32 0, i32 2>
263271
; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
264-
; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]]
265-
; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[CONV9_2]]
266-
; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]]
267-
; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[CONV_2]]
272+
; CHECK-NEXT: [[TMP175:%.*]] = add <2 x i32> [[TMP198]], [[TMP220]]
273+
; CHECK-NEXT: [[TMP221:%.*]] = xor <2 x i32> [[TMP175]], [[TMP181]]
268274
; CHECK-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]]
269275
; CHECK-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP162]]
270276
; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
271277
; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP129]]
278+
; CHECK-NEXT: [[XOR_I53_1:%.*]] = extractelement <2 x i32> [[TMP221]], i32 0
272279
; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD105_3]]
280+
; CHECK-NEXT: [[XOR_I_1:%.*]] = extractelement <2 x i32> [[TMP221]], i32 1
273281
; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
274282
; CHECK-NEXT: [[ADD112_5:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
275283
; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_5]], [[XOR_I63_1]]

0 commit comments

Comments
 (0)