Skip to content

Commit a612524

Browse files
[SLP]Fix the cost of the reduction result to the final type.
Need to fix the way the cost is calculated, otherwise wrong cast opcode can be selected and lead to the over-optimistic vector cost. Plus, need to take into account reduction type size. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #87528
1 parent 298f8f7 commit a612524

File tree

4 files changed

+35
-18
lines changed

4 files changed

+35
-18
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9824,11 +9824,13 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
98249824
if (BWIt != MinBWs.end()) {
98259825
Type *DstTy = Root.Scalars.front()->getType();
98269826
unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
9827-
if (OriginalSz != BWIt->second.first) {
9827+
unsigned SrcSz =
9828+
ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
9829+
if (OriginalSz != SrcSz) {
98289830
unsigned Opcode = Instruction::Trunc;
9829-
if (OriginalSz < BWIt->second.first)
9831+
if (OriginalSz > SrcSz)
98309832
Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
9831-
Type *SrcTy = IntegerType::get(DstTy->getContext(), BWIt->second.first);
9833+
Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
98329834
Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
98339835
TTI::CastContextHint::None,
98349836
TTI::TCK_RecipThroughput);

llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -801,10 +801,20 @@ entry:
801801
define i64 @red_zext_ld_4xi64(ptr %ptr) {
802802
; CHECK-LABEL: @red_zext_ld_4xi64(
803803
; CHECK-NEXT: entry:
804-
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1
805-
; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16>
806-
; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
807-
; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP2]] to i64
804+
; CHECK-NEXT: [[LD0:%.*]] = load i8, ptr [[PTR:%.*]], align 1
805+
; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[LD0]] to i64
806+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1
807+
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[GEP]], align 1
808+
; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i8 [[LD1]] to i64
809+
; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i64 [[ZEXT]], [[ZEXT_1]]
810+
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 2
811+
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[GEP_1]], align 1
812+
; CHECK-NEXT: [[ZEXT_2:%.*]] = zext i8 [[LD2]] to i64
813+
; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i64 [[ADD_1]], [[ZEXT_2]]
814+
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 3
815+
; CHECK-NEXT: [[LD3:%.*]] = load i8, ptr [[GEP_2]], align 1
816+
; CHECK-NEXT: [[ZEXT_3:%.*]] = zext i8 [[LD3]] to i64
817+
; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[ADD_2]], [[ZEXT_3]]
808818
; CHECK-NEXT: ret i64 [[TMP3]]
809819
;
810820
entry:

llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,22 @@ define i32 @test() {
55
; CHECK-LABEL: define i32 @test() {
66
; CHECK-NEXT: entry:
77
; CHECK-NEXT: [[A_PROMOTED:%.*]] = load i8, ptr null, align 1
8-
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i8> poison, i8 [[A_PROMOTED]], i32 0
9-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> zeroinitializer
10-
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], zeroinitializer
11-
; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i8> [[TMP1]], zeroinitializer
12-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
13-
; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i16>
14-
; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i16> [[TMP5]], <i16 -1, i16 0, i16 0, i16 0>
15-
; CHECK-NEXT: [[TMP7:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP6]])
16-
; CHECK-NEXT: [[TMP8:%.*]] = zext i16 [[TMP7]] to i32
8+
; CHECK-NEXT: [[DEC_4:%.*]] = add i8 [[A_PROMOTED]], 0
9+
; CHECK-NEXT: [[CONV_I_4:%.*]] = zext i8 [[DEC_4]] to i32
10+
; CHECK-NEXT: [[SUB_I_4:%.*]] = add nuw nsw i32 [[CONV_I_4]], 0
11+
; CHECK-NEXT: [[DEC_5:%.*]] = add i8 [[A_PROMOTED]], 0
12+
; CHECK-NEXT: [[CONV_I_5:%.*]] = zext i8 [[DEC_5]] to i32
13+
; CHECK-NEXT: [[SUB_I_5:%.*]] = add nuw nsw i32 [[CONV_I_5]], 65535
14+
; CHECK-NEXT: [[TMP0:%.*]] = or i32 [[SUB_I_4]], [[SUB_I_5]]
15+
; CHECK-NEXT: [[DEC_6:%.*]] = or i8 [[A_PROMOTED]], 0
16+
; CHECK-NEXT: [[CONV_I_6:%.*]] = zext i8 [[DEC_6]] to i32
17+
; CHECK-NEXT: [[SUB_I_6:%.*]] = add nuw nsw i32 [[CONV_I_6]], 0
18+
; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[TMP0]], [[SUB_I_6]]
19+
; CHECK-NEXT: [[TMP10:%.*]] = or i8 [[A_PROMOTED]], 0
20+
; CHECK-NEXT: [[CONV_I_7:%.*]] = zext i8 [[TMP10]] to i32
21+
; CHECK-NEXT: [[SUB_I_7:%.*]] = add nuw nsw i32 [[CONV_I_7]], 0
22+
; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[TMP1]], [[SUB_I_7]]
1723
; CHECK-NEXT: [[TMP9:%.*]] = and i32 [[TMP8]], 65535
18-
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
1924
; CHECK-NEXT: store i8 [[TMP10]], ptr null, align 1
2025
; CHECK-NEXT: [[CALL3:%.*]] = tail call i32 (ptr, ...) null(ptr null, i32 [[TMP9]])
2126
; CHECK-NEXT: ret i32 0

llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2-
; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-6 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
2+
; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-7 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
33

44
define void @test(i64 %d.promoted.i) {
55
; CHECK-LABEL: define void @test(

0 commit comments

Comments
 (0)