Skip to content

Commit bbfdb8c

Browse files
committed
[CostModel][X86] Add scalar rotate-by-immediate costs
As noted on #63980 rotate by immediate amounts is much cheaper than variable amounts. This still needs to be expanded to vector rotate cases, and we need to add reasonable funnel-shift costs as well (very tricky as there's a huge range in CPU behaviour for these).
1 parent 315946c commit bbfdb8c

File tree

9 files changed

+500
-468
lines changed

9 files changed

+500
-468
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3945,6 +3945,7 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
39453945
{ ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
39463946
{ ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
39473947
{ ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
3948+
{ X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
39483949
{ ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
39493950
{ ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
39503951
{ ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
@@ -3984,6 +3985,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
39843985
{ ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
39853986
{ ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
39863987
{ ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
3988+
{ X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
3989+
{ X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
3990+
{ X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
39873991
{ ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
39883992
{ ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
39893993
{ ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
@@ -4039,17 +4043,27 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
40394043
ISD = ISD::FSHL;
40404044
if (!ICA.isTypeBasedOnly()) {
40414045
const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4042-
if (Args[0] == Args[1])
4046+
if (Args[0] == Args[1]) {
40434047
ISD = ISD::ROTL;
4048+
// Handle scalar constant rotation amounts.
4049+
// TODO: Handle vector + funnel-shift cases.
4050+
if (isa_and_nonnull<ConstantInt>(Args[2]))
4051+
ISD = X86ISD::VROTLI;
4052+
}
40444053
}
40454054
break;
40464055
case Intrinsic::fshr:
40474056
// FSHR has same costs so don't duplicate.
40484057
ISD = ISD::FSHL;
40494058
if (!ICA.isTypeBasedOnly()) {
40504059
const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4051-
if (Args[0] == Args[1])
4060+
if (Args[0] == Args[1]) {
4061+
// Handle scalar constant rotation amount.
4062+
// TODO: Handle vector + funnel-shift cases.
40524063
ISD = ISD::ROTR;
4064+
if (isa_and_nonnull<ConstantInt>(Args[2]))
4065+
ISD = X86ISD::VROTLI;
4066+
}
40534067
}
40544068
break;
40554069
case Intrinsic::maxnum:

llvm/test/Analysis/CostModel/X86/fshl-latency.ll

Lines changed: 71 additions & 71 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll

Lines changed: 83 additions & 83 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/X86/fshl.ll

Lines changed: 70 additions & 70 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/X86/fshr-latency.ll

Lines changed: 71 additions & 71 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll

Lines changed: 83 additions & 83 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/X86/fshr.ll

Lines changed: 70 additions & 70 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -826,21 +826,30 @@ define void @fshl_v2i32() {
826826
; PR63980
827827
define void @fshl_v2i32_uniformconst() {
828828
; SSE-LABEL: @fshl_v2i32_uniformconst(
829-
; SSE-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
830-
; SSE-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>)
831-
; SSE-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4
829+
; SSE-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4
830+
; SSE-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
831+
; SSE-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1)
832+
; SSE-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1)
833+
; SSE-NEXT: store i32 [[R0]], ptr @d32, align 4
834+
; SSE-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
832835
; SSE-NEXT: ret void
833836
;
834837
; AVX-LABEL: @fshl_v2i32_uniformconst(
835-
; AVX-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
836-
; AVX-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>)
837-
; AVX-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4
838+
; AVX-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4
839+
; AVX-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
840+
; AVX-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1)
841+
; AVX-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1)
842+
; AVX-NEXT: store i32 [[R0]], ptr @d32, align 4
843+
; AVX-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
838844
; AVX-NEXT: ret void
839845
;
840846
; AVX512-LABEL: @fshl_v2i32_uniformconst(
841-
; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
842-
; AVX512-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>)
843-
; AVX512-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4
847+
; AVX512-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4
848+
; AVX512-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
849+
; AVX512-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1)
850+
; AVX512-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1)
851+
; AVX512-NEXT: store i32 [[R0]], ptr @d32, align 4
852+
; AVX512-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
844853
; AVX512-NEXT: ret void
845854
;
846855
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4

llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -826,21 +826,30 @@ define void @fshr_v2i32() {
826826
; PR63980
827827
define void @fshr_v2i32_uniformconst() {
828828
; SSE-LABEL: @fshr_v2i32_uniformconst(
829-
; SSE-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
830-
; SSE-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>)
831-
; SSE-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4
829+
; SSE-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4
830+
; SSE-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
831+
; SSE-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1)
832+
; SSE-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1)
833+
; SSE-NEXT: store i32 [[R0]], ptr @d32, align 4
834+
; SSE-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
832835
; SSE-NEXT: ret void
833836
;
834837
; AVX-LABEL: @fshr_v2i32_uniformconst(
835-
; AVX-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
836-
; AVX-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>)
837-
; AVX-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4
838+
; AVX-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4
839+
; AVX-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
840+
; AVX-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1)
841+
; AVX-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1)
842+
; AVX-NEXT: store i32 [[R0]], ptr @d32, align 4
843+
; AVX-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
838844
; AVX-NEXT: ret void
839845
;
840846
; AVX512-LABEL: @fshr_v2i32_uniformconst(
841-
; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
842-
; AVX512-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>)
843-
; AVX512-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4
847+
; AVX512-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4
848+
; AVX512-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
849+
; AVX512-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1)
850+
; AVX512-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1)
851+
; AVX512-NEXT: store i32 [[R0]], ptr @d32, align 4
852+
; AVX512-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
844853
; AVX512-NEXT: ret void
845854
;
846855
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4

0 commit comments

Comments
 (0)