From 57bed71bb190a41dd1a22017fa85a754f5b50273 Mon Sep 17 00:00:00 2001 From: hanbeom Date: Wed, 6 Nov 2024 21:55:46 +0900 Subject: [PATCH 01/12] Add tests for combine extract/insert between vectors of different lengths --- .../VectorCombine/X86/extract-fneg-insert.ll | 134 ++++++++++++++++++ 1 file changed, 134 insertions(+) diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll index df5fcdb7beb65..ec78e7bf9abbe 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll @@ -18,6 +18,19 @@ define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) { ret <4 x float> %r } +define <4 x float> @ext0_v2f32v4f32(<2 x float> %x, <4 x float> %y) { +; CHECK-LABEL: @ext0_v2f32v4f32( +; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 +; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 0 +; CHECK-NEXT: ret <4 x float> [[R]] +; + %e = extractelement <2 x float> %x, i32 0 + %n = fneg float %e + %r = insertelement <4 x float> %y, float %n, i32 0 + ret <4 x float> %r +} + ; Eliminating extract/insert is profitable. define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) { @@ -32,6 +45,19 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) { ret <4 x float> %r } +define <4 x float> @ext2_v2f32v4f32(<2 x float> %x, <4 x float> %y) { +; CHECK-LABEL: @ext2_v2f32v4f32( +; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 2 +; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 2 +; CHECK-NEXT: ret <4 x float> [[R]] +; + %e = extractelement <2 x float> %x, i32 2 + %n = fneg float %e + %r = insertelement <4 x float> %y, float %n, i32 2 + ret <4 x float> %r +} + ; Eliminating extract/insert is still profitable. Flags propagate. define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) { @@ -46,6 +72,19 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) { ret <2 x double> %r } +define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) { +; CHECK-LABEL: @ext1_v2f64v4f64( +; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 +; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1 +; CHECK-NEXT: ret <4 x double> [[R]] +; + %e = extractelement <2 x double> %x, i32 1 + %n = fneg nsz double %e + %r = insertelement <4 x double> %y, double %n, i32 1 + ret <4 x double> %r +} + ; The vector fneg would cost twice as much as the scalar op with SSE, ; so we don't transform there (the shuffle would also be more expensive). @@ -67,6 +106,19 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) { ret <8 x float> %r } +define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) { +; CHECK-LABEL: @ext7_v4f32v8f32( +; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 +; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7 +; CHECK-NEXT: ret <8 x float> [[R]] +; + %e = extractelement <4 x float> %x, i32 3 + %n = fneg float %e + %r = insertelement <8 x float> %y, float %n, i32 7 + ret <8 x float> %r +} + ; Same as above with an extra use of the extracted element. define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) { @@ -91,6 +143,21 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) { ret <8 x float> %r } +define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) { +; CHECK-LABEL: @ext7_v4f32v8f32_use1( +; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 +; CHECK-NEXT: call void @use(float [[E]]) +; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3 +; CHECK-NEXT: ret <8 x float> [[R]] +; + %e = extractelement <4 x float> %x, i32 3 + call void @use(float %e) + %n = fneg float %e + %r = insertelement <8 x float> %y, float %n, i32 3 + ret <8 x float> %r +} + ; Negative test - the transform is likely not profitable if the fneg has another use. define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) { @@ -108,6 +175,21 @@ define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) { ret <8 x float> %r } +define <8 x float> @ext7_v4f32v8f32_use2(<4 x float> %x, <8 x float> %y) { +; CHECK-LABEL: @ext7_v4f32v8f32_use2( +; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 +; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] +; CHECK-NEXT: call void @use(float [[N]]) +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3 +; CHECK-NEXT: ret <8 x float> [[R]] +; + %e = extractelement <4 x float> %x, i32 3 + %n = fneg float %e + call void @use(float %n) + %r = insertelement <8 x float> %y, float %n, i32 3 + ret <8 x float> %r +} + ; Negative test - can't convert variable index to a shuffle. define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %index) { @@ -123,6 +205,19 @@ define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 % ret <2 x double> %r } +define <4 x double> @ext_index_var_v2f64v4f64(<2 x double> %x, <4 x double> %y, i32 %index) { +; CHECK-LABEL: @ext_index_var_v2f64v4f64( +; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]] +; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 [[INDEX]] +; CHECK-NEXT: ret <4 x double> [[R]] +; + %e = extractelement <2 x double> %x, i32 %index + %n = fneg nsz double %e + %r = insertelement <4 x double> %y, double %n, i32 %index + ret <4 x double> %r +} + ; Negative test - require same extract/insert index for simple shuffle. ; TODO: We could handle this by adjusting the cost calculation. @@ -139,6 +234,19 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) { ret <2 x double> %r } +define <4 x double> @ext1_v2f64v4f64_ins0(<2 x double> %x, <4 x double> %y) { +; CHECK-LABEL: @ext1_v2f64v4f64_ins0( +; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 +; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0 +; CHECK-NEXT: ret <4 x double> [[R]] +; + %e = extractelement <2 x double> %x, i32 1 + %n = fneg nsz double %e + %r = insertelement <4 x double> %y, double %n, i32 0 + ret <4 x double> %r +} + ; Negative test - avoid changing poison ops define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) { @@ -154,6 +262,19 @@ define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) { ret <4 x float> %r } +define <4 x float> @ext12_v2f32v4f32(<2 x float> %x, <4 x float> %y) { +; CHECK-LABEL: @ext12_v2f32v4f32( +; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 6 +; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 12 +; CHECK-NEXT: ret <4 x float> [[R]] +; + %e = extractelement <2 x float> %x, i32 6 + %n = fneg float %e + %r = insertelement <4 x float> %y, float %n, i32 12 + ret <4 x float> %r +} + ; This used to crash because we assumed matching a true, unary fneg instruction. define <2 x float> @ext1_v2f32_fsub(<2 x float> %x) { @@ -181,3 +302,16 @@ define <2 x float> @ext1_v2f32_fsub_fmf(<2 x float> %x, <2 x float> %y) { %r = insertelement <2 x float> %y, float %s, i32 1 ret <2 x float> %r } + +define <4 x float> @ext1_v2f32v4f32_fsub_fmf(<2 x float> %x, <4 x float> %y) { +; CHECK-LABEL: @ext1_v2f32v4f32_fsub_fmf( +; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 +; CHECK-NEXT: [[S:%.*]] = fsub nnan nsz float 0.000000e+00, [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[S]], i32 1 +; CHECK-NEXT: ret <4 x float> [[R]] +; + %e = extractelement <2 x float> %x, i32 1 + %s = fsub nsz nnan float 0.0, %e + %r = insertelement <4 x float> %y, float %s, i32 1 + ret <4 x float> %r +} From 8c69f209a8cb6af08da65b1d298f1906239484e4 Mon Sep 17 00:00:00 2001 From: hanbeom Date: Thu, 7 Nov 2024 02:32:23 +0900 Subject: [PATCH 02/12] [VectorCombine] Combine scalar fneg with insert/extract to vector fneg when length is different insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index -> shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask Original combining left the combine between vectors of different lengths as a TODO. this commit do that. (see #[baab4aa]) --- .../Transforms/Vectorize/VectorCombine.cpp | 32 ++++++++++--- .../VectorCombine/X86/extract-fneg-insert.ll | 48 +++++++++++++------ 2 files changed, 58 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index ebbd05e6d47af..3abdf6c0844a5 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -665,9 +665,9 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))) return false; - // TODO: We could handle this with a length-changing shuffle. auto *VecTy = cast(I.getType()); - if (SrcVec->getType() != VecTy) + auto *SrcVecTy = cast(SrcVec->getType()); + if (SrcVecTy->getScalarType() != VecTy->getScalarType()) return false; // Ignore bogus insert/extract index. @@ -682,7 +682,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { std::iota(Mask.begin(), Mask.end(), 0); Mask[Index] = Index + NumElts; - Type *ScalarTy = VecTy->getScalarType(); + Type *ScalarTy = SrcVecTy->getScalarType(); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost OldCost = TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) + TTI.getVectorInstrCost(I, VecTy, CostKind, Index); @@ -697,14 +698,31 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) + TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind); + bool NeedLenChg = SrcVecTy->getNumElements() != NumElts; + // If the lengths of the two vectors are not equal, + // we need to add a length-change vector. Add this cost. + if (NeedLenChg) + NewCost += + TTI.getShuffleCost(TargetTransformInfo::SK_Select, SrcVecTy, Mask); + if (NewCost > OldCost) return false; - // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> - // shuffle DestVec, (fneg SrcVec), Mask + Value *NewShuf; + // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); - Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); - replaceValue(I, *Shuf); + if (NeedLenChg) { + // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask + SmallVector SrcMask(NumElts, PoisonMaskElem); + SrcMask[Index] = Index; + Value *LenChgShuf = Builder.CreateShuffleVector( + SrcVec, PoisonValue::get(SrcVecTy), SrcMask); + NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask); + } else + // shuffle DestVec, (fneg SrcVec), Mask + NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); + + replaceValue(I, *NewShuf); return true; } diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll index ec78e7bf9abbe..05aad1b4ba79d 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll @@ -46,11 +46,17 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) { } define <4 x float> @ext2_v2f32v4f32(<2 x float> %x, <4 x float> %y) { -; CHECK-LABEL: @ext2_v2f32v4f32( -; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 2 -; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 2 -; CHECK-NEXT: ret <4 x float> [[R]] +; SSE-LABEL: @ext2_v2f32v4f32( +; SSE-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 2 +; SSE-NEXT: [[N:%.*]] = fneg float [[E]] +; SSE-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 2 +; SSE-NEXT: ret <4 x float> [[R]] +; +; AVX-LABEL: @ext2_v2f32v4f32( +; AVX-NEXT: [[TMP1:%.*]] = fneg <2 x float> [[X:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> +; AVX-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> +; AVX-NEXT: ret <4 x float> [[R]] ; %e = extractelement <2 x float> %x, i32 2 %n = fneg float %e @@ -73,11 +79,17 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) { } define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) { -; CHECK-LABEL: @ext1_v2f64v4f64( -; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 -; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1 -; CHECK-NEXT: ret <4 x double> [[R]] +; SSE-LABEL: @ext1_v2f64v4f64( +; SSE-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 +; SSE-NEXT: [[N:%.*]] = fneg nsz double [[E]] +; SSE-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1 +; SSE-NEXT: ret <4 x double> [[R]] +; +; AVX-LABEL: @ext1_v2f64v4f64( +; AVX-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <4 x i32> +; AVX-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> +; AVX-NEXT: ret <4 x double> [[R]] ; %e = extractelement <2 x double> %x, i32 1 %n = fneg nsz double %e @@ -304,11 +316,17 @@ define <2 x float> @ext1_v2f32_fsub_fmf(<2 x float> %x, <2 x float> %y) { } define <4 x float> @ext1_v2f32v4f32_fsub_fmf(<2 x float> %x, <4 x float> %y) { -; CHECK-LABEL: @ext1_v2f32v4f32_fsub_fmf( -; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 -; CHECK-NEXT: [[S:%.*]] = fsub nnan nsz float 0.000000e+00, [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[S]], i32 1 -; CHECK-NEXT: ret <4 x float> [[R]] +; SSE-LABEL: @ext1_v2f32v4f32_fsub_fmf( +; SSE-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 +; SSE-NEXT: [[S:%.*]] = fsub nnan nsz float 0.000000e+00, [[E]] +; SSE-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[S]], i32 1 +; SSE-NEXT: ret <4 x float> [[R]] +; +; AVX-LABEL: @ext1_v2f32v4f32_fsub_fmf( +; AVX-NEXT: [[TMP1:%.*]] = fneg nnan nsz <2 x float> [[X:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> +; AVX-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> +; AVX-NEXT: ret <4 x float> [[R]] ; %e = extractelement <2 x float> %x, i32 1 %s = fsub nsz nnan float 0.0, %e From 33275721b0bcad27fe650b59636a4b19d83df77c Mon Sep 17 00:00:00 2001 From: hanbeom Date: Sat, 16 Nov 2024 15:46:08 +0900 Subject: [PATCH 03/12] fix mis-matched braces --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 3abdf6c0844a5..ac98d35c1224c 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -718,9 +718,10 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { Value *LenChgShuf = Builder.CreateShuffleVector( SrcVec, PoisonValue::get(SrcVecTy), SrcMask); NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask); - } else + } else { // shuffle DestVec, (fneg SrcVec), Mask NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); + } replaceValue(I, *NewShuf); return true; From 66750f65018b00cdde8bc6fe842ba59d7c1af26b Mon Sep 17 00:00:00 2001 From: hanbeom Date: Wed, 11 Dec 2024 06:42:04 +0900 Subject: [PATCH 04/12] remove costkind --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index ac98d35c1224c..fc133cffd750f 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -683,7 +683,6 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { Mask[Index] = Index + NumElts; Type *ScalarTy = SrcVecTy->getScalarType(); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost OldCost = TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) + TTI.getVectorInstrCost(I, VecTy, CostKind, Index); From 5c694ff170a7242e81a0e17854fdbb0245766159 Mon Sep 17 00:00:00 2001 From: hanbeom Date: Wed, 11 Dec 2024 06:42:59 +0900 Subject: [PATCH 05/12] move ScalarTy to reduce duplicate --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index fc133cffd750f..d26ce6c85dc46 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -667,7 +667,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { auto *VecTy = cast(I.getType()); auto *SrcVecTy = cast(SrcVec->getType()); - if (SrcVecTy->getScalarType() != VecTy->getScalarType()) + auto *ScalarTy = SrcVecTy->getScalarType(); + if (ScalarTy != VecTy->getScalarType()) return false; // Ignore bogus insert/extract index. @@ -681,8 +682,6 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { SmallVector Mask(NumElts); std::iota(Mask.begin(), Mask.end(), 0); Mask[Index] = Index + NumElts; - - Type *ScalarTy = SrcVecTy->getScalarType(); InstructionCost OldCost = TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) + TTI.getVectorInstrCost(I, VecTy, CostKind, Index); From 78fd2c42b90fca704ef4df63f5645d0b7d7b6bca Mon Sep 17 00:00:00 2001 From: hanbeom Date: Wed, 11 Dec 2024 06:44:21 +0900 Subject: [PATCH 06/12] more accurate shufflecost with PermuteSingleSrc --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index d26ce6c85dc46..5bf382ab50bc5 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -699,9 +699,12 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { bool NeedLenChg = SrcVecTy->getNumElements() != NumElts; // If the lengths of the two vectors are not equal, // we need to add a length-change vector. Add this cost. - if (NeedLenChg) - NewCost += - TTI.getShuffleCost(TargetTransformInfo::SK_Select, SrcVecTy, Mask); + SmallVector SrcMask; + if (NeedLenChg) { + SrcMask.assign(NumElts, PoisonMaskElem); + NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + SrcVecTy, SrcMask, CostKind); + } if (NewCost > OldCost) return false; @@ -711,7 +714,6 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); if (NeedLenChg) { // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask - SmallVector SrcMask(NumElts, PoisonMaskElem); SrcMask[Index] = Index; Value *LenChgShuf = Builder.CreateShuffleVector( SrcVec, PoisonValue::get(SrcVecTy), SrcMask); From f49084407e6080bfaf347a4d38945cdc75dc49e9 Mon Sep 17 00:00:00 2001 From: hanbeom Date: Wed, 11 Dec 2024 06:45:52 +0900 Subject: [PATCH 07/12] updated testcase --- .../VectorCombine/X86/extract-fneg-insert.ll | 32 ++++++------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll index 05aad1b4ba79d..65024af25ced5 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll @@ -46,17 +46,11 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) { } define <4 x float> @ext2_v2f32v4f32(<2 x float> %x, <4 x float> %y) { -; SSE-LABEL: @ext2_v2f32v4f32( -; SSE-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 2 -; SSE-NEXT: [[N:%.*]] = fneg float [[E]] -; SSE-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 2 -; SSE-NEXT: ret <4 x float> [[R]] -; -; AVX-LABEL: @ext2_v2f32v4f32( -; AVX-NEXT: [[TMP1:%.*]] = fneg <2 x float> [[X:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> -; AVX-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> -; AVX-NEXT: ret <4 x float> [[R]] +; CHECK-LABEL: @ext2_v2f32v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = fneg <2 x float> [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[R]] ; %e = extractelement <2 x float> %x, i32 2 %n = fneg float %e @@ -316,17 +310,11 @@ define <2 x float> @ext1_v2f32_fsub_fmf(<2 x float> %x, <2 x float> %y) { } define <4 x float> @ext1_v2f32v4f32_fsub_fmf(<2 x float> %x, <4 x float> %y) { -; SSE-LABEL: @ext1_v2f32v4f32_fsub_fmf( -; SSE-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 -; SSE-NEXT: [[S:%.*]] = fsub nnan nsz float 0.000000e+00, [[E]] -; SSE-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[S]], i32 1 -; SSE-NEXT: ret <4 x float> [[R]] -; -; AVX-LABEL: @ext1_v2f32v4f32_fsub_fmf( -; AVX-NEXT: [[TMP1:%.*]] = fneg nnan nsz <2 x float> [[X:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> -; AVX-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> -; AVX-NEXT: ret <4 x float> [[R]] +; CHECK-LABEL: @ext1_v2f32v4f32_fsub_fmf( +; CHECK-NEXT: [[TMP1:%.*]] = fneg nnan nsz <2 x float> [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[R]] ; %e = extractelement <2 x float> %x, i32 1 %s = fsub nsz nnan float 0.0, %e From d1d1af23e51a5c55e5b9acc8a10e15caddf1bd1f Mon Sep 17 00:00:00 2001 From: hanbeom Date: Thu, 12 Dec 2024 04:40:38 +0900 Subject: [PATCH 08/12] move indexing of SrcMask --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 5bf382ab50bc5..7d4121ab5d25d 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -702,6 +702,7 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { SmallVector SrcMask; if (NeedLenChg) { SrcMask.assign(NumElts, PoisonMaskElem); + SrcMask[Index] = Index; NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcVecTy, SrcMask, CostKind); } @@ -714,7 +715,6 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); if (NeedLenChg) { // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask - SrcMask[Index] = Index; Value *LenChgShuf = Builder.CreateShuffleVector( SrcVec, PoisonValue::get(SrcVecTy), SrcMask); NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask); From 288355865119c92a1ef2f4602d5424656e17a087 Mon Sep 17 00:00:00 2001 From: hanbeom Date: Sat, 14 Dec 2024 01:46:41 +0900 Subject: [PATCH 09/12] add checking SrcVec is FixedVectorType --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 7d4121ab5d25d..eec8a0ccc529e 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -666,9 +666,9 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { return false; auto *VecTy = cast(I.getType()); - auto *SrcVecTy = cast(SrcVec->getType()); + auto *SrcVecTy = dyn_cast(SrcVec->getType()); auto *ScalarTy = SrcVecTy->getScalarType(); - if (ScalarTy != VecTy->getScalarType()) + if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType()) return false; // Ignore bogus insert/extract index. From dba637d06c0a9790935b95e42a866e88fb74fc2b Mon Sep 17 00:00:00 2001 From: hanbeom Date: Sat, 14 Dec 2024 02:47:43 +0900 Subject: [PATCH 10/12] add test for extract from an index greater than the vector width of the destination --- .../VectorCombine/X86/extract-fneg-insert.ll | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll index 65024af25ced5..83f94ba46a072 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll @@ -240,6 +240,20 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) { ret <2 x double> %r } +; Negative test - extract from an index greater than the vector width of the destination +define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) { +; CHECK-LABEL: @ext3_v4f64v2f64( +; CHECK-NEXT: [[E:%.*]] = extractelement <4 x double> [[X:%.*]], i32 3 +; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] +; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1 +; CHECK-NEXT: ret <2 x double> [[R]] +; + %e = extractelement <4 x double> %x, i32 3 + %n = fneg nsz double %e + %r = insertelement <2 x double> %y, double %n, i32 1 + ret <2 x double> %r +} + define <4 x double> @ext1_v2f64v4f64_ins0(<2 x double> %x, <4 x double> %y) { ; CHECK-LABEL: @ext1_v2f64v4f64_ins0( ; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 From 014b186f617cd7ba67efc3444ebf48e0f0b6b185 Mon Sep 17 00:00:00 2001 From: hanbeom Date: Mon, 16 Dec 2024 01:51:46 +0900 Subject: [PATCH 11/12] Fix misused of SrcVecTy instead VecTy --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index eec8a0ccc529e..16ee97e1c119b 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -666,8 +666,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { return false; auto *VecTy = cast(I.getType()); + auto *ScalarTy = VecTy->getScalarType(); auto *SrcVecTy = dyn_cast(SrcVec->getType()); - auto *ScalarTy = SrcVecTy->getScalarType(); if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType()) return false; From 7a58cb0cdc3f3f91af30d5addd8b51f0f9146cbb Mon Sep 17 00:00:00 2001 From: hanbeom Date: Mon, 16 Dec 2024 01:56:04 +0900 Subject: [PATCH 12/12] Call CreateShuffleVector without the poison If we call CreateShuffleVector with only one value as an argument, it will create a poison vector internally and it shuffled. --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 16ee97e1c119b..a88b9895ebb7d 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -715,8 +715,7 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); if (NeedLenChg) { // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask - Value *LenChgShuf = Builder.CreateShuffleVector( - SrcVec, PoisonValue::get(SrcVecTy), SrcMask); + Value *LenChgShuf = Builder.CreateShuffleVector(SrcVec, SrcMask); NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask); } else { // shuffle DestVec, (fneg SrcVec), Mask