From 701ec749fba0eea7a71a3d614a7bc14a6c44f8cd Mon Sep 17 00:00:00 2001 From: Rajveer Date: Sun, 22 Jun 2025 17:39:34 +0530 Subject: [PATCH 1/3] [VectorCombine] New folding pattern for extract/binop/shuffle chains Resolves #144654 Part of #143088 This adds a new `foldShuffleChainsToReduce` for horizontal reduction of patterns like: ```llvm define i16 @test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #0 { %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) %7 = extractelement <8 x i16> %6, i64 0 ret i16 %7 } ``` ...which can be reduced to a llvm.vector.reduce.umin.v8i16(%a0) intrinsic call. Similar transformation for other ops when costs permit to do so. --- .../Transforms/Vectorize/VectorCombine.cpp | 177 ++++++++++++++++ .../X86/shuffle-chain-reduction-umin.ll | 200 ++++++++++++++++++ .../fold-shuffle-chains-to-reduce.ll | 127 +++++++++++ 3 files changed, 504 insertions(+) create mode 100644 llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll create mode 100644 llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 4e2a5c78e0ac8..e54f48e02f1da 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -135,6 +135,7 @@ class VectorCombine { bool foldShuffleOfIntrinsics(Instruction &I); bool foldShuffleToIdentity(Instruction &I); bool foldShuffleFromReductions(Instruction &I); + bool foldShuffleChainsToReduce(Instruction &I); bool foldCastFromReductions(Instruction &I); bool foldSelectShuffle(Instruction &I, bool FromReduction = false); bool foldInterleaveIntrinsics(Instruction &I); @@ -3129,6 +3130,179 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { return MadeChanges; } +bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { + auto *EEI = dyn_cast(&I); + if (!EEI) + return false; + + std::queue InstWorklist; + Value *InitEEV = nullptr; + Intrinsic::ID CommonOp = 0; + + bool IsFirstCallInst = true; + bool ShouldBeCallInst = true; + + SmallVector PrevVecV(3, nullptr); + int64_t ShuffleMaskHalf = -1, ExpectedShuffleMaskHalf = 1; + int64_t VecSize = -1; + + Value *VecOp; + if (!match(&I, m_ExtractElt(m_Value(VecOp), m_Zero()))) + return false; + + auto *FVT = dyn_cast(VecOp->getType()); + if (!FVT) + return false; + + VecSize = FVT->getNumElements(); + if (VecSize < 2 || (VecSize % 2) != 0) + return false; + + ShuffleMaskHalf = 1; + PrevVecV[2] = VecOp; + InitEEV = EEI; + + InstWorklist.push(PrevVecV[2]); + + while (!InstWorklist.empty()) { + Value *V = InstWorklist.front(); + InstWorklist.pop(); + + auto *CI = dyn_cast(V); + if (!CI) + return false; + + if (auto *CallI = dyn_cast(CI)) { + if (!ShouldBeCallInst || !PrevVecV[2]) + return false; + + if (!IsFirstCallInst && + any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) + return false; + + if (CallI != (IsFirstCallInst ? PrevVecV[2] : PrevVecV[0])) + return false; + IsFirstCallInst = false; + + auto *II = dyn_cast(CallI); + if (!II) + return false; + + if (!CommonOp) + CommonOp = II->getIntrinsicID(); + if (II->getIntrinsicID() != CommonOp) + return false; + + switch (II->getIntrinsicID()) { + case Intrinsic::umin: + case Intrinsic::umax: + case Intrinsic::smin: + case Intrinsic::smax: { + auto *Op0 = CallI->getOperand(0); + auto *Op1 = CallI->getOperand(1); + PrevVecV[0] = Op0; + PrevVecV[1] = Op1; + break; + } + default: + return false; + } + ShouldBeCallInst ^= 1; + + if (!isa(PrevVecV[1])) + std::swap(PrevVecV[0], PrevVecV[1]); + InstWorklist.push(PrevVecV[1]); + InstWorklist.push(PrevVecV[0]); + } else if (auto *SVInst = dyn_cast(CI)) { + if (ShouldBeCallInst || + any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) + return false; + + if (SVInst != PrevVecV[1]) + return false; + + auto *ShuffleVec = SVInst->getOperand(0); + if (!ShuffleVec || ShuffleVec != PrevVecV[0]) + return false; + + SmallVector CurMask; + SVInst->getShuffleMask(CurMask); + + if (ShuffleMaskHalf != ExpectedShuffleMaskHalf) + return false; + ExpectedShuffleMaskHalf *= 2; + + for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) { + if (Mask < ShuffleMaskHalf && CurMask[Mask] != ShuffleMaskHalf + Mask) + return false; + if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1) + return false; + } + ShuffleMaskHalf *= 2; + if (ExpectedShuffleMaskHalf == VecSize) + break; + ShouldBeCallInst ^= 1; + } else { + return false; + } + } + + if (ShouldBeCallInst) + return false; + + assert(VecSize != -1 && ExpectedShuffleMaskHalf == VecSize && + "Expected Match for Vector Size and Mask Half"); + + Value *FinalVecV = PrevVecV[0]; + auto *FinalVecVTy = dyn_cast(FinalVecV->getType()); + + if (!InitEEV || !FinalVecV) + return false; + + assert(FinalVecVTy && "Expected non-null value for Vector Type"); + + Intrinsic::ID ReducedOp = 0; + switch (CommonOp) { + case Intrinsic::umin: + ReducedOp = Intrinsic::vector_reduce_umin; + break; + case Intrinsic::umax: + ReducedOp = Intrinsic::vector_reduce_umax; + break; + case Intrinsic::smin: + ReducedOp = Intrinsic::vector_reduce_smin; + break; + case Intrinsic::smax: + ReducedOp = Intrinsic::vector_reduce_smax; + break; + default: + return false; + } + + InstructionCost OrigCost = 0; + unsigned int NumLevels = Log2_64(VecSize); + + for (unsigned int Level = 0; Level < NumLevels; ++Level) { + OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + FinalVecVTy, FinalVecVTy); + OrigCost += TTI.getArithmeticInstrCost(Instruction::ICmp, FinalVecVTy); + } + OrigCost += TTI.getVectorInstrCost(Instruction::ExtractElement, FinalVecVTy, + CostKind, 0); + + IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV}); + InstructionCost NewCost = TTI.getIntrinsicInstrCost(ICA, CostKind); + + if (NewCost >= OrigCost) + return false; + + auto *ReducedResult = + Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV}); + replaceValue(*InitEEV, *ReducedResult); + + return true; +} + /// Determine if its more efficient to fold: /// reduce(trunc(x)) -> trunc(reduce(x)). /// reduce(sext(x)) -> sext(reduce(x)). @@ -4216,6 +4390,9 @@ bool VectorCombine::run() { if (foldCastFromReductions(I)) return true; break; + case Instruction::ExtractElement: + MadeChange |= foldShuffleChainsToReduce(I); + break; case Instruction::ICmp: case Instruction::FCmp: if (foldExtractExtract(I)) diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll new file mode 100644 index 0000000000000..82b20ccc5b8f5 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll @@ -0,0 +1,200 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes=vector-combine -S %s | FileCheck %s +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes=vector-combine -S %s | FileCheck %s +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes=vector-combine -S %s | FileCheck %s +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v4 -passes=vector-combine -S %s | FileCheck %s + +define i16 @test_reduce_v8i16(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +} + +define i8 @test_reduce_v16i8(<16 x i8> %a0) { +; +; CHECK-LABEL: define i8 @test_reduce_v16i8( +; CHECK-SAME: <16 x i8> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> [[A0]]) +; CHECK-NEXT: ret i8 [[TMP8]] +; + %1 = shufflevector <16 x i8> %a0, <16 x i8> poison, <16 x i32> + %2 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %a0, <16 x i8> %1) + %3 = shufflevector <16 x i8> %2, <16 x i8> poison, <16 x i32> + %4 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %2, <16 x i8> %3) + %5 = shufflevector <16 x i8> %4, <16 x i8> poison, <16 x i32> + %6 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %4, <16 x i8> %5) + %7 = shufflevector <16 x i8> %6, <16 x i8> poison, <16 x i32> + %8 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %6, <16 x i8> %7) + %9 = extractelement <16 x i8> %8, i64 0 + ret i8 %9 +} + +define i8 @test_reduce_v32i8(<32 x i8> %a0) { +; CHECK-LABEL: define i8 @test_reduce_v32i8( +; CHECK-SAME: <32 x i8> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> [[A0]]) +; CHECK-NEXT: ret i8 [[TMP1]] +; + %1 = shufflevector <32 x i8> %a0, <32 x i8> poison, <32 x i32> + %2 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %a0, <32 x i8> %1) + %3 = shufflevector <32 x i8> %2, <32 x i8> poison, <32 x i32> + %4 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %2, <32 x i8> %3) + %5 = shufflevector <32 x i8> %4, <32 x i8> poison, <32 x i32> + %6 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %4, <32 x i8> %5) + %7 = shufflevector <32 x i8> %6, <32 x i8> poison, <32 x i32> + %8 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %6, <32 x i8> %7) + %9 = shufflevector <32 x i8> %8, <32 x i8> poison, <32 x i32> + %10 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %8, <32 x i8> %9) + %11 = extractelement <32 x i8> %10, i64 0 + ret i8 %11 +} + +define i16 @test_reduce_v16i16(<16 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v16i16( +; CHECK-SAME: <16 x i16> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <16 x i16> %a0, <16 x i16> poison, <16 x i32> + %2 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %a0, <16 x i16> %1) + %3 = shufflevector <16 x i16> %2, <16 x i16> poison, <16 x i32> + %4 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %2, <16 x i16> %3) + %5 = shufflevector <16 x i16> %4, <16 x i16> poison, <16 x i32> + %6 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %4, <16 x i16> %5) + %7 = shufflevector <16 x i16> %6, <16 x i16> poison, <16 x i32> + %8 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %6, <16 x i16> %7) + %9 = extractelement <16 x i16> %8, i64 0 + ret i16 %9 +} + +define i8 @test_reduce_v64i8(<64 x i8> %a0) { +; CHECK-LABEL: define i8 @test_reduce_v64i8( +; CHECK-SAME: <64 x i8> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> [[A0]]) +; CHECK-NEXT: ret i8 [[TMP1]] +; + %1 = shufflevector <64 x i8> %a0, <64 x i8> poison, <64 x i32> + %2 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %a0, <64 x i8> %1) + %3 = shufflevector <64 x i8> %2, <64 x i8> poison, <64 x i32> + %4 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %2, <64 x i8> %3) + %5 = shufflevector <64 x i8> %4, <64 x i8> poison, <64 x i32> + %6 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %4, <64 x i8> %5) + %7 = shufflevector <64 x i8> %6, <64 x i8> poison, <64 x i32> + %8 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %6, <64 x i8> %7) + %9 = shufflevector <64 x i8> %8, <64 x i8> poison, <64 x i32> + %10 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %8, <64 x i8> %9) + %11 = shufflevector <64 x i8> %10, <64 x i8> poison, <64 x i32> + %12 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %10, <64 x i8> %11) + %13 = extractelement <64 x i8> %12, i64 0 + ret i8 %13 +} + +define i16 @test_reduce_v32i16(<32 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v32i16( +; CHECK-SAME: <32 x i16> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <32 x i16> %a0, <32 x i16> poison, <32 x i32> + %2 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %a0, <32 x i16> %1) + %3 = shufflevector <32 x i16> %2, <32 x i16> poison, <32 x i32> + %4 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %2, <32 x i16> %3) + %5 = shufflevector <32 x i16> %4, <32 x i16> poison, <32 x i32> + %6 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %4, <32 x i16> %5) + %7 = shufflevector <32 x i16> %6, <32 x i16> poison, <32 x i32> + %8 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %6, <32 x i16> %7) + %9 = shufflevector <32 x i16> %8, <32 x i16> poison, <32 x i32> + %10 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %8, <32 x i16> %9) + %11 = extractelement <32 x i16> %10, i64 0 + ret i16 %11 +} diff --git a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll new file mode 100644 index 0000000000000..3cb25ba4ecce6 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=vector-combine -S | FileCheck %s + +define i16 @test_reduce_v8i16(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +} + +define i16 @test_reduce_v8i16_2(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16_2( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]]) +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP9]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i16> [[TMP16]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = tail call i16 @llvm.umin.i16(i16 [[TMP13]], i16 [[TMP14]]) +; CHECK-NEXT: ret i16 [[TMP15]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + + %8 = shufflevector <8 x i16> %6, <8 x i16> poison, <8 x i32> + %9 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %8) + %10 = shufflevector <8 x i16> %9, <8 x i16> poison, <8 x i32> + %11 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %9, <8 x i16> %10) + %12 = shufflevector <8 x i16> %11, <8 x i16> poison, <8 x i32> + %13 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %11, <8 x i16> %12) + %14 = extractelement <8 x i16> %13, i64 0 + + %15 = tail call i16 @llvm.umin.i16(i16 %7, i16 %14) + + ret i16 %15 +} + +define i16 @test_reduce_v8i16_neg1(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16_neg1( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: ret i16 [[TMP7]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +} + +define i16 @test_reduce_v8i16_neg2(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16_neg2( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: ret i16 [[TMP7]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +} + +define i16 @test_reduce_v8i16_neg3(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16_neg3( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP7]], i64 0 +; CHECK-NEXT: ret i16 [[TMP8]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %6 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %7 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %5, <8 x i16> %6) + %8 = extractelement <8 x i16> %7, i64 0 + ret i16 %8 +} From a6acccdb18b69b4c2f513894a6b27e932ac41c28 Mon Sep 17 00:00:00 2001 From: Rajveer Date: Sat, 28 Jun 2025 16:31:51 +0530 Subject: [PATCH 2/3] Include support for Add/Mul/Or/And/Xor Binary Operations --- .../include/llvm/Transforms/Utils/LoopUtils.h | 3 + llvm/lib/Transforms/Utils/LoopUtils.cpp | 15 + .../Transforms/Vectorize/VectorCombine.cpp | 258 ++++++++++++------ .../fold-shuffle-chains-to-reduce.ll | 68 +++++ 4 files changed, 260 insertions(+), 84 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 723f6aea1b76f..96e3d3d47f2d0 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -371,6 +371,9 @@ LLVM_ABI bool canSinkOrHoistInst(Instruction &I, AAResults *AA, /// Returns the llvm.vector.reduce intrinsic that corresponds to the recurrence /// kind. LLVM_ABI constexpr Intrinsic::ID getReductionIntrinsicID(RecurKind RK); +/// Returns the llvm.vector.reduce min/max intrinsic that corresponds to the +/// intrinsic op. +LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID); /// Returns the arithmetic instruction opcode used when expanding a reduction. LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID); diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 2d830f3b6f952..843364eb34f83 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -956,6 +956,21 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) { } } +Intrinsic::ID llvm::getMinMaxReductionIntrinsicID(Intrinsic::ID IID) { + switch (IID) { + default: + llvm_unreachable("Unexpected intrinsic id"); + case Intrinsic::umin: + return Intrinsic::vector_reduce_umin; + case Intrinsic::umax: + return Intrinsic::vector_reduce_umax; + case Intrinsic::smin: + return Intrinsic::vector_reduce_smin; + case Intrinsic::smax: + return Intrinsic::vector_reduce_smax; + } +} + // This is the inverse to getReductionForBinop unsigned llvm::getArithmeticReductionInstruction(Intrinsic::ID RdxID) { switch (RdxID) { diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index e54f48e02f1da..17ae97a6f9faf 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3130,21 +3130,66 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { return MadeChanges; } +/// For a given chain of patterns of the following form: +/// +/// ``` +/// %1 = shufflevector %0, poison mask +/// +/// %2 = tail call llvm.( %0, %1) +/// OR +/// %2 = add/mul/or/and/xor %0, %1 +/// +/// %3 = shufflevector %2, poison mask +/// ... +/// ... +/// %(i - 1) = tail call llvm.( %(i - +/// 3), %(i - 2) +/// OR +/// %(i - 1) = add/mul/or/and/xor %(i - 3), %(i - 2) +/// +/// %(i) = extractelement %(i - 1), 0 +/// ``` +/// +/// Where: +/// `mask` follows a partition pattern: +/// +/// Ex: +/// [n = 8, p = poison] +/// +/// 4 5 6 7 | p p p p +/// 2 3 | p p p p p p +/// 1 | p p p p p p p +/// +/// For powers of 2, there's a consistent pattern, but for other cases +/// the parity of the current half value at each step decides the +/// next partition half (see `ExpectedParityMask` for more logical details +/// in generalising this). +/// +/// Ex: +/// [n = 6] +/// +/// 3 4 5 | p p p +/// 1 2 | p p p p +/// 1 | p p p p p bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { - auto *EEI = dyn_cast(&I); - if (!EEI) - return false; - + // Going bottom-up for the pattern. std::queue InstWorklist; - Value *InitEEV = nullptr; - Intrinsic::ID CommonOp = 0; + InstructionCost OrigCost = 0; + + // Common instruction operation after each shuffle op. + std::optional CommonCallOp = std::nullopt; + std::optional CommonBinOp = std::nullopt; - bool IsFirstCallInst = true; - bool ShouldBeCallInst = true; + bool IsFirstCallOrBinInst = true; + bool ShouldBeCallOrBinInst = true; + // This stores the last used instructions for shuffle/common op. + // + // PrevVecV[2] stores the first vector from extract element instruction, + // while PrevVecV[0] / PrevVecV[1] store the last two simultaneous + // instructions from either shuffle/common op. SmallVector PrevVecV(3, nullptr); - int64_t ShuffleMaskHalf = -1, ExpectedShuffleMaskHalf = 1; - int64_t VecSize = -1; Value *VecOp; if (!match(&I, m_ExtractElt(m_Value(VecOp), m_Zero()))) @@ -3154,43 +3199,53 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { if (!FVT) return false; - VecSize = FVT->getNumElements(); - if (VecSize < 2 || (VecSize % 2) != 0) + int64_t VecSize = FVT->getNumElements(); + if (VecSize < 2) return false; - ShuffleMaskHalf = 1; - PrevVecV[2] = VecOp; - InitEEV = EEI; + // Number of levels would be ~log2(n), considering we always partition + // by half for this fold pattern. + unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0; + int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0; + + // This is how we generalise for all element sizes. + // At each step, if vector size is odd, we need non-poison + // values to cover the dominant half so we don't miss out on any element. + // + // This mask will help us retrieve this as we go from bottom to top: + // + // Mask Set -> N = N * 2 - 1 + // Mask Unset -> N = N * 2 + for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1; + Cur = (Cur + 1) / 2, --Mask) { + if (Cur & 1) + ExpectedParityMask |= (1ll << Mask); + } + PrevVecV[2] = VecOp; InstWorklist.push(PrevVecV[2]); while (!InstWorklist.empty()) { - Value *V = InstWorklist.front(); + Value *CI = InstWorklist.front(); InstWorklist.pop(); - auto *CI = dyn_cast(V); - if (!CI) - return false; - - if (auto *CallI = dyn_cast(CI)) { - if (!ShouldBeCallInst || !PrevVecV[2]) + if (auto *II = dyn_cast(CI)) { + if (!ShouldBeCallOrBinInst) return false; - if (!IsFirstCallInst && + if (!IsFirstCallOrBinInst && any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) return false; - if (CallI != (IsFirstCallInst ? PrevVecV[2] : PrevVecV[0])) + // For the first found call/bin op, the vector has to come from the + // extract element op. + if (II != (IsFirstCallOrBinInst ? PrevVecV[2] : PrevVecV[0])) return false; - IsFirstCallInst = false; + IsFirstCallOrBinInst = false; - auto *II = dyn_cast(CallI); - if (!II) - return false; - - if (!CommonOp) - CommonOp = II->getIntrinsicID(); - if (II->getIntrinsicID() != CommonOp) + if (!CommonCallOp) + CommonCallOp = II->getIntrinsicID(); + if (II->getIntrinsicID() != *CommonCallOp) return false; switch (II->getIntrinsicID()) { @@ -3198,8 +3253,8 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { case Intrinsic::umax: case Intrinsic::smin: case Intrinsic::smax: { - auto *Op0 = CallI->getOperand(0); - auto *Op1 = CallI->getOperand(1); + auto *Op0 = II->getOperand(0); + auto *Op1 = II->getOperand(1); PrevVecV[0] = Op0; PrevVecV[1] = Op1; break; @@ -3207,88 +3262,123 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { default: return false; } - ShouldBeCallInst ^= 1; + ShouldBeCallOrBinInst ^= 1; + + IntrinsicCostAttributes ICA( + *CommonCallOp, II->getType(), + {PrevVecV[0]->getType(), PrevVecV[1]->getType()}); + OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind); + // We may need a swap here since it can be (a, b) or (b, a) + // and accordingly change as we go up. if (!isa(PrevVecV[1])) std::swap(PrevVecV[0], PrevVecV[1]); InstWorklist.push(PrevVecV[1]); InstWorklist.push(PrevVecV[0]); - } else if (auto *SVInst = dyn_cast(CI)) { - if (ShouldBeCallInst || + } else if (auto *BinOp = dyn_cast(CI)) { + // Similar logic for bin ops. + + if (!ShouldBeCallOrBinInst) + return false; + + if (!IsFirstCallOrBinInst && any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) return false; - if (SVInst != PrevVecV[1]) + if (BinOp != (IsFirstCallOrBinInst ? PrevVecV[2] : PrevVecV[0])) + return false; + IsFirstCallOrBinInst = false; + + if (!CommonBinOp) + CommonBinOp = BinOp->getOpcode(); + + if (BinOp->getOpcode() != *CommonBinOp) return false; - auto *ShuffleVec = SVInst->getOperand(0); - if (!ShuffleVec || ShuffleVec != PrevVecV[0]) + switch (*CommonBinOp) { + case BinaryOperator::Add: + case BinaryOperator::Mul: + case BinaryOperator::Or: + case BinaryOperator::And: + case BinaryOperator::Xor: { + auto *Op0 = BinOp->getOperand(0); + auto *Op1 = BinOp->getOperand(1); + PrevVecV[0] = Op0; + PrevVecV[1] = Op1; + break; + } + default: return false; + } + ShouldBeCallOrBinInst ^= 1; - SmallVector CurMask; - SVInst->getShuffleMask(CurMask); + OrigCost += + TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind); - if (ShuffleMaskHalf != ExpectedShuffleMaskHalf) + if (!isa(PrevVecV[1])) + std::swap(PrevVecV[0], PrevVecV[1]); + InstWorklist.push(PrevVecV[1]); + InstWorklist.push(PrevVecV[0]); + } else if (auto *SVInst = dyn_cast(CI)) { + // We shouldn't have any null values in the previous vectors, + // is so, there was a mismatch in pattern. + if (ShouldBeCallOrBinInst || + any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) + return false; + + if (SVInst != PrevVecV[1]) + return false; + + ArrayRef CurMask; + if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(), + m_Mask(CurMask)))) return false; - ExpectedShuffleMaskHalf *= 2; + // Subtract the parity mask when checking the condition. for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) { - if (Mask < ShuffleMaskHalf && CurMask[Mask] != ShuffleMaskHalf + Mask) + if (Mask < ShuffleMaskHalf && + CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1)) return false; if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1) return false; } + + // Update mask values. ShuffleMaskHalf *= 2; - if (ExpectedShuffleMaskHalf == VecSize) + ShuffleMaskHalf -= (ExpectedParityMask & 1); + ExpectedParityMask >>= 1; + + OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + SVInst->getType(), SVInst->getType(), + CurMask, CostKind); + + VisitedCnt += 1; + if (!ExpectedParityMask && VisitedCnt == NumLevels) break; - ShouldBeCallInst ^= 1; + + ShouldBeCallOrBinInst ^= 1; } else { return false; } } - if (ShouldBeCallInst) + // Pattern should end with a shuffle op. + if (ShouldBeCallOrBinInst) return false; - assert(VecSize != -1 && ExpectedShuffleMaskHalf == VecSize && - "Expected Match for Vector Size and Mask Half"); + assert(VecSize != -1 && "Expected Match for Vector Size"); Value *FinalVecV = PrevVecV[0]; - auto *FinalVecVTy = dyn_cast(FinalVecV->getType()); - - if (!InitEEV || !FinalVecV) + if (!FinalVecV) return false; - assert(FinalVecVTy && "Expected non-null value for Vector Type"); + auto *FinalVecVTy = cast(FinalVecV->getType()); - Intrinsic::ID ReducedOp = 0; - switch (CommonOp) { - case Intrinsic::umin: - ReducedOp = Intrinsic::vector_reduce_umin; - break; - case Intrinsic::umax: - ReducedOp = Intrinsic::vector_reduce_umax; - break; - case Intrinsic::smin: - ReducedOp = Intrinsic::vector_reduce_smin; - break; - case Intrinsic::smax: - ReducedOp = Intrinsic::vector_reduce_smax; - break; - default: + Intrinsic::ID ReducedOp = + (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp) + : getReductionForBinop(*CommonBinOp)); + if (!ReducedOp) return false; - } - - InstructionCost OrigCost = 0; - unsigned int NumLevels = Log2_64(VecSize); - - for (unsigned int Level = 0; Level < NumLevels; ++Level) { - OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - FinalVecVTy, FinalVecVTy); - OrigCost += TTI.getArithmeticInstrCost(Instruction::ICmp, FinalVecVTy); - } - OrigCost += TTI.getVectorInstrCost(Instruction::ExtractElement, FinalVecVTy, - CostKind, 0); IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV}); InstructionCost NewCost = TTI.getIntrinsicInstrCost(ICA, CostKind); @@ -3298,7 +3388,7 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { auto *ReducedResult = Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV}); - replaceValue(*InitEEV, *ReducedResult); + replaceValue(I, *ReducedResult); return true; } @@ -4391,8 +4481,8 @@ bool VectorCombine::run() { return true; break; case Instruction::ExtractElement: - MadeChange |= foldShuffleChainsToReduce(I); - break; + if (foldShuffleChainsToReduce(I)) + return true; case Instruction::ICmp: case Instruction::FCmp: if (foldExtractExtract(I)) diff --git a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll index 3cb25ba4ecce6..403ce33b5344e 100644 --- a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll +++ b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll @@ -17,6 +17,52 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ret i16 %7 } +define i16 @test_reduce_v7i16_or(<7 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v7i16_or( +; CHECK-SAME: <7 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v7i16(<7 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <7 x i16> %a0, <7 x i16> poison, <7 x i32> + %2 = or <7 x i16> %a0, %1 + %3 = shufflevector <7 x i16> %2, <7 x i16> poison, <7 x i32> + %4 = or <7 x i16> %2, %3 + %5 = shufflevector <7 x i16> %4, <7 x i16> poison, <7 x i32> + %6 = or <7 x i16> %4, %5 + %7 = extractelement <7 x i16> %6, i64 0 + ret i16 %7 +} + +define i16 @test_reduce_v3i16_and(<3 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v3i16_and( +; CHECK-SAME: <3 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.and.v3i16(<3 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <3 x i16> %a0, <3 x i16> poison, <3 x i32> + %2 = and <3 x i16> %a0, %1 + %3 = shufflevector <3 x i16> %2, <3 x i16> poison, <3 x i32> + %4 = and <3 x i16> %2, %3 + %5 = extractelement <3 x i16> %4, i64 0 + ret i16 %5 +} + +define i16 @test_reduce_v6i16_xor(<6 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v6i16_xor( +; CHECK-SAME: <6 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.xor.v6i16(<6 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <6 x i16> %a0, <6 x i16> poison, <6 x i32> + %2 = xor <6 x i16> %a0, %1 + %3 = shufflevector <6 x i16> %2, <6 x i16> poison, <6 x i32> + %4 = xor <6 x i16> %2, %3 + %5 = shufflevector <6 x i16> %4, <6 x i16> poison, <6 x i32> + %6 = xor <6 x i16> %4, %5 + %7 = extractelement <6 x i16> %6, i64 0 + ret i16 %7 +} + define i16 @test_reduce_v8i16_2(<8 x i16> %a0) { ; CHECK-LABEL: define i16 @test_reduce_v8i16_2( ; CHECK-SAME: <8 x i16> [[A0:%.*]]) { @@ -125,3 +171,25 @@ define i16 @test_reduce_v8i16_neg3(<8 x i16> %a0) { %8 = extractelement <8 x i16> %7, i64 0 ret i16 %8 } + +define i16 @test_reduce_v6i16_xor_neg(<6 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v6i16_xor_neg( +; CHECK-SAME: <6 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i16> [[A0]], <6 x i16> poison, <6 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = xor <6 x i16> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <6 x i16> [[TMP2]], <6 x i16> poison, <6 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = xor <6 x i16> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <6 x i16> [[TMP4]], <6 x i16> poison, <6 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = xor <6 x i16> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <6 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: ret i16 [[TMP7]] +; + %1 = shufflevector <6 x i16> %a0, <6 x i16> poison, <6 x i32> + %2 = xor <6 x i16> %a0, %1 + %3 = shufflevector <6 x i16> %2, <6 x i16> poison, <6 x i32> + %4 = xor <6 x i16> %2, %3 + %5 = shufflevector <6 x i16> %4, <6 x i16> poison, <6 x i32> + %6 = xor <6 x i16> %4, %5 + %7 = extractelement <6 x i16> %6, i64 0 + ret i16 %7 +} From 84a428047914ce4d7b8a0984b1359f1774dd8e9e Mon Sep 17 00:00:00 2001 From: Rajveer Singh Bharadwaj Date: Sun, 24 Aug 2025 13:42:18 +0530 Subject: [PATCH 3/3] Minor improvements --- .../lib/Transforms/Vectorize/VectorCombine.cpp | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 17ae97a6f9faf..b2fbb4298ee23 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3186,16 +3186,15 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { // This stores the last used instructions for shuffle/common op. // - // PrevVecV[2] stores the first vector from extract element instruction, - // while PrevVecV[0] / PrevVecV[1] store the last two simultaneous + // PrevVecV[0] / PrevVecV[1] store the last two simultaneous // instructions from either shuffle/common op. - SmallVector PrevVecV(3, nullptr); + SmallVector PrevVecV(2, nullptr); - Value *VecOp; - if (!match(&I, m_ExtractElt(m_Value(VecOp), m_Zero()))) + Value *VecOpEE; + if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero()))) return false; - auto *FVT = dyn_cast(VecOp->getType()); + auto *FVT = dyn_cast(VecOpEE->getType()); if (!FVT) return false; @@ -3222,8 +3221,7 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { ExpectedParityMask |= (1ll << Mask); } - PrevVecV[2] = VecOp; - InstWorklist.push(PrevVecV[2]); + InstWorklist.push(VecOpEE); while (!InstWorklist.empty()) { Value *CI = InstWorklist.front(); @@ -3239,7 +3237,7 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { // For the first found call/bin op, the vector has to come from the // extract element op. - if (II != (IsFirstCallOrBinInst ? PrevVecV[2] : PrevVecV[0])) + if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0])) return false; IsFirstCallOrBinInst = false; @@ -3285,7 +3283,7 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) return false; - if (BinOp != (IsFirstCallOrBinInst ? PrevVecV[2] : PrevVecV[0])) + if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0])) return false; IsFirstCallOrBinInst = false;