Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 39 additions & 9 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6935,6 +6935,23 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
cast<VPRecipeWithIRFlags>(R).getPredicate() !=
cast<CmpInst>(UI)->getPredicate())
return true;

if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(&R)) {
if (MemR->isReverse()) {
// If the stored value of a reverse store is invariant, LICM will
// hoist the reverse operation to the preheader. In this case, the
// result of the VPlan-based cost model will diverge from that of
// the legacy model.
if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemR))
if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
return true;

if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(MemR))
if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
return true;
}
}

SeenInstrs.insert(UI);
}
}
Expand Down Expand Up @@ -7504,9 +7521,9 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
});
}

VPWidenMemoryRecipe *
VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
VFRange &Range) {
VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
ArrayRef<VPValue *> Operands,
VFRange &Range) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Must be called with either a load or store");

Expand Down Expand Up @@ -7563,14 +7580,27 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
Builder.insert(VectorPtr);
Ptr = VectorPtr;
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
VPIRMetadata(*Load, LVer), I->getDebugLoc());

StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
if (auto *Load = dyn_cast<LoadInst>(I)) {
auto *LoadR =
new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that the load/store doesn't handle reversing, it should not need the flag to indicate it is reversing

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There was a similar discussion earlier: #146525 (comment)
I think it would be good to continue the discussion in the same comment thread.

VPIRMetadata(*Load, LVer), Load->getDebugLoc());
if (Reverse) {
Builder.insert(LoadR);
return new VPInstruction(VPInstruction::Reverse, LoadR,
LoadR->getDebugLoc());
}
return LoadR;
}

auto *Store = cast<StoreInst>(I);
VPValue *StoredVal = Operands[0];
if (Reverse)
StoredVal = Builder.createNaryOp(VPInstruction::Reverse, StoredVal,
Store->getDebugLoc());
return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive,
Reverse, VPIRMetadata(*Store, LVer),
I->getDebugLoc());
Store->getDebugLoc());
}

/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
Expand Down
5 changes: 2 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,8 @@ class VPRecipeBuilder {
/// Check if the load or store instruction \p I should widened for \p
/// Range.Start and potentially masked. Such instructions are handled by a
/// recipe that takes an additional VPInstruction for the mask.
VPWidenMemoryRecipe *tryToWidenMemory(Instruction *I,
ArrayRef<VPValue *> Operands,
VFRange &Range);
VPRecipeBase *tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
VFRange &Range);

/// Check if an induction recipe should be constructed for \p Phi. If so build
/// and return it. If not, return null.
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1038,6 +1038,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
// It produces the lane index across all unrolled iterations. Unrolling will
// add all copies of its original operand as additional operands.
FirstActiveLane,
// Returns a reversed vector for the operand.
Reverse,

// The opcodes below are used for VPInstructionWithType.
//
Expand Down Expand Up @@ -3333,6 +3335,15 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
setMask(Mask);
}

VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Addr,
VPValue *StoredVal, VPValue &EVL, VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
{Addr, StoredVal, &EVL}, S.isConsecutive(),
S.isReverse(), S, S.getDebugLoc()) {
assert(isReverse() && "Only reverse access need to set new stored value");
setMask(Mask);
}

VP_CLASSOF_IMPL(VPDef::VPWidenStoreEVLSC)

/// Return the address accessed by this recipe.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::Broadcast:
case VPInstruction::PtrAdd:
case VPInstruction::WidePtrAdd:
case VPInstruction::Reverse:
// Return the type based on first operand.
return inferScalarType(R->getOperand(0));
case VPInstruction::BranchOnCond:
Expand Down
14 changes: 13 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,8 @@ struct Recipe_match {
if constexpr (std::is_same<RecipeTy, VPScalarIVStepsRecipe>::value ||
std::is_same<RecipeTy, VPCanonicalIVPHIRecipe>::value ||
std::is_same<RecipeTy, VPDerivedIVRecipe>::value ||
std::is_same<RecipeTy, VPWidenGEPRecipe>::value)
std::is_same<RecipeTy, VPWidenGEPRecipe>::value ||
std::is_same<RecipeTy, VPVectorEndPointerRecipe>::value)
return DefR;
else
return DefR && DefR->getOpcode() == Opcode;
Expand Down Expand Up @@ -567,6 +568,17 @@ m_LogicalOr(const Op0_t &Op0, const Op1_t &Op1) {
return m_Select(Op0, m_True(), Op1);
}

template <typename Op0_t, typename Op1_t>
using VPVectorEndPointer_match =
Recipe_match<std::tuple<Op0_t, Op1_t>, 0, /*Commutative=*/false,
VPVectorEndPointerRecipe>;

template <typename Op0_t, typename Op1_t>
inline VPVectorEndPointer_match<Op0_t, Op1_t>
m_VectorEndPointer(const Op0_t &Op0, const Op1_t &Op1) {
return VPVectorEndPointer_match<Op0_t, Op1_t>({Op0, Op1});
}

template <typename Op0_t, typename Op1_t, typename Op2_t>
using VPScalarIVSteps_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, 0,
false, VPScalarIVStepsRecipe>;
Expand Down
74 changes: 39 additions & 35 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
case VPInstruction::Not:
case VPInstruction::Reverse:
return 1;
case Instruction::ICmp:
case Instruction::FCmp:
Expand Down Expand Up @@ -983,6 +984,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
case VPInstruction::ResumeForEpilogue:
return State.get(getOperand(0), true);
case VPInstruction::Reverse:
return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
default:
llvm_unreachable("Unsupported opcode for instruction");
}
Expand Down Expand Up @@ -1147,6 +1150,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
I32Ty, {Arg0Ty, I32Ty, I1Ty});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
case VPInstruction::Reverse: {
assert(VF.isVector() && "Reverse operation must be vector type");
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getShuffleCost(
TargetTransformInfo::SK_Reverse, cast<VectorType>(VectorTy),
cast<VectorType>(VectorTy), {}, Ctx.CostKind, 0);
Comment on lines +1155 to +1158
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getShuffleCost(
TargetTransformInfo::SK_Reverse, cast<VectorType>(VectorTy),
cast<VectorType>(VectorTy), {}, Ctx.CostKind, 0);
Type *VectorTy = cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
return Ctx.TTI.getShuffleCost(
TargetTransformInfo::SK_Reverse, VectorTy,
VectorTy, {}, Ctx.CostKind, 0);

Could you also add `/Arg=/ to the arguments passing {} and 0?

}
case VPInstruction::ExtractLastElement: {
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
Expand Down Expand Up @@ -1251,6 +1261,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::WidePtrAdd:
case VPInstruction::StepVector:
case VPInstruction::ReductionStartVector:
case VPInstruction::Reverse:
case VPInstruction::VScale:
return false;
default:
Expand Down Expand Up @@ -1414,6 +1425,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ResumeForEpilogue:
O << "resume-for-epilogue";
break;
case VPInstruction::Reverse:
O << "reverse";
break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
Expand Down Expand Up @@ -2281,21 +2295,37 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
return TTI::CastContextHint::Normal;
};

using namespace llvm::VPlanPatternMatch;
VPValue *Operand = getOperand(0);
TTI::CastContextHint CCH = TTI::CastContextHint::None;
// For Trunc/FPTrunc, get the context from the only user.
if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
!hasMoreThanOneUniqueUser() && getNumUsers() > 0) {
if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
CCH = ComputeCCH(StoreRecipe);
if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
return nullptr;
return dyn_cast<VPRecipeBase>(*R->user_begin());
};

if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
if (match(Recipe, m_VPInstruction<VPInstruction::Reverse>(m_VPValue())))
Recipe = GetOnlyUser(cast<VPInstruction>(Recipe));
if (Recipe)
CCH = ComputeCCH(Recipe);
Comment on lines +2309 to +2313
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, if we have a shuffle inbetween the load and a cast for example, can the cast still be folded into the load in most cases? Curious if this may have surfaced an in-accuracy of the current cost modeling.

}
}
// For Z/Sext, get the context from the operand.
else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
Opcode == Instruction::FPExt) {
Comment on lines 2317 to 2318
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I understand why the code is guarded by Trunc, FPTrunc, FPExt, ZExt, or SExt opcodes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code is used to determine TTI::CastContextHint. Perhaps for other cast opcodes, there isn’t currently a situation that requires this hint. But I think that’s a separate issue. The change here is just to ensure that CastContextHint::Reversed is still correctly propagated after the reverse operation is separated out.

if (Operand->isLiveIn())
CCH = TTI::CastContextHint::Normal;
else if (Operand->getDefiningRecipe())
CCH = ComputeCCH(Operand->getDefiningRecipe());
else if (auto *Recipe = Operand->getDefiningRecipe()) {
VPValue *ReverseOp;
if (match(Recipe,
m_VPInstruction<VPInstruction::Reverse>(m_VPValue(ReverseOp))))
Recipe = ReverseOp->getDefiningRecipe();
if (Recipe)
CCH = ComputeCCH(Recipe);
}
}

auto *SrcTy =
Expand Down Expand Up @@ -3371,12 +3401,7 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
OpInfo, &Ingredient);
}
if (!Reverse)
return Cost;

return Cost += Ctx.TTI.getShuffleCost(
TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
return Cost;
}

void VPWidenLoadRecipe::execute(VPTransformState &State) {
Expand Down Expand Up @@ -3408,8 +3433,6 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
}
applyMetadata(*cast<Instruction>(NewLI));
if (Reverse)
NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
State.set(this, NewLI);
}

Expand Down Expand Up @@ -3465,8 +3488,6 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
applyMetadata(*NewLI);
Instruction *Res = NewLI;
if (isReverse())
Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
State.set(this, Res);
}

Expand All @@ -3485,12 +3506,8 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
unsigned AS = getLoadStoreAddressSpace(&Ingredient);
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
Instruction::Load, Ty, Alignment, AS, Ctx.CostKind);
if (!Reverse)
return Cost;

return Cost + Ctx.TTI.getShuffleCost(
TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
return Cost;
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Expand Down Expand Up @@ -3520,13 +3537,6 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
}

Value *StoredVal = State.get(StoredVPValue);
if (isReverse()) {
// If we store to reverse consecutive memory locations, then we need
// to reverse the order of elements in the stored value.
StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
// We don't want to update the value in the map as it might be used in
// another expression. So don't call resetVectorValue(StoredVal).
}
Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
Instruction *NewSI = nullptr;
if (CreateScatter)
Expand Down Expand Up @@ -3556,8 +3566,6 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
CallInst *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue);
Value *EVL = State.get(getEVL(), VPLane(0));
if (isReverse())
StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
Value *Mask = nullptr;
if (VPValue *VPMask = getMask()) {
Mask = State.get(VPMask);
Expand Down Expand Up @@ -3596,12 +3604,8 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
unsigned AS = getLoadStoreAddressSpace(&Ingredient);
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
Instruction::Store, Ty, Alignment, AS, Ctx.CostKind);
if (!Reverse)
return Cost;

return Cost + Ctx.TTI.getShuffleCost(
TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
return Cost;
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Expand Down
51 changes: 51 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2482,6 +2482,29 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
.Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
VPValue *NewMask = GetNewMask(S->getMask());
VPValue *NewAddr = GetNewAddr(S->getAddr());
// Convert general reverse operations on stored value into vp.reverse,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Convert general reverse operations on stored value into vp.reverse,
// Convert general reverse operations on stored values into vp.reverse,

// when the VPVectorEndPointerRecipe adjusting the access address uses
// EVL instead of VF.
if (match(NewAddr, m_VectorEndPointer(m_VPValue(), m_Specific(&EVL)))) {
VPValue *StoredVal = S->getStoredValue();
// Skip if the stored value is not defined in the loop region.
if (!StoredVal->isDefinedOutsideLoopRegions()) {
Comment on lines +2490 to +2491
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, is this correct even if the value outside the region is a vector other than a broadcast?

VPValue *ReversedVal;
bool IsReverse =
match(StoredVal, m_VPInstruction<VPInstruction::Reverse>(
m_VPValue(ReversedVal)));
assert(IsReverse && "The stored value of reverse store must be "
"defined by a reverse operation");
auto *Reverse = cast<VPInstruction>(StoredVal);
auto *NewReverse = new VPWidenIntrinsicRecipe(
Intrinsic::experimental_vp_reverse,
{ReversedVal, &AllOneMask, &EVL},
TypeInfo.inferScalarType(Reverse), Reverse->getDebugLoc());
NewReverse->insertBefore(Reverse);
return new VPWidenStoreEVLRecipe(*S, NewAddr, NewReverse, EVL,
NewMask);
}
}
return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
})
.Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
Expand Down Expand Up @@ -2623,6 +2646,34 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
}
}
ToErase.push_back(CurRecipe);

// Convert general reverse operations on loaded result into vp.reverse, when
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Convert general reverse operations on loaded result into vp.reverse, when
// Convert general reverse operations on loaded results into vp.reverse, when

// the VPVectorEndPointerRecipe adjusting the access address uses EVL
// instead of VF.
if (auto *LoadR = dyn_cast<VPWidenLoadEVLRecipe>(EVLRecipe)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason we handle the load/store cases separately, instead of just converting all reverse operations? Could we mis-compile in the future if some other transform decides to create new reverse operations?

if (!match(LoadR->getAddr(),
m_VectorEndPointer(m_VPValue(), m_Specific(&EVL))))
continue;
assert(LoadR->isReverse() &&
"Only reverse access uses VPVectorEndPointerRecipe as address");
// TODO: Extend conversion along the use-def chain, as reverse operations
// may be eliminated or sunk in the future.
assert(LoadR->getNumUsers() == 1 &&
"Unexpected user number of reverse load");
auto *UserR = cast<VPRecipeBase>(*LoadR->user_begin());
VPValue *ReversedVal;
bool IsReverse = match(UserR, m_VPInstruction<VPInstruction::Reverse>(
m_VPValue(ReversedVal)));
assert(IsReverse && "The defined value of reverse load must be used by a "
"reverse operation");
auto *Reverse = cast<VPInstruction>(UserR);
auto *NewReverse = new VPWidenIntrinsicRecipe(
Intrinsic::experimental_vp_reverse, {ReversedVal, AllOneMask, &EVL},
TypeInfo.inferScalarType(Reverse), Reverse->getDebugLoc());
NewReverse->insertBefore(Reverse);
Reverse->replaceAllUsesWith(NewReverse);
ToErase.push_back(Reverse);
}
}
// Remove dead EVL mask.
if (EVLMask->getNumUsers() == 0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ define void @vector_reverse_mask_nxv4i1(ptr %a, ptr %cond, i64 %N) #0 {
; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
; CHECK: %[[REVERSE7:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[WIDEMSKLOAD]])
; CHECK: %[[FADD:.*]] = fadd <vscale x 4 x double> %[[REVERSE7]]
; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
; CHECK: %[[REVERSE8:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[FADD]])
; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
; CHECK: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %[[REVERSE8]], ptr %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE9]]

entry:
Expand Down
Loading