diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index a98e46c587273..5096e0bd70e6e 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -2479,8 +2479,7 @@ class SelectionDAG { /// Check if a value \op N is a constant using the target's BooleanContent for /// its type. - LLVM_ABI std::optional - isBoolConstant(SDValue N, bool AllowTruncation = false) const; + LLVM_ABI std::optional isBoolConstant(SDValue N) const; /// Set CallSiteInfo to be associated with Node. void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo) { diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 727526055e592..34968afdfe836 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4375,6 +4375,11 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase { Op.getOpcode() == ISD::SPLAT_VECTOR_PARTS; } + /// Return true if the given select/vselect should be considered canonical and + /// not be transformed. Currently only used for "vselect (not Cond), N1, N2 -> + /// vselect Cond, N2, N1". + virtual bool isTargetCanonicalSelect(SDNode *N) const { return false; } + struct DAGCombinerInfo { void *DC; // The DAG Combiner object. CombineLevel Level; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 6191e61791678..c9a493cdd7a89 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12967,8 +12967,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { return V; // vselect (not Cond), N1, N2 -> vselect Cond, N2, N1 - if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) - return DAG.getSelect(DL, VT, F, N2, N1); + if (!TLI.isTargetCanonicalSelect(N)) + if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) + return DAG.getSelect(DL, VT, F, N2, N1); // select (sext m), (add X, C), X --> (add X, (and C, (sext m)))) if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N2 && N1->hasOneUse() && diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 30ee6a99b9dfc..3cdd2ac3a18d6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -10349,7 +10349,7 @@ SDValue SelectionDAG::simplifySelect(SDValue Cond, SDValue T, SDValue F) { // select true, T, F --> T // select false, T, F --> F - if (auto C = isBoolConstant(Cond, /*AllowTruncation=*/true)) + if (auto C = isBoolConstant(Cond)) return *C ? T : F; // select ?, T, T --> T @@ -13562,13 +13562,14 @@ bool SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const { return false; } -std::optional SelectionDAG::isBoolConstant(SDValue N, - bool AllowTruncation) const { - ConstantSDNode *Const = isConstOrConstSplat(N, false, AllowTruncation); +std::optional SelectionDAG::isBoolConstant(SDValue N) const { + ConstantSDNode *Const = + isConstOrConstSplat(N, false, /*AllowTruncation=*/true); if (!Const) return std::nullopt; - const APInt &CVal = Const->getAPIntValue(); + EVT VT = N->getValueType(0); + const APInt CVal = Const->getAPIntValue().trunc(VT.getScalarSizeInBits()); switch (TLI->getBooleanContents(N.getValueType())) { case TargetLowering::ZeroOrOneBooleanContent: if (CVal.isOne()) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2541182de1208..88960496d11dd 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4975,6 +4975,16 @@ X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const { return getTargetConstantFromNode(LD); } +bool X86TargetLowering::isTargetCanonicalSelect(SDNode *N) const { + // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X) + SDValue Cond = N->getOperand(0); + SDValue RHS = N->getOperand(2); + EVT CondVT = Cond.getValueType(); + return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() && + CondVT.getVectorElementType() == MVT::i1 && + ISD::isBuildVectorAllZeros(RHS.getNode()); +} + // Extract raw constant bits from constant pools. static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 5cb6b3e493a32..20c90ebf9a5e2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1356,6 +1356,8 @@ namespace llvm { TargetLowering::isTargetCanonicalConstantNode(Op); } + bool isTargetCanonicalSelect(SDNode *N) const override; + const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override; SDValue unwrapAddress(SDValue N) const override; diff --git a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll index 584c29ebcfc04..dfd0a05372b9b 100644 --- a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll +++ b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll @@ -16,16 +16,15 @@ define @test_signed_v2f32_v2i32( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-822083584 // =0xcf000000 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z2.d, #0xffffffff80000000 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.s ; CHECK-NEXT: mov z3.s, w8 ; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.s +; CHECK-NEXT: mov z1.d, #0xffffffff80000000 ; CHECK-NEXT: fcmgt p2.s, p0/z, z0.s, z3.s ; CHECK-NEXT: mov z3.d, #0x7fffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -40,16 +39,15 @@ define @test_signed_v4f32_v4i32( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-822083584 // =0xcf000000 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z2.s, #0x80000000 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.s, p0/m, z0.s ; CHECK-NEXT: mov z3.s, w8 ; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.s, p0/m, z0.s +; CHECK-NEXT: mov z1.s, #0x80000000 ; CHECK-NEXT: fcmgt p2.s, p0/z, z0.s, z3.s ; CHECK-NEXT: mov z3.s, #0x7fffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s ; CHECK-NEXT: mov z1.s, p1/m, z2.s ; CHECK-NEXT: sel z0.s, p2, z3.s, z1.s @@ -69,27 +67,25 @@ define @test_signed_v8f32_v8i32( %f) { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: mov w8, #-822083584 // =0xcf000000 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z6.s, #0x7fffffff +; CHECK-NEXT: mov z3.s, #0x80000000 ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff -; CHECK-NEXT: mov z3.s, w8 ; CHECK-NEXT: movprfx z4, z0 ; CHECK-NEXT: fcvtzs z4.s, p0/m, z0.s -; CHECK-NEXT: movprfx z5, z1 -; CHECK-NEXT: fcvtzs z5.s, p0/m, z1.s +; CHECK-NEXT: mov z5.s, w8 +; CHECK-NEXT: movprfx z6, z1 +; CHECK-NEXT: fcvtzs z6.s, p0/m, z1.s ; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z2.s ; CHECK-NEXT: fcmge p2.s, p0/z, z1.s, z2.s -; CHECK-NEXT: mov z2.s, #0x80000000 -; CHECK-NEXT: fcmgt p3.s, p0/z, z0.s, z3.s -; CHECK-NEXT: fcmgt p4.s, p0/z, z1.s, z3.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: sel z3.s, p1, z2.s, z4.s +; CHECK-NEXT: mov z2.s, #0x7fffffff +; CHECK-NEXT: fcmgt p3.s, p0/z, z0.s, z5.s +; CHECK-NEXT: fcmgt p4.s, p0/z, z1.s, z5.s +; CHECK-NEXT: sel z4.s, p1, z4.s, z3.s +; CHECK-NEXT: mov z3.s, p2/m, z6.s ; CHECK-NEXT: fcmuo p1.s, p0/z, z0.s, z0.s ; CHECK-NEXT: fcmuo p0.s, p0/z, z1.s, z1.s -; CHECK-NEXT: sel z2.s, p2, z2.s, z5.s -; CHECK-NEXT: sel z0.s, p3, z6.s, z3.s -; CHECK-NEXT: sel z1.s, p4, z6.s, z2.s +; CHECK-NEXT: sel z0.s, p3, z2.s, z4.s +; CHECK-NEXT: sel z1.s, p4, z2.s, z3.s ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.s, p0/m, #0 // =0x0 @@ -105,19 +101,19 @@ define @test_signed_v4f32_v4i16( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-956301312 // =0xc7000000 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z3.s, #-32768 // =0xffffffffffff8000 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #65024 // =0xfe00 ; CHECK-NEXT: movk w8, #18175, lsl #16 -; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.s, p0/m, z0.s ; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.s, p0/m, z0.s -; CHECK-NEXT: fcmgt p2.s, p0/z, z0.s, z2.s -; CHECK-NEXT: mov z2.s, #32767 // =0x7fff -; CHECK-NEXT: not p1.b, p0/z, p1.b +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: fcmgt p2.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z1.s, #32767 // =0x7fff ; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s -; CHECK-NEXT: mov z1.s, p1/m, #-32768 // =0xffffffffffff8000 -; CHECK-NEXT: sel z0.s, p2, z2.s, z1.s +; CHECK-NEXT: sel z2.s, p1, z2.s, z3.s +; CHECK-NEXT: sel z0.s, p2, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0 ; CHECK-NEXT: ret %x = call @llvm.fptosi.sat.nxv4f32.nxv4i16( %f) @@ -134,27 +130,26 @@ define @test_signed_v8f32_v8i16( %f) { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: mov w8, #-956301312 // =0xc7000000 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z5.s, #32767 // =0x7fff +; CHECK-NEXT: mov z6.s, #32767 // =0x7fff ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: mov w8, #65024 // =0xfe00 ; CHECK-NEXT: movk w8, #18175, lsl #16 ; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: fcvtzs z3.s, p0/m, z1.s -; CHECK-NEXT: movprfx z4, z0 -; CHECK-NEXT: fcvtzs z4.s, p0/m, z0.s +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: fcvtzs z5.s, p0/m, z0.s +; CHECK-NEXT: mov z4.s, w8 ; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, z2.s ; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z2.s -; CHECK-NEXT: mov z2.s, w8 -; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z2.s -; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z2.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z3.s, p1/m, #-32768 // =0xffffffffffff8000 +; CHECK-NEXT: mov z2.s, #-32768 // =0xffffffffffff8000 +; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z4.s +; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z4.s +; CHECK-NEXT: sel z3.s, p1, z3.s, z2.s +; CHECK-NEXT: mov z2.s, p2/m, z5.s ; CHECK-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s ; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s -; CHECK-NEXT: mov z4.s, p2/m, #-32768 // =0xffffffffffff8000 -; CHECK-NEXT: sel z0.s, p3, z5.s, z3.s -; CHECK-NEXT: sel z1.s, p4, z5.s, z4.s +; CHECK-NEXT: sel z0.s, p3, z6.s, z3.s +; CHECK-NEXT: sel z1.s, p4, z6.s, z2.s ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.s, p0/m, #0 // =0x0 @@ -171,16 +166,15 @@ define @test_signed_v2f32_v2i64( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-553648128 // =0xdf000000 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.s ; CHECK-NEXT: mov z3.s, w8 ; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.s +; CHECK-NEXT: mov z1.d, #0x8000000000000000 ; CHECK-NEXT: fcmgt p2.s, p0/z, z0.s, z3.s ; CHECK-NEXT: mov z3.d, #0x7fffffffffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -204,25 +198,23 @@ define @test_signed_v4f32_v4i64( %f) { ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff -; CHECK-NEXT: mov z3.s, w8 -; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff -; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, z2.s -; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z2.s -; CHECK-NEXT: mov z2.d, #0x8000000000000000 +; CHECK-NEXT: mov z3.d, #0x8000000000000000 +; CHECK-NEXT: mov z5.s, w8 ; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.s -; CHECK-NEXT: movprfx z5, z0 -; CHECK-NEXT: fcvtzs z5.d, p0/m, z0.s -; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z3.s -; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z3.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d +; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, z2.s +; CHECK-NEXT: movprfx z6, z0 +; CHECK-NEXT: fcvtzs z6.d, p0/m, z0.s +; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z2.s +; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z5.s +; CHECK-NEXT: mov z2.d, #0x7fffffffffffffff +; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z5.s +; CHECK-NEXT: sel z4.d, p1, z4.d, z3.d ; CHECK-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s ; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s -; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d +; CHECK-NEXT: mov z3.d, p2/m, z6.d +; CHECK-NEXT: sel z0.d, p3, z2.d, z4.d +; CHECK-NEXT: sel z1.d, p4, z2.d, z3.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 @@ -248,17 +240,16 @@ define @test_signed_v2f64_v2i32( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-4476578029606273024 // =0xc1e0000000000000 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z2.d, #0xffffffff80000000 ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mov x8, #281474972516352 // =0xffffffc00000 ; CHECK-NEXT: movk x8, #16863, lsl #48 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.d ; CHECK-NEXT: mov z3.d, x8 ; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.d +; CHECK-NEXT: mov z1.d, #0xffffffff80000000 ; CHECK-NEXT: fcmgt p2.d, p0/z, z0.d, z3.d ; CHECK-NEXT: mov z3.d, #0x7fffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -278,28 +269,26 @@ define @test_signed_v4f64_v4i32( %f) { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: mov x8, #-4476578029606273024 // =0xc1e0000000000000 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z6.d, #0x7fffffff +; CHECK-NEXT: mov z3.d, #0xffffffff80000000 ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: mov x8, #281474972516352 // =0xffffffc00000 ; CHECK-NEXT: movk x8, #16863, lsl #48 ; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.d -; CHECK-NEXT: movprfx z5, z0 -; CHECK-NEXT: fcvtzs z5.d, p0/m, z0.d -; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: movprfx z6, z0 +; CHECK-NEXT: fcvtzs z6.d, p0/m, z0.d +; CHECK-NEXT: mov z5.d, x8 ; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z2.d ; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z2.d -; CHECK-NEXT: mov z2.d, #0xffffffff80000000 -; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z3.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z0.d, z3.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d +; CHECK-NEXT: mov z2.d, #0x7fffffff +; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z5.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z0.d, z5.d +; CHECK-NEXT: sel z4.d, p1, z4.d, z3.d +; CHECK-NEXT: mov z3.d, p2/m, z6.d ; CHECK-NEXT: fcmuo p1.d, p0/z, z1.d, z1.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d -; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d +; CHECK-NEXT: sel z0.d, p3, z2.d, z4.d +; CHECK-NEXT: sel z1.d, p4, z2.d, z3.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 @@ -327,49 +316,45 @@ define @test_signed_v8f64_v8i32( %f) { ; CHECK-NEXT: mov z5.d, #0xffffffff80000000 ; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: mov x8, #281474972516352 // =0xffffffc00000 -; CHECK-NEXT: mov z26.d, #0x7fffffff ; CHECK-NEXT: movk x8, #16863, lsl #48 -; CHECK-NEXT: movprfx z7, z0 -; CHECK-NEXT: fcvtzs z7.d, p0/m, z0.d -; CHECK-NEXT: movprfx z24, z3 -; CHECK-NEXT: fcvtzs z24.d, p0/m, z3.d +; CHECK-NEXT: movprfx z7, z1 +; CHECK-NEXT: fcvtzs z7.d, p0/m, z1.d +; CHECK-NEXT: movprfx z24, z0 +; CHECK-NEXT: fcvtzs z24.d, p0/m, z0.d ; CHECK-NEXT: mov z6.d, x8 -; CHECK-NEXT: movprfx z25, z2 -; CHECK-NEXT: fcvtzs z25.d, p0/m, z2.d +; CHECK-NEXT: movprfx z25, z3 +; CHECK-NEXT: fcvtzs z25.d, p0/m, z3.d +; CHECK-NEXT: movprfx z26, z2 +; CHECK-NEXT: fcvtzs z26.d, p0/m, z2.d ; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z4.d ; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z4.d ; CHECK-NEXT: fcmge p3.d, p0/z, z3.d, z4.d ; CHECK-NEXT: fcmge p4.d, p0/z, z2.d, z4.d -; CHECK-NEXT: movprfx z4, z1 -; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.d +; CHECK-NEXT: mov z4.d, #0x7fffffff ; CHECK-NEXT: fcmgt p5.d, p0/z, z1.d, z6.d ; CHECK-NEXT: fcmgt p6.d, p0/z, z0.d, z6.d ; CHECK-NEXT: fcmgt p7.d, p0/z, z3.d, z6.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: mov z4.d, p1/m, z5.d +; CHECK-NEXT: sel z7.d, p1, z7.d, z5.d ; CHECK-NEXT: fcmgt p1.d, p0/z, z2.d, z6.d -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: sel z6.d, p2, z5.d, z7.d +; CHECK-NEXT: sel z6.d, p2, z24.d, z5.d +; CHECK-NEXT: sel z24.d, p3, z25.d, z5.d +; CHECK-NEXT: mov z5.d, p4/m, z26.d ; CHECK-NEXT: fcmuo p2.d, p0/z, z1.d, z1.d -; CHECK-NEXT: sel z7.d, p3, z5.d, z24.d ; CHECK-NEXT: fcmuo p3.d, p0/z, z0.d, z0.d -; CHECK-NEXT: sel z5.d, p4, z5.d, z25.d ; CHECK-NEXT: fcmuo p4.d, p0/z, z3.d, z3.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z2.d, z2.d -; CHECK-NEXT: sel z0.d, p5, z26.d, z4.d -; CHECK-NEXT: sel z1.d, p6, z26.d, z6.d +; CHECK-NEXT: sel z0.d, p5, z4.d, z7.d +; CHECK-NEXT: sel z1.d, p6, z4.d, z6.d ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z2.d, p7, z26.d, z7.d +; CHECK-NEXT: sel z2.d, p7, z4.d, z24.d +; CHECK-NEXT: sel z3.d, p1, z4.d, z5.d ; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z3.d, p1, z26.d, z5.d ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 ; CHECK-NEXT: mov z2.d, p4/m, #0 // =0x0 -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z3.d, p0/m, #0 // =0x0 +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s ; CHECK-NEXT: uzp1 z1.s, z3.s, z2.s ; CHECK-NEXT: addvl sp, sp, #1 @@ -389,27 +374,26 @@ define @test_signed_v4f64_v4i16( %f) { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: mov x8, #-4548635623644200960 // =0xc0e0000000000000 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z5.d, #32767 // =0x7fff +; CHECK-NEXT: mov z6.d, #32767 // =0x7fff ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: mov x8, #281200098803712 // =0xffc000000000 ; CHECK-NEXT: movk x8, #16607, lsl #48 ; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: fcvtzs z3.d, p0/m, z1.d -; CHECK-NEXT: movprfx z4, z0 -; CHECK-NEXT: fcvtzs z4.d, p0/m, z0.d +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: fcvtzs z5.d, p0/m, z0.d +; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z2.d ; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z2.d -; CHECK-NEXT: mov z2.d, x8 -; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z2.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z0.d, z2.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z3.d, p1/m, #-32768 // =0xffffffffffff8000 +; CHECK-NEXT: mov z2.d, #-32768 // =0xffffffffffff8000 +; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z4.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z0.d, z4.d +; CHECK-NEXT: sel z3.d, p1, z3.d, z2.d +; CHECK-NEXT: mov z2.d, p2/m, z5.d ; CHECK-NEXT: fcmuo p1.d, p0/z, z1.d, z1.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d -; CHECK-NEXT: mov z4.d, p2/m, #-32768 // =0xffffffffffff8000 -; CHECK-NEXT: sel z0.d, p3, z5.d, z3.d -; CHECK-NEXT: sel z1.d, p4, z5.d, z4.d +; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d +; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 @@ -434,51 +418,48 @@ define @test_signed_v8f64_v8i16( %f) { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: mov x8, #-4548635623644200960 // =0xc0e0000000000000 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z25.d, #32767 // =0x7fff +; CHECK-NEXT: mov z5.d, #-32768 // =0xffffffffffff8000 ; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: mov x8, #281200098803712 // =0xffc000000000 ; CHECK-NEXT: movk x8, #16607, lsl #48 -; CHECK-NEXT: movprfx z6, z2 -; CHECK-NEXT: fcvtzs z6.d, p0/m, z2.d -; CHECK-NEXT: movprfx z7, z1 -; CHECK-NEXT: fcvtzs z7.d, p0/m, z1.d -; CHECK-NEXT: mov z5.d, x8 -; CHECK-NEXT: movprfx z24, z0 -; CHECK-NEXT: fcvtzs z24.d, p0/m, z0.d +; CHECK-NEXT: movprfx z7, z3 +; CHECK-NEXT: fcvtzs z7.d, p0/m, z3.d +; CHECK-NEXT: movprfx z24, z2 +; CHECK-NEXT: fcvtzs z24.d, p0/m, z2.d +; CHECK-NEXT: mov z6.d, x8 +; CHECK-NEXT: movprfx z25, z1 +; CHECK-NEXT: fcvtzs z25.d, p0/m, z1.d +; CHECK-NEXT: movprfx z26, z0 +; CHECK-NEXT: fcvtzs z26.d, p0/m, z0.d ; CHECK-NEXT: fcmge p1.d, p0/z, z3.d, z4.d ; CHECK-NEXT: fcmge p2.d, p0/z, z2.d, z4.d ; CHECK-NEXT: fcmge p3.d, p0/z, z1.d, z4.d ; CHECK-NEXT: fcmge p4.d, p0/z, z0.d, z4.d -; CHECK-NEXT: movprfx z4, z3 -; CHECK-NEXT: fcvtzs z4.d, p0/m, z3.d -; CHECK-NEXT: fcmgt p5.d, p0/z, z3.d, z5.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z2.d, z5.d -; CHECK-NEXT: fcmgt p7.d, p0/z, z1.d, z5.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: mov z4.d, p1/m, #-32768 // =0xffffffffffff8000 -; CHECK-NEXT: fcmgt p1.d, p0/z, z0.d, z5.d -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z6.d, p2/m, #-32768 // =0xffffffffffff8000 +; CHECK-NEXT: mov z4.d, #32767 // =0x7fff +; CHECK-NEXT: fcmgt p5.d, p0/z, z3.d, z6.d +; CHECK-NEXT: fcmgt p6.d, p0/z, z2.d, z6.d +; CHECK-NEXT: fcmgt p7.d, p0/z, z1.d, z6.d +; CHECK-NEXT: sel z7.d, p1, z7.d, z5.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z0.d, z6.d +; CHECK-NEXT: sel z6.d, p2, z24.d, z5.d +; CHECK-NEXT: sel z24.d, p3, z25.d, z5.d +; CHECK-NEXT: mov z5.d, p4/m, z26.d ; CHECK-NEXT: fcmuo p2.d, p0/z, z3.d, z3.d -; CHECK-NEXT: mov z7.d, p3/m, #-32768 // =0xffffffffffff8000 ; CHECK-NEXT: fcmuo p3.d, p0/z, z2.d, z2.d -; CHECK-NEXT: mov z24.d, p4/m, #-32768 // =0xffffffffffff8000 ; CHECK-NEXT: fcmuo p4.d, p0/z, z1.d, z1.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d -; CHECK-NEXT: sel z2.d, p5, z25.d, z4.d -; CHECK-NEXT: sel z0.d, p6, z25.d, z6.d +; CHECK-NEXT: sel z2.d, p5, z4.d, z7.d +; CHECK-NEXT: sel z0.d, p6, z4.d, z6.d ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z1.d, p7, z25.d, z7.d +; CHECK-NEXT: sel z1.d, p7, z4.d, z24.d +; CHECK-NEXT: sel z3.d, p1, z4.d, z5.d ; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z3.d, p1, z25.d, z24.d ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z2.d, p2/m, #0 // =0x0 ; CHECK-NEXT: mov z0.d, p3/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p4/m, #0 // =0x0 -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z3.d, p0/m, #0 // =0x0 +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s ; CHECK-NEXT: uzp1 z1.s, z3.s, z1.s ; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h @@ -494,16 +475,15 @@ define @test_signed_v2f64_v2i64( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.d ; CHECK-NEXT: mov z3.d, x8 ; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.d +; CHECK-NEXT: mov z1.d, #0x8000000000000000 ; CHECK-NEXT: fcmgt p2.d, p0/z, z0.d, z3.d ; CHECK-NEXT: mov z3.d, #0x7fffffffffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -523,27 +503,25 @@ define @test_signed_v4f64_v4i64( %f) { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff +; CHECK-NEXT: mov z3.d, #0x8000000000000000 ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff -; CHECK-NEXT: mov z3.d, x8 ; CHECK-NEXT: movprfx z4, z0 ; CHECK-NEXT: fcvtzs z4.d, p0/m, z0.d -; CHECK-NEXT: movprfx z5, z1 -; CHECK-NEXT: fcvtzs z5.d, p0/m, z1.d +; CHECK-NEXT: mov z5.d, x8 +; CHECK-NEXT: movprfx z6, z1 +; CHECK-NEXT: fcvtzs z6.d, p0/m, z1.d ; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z2.d ; CHECK-NEXT: fcmge p2.d, p0/z, z1.d, z2.d -; CHECK-NEXT: mov z2.d, #0x8000000000000000 -; CHECK-NEXT: fcmgt p3.d, p0/z, z0.d, z3.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z1.d, z3.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d +; CHECK-NEXT: mov z2.d, #0x7fffffffffffffff +; CHECK-NEXT: fcmgt p3.d, p0/z, z0.d, z5.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z1.d, z5.d +; CHECK-NEXT: sel z4.d, p1, z4.d, z3.d +; CHECK-NEXT: mov z3.d, p2/m, z6.d ; CHECK-NEXT: fcmuo p1.d, p0/z, z0.d, z0.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z1.d, z1.d -; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d +; CHECK-NEXT: sel z0.d, p3, z2.d, z4.d +; CHECK-NEXT: sel z1.d, p4, z2.d, z3.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 @@ -570,16 +548,15 @@ define @test_signed_v2f16_v2i32( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #64511 // =0xfbff ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z2.d, #0xffffffff80000000 ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.h ; CHECK-NEXT: mov z3.h, w8 ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.h +; CHECK-NEXT: mov z1.d, #0xffffffff80000000 ; CHECK-NEXT: fcmgt p2.h, p0/z, z0.h, z3.h ; CHECK-NEXT: mov z3.d, #0x7fffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -594,16 +571,15 @@ define @test_signed_v4f16_v4i32( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #64511 // =0xfbff ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z2.s, #0x80000000 ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.s, p0/m, z0.h ; CHECK-NEXT: mov z3.h, w8 ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.s, p0/m, z0.h +; CHECK-NEXT: mov z1.s, #0x80000000 ; CHECK-NEXT: fcmgt p2.h, p0/z, z0.h, z3.h ; CHECK-NEXT: mov z3.s, #0x7fffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h ; CHECK-NEXT: mov z1.s, p1/m, z2.s ; CHECK-NEXT: sel z0.s, p2, z3.s, z1.s @@ -627,25 +603,23 @@ define @test_signed_v8f16_v8i32( %f) { ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: mov z3.h, w8 -; CHECK-NEXT: mov z6.s, #0x7fffffff -; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z2.h -; CHECK-NEXT: fcmge p2.h, p0/z, z0.h, z2.h -; CHECK-NEXT: mov z2.s, #0x80000000 +; CHECK-NEXT: mov z3.s, #0x80000000 +; CHECK-NEXT: mov z5.h, w8 ; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: fcvtzs z4.s, p0/m, z1.h -; CHECK-NEXT: movprfx z5, z0 -; CHECK-NEXT: fcvtzs z5.s, p0/m, z0.h -; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z3.h -; CHECK-NEXT: fcmgt p4.h, p0/z, z0.h, z3.h -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: sel z3.s, p1, z2.s, z4.s +; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z2.h +; CHECK-NEXT: movprfx z6, z0 +; CHECK-NEXT: fcvtzs z6.s, p0/m, z0.h +; CHECK-NEXT: fcmge p2.h, p0/z, z0.h, z2.h +; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z5.h +; CHECK-NEXT: mov z2.s, #0x7fffffff +; CHECK-NEXT: fcmgt p4.h, p0/z, z0.h, z5.h +; CHECK-NEXT: sel z4.s, p1, z4.s, z3.s ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z1.h ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h -; CHECK-NEXT: sel z2.s, p2, z2.s, z5.s -; CHECK-NEXT: sel z0.s, p3, z6.s, z3.s -; CHECK-NEXT: sel z1.s, p4, z6.s, z2.s +; CHECK-NEXT: mov z3.s, p2/m, z6.s +; CHECK-NEXT: sel z0.s, p3, z2.s, z4.s +; CHECK-NEXT: sel z1.s, p4, z2.s, z3.s ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.s, p0/m, #0 // =0x0 @@ -661,18 +635,18 @@ define @test_signed_v4f16_v4i16( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #63488 // =0xf800 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z3.s, #-32768 // =0xffffffffffff8000 ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #30719 // =0x77ff -; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.s, p0/m, z0.h ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.s, p0/m, z0.h -; CHECK-NEXT: fcmgt p2.h, p0/z, z0.h, z2.h -; CHECK-NEXT: mov z2.s, #32767 // =0x7fff -; CHECK-NEXT: not p1.b, p0/z, p1.b +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: fcmgt p2.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z1.s, #32767 // =0x7fff ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h -; CHECK-NEXT: mov z1.s, p1/m, #-32768 // =0xffffffffffff8000 -; CHECK-NEXT: sel z0.s, p2, z2.s, z1.s +; CHECK-NEXT: sel z2.s, p1, z2.s, z3.s +; CHECK-NEXT: sel z0.s, p2, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0 ; CHECK-NEXT: ret %x = call @llvm.fptosi.sat.nxv4f16.nxv4i16( %f) @@ -684,18 +658,18 @@ define @test_signed_v8f16_v8i16( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #63488 // =0xf800 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z3.h, #-32768 // =0xffffffffffff8000 ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #30719 // =0x77ff -; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.h, p0/m, z0.h ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.h, p0/m, z0.h -; CHECK-NEXT: fcmgt p2.h, p0/z, z0.h, z2.h -; CHECK-NEXT: mov z2.h, #32767 // =0x7fff -; CHECK-NEXT: not p1.b, p0/z, p1.b +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: fcmgt p2.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z1.h, #32767 // =0x7fff ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h -; CHECK-NEXT: mov z1.h, p1/m, #-32768 // =0xffffffffffff8000 -; CHECK-NEXT: sel z0.h, p2, z2.h, z1.h +; CHECK-NEXT: sel z2.h, p1, z2.h, z3.h +; CHECK-NEXT: sel z0.h, p2, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, #0 // =0x0 ; CHECK-NEXT: ret %x = call @llvm.fptosi.sat.nxv8f16.nxv8i16( %f) @@ -707,16 +681,15 @@ define @test_signed_v2f16_v2i64( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #64511 // =0xfbff ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.h ; CHECK-NEXT: mov z3.h, w8 ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.h +; CHECK-NEXT: mov z1.d, #0x8000000000000000 ; CHECK-NEXT: fcmgt p2.h, p0/z, z0.h, z3.h ; CHECK-NEXT: mov z3.d, #0x7fffffffffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -740,25 +713,23 @@ define @test_signed_v4f16_v4i64( %f) { ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: mov z3.h, w8 -; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff -; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z2.h -; CHECK-NEXT: fcmge p2.h, p0/z, z0.h, z2.h -; CHECK-NEXT: mov z2.d, #0x8000000000000000 +; CHECK-NEXT: mov z3.d, #0x8000000000000000 +; CHECK-NEXT: mov z5.h, w8 ; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.h -; CHECK-NEXT: movprfx z5, z0 -; CHECK-NEXT: fcvtzs z5.d, p0/m, z0.h -; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z3.h -; CHECK-NEXT: fcmgt p4.h, p0/z, z0.h, z3.h -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d +; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z2.h +; CHECK-NEXT: movprfx z6, z0 +; CHECK-NEXT: fcvtzs z6.d, p0/m, z0.h +; CHECK-NEXT: fcmge p2.h, p0/z, z0.h, z2.h +; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z5.h +; CHECK-NEXT: mov z2.d, #0x7fffffffffffffff +; CHECK-NEXT: fcmgt p4.h, p0/z, z0.h, z5.h +; CHECK-NEXT: sel z4.d, p1, z4.d, z3.d ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z1.h ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h -; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d +; CHECK-NEXT: mov z3.d, p2/m, z6.d +; CHECK-NEXT: sel z0.d, p3, z2.d, z4.d +; CHECK-NEXT: sel z1.d, p4, z2.d, z3.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 diff --git a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll index ed352ffec339f..46fe910bd29a9 100644 --- a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll +++ b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll @@ -16,15 +16,15 @@ define @test_signed_v2f32_v2i32( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #1333788671 // =0x4f7fffff -; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: mov z3.s, w8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: fcvtzu z1.d, p0/m, z0.s ; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: fcvtzu z2.d, p0/m, z0.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z3.s ; CHECK-NEXT: mov z0.d, #0xffffffff -; CHECK-NEXT: mov z2.d, p1/m, #0 // =0x0 -; CHECK-NEXT: sel z0.d, p0, z0.d, z2.d +; CHECK-NEXT: sel z1.d, p1, z1.d, z2.d +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv2f32.nxv2i32( %f) ret %x @@ -35,15 +35,14 @@ define @test_signed_v4f32_v4i32( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #1333788671 // =0x4f7fffff -; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzu z2.s, p0/m, z0.s ; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzu z1.s, p0/m, z0.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z2.s -; CHECK-NEXT: mov z1.s, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z1.s, p0/m, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: sel z0.s, p1, z2.s, z3.s +; CHECK-NEXT: mov z0.s, p0/m, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv4f32.nxv4i32( %f) ret %x @@ -54,23 +53,20 @@ define @test_signed_v8f32_v8i32( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #1333788671 // =0x4f7fffff -; CHECK-NEXT: mov z4.s, w8 +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: movprfx z3, z0 +; CHECK-NEXT: fcvtzu z3.s, p0/m, z0.s ; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: movprfx z5, z1 +; CHECK-NEXT: fcvtzu z5.s, p0/m, z1.s ; CHECK-NEXT: fcmge p2.s, p0/z, z1.s, #0.0 -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: fcvtzu z2.s, p0/m, z0.s -; CHECK-NEXT: movprfx z3, z1 -; CHECK-NEXT: fcvtzu z3.s, p0/m, z1.s -; CHECK-NEXT: fcmgt p3.s, p0/z, z0.s, z4.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: fcmgt p0.s, p0/z, z1.s, z4.s -; CHECK-NEXT: mov z2.s, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z3.s, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z2.s, p3/m, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z3.s, p0/m, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z0.d, z2.d -; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: fcmgt p3.s, p0/z, z0.s, z2.s +; CHECK-NEXT: fcmgt p0.s, p0/z, z1.s, z2.s +; CHECK-NEXT: sel z0.s, p1, z3.s, z4.s +; CHECK-NEXT: sel z1.s, p2, z5.s, z4.s +; CHECK-NEXT: mov z0.s, p3/m, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.s, p0/m, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv8f32.nxv8i32( %f) ret %x @@ -81,16 +77,16 @@ define @test_signed_v4f32_v4i16( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #65280 // =0xff00 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: movk w8, #18303, lsl #16 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: fcvtzu z1.s, p0/m, z0.s ; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: mov z1.s, w8 -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: fcvtzu z2.s, p0/m, z0.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z3.s, w8 +; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z3.s ; CHECK-NEXT: mov z0.s, #65535 // =0xffff -; CHECK-NEXT: mov z2.s, p1/m, #0 // =0x0 -; CHECK-NEXT: sel z0.s, p0, z0.s, z2.s +; CHECK-NEXT: sel z1.s, p1, z1.s, z2.s +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv4f32.nxv4i16( %f) ret %x @@ -101,24 +97,23 @@ define @test_signed_v8f32_v8i16( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #65280 // =0xff00 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: movk w8, #18303, lsl #16 -; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, #0.0 -; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: fcvtzu z3.s, p0/m, z1.s -; CHECK-NEXT: movprfx z4, z0 -; CHECK-NEXT: fcvtzu z4.s, p0/m, z0.s -; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z2.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z2.s -; CHECK-NEXT: mov z0.s, #65535 // =0xffff -; CHECK-NEXT: mov z3.s, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z4.s, p2/m, #0 // =0x0 -; CHECK-NEXT: sel z1.s, p3, z0.s, z3.s -; CHECK-NEXT: sel z0.s, p0, z0.s, z4.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: mov z4.s, w8 +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: fcvtzu z5.s, p0/m, z0.s +; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z4.s +; CHECK-NEXT: mov z1.s, #65535 // =0xffff +; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z4.s +; CHECK-NEXT: sel z0.s, p1, z3.s, z2.s +; CHECK-NEXT: mov z2.s, p2/m, z5.s +; CHECK-NEXT: mov z0.s, p3/m, z1.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z2.s +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv8f32.nxv8i16( %f) ret %x @@ -129,15 +124,14 @@ define @test_signed_v2f32_v2i64( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #1602224127 // =0x5f7fffff -; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzu z2.d, p0/m, z0.s ; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzu z1.d, p0/m, z0.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z2.s -; CHECK-NEXT: mov z1.d, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z1.d, p0/m, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: sel z0.d, p1, z2.d, z3.d +; CHECK-NEXT: mov z0.d, p0/m, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv2f32.nxv2i64( %f) ret %x @@ -146,23 +140,22 @@ define @test_signed_v2f32_v2i64( %f) { define @test_signed_v4f32_v4i64( %f) { ; CHECK-LABEL: test_signed_v4f32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpklo z2.d, z0.s -; CHECK-NEXT: uunpkhi z3.d, z0.s +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s ; CHECK-NEXT: mov w8, #1602224127 // =0x5f7fffff ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z4.s, w8 -; CHECK-NEXT: fcmge p1.s, p0/z, z2.s, #0.0 -; CHECK-NEXT: fcmge p2.s, p0/z, z3.s, #0.0 -; CHECK-NEXT: movprfx z0, z2 -; CHECK-NEXT: fcvtzu z0.d, p0/m, z2.s -; CHECK-NEXT: movprfx z1, z3 -; CHECK-NEXT: fcvtzu z1.d, p0/m, z3.s -; CHECK-NEXT: fcmgt p3.s, p0/z, z2.s, z4.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: fcmgt p0.s, p0/z, z3.s, z4.s -; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z1.d, p2/m, #0 // =0x0 +; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: fcvtzu z3.d, p0/m, z1.s +; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: fcvtzu z5.d, p0/m, z0.s +; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z2.s +; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z2.s +; CHECK-NEXT: sel z0.d, p1, z3.d, z4.d +; CHECK-NEXT: sel z1.d, p2, z5.d, z4.d ; CHECK-NEXT: mov z0.d, p3/m, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.d, p0/m, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ret @@ -185,16 +178,16 @@ define @test_signed_v2f64_v2i32( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #281474974613504 // =0xffffffe00000 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: movk x8, #16879, lsl #48 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: fcvtzu z1.d, p0/m, z0.d ; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: fcvtzu z2.d, p0/m, z0.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z3.d ; CHECK-NEXT: mov z0.d, #0xffffffff -; CHECK-NEXT: mov z2.d, p1/m, #0 // =0x0 -; CHECK-NEXT: sel z0.d, p0, z0.d, z2.d +; CHECK-NEXT: sel z1.d, p1, z1.d, z2.d +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv2f64.nxv2i32( %f) ret %x @@ -205,24 +198,23 @@ define @test_signed_v4f64_v4i32( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #281474974613504 // =0xffffffe00000 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: movk x8, #16879, lsl #48 -; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0 -; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: fcvtzu z3.d, p0/m, z1.d -; CHECK-NEXT: movprfx z4, z0 -; CHECK-NEXT: fcvtzu z4.d, p0/m, z0.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z2.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z2.d -; CHECK-NEXT: mov z0.d, #0xffffffff -; CHECK-NEXT: mov z3.d, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z4.d, p2/m, #0 // =0x0 -; CHECK-NEXT: sel z1.d, p3, z0.d, z3.d -; CHECK-NEXT: sel z0.d, p0, z0.d, z4.d -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0 +; CHECK-NEXT: mov z4.d, x8 +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: fcvtzu z5.d, p0/m, z0.d +; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z4.d +; CHECK-NEXT: mov z1.d, #0xffffffff +; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z4.d +; CHECK-NEXT: sel z0.d, p1, z3.d, z2.d +; CHECK-NEXT: mov z2.d, p2/m, z5.d +; CHECK-NEXT: mov z0.d, p3/m, z1.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z2.d +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv4f64.nxv4i32( %f) ret %x @@ -240,40 +232,37 @@ define @test_signed_v8f64_v8i32( %f) { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #281474974613504 // =0xffffffe00000 +; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: movk x8, #16879, lsl #48 +; CHECK-NEXT: movprfx z5, z1 +; CHECK-NEXT: fcvtzu z5.d, p0/m, z1.d ; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0 +; CHECK-NEXT: mov z6.d, x8 +; CHECK-NEXT: movprfx z7, z0 +; CHECK-NEXT: fcvtzu z7.d, p0/m, z0.d ; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: movprfx z24, z3 +; CHECK-NEXT: fcvtzu z24.d, p0/m, z3.d ; CHECK-NEXT: fcmge p3.d, p0/z, z3.d, #0.0 +; CHECK-NEXT: movprfx z25, z2 +; CHECK-NEXT: fcvtzu z25.d, p0/m, z2.d ; CHECK-NEXT: fcmge p4.d, p0/z, z2.d, #0.0 -; CHECK-NEXT: movprfx z5, z1 -; CHECK-NEXT: fcvtzu z5.d, p0/m, z1.d -; CHECK-NEXT: mov z4.d, x8 -; CHECK-NEXT: movprfx z6, z0 -; CHECK-NEXT: fcvtzu z6.d, p0/m, z0.d -; CHECK-NEXT: movprfx z7, z3 -; CHECK-NEXT: fcvtzu z7.d, p0/m, z3.d -; CHECK-NEXT: movprfx z24, z2 -; CHECK-NEXT: fcvtzu z24.d, p0/m, z2.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p5.d, p0/z, z1.d, z4.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z0.d, z4.d -; CHECK-NEXT: not p2.b, p0/z, p2.b +; CHECK-NEXT: fcmgt p5.d, p0/z, z1.d, z6.d +; CHECK-NEXT: fcmgt p6.d, p0/z, z0.d, z6.d ; CHECK-NEXT: mov z0.d, #0xffffffff -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: mov z5.d, p1/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p1.d, p0/z, z3.d, z4.d -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: fcmgt p0.d, p0/z, z2.d, z4.d -; CHECK-NEXT: mov z6.d, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z7.d, p3/m, #0 // =0x0 -; CHECK-NEXT: mov z24.d, p4/m, #0 // =0x0 -; CHECK-NEXT: sel z1.d, p5, z0.d, z5.d +; CHECK-NEXT: sel z1.d, p1, z5.d, z4.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z3.d, z6.d +; CHECK-NEXT: fcmgt p0.d, p0/z, z2.d, z6.d +; CHECK-NEXT: sel z2.d, p2, z7.d, z4.d +; CHECK-NEXT: sel z3.d, p3, z24.d, z4.d +; CHECK-NEXT: mov z4.d, p4/m, z25.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z1.d, p5/m, z0.d ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z2.d, p6, z0.d, z6.d +; CHECK-NEXT: mov z2.d, p6/m, z0.d ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z3.d, p1, z0.d, z7.d -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z4.d, p0, z0.d, z24.d +; CHECK-NEXT: mov z3.d, p1/m, z0.d +; CHECK-NEXT: mov z4.d, p0/m, z0.d ; CHECK-NEXT: uzp1 z0.s, z2.s, z1.s ; CHECK-NEXT: uzp1 z1.s, z4.s, z3.s ; CHECK-NEXT: addvl sp, sp, #1 @@ -288,24 +277,23 @@ define @test_signed_v4f64_v4i16( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #281337537757184 // =0xffe000000000 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: movk x8, #16623, lsl #48 -; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0 -; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: fcvtzu z3.d, p0/m, z1.d -; CHECK-NEXT: movprfx z4, z0 -; CHECK-NEXT: fcvtzu z4.d, p0/m, z0.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z2.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z2.d -; CHECK-NEXT: mov z0.d, #65535 // =0xffff -; CHECK-NEXT: mov z3.d, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z4.d, p2/m, #0 // =0x0 -; CHECK-NEXT: sel z1.d, p3, z0.d, z3.d -; CHECK-NEXT: sel z0.d, p0, z0.d, z4.d -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0 +; CHECK-NEXT: mov z4.d, x8 +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: fcvtzu z5.d, p0/m, z0.d +; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z4.d +; CHECK-NEXT: mov z1.d, #65535 // =0xffff +; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z4.d +; CHECK-NEXT: sel z0.d, p1, z3.d, z2.d +; CHECK-NEXT: mov z2.d, p2/m, z5.d +; CHECK-NEXT: mov z0.d, p3/m, z1.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z2.d +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv4f64.nxv4i16( %f) ret %x @@ -323,42 +311,39 @@ define @test_signed_v8f64_v8i16( %f) { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #281337537757184 // =0xffe000000000 +; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: movk x8, #16623, lsl #48 +; CHECK-NEXT: movprfx z5, z3 +; CHECK-NEXT: fcvtzu z5.d, p0/m, z3.d ; CHECK-NEXT: fcmge p1.d, p0/z, z3.d, #0.0 +; CHECK-NEXT: mov z6.d, x8 +; CHECK-NEXT: movprfx z7, z2 +; CHECK-NEXT: fcvtzu z7.d, p0/m, z2.d ; CHECK-NEXT: fcmge p2.d, p0/z, z2.d, #0.0 +; CHECK-NEXT: movprfx z24, z1 +; CHECK-NEXT: fcvtzu z24.d, p0/m, z1.d ; CHECK-NEXT: fcmge p3.d, p0/z, z1.d, #0.0 +; CHECK-NEXT: movprfx z25, z0 +; CHECK-NEXT: fcvtzu z25.d, p0/m, z0.d ; CHECK-NEXT: fcmge p4.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: movprfx z5, z3 -; CHECK-NEXT: fcvtzu z5.d, p0/m, z3.d -; CHECK-NEXT: mov z4.d, x8 -; CHECK-NEXT: movprfx z6, z2 -; CHECK-NEXT: fcvtzu z6.d, p0/m, z2.d -; CHECK-NEXT: movprfx z7, z1 -; CHECK-NEXT: fcvtzu z7.d, p0/m, z1.d -; CHECK-NEXT: movprfx z24, z0 -; CHECK-NEXT: fcvtzu z24.d, p0/m, z0.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p5.d, p0/z, z3.d, z4.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z2.d, z4.d -; CHECK-NEXT: not p2.b, p0/z, p2.b +; CHECK-NEXT: fcmgt p5.d, p0/z, z3.d, z6.d +; CHECK-NEXT: fcmgt p6.d, p0/z, z2.d, z6.d ; CHECK-NEXT: mov z2.d, #65535 // =0xffff -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: mov z5.d, p1/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z4.d -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z4.d -; CHECK-NEXT: mov z6.d, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z7.d, p3/m, #0 // =0x0 -; CHECK-NEXT: mov z24.d, p4/m, #0 // =0x0 -; CHECK-NEXT: sel z0.d, p5, z2.d, z5.d +; CHECK-NEXT: sel z3.d, p1, z5.d, z4.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z6.d +; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z6.d +; CHECK-NEXT: sel z0.d, p2, z7.d, z4.d +; CHECK-NEXT: sel z1.d, p3, z24.d, z4.d +; CHECK-NEXT: mov z4.d, p4/m, z25.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z3.d, p5/m, z2.d ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z1.d, p6, z2.d, z6.d +; CHECK-NEXT: mov z0.d, p6/m, z2.d ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z3.d, p1, z2.d, z7.d -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z2.d, p0, z2.d, z24.d -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s -; CHECK-NEXT: uzp1 z1.s, z2.s, z3.s +; CHECK-NEXT: mov z1.d, p1/m, z2.d +; CHECK-NEXT: sel z2.d, p0, z2.d, z4.d +; CHECK-NEXT: uzp1 z0.s, z0.s, z3.s +; CHECK-NEXT: uzp1 z1.s, z2.s, z1.s ; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -372,15 +357,14 @@ define @test_signed_v2f64_v2i64( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #4895412794951729151 // =0x43efffffffffffff -; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzu z2.d, p0/m, z0.d ; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzu z1.d, p0/m, z0.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z2.d -; CHECK-NEXT: mov z1.d, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z1.d, p0/m, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: sel z0.d, p1, z2.d, z3.d +; CHECK-NEXT: mov z0.d, p0/m, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv2f64.nxv2i64( %f) ret %x @@ -391,23 +375,20 @@ define @test_signed_v4f64_v4i64( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #4895412794951729151 // =0x43efffffffffffff -; CHECK-NEXT: mov z4.d, x8 +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: movprfx z3, z0 +; CHECK-NEXT: fcvtzu z3.d, p0/m, z0.d ; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: movprfx z5, z1 +; CHECK-NEXT: fcvtzu z5.d, p0/m, z1.d ; CHECK-NEXT: fcmge p2.d, p0/z, z1.d, #0.0 -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: fcvtzu z2.d, p0/m, z0.d -; CHECK-NEXT: movprfx z3, z1 -; CHECK-NEXT: fcvtzu z3.d, p0/m, z1.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z0.d, z4.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: fcmgt p0.d, p0/z, z1.d, z4.d -; CHECK-NEXT: mov z2.d, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z3.d, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z2.d, p3/m, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z3.d, p0/m, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z0.d, z2.d -; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: fcmgt p3.d, p0/z, z0.d, z2.d +; CHECK-NEXT: fcmgt p0.d, p0/z, z1.d, z2.d +; CHECK-NEXT: sel z0.d, p1, z3.d, z4.d +; CHECK-NEXT: sel z1.d, p2, z5.d, z4.d +; CHECK-NEXT: mov z0.d, p3/m, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.d, p0/m, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv4f64.nxv4i64( %f) ret %x @@ -429,15 +410,15 @@ define @test_signed_v2f16_v2i32( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: mov z3.h, w8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: fcvtzu z1.d, p0/m, z0.h ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: fcvtzu z2.d, p0/m, z0.h -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: fcmgt p0.h, p0/z, z0.h, z3.h ; CHECK-NEXT: mov z0.d, #0xffffffff -; CHECK-NEXT: mov z2.d, p1/m, #0 // =0x0 -; CHECK-NEXT: sel z0.d, p0, z0.d, z2.d +; CHECK-NEXT: sel z1.d, p1, z1.d, z2.d +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv2f16.nxv2i32( %f) ret %x @@ -448,15 +429,14 @@ define @test_signed_v4f16_v4i32( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzu z2.s, p0/m, z0.h ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzu z1.s, p0/m, z0.h -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p0.h, p0/z, z0.h, z2.h -; CHECK-NEXT: mov z1.s, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z1.s, p0/m, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: fcmgt p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: sel z0.s, p1, z2.s, z3.s +; CHECK-NEXT: mov z0.s, p0/m, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv4f16.nxv4i32( %f) ret %x @@ -465,23 +445,22 @@ define @test_signed_v4f16_v4i32( %f) { define @test_signed_v8f16_v8i32( %f) { ; CHECK-LABEL: test_signed_v8f16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpklo z2.s, z0.h -; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z0.h +; CHECK-NEXT: uunpkhi z0.s, z0.h ; CHECK-NEXT: mov w8, #31743 // =0x7bff ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z4.h, w8 -; CHECK-NEXT: fcmge p1.h, p0/z, z2.h, #0.0 -; CHECK-NEXT: fcmge p2.h, p0/z, z3.h, #0.0 -; CHECK-NEXT: movprfx z0, z2 -; CHECK-NEXT: fcvtzu z0.s, p0/m, z2.h -; CHECK-NEXT: movprfx z1, z3 -; CHECK-NEXT: fcvtzu z1.s, p0/m, z3.h -; CHECK-NEXT: fcmgt p3.h, p0/z, z2.h, z4.h -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z4.h -; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z1.s, p2/m, #0 // =0x0 +; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: fcvtzu z3.s, p0/m, z1.h +; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, #0.0 +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: fcvtzu z5.s, p0/m, z0.h +; CHECK-NEXT: fcmge p2.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z2.h +; CHECK-NEXT: fcmgt p0.h, p0/z, z0.h, z2.h +; CHECK-NEXT: sel z0.s, p1, z3.s, z4.s +; CHECK-NEXT: sel z1.s, p2, z5.s, z4.s ; CHECK-NEXT: mov z0.s, p3/m, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.s, p0/m, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ret @@ -494,15 +473,15 @@ define @test_signed_v4f16_v4i16( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: mov z3.h, w8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: fcvtzu z1.s, p0/m, z0.h ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: fcvtzu z2.s, p0/m, z0.h -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: fcmgt p0.h, p0/z, z0.h, z3.h ; CHECK-NEXT: mov z0.s, #65535 // =0xffff -; CHECK-NEXT: mov z2.s, p1/m, #0 // =0x0 -; CHECK-NEXT: sel z0.s, p0, z0.s, z2.s +; CHECK-NEXT: sel z1.s, p1, z1.s, z2.s +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv4f16.nxv4i16( %f) ret %x @@ -513,15 +492,14 @@ define @test_signed_v8f16_v8i16( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzu z2.h, p0/m, z0.h ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzu z1.h, p0/m, z0.h -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p0.h, p0/z, z0.h, z2.h -; CHECK-NEXT: mov z1.h, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z1.h, p0/m, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: fcmgt p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: sel z0.h, p1, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p0/m, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv8f16.nxv8i16( %f) ret %x @@ -532,15 +510,14 @@ define @test_signed_v2f16_v2i64( %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzu z2.d, p0/m, z0.h ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzu z1.d, p0/m, z0.h -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p0.h, p0/z, z0.h, z2.h -; CHECK-NEXT: mov z1.d, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z1.d, p0/m, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: fcmgt p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: sel z0.d, p1, z2.d, z3.d +; CHECK-NEXT: mov z0.d, p0/m, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv2f16.nxv2i64( %f) ret %x @@ -549,23 +526,22 @@ define @test_signed_v2f16_v2i64( %f) { define @test_signed_v4f16_v4i64( %f) { ; CHECK-LABEL: test_signed_v4f16_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpklo z2.d, z0.s -; CHECK-NEXT: uunpkhi z3.d, z0.s +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s ; CHECK-NEXT: mov w8, #31743 // =0x7bff ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z4.h, w8 -; CHECK-NEXT: fcmge p1.h, p0/z, z2.h, #0.0 -; CHECK-NEXT: fcmge p2.h, p0/z, z3.h, #0.0 -; CHECK-NEXT: movprfx z0, z2 -; CHECK-NEXT: fcvtzu z0.d, p0/m, z2.h -; CHECK-NEXT: movprfx z1, z3 -; CHECK-NEXT: fcvtzu z1.d, p0/m, z3.h -; CHECK-NEXT: fcmgt p3.h, p0/z, z2.h, z4.h -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z4.h -; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z1.d, p2/m, #0 // =0x0 +; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: fcvtzu z3.d, p0/m, z1.h +; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, #0.0 +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: fcvtzu z5.d, p0/m, z0.h +; CHECK-NEXT: fcmge p2.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z2.h +; CHECK-NEXT: fcmgt p0.h, p0/z, z0.h, z2.h +; CHECK-NEXT: sel z0.d, p1, z3.d, z4.d +; CHECK-NEXT: sel z1.d, p2, z5.d, z4.d ; CHECK-NEXT: mov z0.d, p3/m, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.d, p0/m, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-llrint.ll b/llvm/test/CodeGen/AArch64/sve-llrint.ll index 16e0e0c4661b6..8eaab2e0af104 100644 --- a/llvm/test/CodeGen/AArch64/sve-llrint.ll +++ b/llvm/test/CodeGen/AArch64/sve-llrint.ll @@ -6,17 +6,16 @@ define @llrint_v1i64_v1f16( %x) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #64511 // =0xfbff -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff ; CHECK-NEXT: frintx z0.h, p0/m, z0.h ; CHECK-NEXT: mov z3.h, w8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.h ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.h +; CHECK-NEXT: mov z1.d, #0x8000000000000000 ; CHECK-NEXT: fcmgt p2.h, p0/z, z0.h, z3.h ; CHECK-NEXT: mov z3.d, #0x7fffffffffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -32,17 +31,16 @@ define @llrint_v1i64_v2f16( %x) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #64511 // =0xfbff -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff ; CHECK-NEXT: frintx z0.h, p0/m, z0.h ; CHECK-NEXT: mov z3.h, w8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.h ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.h +; CHECK-NEXT: mov z1.d, #0x8000000000000000 ; CHECK-NEXT: fcmgt p2.h, p0/z, z0.h, z3.h ; CHECK-NEXT: mov z3.d, #0x7fffffffffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -67,27 +65,25 @@ define @llrint_v4i64_v4f16( %x) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: mov z3.h, w8 -; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff +; CHECK-NEXT: mov z3.d, #0x8000000000000000 +; CHECK-NEXT: mov z5.h, w8 ; CHECK-NEXT: frintx z1.h, p0/m, z1.h ; CHECK-NEXT: frintx z0.h, p0/m, z0.h -; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z2.h -; CHECK-NEXT: fcmge p2.h, p0/z, z0.h, z2.h -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.h -; CHECK-NEXT: movprfx z5, z0 -; CHECK-NEXT: fcvtzs z5.d, p0/m, z0.h -; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z3.h -; CHECK-NEXT: fcmgt p4.h, p0/z, z0.h, z3.h -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d +; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z2.h +; CHECK-NEXT: movprfx z6, z0 +; CHECK-NEXT: fcvtzs z6.d, p0/m, z0.h +; CHECK-NEXT: fcmge p2.h, p0/z, z0.h, z2.h +; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z5.h +; CHECK-NEXT: mov z2.d, #0x7fffffffffffffff +; CHECK-NEXT: fcmgt p4.h, p0/z, z0.h, z5.h +; CHECK-NEXT: sel z4.d, p1, z4.d, z3.d ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z1.h ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h -; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d +; CHECK-NEXT: mov z3.d, p2/m, z6.d +; CHECK-NEXT: sel z0.d, p3, z2.d, z4.d +; CHECK-NEXT: sel z1.d, p4, z2.d, z3.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 @@ -117,7 +113,6 @@ define @llrint_v8i64_v8f16( %x) { ; CHECK-NEXT: mov z4.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff ; CHECK-NEXT: mov z6.h, w8 -; CHECK-NEXT: mov z26.d, #0x7fffffffffffffff ; CHECK-NEXT: uunpklo z2.d, z1.s ; CHECK-NEXT: uunpkhi z1.d, z1.s ; CHECK-NEXT: uunpklo z3.d, z0.s @@ -128,45 +123,42 @@ define @llrint_v8i64_v8f16( %x) { ; CHECK-NEXT: movprfx z5, z0 ; CHECK-NEXT: frintx z5.h, p0/m, z0.h ; CHECK-NEXT: mov z0.d, #0x8000000000000000 +; CHECK-NEXT: movprfx z7, z2 +; CHECK-NEXT: fcvtzs z7.d, p0/m, z2.h ; CHECK-NEXT: fcmge p1.h, p0/z, z2.h, z4.h +; CHECK-NEXT: movprfx z24, z1 +; CHECK-NEXT: fcvtzs z24.d, p0/m, z1.h ; CHECK-NEXT: fcmge p2.h, p0/z, z1.h, z4.h +; CHECK-NEXT: movprfx z25, z3 +; CHECK-NEXT: fcvtzs z25.d, p0/m, z3.h ; CHECK-NEXT: fcmge p3.h, p0/z, z3.h, z4.h +; CHECK-NEXT: movprfx z26, z5 +; CHECK-NEXT: fcvtzs z26.d, p0/m, z5.h ; CHECK-NEXT: fcmge p4.h, p0/z, z5.h, z4.h -; CHECK-NEXT: movprfx z4, z2 -; CHECK-NEXT: fcvtzs z4.d, p0/m, z2.h -; CHECK-NEXT: movprfx z7, z1 -; CHECK-NEXT: fcvtzs z7.d, p0/m, z1.h -; CHECK-NEXT: movprfx z24, z3 -; CHECK-NEXT: fcvtzs z24.d, p0/m, z3.h -; CHECK-NEXT: movprfx z25, z5 -; CHECK-NEXT: fcvtzs z25.d, p0/m, z5.h +; CHECK-NEXT: mov z4.d, #0x7fffffffffffffff ; CHECK-NEXT: fcmgt p7.h, p0/z, z3.h, z6.h ; CHECK-NEXT: fcmgt p5.h, p0/z, z2.h, z6.h ; CHECK-NEXT: fcmgt p6.h, p0/z, z1.h, z6.h -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: mov z4.d, p1/m, z0.d +; CHECK-NEXT: sel z7.d, p1, z7.d, z0.d ; CHECK-NEXT: fcmgt p1.h, p0/z, z5.h, z6.h -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: sel z6.d, p2, z0.d, z7.d +; CHECK-NEXT: sel z6.d, p2, z24.d, z0.d +; CHECK-NEXT: sel z24.d, p3, z25.d, z0.d ; CHECK-NEXT: fcmuo p2.h, p0/z, z2.h, z2.h -; CHECK-NEXT: sel z7.d, p3, z0.d, z24.d -; CHECK-NEXT: fcmuo p3.h, p0/z, z1.h, z1.h -; CHECK-NEXT: sel z24.d, p4, z0.d, z25.d +; CHECK-NEXT: sel z25.d, p4, z26.d, z0.d ; CHECK-NEXT: fcmuo p4.h, p0/z, z3.h, z3.h +; CHECK-NEXT: fcmuo p3.h, p0/z, z1.h, z1.h ; CHECK-NEXT: fcmuo p0.h, p0/z, z5.h, z5.h -; CHECK-NEXT: sel z0.d, p5, z26.d, z4.d -; CHECK-NEXT: sel z1.d, p6, z26.d, z6.d -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z2.d, p7, z26.d, z7.d -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z3.d, p1, z26.d, z24.d +; CHECK-NEXT: sel z0.d, p5, z4.d, z7.d ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel z2.d, p7, z4.d, z24.d +; CHECK-NEXT: sel z1.d, p6, z4.d, z6.d +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel z3.d, p1, z4.d, z25.d +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 ; CHECK-NEXT: mov z2.d, p4/m, #0 // =0x0 ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 ; CHECK-NEXT: mov z3.d, p0/m, #0 // =0x0 ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -198,113 +190,105 @@ define @llrint_v16i64_v16f16( %x) { ; CHECK-NEXT: uunpkhi z0.s, z0.h ; CHECK-NEXT: mov w8, #64511 // =0xfbff ; CHECK-NEXT: uunpklo z4.s, z1.h -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpkhi z1.s, z1.h -; CHECK-NEXT: mov z5.h, w8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z24.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: mov z25.d, #0x8000000000000000 +; CHECK-NEXT: mov z5.d, #0x8000000000000000 ; CHECK-NEXT: mov z27.h, w8 ; CHECK-NEXT: mov z7.d, #0x7fffffffffffffff ; CHECK-NEXT: uunpklo z3.d, z2.s ; CHECK-NEXT: uunpkhi z2.d, z2.s ; CHECK-NEXT: uunpklo z6.d, z0.s ; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: uunpklo z24.d, z4.s +; CHECK-NEXT: uunpklo z25.d, z4.s ; CHECK-NEXT: uunpkhi z4.d, z4.s ; CHECK-NEXT: uunpklo z26.d, z1.s ; CHECK-NEXT: uunpkhi z1.d, z1.s -; CHECK-NEXT: frintx z2.h, p0/m, z2.h ; CHECK-NEXT: frintx z3.h, p0/m, z3.h +; CHECK-NEXT: frintx z2.h, p0/m, z2.h ; CHECK-NEXT: frintx z6.h, p0/m, z6.h ; CHECK-NEXT: movprfx z28, z0 ; CHECK-NEXT: frintx z28.h, p0/m, z0.h -; CHECK-NEXT: movprfx z29, z4 -; CHECK-NEXT: frintx z29.h, p0/m, z4.h -; CHECK-NEXT: frintx z24.h, p0/m, z24.h -; CHECK-NEXT: movprfx z30, z1 -; CHECK-NEXT: frintx z30.h, p0/m, z1.h +; CHECK-NEXT: movprfx z30, z4 +; CHECK-NEXT: frintx z30.h, p0/m, z4.h +; CHECK-NEXT: frintx z25.h, p0/m, z25.h ; CHECK-NEXT: frintx z26.h, p0/m, z26.h -; CHECK-NEXT: fcmge p5.h, p0/z, z2.h, z5.h -; CHECK-NEXT: fcmge p2.h, p0/z, z3.h, z5.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z2.h +; CHECK-NEXT: movprfx z31, z1 +; CHECK-NEXT: frintx z31.h, p0/m, z1.h ; CHECK-NEXT: movprfx z0, z3 ; CHECK-NEXT: fcvtzs z0.d, p0/m, z3.h -; CHECK-NEXT: fcmge p6.h, p0/z, z6.h, z5.h -; CHECK-NEXT: fcmgt p3.h, p0/z, z3.h, z27.h +; CHECK-NEXT: fcmge p2.h, p0/z, z3.h, z24.h +; CHECK-NEXT: movprfx z29, z2 +; CHECK-NEXT: fcvtzs z29.d, p0/m, z2.h +; CHECK-NEXT: fcmge p3.h, p0/z, z2.h, z24.h +; CHECK-NEXT: fcmgt p4.h, p0/z, z3.h, z27.h ; CHECK-NEXT: fcmuo p1.h, p0/z, z3.h, z3.h -; CHECK-NEXT: fcmge p7.h, p0/z, z28.h, z5.h -; CHECK-NEXT: movprfx z3, z6 -; CHECK-NEXT: fcvtzs z3.d, p0/m, z6.h -; CHECK-NEXT: fcmge p8.h, p0/z, z24.h, z5.h -; CHECK-NEXT: fcmgt p4.h, p0/z, z2.h, z27.h -; CHECK-NEXT: fcmge p9.h, p0/z, z26.h, z5.h -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: movprfx z4, z24 -; CHECK-NEXT: fcvtzs z4.d, p0/m, z24.h -; CHECK-NEXT: fcmge p10.h, p0/z, z30.h, z5.h -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: movprfx z31, z26 -; CHECK-NEXT: fcvtzs z31.d, p0/m, z26.h +; CHECK-NEXT: movprfx z1, z6 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z6.h +; CHECK-NEXT: fcmge p6.h, p0/z, z6.h, z24.h +; CHECK-NEXT: movprfx z4, z28 +; CHECK-NEXT: fcvtzs z4.d, p0/m, z28.h ; CHECK-NEXT: movprfx z8, z30 ; CHECK-NEXT: fcvtzs z8.d, p0/m, z30.h -; CHECK-NEXT: mov z1.d, p5/m, z25.d -; CHECK-NEXT: fcmge p5.h, p0/z, z29.h, z5.h -; CHECK-NEXT: not p6.b, p0/z, p6.b -; CHECK-NEXT: mov z0.d, p2/m, z25.d -; CHECK-NEXT: fcmuo p2.h, p0/z, z2.h, z2.h -; CHECK-NEXT: movprfx z2, z28 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z28.h -; CHECK-NEXT: movprfx z5, z29 -; CHECK-NEXT: fcvtzs z5.d, p0/m, z29.h -; CHECK-NEXT: not p7.b, p0/z, p7.b -; CHECK-NEXT: mov z3.d, p6/m, z25.d -; CHECK-NEXT: not p6.b, p0/z, p8.b -; CHECK-NEXT: fcmgt p8.h, p0/z, z6.h, z27.h -; CHECK-NEXT: mov z1.d, p4/m, z7.d -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: mov z0.d, p3/m, z7.d -; CHECK-NEXT: fcmgt p3.h, p0/z, z29.h, z27.h -; CHECK-NEXT: sel z9.d, p7, z25.d, z2.d -; CHECK-NEXT: not p7.b, p0/z, p9.b -; CHECK-NEXT: mov z4.d, p6/m, z25.d -; CHECK-NEXT: not p6.b, p0/z, p10.b -; CHECK-NEXT: fcmgt p10.h, p0/z, z28.h, z27.h -; CHECK-NEXT: mov z5.d, p5/m, z25.d -; CHECK-NEXT: fcmgt p5.h, p0/z, z24.h, z27.h -; CHECK-NEXT: fcmuo p9.h, p0/z, z6.h, z6.h -; CHECK-NEXT: sel z6.d, p7, z25.d, z31.d -; CHECK-NEXT: sel z25.d, p6, z25.d, z8.d -; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: fcmgt p6.h, p0/z, z26.h, z27.h -; CHECK-NEXT: fcmgt p7.h, p0/z, z30.h, z27.h -; CHECK-NEXT: fcmuo p4.h, p0/z, z28.h, z28.h -; CHECK-NEXT: sel z2.d, p8, z7.d, z3.d -; CHECK-NEXT: sel z3.d, p10, z7.d, z9.d +; CHECK-NEXT: fcmge p7.h, p0/z, z30.h, z24.h +; CHECK-NEXT: movprfx z9, z26 +; CHECK-NEXT: fcvtzs z9.d, p0/m, z26.h +; CHECK-NEXT: sel z0.d, p2, z0.d, z5.d +; CHECK-NEXT: fcmge p2.h, p0/z, z28.h, z24.h +; CHECK-NEXT: fcmge p8.h, p0/z, z26.h, z24.h +; CHECK-NEXT: sel z3.d, p3, z29.d, z5.d +; CHECK-NEXT: movprfx z29, z25 +; CHECK-NEXT: fcvtzs z29.d, p0/m, z25.h +; CHECK-NEXT: fcmge p3.h, p0/z, z25.h, z24.h +; CHECK-NEXT: fcmge p9.h, p0/z, z31.h, z24.h +; CHECK-NEXT: movprfx z24, z31 +; CHECK-NEXT: fcvtzs z24.d, p0/m, z31.h +; CHECK-NEXT: fcmgt p5.h, p0/z, z2.h, z27.h +; CHECK-NEXT: fcmuo p10.h, p0/z, z2.h, z2.h +; CHECK-NEXT: sel z2.d, p6, z1.d, z5.d +; CHECK-NEXT: fcmgt p6.h, p0/z, z6.h, z27.h +; CHECK-NEXT: sel z4.d, p2, z4.d, z5.d +; CHECK-NEXT: fcmuo p2.h, p0/z, z6.h, z6.h +; CHECK-NEXT: mov z0.d, p4/m, z7.d +; CHECK-NEXT: sel z6.d, p3, z29.d, z5.d +; CHECK-NEXT: fcmgt p3.h, p0/z, z28.h, z27.h +; CHECK-NEXT: sel z29.d, p7, z8.d, z5.d +; CHECK-NEXT: fcmgt p7.h, p0/z, z25.h, z27.h +; CHECK-NEXT: sel z8.d, p8, z9.d, z5.d ; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: fcmuo p8.h, p0/z, z29.h, z29.h -; CHECK-NEXT: mov z4.d, p5/m, z7.d -; CHECK-NEXT: fcmuo p5.h, p0/z, z24.h, z24.h -; CHECK-NEXT: fcmuo p10.h, p0/z, z26.h, z26.h -; CHECK-NEXT: mov z5.d, p3/m, z7.d -; CHECK-NEXT: mov z6.d, p6/m, z7.d -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: fcmuo p0.h, p0/z, z30.h, z30.h -; CHECK-NEXT: sel z7.d, p7, z7.d, z25.d -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z2.d, p9/m, #0 // =0x0 -; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z3.d, p4/m, #0 // =0x0 +; CHECK-NEXT: fcmgt p4.h, p0/z, z30.h, z27.h +; CHECK-NEXT: fcmgt p8.h, p0/z, z26.h, z27.h +; CHECK-NEXT: sel z24.d, p9, z24.d, z5.d +; CHECK-NEXT: fcmgt p9.h, p0/z, z31.h, z27.h +; CHECK-NEXT: sel z1.d, p5, z7.d, z3.d +; CHECK-NEXT: fcmuo p5.h, p0/z, z28.h, z28.h +; CHECK-NEXT: mov z2.d, p6/m, z7.d +; CHECK-NEXT: sel z3.d, p3, z7.d, z4.d +; CHECK-NEXT: fcmuo p6.h, p0/z, z30.h, z30.h +; CHECK-NEXT: sel z4.d, p7, z7.d, z6.d +; CHECK-NEXT: fcmuo p7.h, p0/z, z26.h, z26.h +; CHECK-NEXT: fcmuo p3.h, p0/z, z25.h, z25.h +; CHECK-NEXT: sel z5.d, p4, z7.d, z29.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z4.d, p5/m, #0 // =0x0 -; CHECK-NEXT: mov z5.d, p8/m, #0 // =0x0 +; CHECK-NEXT: sel z6.d, p8, z7.d, z8.d +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z6.d, p10/m, #0 // =0x0 -; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p0.h, p0/z, z31.h, z31.h +; CHECK-NEXT: sel z7.d, p9, z7.d, z24.d +; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z1.d, p10/m, #0 // =0x0 ; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z1.d, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z7.d, p0/m, #0 // =0x0 +; CHECK-NEXT: mov z3.d, p5/m, #0 // =0x0 +; CHECK-NEXT: mov z5.d, p6/m, #0 // =0x0 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z6.d, p7/m, #0 // =0x0 +; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z2.d, p2/m, #0 // =0x0 +; CHECK-NEXT: mov z4.d, p3/m, #0 // =0x0 ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z7.d, p0/m, #0 // =0x0 ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -340,8 +324,8 @@ define @llrint_v32i64_v32f16( %x) { ; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-3 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 168 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG @@ -352,230 +336,219 @@ define @llrint_v32i64_v32f16( %x) { ; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG ; CHECK-NEXT: uunpklo z4.s, z0.h -; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpkhi z0.s, z0.h ; CHECK-NEXT: mov w9, #64511 // =0xfbff ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: uunpklo z6.s, z1.h -; CHECK-NEXT: mov z26.h, w9 -; CHECK-NEXT: uunpkhi z25.s, z1.h +; CHECK-NEXT: mov z28.h, w9 ; CHECK-NEXT: mov w9, #31743 // =0x7bff +; CHECK-NEXT: uunpklo z24.s, z1.h +; CHECK-NEXT: uunpkhi z25.s, z1.h ; CHECK-NEXT: mov z27.d, #0x8000000000000000 -; CHECK-NEXT: uunpklo z31.s, z2.h -; CHECK-NEXT: uunpkhi z12.s, z2.h -; CHECK-NEXT: mov z17.d, z3.d -; CHECK-NEXT: uunpklo z0.d, z4.s +; CHECK-NEXT: uunpklo z8.s, z2.h +; CHECK-NEXT: uunpkhi z15.s, z3.h +; CHECK-NEXT: uunpklo z13.s, z3.h +; CHECK-NEXT: uunpklo z5.d, z4.s +; CHECK-NEXT: uunpkhi z7.d, z0.s +; CHECK-NEXT: uunpklo z6.d, z0.s ; CHECK-NEXT: uunpkhi z4.d, z4.s -; CHECK-NEXT: uunpklo z7.d, z5.s -; CHECK-NEXT: uunpkhi z24.d, z5.s -; CHECK-NEXT: uunpklo z28.d, z6.s -; CHECK-NEXT: uunpkhi z29.d, z6.s -; CHECK-NEXT: uunpklo z8.d, z25.s -; CHECK-NEXT: uunpkhi z9.d, z25.s -; CHECK-NEXT: uunpklo z16.s, z17.h -; CHECK-NEXT: uunpklo z11.d, z31.s -; CHECK-NEXT: uunpkhi z14.d, z31.s -; CHECK-NEXT: uunpkhi z17.s, z17.h -; CHECK-NEXT: movprfx z30, z4 -; CHECK-NEXT: frintx z30.h, p0/m, z4.h -; CHECK-NEXT: movprfx z4, z7 -; CHECK-NEXT: frintx z4.h, p0/m, z7.h -; CHECK-NEXT: frintx z0.h, p0/m, z0.h -; CHECK-NEXT: movprfx z6, z24 -; CHECK-NEXT: frintx z6.h, p0/m, z24.h -; CHECK-NEXT: movprfx z7, z28 -; CHECK-NEXT: frintx z7.h, p0/m, z28.h -; CHECK-NEXT: movprfx z25, z29 -; CHECK-NEXT: frintx z25.h, p0/m, z29.h -; CHECK-NEXT: movprfx z3, z9 -; CHECK-NEXT: frintx z3.h, p0/m, z9.h -; CHECK-NEXT: mov z5.h, w9 -; CHECK-NEXT: movprfx z31, z11 -; CHECK-NEXT: frintx z31.h, p0/m, z11.h -; CHECK-NEXT: movprfx z9, z14 -; CHECK-NEXT: frintx z9.h, p0/m, z14.h -; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z26.h -; CHECK-NEXT: fcmge p4.h, p0/z, z4.h, z26.h -; CHECK-NEXT: movprfx z24, z0 -; CHECK-NEXT: fcvtzs z24.d, p0/m, z0.h -; CHECK-NEXT: fcmge p2.h, p0/z, z30.h, z26.h -; CHECK-NEXT: movprfx z29, z4 -; CHECK-NEXT: fcvtzs z29.d, p0/m, z4.h -; CHECK-NEXT: fcmge p6.h, p0/z, z6.h, z26.h -; CHECK-NEXT: movprfx z28, z30 -; CHECK-NEXT: fcvtzs z28.d, p0/m, z30.h -; CHECK-NEXT: movprfx z10, z6 -; CHECK-NEXT: fcvtzs z10.d, p0/m, z6.h -; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: fcmge p3.h, p0/z, z7.h, z26.h -; CHECK-NEXT: movprfx z13, z7 -; CHECK-NEXT: fcvtzs z13.d, p0/m, z7.h -; CHECK-NEXT: movprfx z15, z25 -; CHECK-NEXT: fcvtzs z15.d, p0/m, z25.h -; CHECK-NEXT: not p5.b, p0/z, p1.b -; CHECK-NEXT: movprfx z18, z3 -; CHECK-NEXT: fcvtzs z18.d, p0/m, z3.h -; CHECK-NEXT: movprfx z20, z31 -; CHECK-NEXT: fcvtzs z20.d, p0/m, z31.h -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: movprfx z21, z9 -; CHECK-NEXT: fcvtzs z21.d, p0/m, z9.h -; CHECK-NEXT: fcmgt p1.h, p0/z, z30.h, z5.h -; CHECK-NEXT: sel z0.d, p5, z27.d, z24.d -; CHECK-NEXT: not p7.b, p0/z, p2.b -; CHECK-NEXT: fcmgt p2.h, p0/z, z4.h, z5.h -; CHECK-NEXT: mov z29.d, p4/m, z27.d -; CHECK-NEXT: fcmge p4.h, p0/z, z25.h, z26.h -; CHECK-NEXT: not p5.b, p0/z, p6.b -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: fcmge p6.h, p0/z, z9.h, z26.h -; CHECK-NEXT: fcmgt p9.h, p0/z, z6.h, z5.h -; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: sel z0.d, p7, z27.d, z28.d -; CHECK-NEXT: movprfx z28, z8 -; CHECK-NEXT: frintx z28.h, p0/m, z8.h -; CHECK-NEXT: sel z8.d, p5, z27.d, z10.d -; CHECK-NEXT: uunpklo z10.d, z12.s -; CHECK-NEXT: uunpkhi z12.d, z12.s -; CHECK-NEXT: not p5.b, p0/z, p4.b -; CHECK-NEXT: sel z11.d, p3, z27.d, z13.d -; CHECK-NEXT: uunpklo z13.d, z16.s -; CHECK-NEXT: fcmge p3.h, p0/z, z3.h, z26.h -; CHECK-NEXT: not p6.b, p0/z, p6.b -; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: sel z24.d, p5, z27.d, z15.d -; CHECK-NEXT: uunpkhi z15.d, z16.s -; CHECK-NEXT: movprfx z14, z28 -; CHECK-NEXT: fcvtzs z14.d, p0/m, z28.h -; CHECK-NEXT: frintx z10.h, p0/m, z10.h -; CHECK-NEXT: uunpklo z16.d, z17.s -; CHECK-NEXT: frintx z12.h, p0/m, z12.h -; CHECK-NEXT: uunpkhi z17.d, z17.s -; CHECK-NEXT: movprfx z19, z13 -; CHECK-NEXT: frintx z19.h, p0/m, z13.h -; CHECK-NEXT: fcmge p4.h, p0/z, z28.h, z26.h -; CHECK-NEXT: fcmge p5.h, p0/z, z31.h, z26.h -; CHECK-NEXT: not p3.b, p0/z, p3.b +; CHECK-NEXT: uunpkhi z26.d, z24.s +; CHECK-NEXT: uunpkhi z11.d, z25.s +; CHECK-NEXT: uunpklo z29.d, z25.s +; CHECK-NEXT: uunpkhi z16.d, z8.s +; CHECK-NEXT: uunpklo z21.d, z15.s +; CHECK-NEXT: uunpklo z20.d, z13.s +; CHECK-NEXT: movprfx z0, z5 +; CHECK-NEXT: frintx z0.h, p0/m, z5.h +; CHECK-NEXT: movprfx z5, z7 +; CHECK-NEXT: frintx z5.h, p0/m, z7.h +; CHECK-NEXT: uunpklo z7.d, z24.s +; CHECK-NEXT: movprfx z1, z4 +; CHECK-NEXT: frintx z1.h, p0/m, z4.h +; CHECK-NEXT: movprfx z4, z6 +; CHECK-NEXT: frintx z4.h, p0/m, z6.h +; CHECK-NEXT: uunpkhi z13.d, z13.s +; CHECK-NEXT: movprfx z25, z26 +; CHECK-NEXT: frintx z25.h, p0/m, z26.h +; CHECK-NEXT: movprfx z26, z29 +; CHECK-NEXT: frintx z26.h, p0/m, z29.h +; CHECK-NEXT: uunpkhi z15.d, z15.s +; CHECK-NEXT: mov z6.h, w9 +; CHECK-NEXT: movprfx z30, z0 +; CHECK-NEXT: fcvtzs z30.d, p0/m, z0.h +; CHECK-NEXT: fcmge p2.h, p0/z, z0.h, z28.h +; CHECK-NEXT: movprfx z10, z5 +; CHECK-NEXT: fcvtzs z10.d, p0/m, z5.h +; CHECK-NEXT: fcmge p5.h, p0/z, z5.h, z28.h +; CHECK-NEXT: str z0, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: movprfx z31, z1 +; CHECK-NEXT: fcvtzs z31.d, p0/m, z1.h +; CHECK-NEXT: fcmge p3.h, p0/z, z1.h, z28.h +; CHECK-NEXT: str z4, [sp] // 16-byte Folded Spill +; CHECK-NEXT: movprfx z9, z4 +; CHECK-NEXT: fcvtzs z9.d, p0/m, z4.h +; CHECK-NEXT: fcmge p4.h, p0/z, z4.h, z28.h +; CHECK-NEXT: movprfx z24, z7 +; CHECK-NEXT: frintx z24.h, p0/m, z7.h +; CHECK-NEXT: movprfx z4, z11 +; CHECK-NEXT: frintx z4.h, p0/m, z11.h +; CHECK-NEXT: sel z0.d, p2, z30.d, z27.d +; CHECK-NEXT: uunpklo z11.d, z8.s +; CHECK-NEXT: movprfx z14, z26 +; CHECK-NEXT: fcvtzs z14.d, p0/m, z26.h +; CHECK-NEXT: sel z30.d, p5, z10.d, z27.d +; CHECK-NEXT: uunpkhi z10.s, z2.h +; CHECK-NEXT: frintx z13.h, p0/m, z13.h +; CHECK-NEXT: sel z31.d, p3, z31.d, z27.d +; CHECK-NEXT: fcmge p3.h, p0/z, z25.h, z28.h ; CHECK-NEXT: frintx z15.h, p0/m, z15.h -; CHECK-NEXT: fcmge p7.h, p0/z, z10.h, z26.h -; CHECK-NEXT: frintx z16.h, p0/m, z16.h -; CHECK-NEXT: fcmge p8.h, p0/z, z12.h, z26.h -; CHECK-NEXT: frintx z17.h, p0/m, z17.h -; CHECK-NEXT: movprfx z23, z19 -; CHECK-NEXT: fcvtzs z23.d, p0/m, z19.h -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: sel z13.d, p3, z27.d, z18.d -; CHECK-NEXT: fcmge p3.h, p0/z, z19.h, z26.h -; CHECK-NEXT: movprfx z0, z15 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z15.h -; CHECK-NEXT: sel z22.d, p4, z27.d, z14.d -; CHECK-NEXT: sel z18.d, p6, z27.d, z21.d -; CHECK-NEXT: movprfx z21, z12 -; CHECK-NEXT: fcvtzs z21.d, p0/m, z12.h -; CHECK-NEXT: movprfx z1, z16 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z16.h -; CHECK-NEXT: sel z14.d, p5, z27.d, z20.d -; CHECK-NEXT: fcmge p4.h, p0/z, z15.h, z26.h +; CHECK-NEXT: sel z29.d, p4, z9.d, z27.d +; CHECK-NEXT: movprfx z12, z24 +; CHECK-NEXT: fcvtzs z12.d, p0/m, z24.h +; CHECK-NEXT: fcmge p2.h, p0/z, z24.h, z28.h +; CHECK-NEXT: movprfx z9, z25 +; CHECK-NEXT: fcvtzs z9.d, p0/m, z25.h +; CHECK-NEXT: movprfx z18, z4 +; CHECK-NEXT: fcvtzs z18.d, p0/m, z4.h +; CHECK-NEXT: fcmge p5.h, p0/z, z4.h, z28.h +; CHECK-NEXT: uunpkhi z19.d, z10.s +; CHECK-NEXT: uunpklo z17.d, z10.s +; CHECK-NEXT: movprfx z10, z16 +; CHECK-NEXT: frintx z10.h, p0/m, z16.h +; CHECK-NEXT: movprfx z3, z11 +; CHECK-NEXT: frintx z3.h, p0/m, z11.h +; CHECK-NEXT: fcmge p4.h, p0/z, z26.h, z28.h +; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sel z11.d, p2, z12.d, z27.d +; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z6.h +; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sel z8.d, p3, z9.d, z27.d +; CHECK-NEXT: sel z9.d, p5, z18.d, z27.d +; CHECK-NEXT: movprfx z18, z20 +; CHECK-NEXT: frintx z18.h, p0/m, z20.h +; CHECK-NEXT: movprfx z16, z19 +; CHECK-NEXT: frintx z16.h, p0/m, z19.h +; CHECK-NEXT: movprfx z19, z21 +; CHECK-NEXT: frintx z19.h, p0/m, z21.h +; CHECK-NEXT: movprfx z12, z17 +; CHECK-NEXT: frintx z12.h, p0/m, z17.h +; CHECK-NEXT: movprfx z17, z3 +; CHECK-NEXT: fcvtzs z17.d, p0/m, z3.h +; CHECK-NEXT: fcmge p2.h, p0/z, z3.h, z28.h +; CHECK-NEXT: sel z14.d, p4, z14.d, z27.d ; CHECK-NEXT: movprfx z20, z10 ; CHECK-NEXT: fcvtzs z20.d, p0/m, z10.h -; CHECK-NEXT: movprfx z2, z17 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z17.h -; CHECK-NEXT: not p5.b, p0/z, p7.b -; CHECK-NEXT: fcmge p6.h, p0/z, z16.h, z26.h -; CHECK-NEXT: not p7.b, p0/z, p8.b -; CHECK-NEXT: fcmge p8.h, p0/z, z17.h, z26.h -; CHECK-NEXT: mov z26.d, #0x7fffffffffffffff -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z20.d, p5/m, z27.d -; CHECK-NEXT: mov z21.d, p7/m, z27.d -; CHECK-NEXT: not p5.b, p0/z, p6.b -; CHECK-NEXT: mov z23.d, p3/m, z27.d -; CHECK-NEXT: fcmgt p3.h, p0/z, z17.h, z5.h -; CHECK-NEXT: not p6.b, p0/z, p8.b -; CHECK-NEXT: mov z0.d, p4/m, z27.d -; CHECK-NEXT: fcmgt p4.h, p0/z, z16.h, z5.h -; CHECK-NEXT: mov z1.d, p5/m, z27.d -; CHECK-NEXT: fcmuo p5.h, p0/z, z16.h, z16.h -; CHECK-NEXT: mov z29.d, p2/m, z26.d -; CHECK-NEXT: mov z2.d, p6/m, z27.d -; CHECK-NEXT: ldr z27, [sp] // 16-byte Folded Reload -; CHECK-NEXT: fcmgt p6.h, p0/z, z7.h, z5.h -; CHECK-NEXT: fcmgt p2.h, p0/z, z12.h, z5.h -; CHECK-NEXT: fcmuo p8.h, p0/z, z17.h, z17.h -; CHECK-NEXT: fcmgt p7.h, p0/z, z28.h, z5.h -; CHECK-NEXT: mov z1.d, p4/m, z26.d -; CHECK-NEXT: fcmuo p4.h, p0/z, z15.h, z15.h -; CHECK-NEXT: mov z8.d, p9/m, z26.d -; CHECK-NEXT: mov z27.d, p1/m, z26.d -; CHECK-NEXT: fcmgt p1.h, p0/z, z15.h, z5.h -; CHECK-NEXT: mov z2.d, p3/m, z26.d -; CHECK-NEXT: fcmgt p3.h, p0/z, z19.h, z5.h -; CHECK-NEXT: mov z11.d, p6/m, z26.d +; CHECK-NEXT: fcmge p3.h, p0/z, z10.h, z28.h +; CHECK-NEXT: fcmge p7.h, p0/z, z13.h, z28.h +; CHECK-NEXT: movprfx z23, z18 +; CHECK-NEXT: fcvtzs z23.d, p0/m, z18.h +; CHECK-NEXT: fcmge p6.h, p0/z, z18.h, z28.h +; CHECK-NEXT: fcmge p9.h, p0/z, z15.h, z28.h +; CHECK-NEXT: movprfx z22, z16 +; CHECK-NEXT: fcvtzs z22.d, p0/m, z16.h +; CHECK-NEXT: fcmge p5.h, p0/z, z16.h, z28.h +; CHECK-NEXT: fcmge p8.h, p0/z, z19.h, z28.h +; CHECK-NEXT: movprfx z1, z19 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z19.h +; CHECK-NEXT: movprfx z21, z12 +; CHECK-NEXT: fcvtzs z21.d, p0/m, z12.h +; CHECK-NEXT: fcmge p4.h, p0/z, z12.h, z28.h +; CHECK-NEXT: movprfx z0, z13 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z13.h +; CHECK-NEXT: movprfx z2, z15 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z15.h +; CHECK-NEXT: ldr z7, [sp] // 16-byte Folded Reload +; CHECK-NEXT: mov z28.d, #0x7fffffffffffffff +; CHECK-NEXT: sel z17.d, p2, z17.d, z27.d +; CHECK-NEXT: sel z20.d, p3, z20.d, z27.d +; CHECK-NEXT: sel z22.d, p5, z22.d, z27.d +; CHECK-NEXT: fcmgt p5.h, p0/z, z19.h, z6.h +; CHECK-NEXT: fcmgt p3.h, p0/z, z5.h, z6.h +; CHECK-NEXT: fcmgt p2.h, p0/z, z7.h, z6.h +; CHECK-NEXT: sel z1.d, p8, z1.d, z27.d +; CHECK-NEXT: sel z21.d, p4, z21.d, z27.d +; CHECK-NEXT: sel z23.d, p6, z23.d, z27.d +; CHECK-NEXT: sel z0.d, p7, z0.d, z27.d +; CHECK-NEXT: sel z2.d, p9, z2.d, z27.d +; CHECK-NEXT: sel z27.d, p1, z28.d, z31.d +; CHECK-NEXT: fcmgt p1.h, p0/z, z13.h, z6.h +; CHECK-NEXT: fcmgt p4.h, p0/z, z15.h, z6.h ; CHECK-NEXT: fcmuo p6.h, p0/z, z19.h, z19.h -; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p5.h, p0/z, z9.h, z5.h -; CHECK-NEXT: sel z15.d, p2, z26.d, z21.d -; CHECK-NEXT: fcmuo p2.h, p0/z, z12.h, z12.h -; CHECK-NEXT: mov z2.d, p8/m, #0 // =0x0 -; CHECK-NEXT: sel z16.d, p7, z26.d, z22.d -; CHECK-NEXT: mov z0.d, p1/m, z26.d -; CHECK-NEXT: fcmgt p1.h, p0/z, z10.h, z5.h +; CHECK-NEXT: mov z1.d, p5/m, z28.d +; CHECK-NEXT: fcmgt p7.h, p0/z, z24.h, z6.h +; CHECK-NEXT: mov z29.d, p2/m, z28.d +; CHECK-NEXT: fcmgt p2.h, p0/z, z16.h, z6.h +; CHECK-NEXT: fcmgt p8.h, p0/z, z26.h, z6.h +; CHECK-NEXT: mov z30.d, p3/m, z28.d +; CHECK-NEXT: fcmgt p3.h, p0/z, z18.h, z6.h +; CHECK-NEXT: fcmuo p9.h, p0/z, z15.h, z15.h +; CHECK-NEXT: mov z0.d, p1/m, z28.d +; CHECK-NEXT: fcmgt p1.h, p0/z, z12.h, z6.h +; CHECK-NEXT: mov z2.d, p4/m, z28.d +; CHECK-NEXT: fcmuo p4.h, p0/z, z13.h, z13.h +; CHECK-NEXT: mov z1.d, p6/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.h, p0/z, z18.h, z18.h +; CHECK-NEXT: sel z13.d, p2, z28.d, z22.d +; CHECK-NEXT: fcmuo p2.h, p0/z, z16.h, z16.h +; CHECK-NEXT: sel z31.d, p7, z28.d, z11.d +; CHECK-NEXT: sel z11.d, p8, z28.d, z14.d +; CHECK-NEXT: fcmgt p6.h, p0/z, z10.h, z6.h +; CHECK-NEXT: sel z14.d, p3, z28.d, z23.d ; CHECK-NEXT: str z1, [x8, #14, mul vl] -; CHECK-NEXT: sel z17.d, p3, z26.d, z23.d -; CHECK-NEXT: fcmuo p3.h, p0/z, z10.h, z10.h -; CHECK-NEXT: str z2, [x8, #15, mul vl] -; CHECK-NEXT: sel z2.d, p5, z26.d, z18.d -; CHECK-NEXT: fcmuo p5.h, p0/z, z9.h, z9.h +; CHECK-NEXT: sel z1.d, p1, z28.d, z21.d +; CHECK-NEXT: fcmgt p1.h, p0/z, z3.h, z6.h +; CHECK-NEXT: mov z2.d, p9/m, #0 // =0x0 ; CHECK-NEXT: mov z0.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p4.h, p0/z, z3.h, z5.h -; CHECK-NEXT: mov z15.d, p2/m, #0 // =0x0 -; CHECK-NEXT: sel z1.d, p1, z26.d, z20.d -; CHECK-NEXT: fcmgt p1.h, p0/z, z31.h, z5.h -; CHECK-NEXT: mov z17.d, p6/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p2.h, p0/z, z31.h, z31.h +; CHECK-NEXT: fcmuo p3.h, p0/z, z12.h, z12.h +; CHECK-NEXT: fcmgt p4.h, p0/z, z4.h, z6.h +; CHECK-NEXT: mov z14.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.h, p0/z, z10.h, z10.h +; CHECK-NEXT: mov z13.d, p2/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p2.h, p0/z, z3.h, z3.h +; CHECK-NEXT: str z2, [x8, #15, mul vl] +; CHECK-NEXT: sel z2.d, p6, z28.d, z20.d ; CHECK-NEXT: str z0, [x8, #13, mul vl] -; CHECK-NEXT: mov z2.d, p5/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p5.h, p0/z, z25.h, z25.h -; CHECK-NEXT: str z17, [x8, #12, mul vl] +; CHECK-NEXT: sel z0.d, p1, z28.d, z17.d ; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p3.h, p0/z, z25.h, z5.h -; CHECK-NEXT: str z15, [x8, #11, mul vl] -; CHECK-NEXT: sel z0.d, p1, z26.d, z14.d -; CHECK-NEXT: fcmuo p1.h, p0/z, z3.h, z3.h -; CHECK-NEXT: sel z3.d, p4, z26.d, z13.d -; CHECK-NEXT: fcmuo p4.h, p0/z, z28.h, z28.h +; CHECK-NEXT: str z14, [x8, #12, mul vl] +; CHECK-NEXT: fcmuo p1.h, p0/z, z4.h, z4.h +; CHECK-NEXT: fcmgt p3.h, p0/z, z25.h, z6.h +; CHECK-NEXT: str z13, [x8, #11, mul vl] +; CHECK-NEXT: sel z3.d, p4, z28.d, z9.d +; CHECK-NEXT: mov z2.d, p5/m, #0 // =0x0 ; CHECK-NEXT: str z1, [x8, #10, mul vl] -; CHECK-NEXT: sel z1.d, p3, z26.d, z24.d -; CHECK-NEXT: fcmuo p3.h, p0/z, z7.h, z7.h -; CHECK-NEXT: ldr z7, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: str z2, [x8, #9, mul vl] ; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.h, p0/z, z25.h, z25.h +; CHECK-NEXT: ldr z4, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: fcmuo p4.h, p0/z, z26.h, z26.h +; CHECK-NEXT: str z2, [x8, #9, mul vl] ; CHECK-NEXT: mov z3.d, p1/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p1.h, p0/z, z6.h, z6.h -; CHECK-NEXT: mov z16.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p4.h, p0/z, z4.h, z4.h -; CHECK-NEXT: fcmgt p2.h, p0/z, z7.h, z5.h -; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p5.h, p0/z, z30.h, z30.h +; CHECK-NEXT: sel z1.d, p3, z28.d, z8.d ; CHECK-NEXT: str z0, [x8, #8, mul vl] -; CHECK-NEXT: fcmuo p0.h, p0/z, z7.h, z7.h -; CHECK-NEXT: mov z11.d, p3/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p3.h, p0/z, z24.h, z24.h +; CHECK-NEXT: fcmuo p1.h, p0/z, z5.h, z5.h +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: fcmgt p2.h, p0/z, z4.h, z6.h ; CHECK-NEXT: str z3, [x8, #7, mul vl] +; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: mov z11.d, p4/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p4.h, p0/z, z7.h, z7.h +; CHECK-NEXT: fcmuo p5.h, p0/z, z0.h, z0.h ; CHECK-NEXT: ldr z0, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: str z16, [x8, #6, mul vl] -; CHECK-NEXT: mov z8.d, p1/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p0.h, p0/z, z4.h, z4.h +; CHECK-NEXT: mov z31.d, p3/m, #0 // =0x0 +; CHECK-NEXT: str z11, [x8, #6, mul vl] +; CHECK-NEXT: mov z30.d, p1/m, #0 // =0x0 ; CHECK-NEXT: str z1, [x8, #5, mul vl] +; CHECK-NEXT: mov z0.d, p2/m, z28.d ; CHECK-NEXT: mov z29.d, p4/m, #0 // =0x0 ; CHECK-NEXT: mov z27.d, p5/m, #0 // =0x0 -; CHECK-NEXT: str z11, [x8, #4, mul vl] -; CHECK-NEXT: str z8, [x8, #3, mul vl] -; CHECK-NEXT: mov z0.d, p2/m, z26.d +; CHECK-NEXT: str z31, [x8, #4, mul vl] +; CHECK-NEXT: str z30, [x8, #3, mul vl] +; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 ; CHECK-NEXT: str z29, [x8, #2, mul vl] ; CHECK-NEXT: str z27, [x8, #1, mul vl] -; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 ; CHECK-NEXT: str z0, [x8] -; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -611,17 +584,16 @@ define @llrint_v1i64_v1f32( %x) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #-553648128 // =0xdf000000 -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff ; CHECK-NEXT: frintx z0.s, p0/m, z0.s ; CHECK-NEXT: mov z3.s, w8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.s ; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.s +; CHECK-NEXT: mov z1.d, #0x8000000000000000 ; CHECK-NEXT: fcmgt p2.s, p0/z, z0.s, z3.s ; CHECK-NEXT: mov z3.d, #0x7fffffffffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -637,17 +609,16 @@ define @llrint_v2i64_v2f32( %x) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #-553648128 // =0xdf000000 -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff ; CHECK-NEXT: frintx z0.s, p0/m, z0.s ; CHECK-NEXT: mov z3.s, w8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.s ; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.s +; CHECK-NEXT: mov z1.d, #0x8000000000000000 ; CHECK-NEXT: fcmgt p2.s, p0/z, z0.s, z3.s ; CHECK-NEXT: mov z3.d, #0x7fffffffffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -672,27 +643,25 @@ define @llrint_v4i64_v4f32( %x) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff -; CHECK-NEXT: mov z3.s, w8 -; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff +; CHECK-NEXT: mov z3.d, #0x8000000000000000 +; CHECK-NEXT: mov z5.s, w8 ; CHECK-NEXT: frintx z1.s, p0/m, z1.s ; CHECK-NEXT: frintx z0.s, p0/m, z0.s -; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, z2.s -; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z2.s -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.s -; CHECK-NEXT: movprfx z5, z0 -; CHECK-NEXT: fcvtzs z5.d, p0/m, z0.s -; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z3.s -; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z3.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d +; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, z2.s +; CHECK-NEXT: movprfx z6, z0 +; CHECK-NEXT: fcvtzs z6.d, p0/m, z0.s +; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z2.s +; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z5.s +; CHECK-NEXT: mov z2.d, #0x7fffffffffffffff +; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z5.s +; CHECK-NEXT: sel z4.d, p1, z4.d, z3.d ; CHECK-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s ; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s -; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d +; CHECK-NEXT: mov z3.d, p2/m, z6.d +; CHECK-NEXT: sel z0.d, p3, z2.d, z4.d +; CHECK-NEXT: sel z1.d, p4, z2.d, z3.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 @@ -725,50 +694,46 @@ define @llrint_v8i64_v8f32( %x) { ; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff ; CHECK-NEXT: mov z5.d, #0x8000000000000000 ; CHECK-NEXT: mov z6.s, w8 -; CHECK-NEXT: mov z26.d, #0x7fffffffffffffff ; CHECK-NEXT: frintx z2.s, p0/m, z2.s ; CHECK-NEXT: frintx z0.s, p0/m, z0.s ; CHECK-NEXT: frintx z3.s, p0/m, z3.s ; CHECK-NEXT: frintx z1.s, p0/m, z1.s +; CHECK-NEXT: movprfx z7, z2 +; CHECK-NEXT: fcvtzs z7.d, p0/m, z2.s ; CHECK-NEXT: fcmge p1.s, p0/z, z2.s, z4.s +; CHECK-NEXT: movprfx z24, z0 +; CHECK-NEXT: fcvtzs z24.d, p0/m, z0.s ; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z4.s -; CHECK-NEXT: movprfx z7, z0 -; CHECK-NEXT: fcvtzs z7.d, p0/m, z0.s +; CHECK-NEXT: movprfx z25, z3 +; CHECK-NEXT: fcvtzs z25.d, p0/m, z3.s ; CHECK-NEXT: fcmge p3.s, p0/z, z3.s, z4.s +; CHECK-NEXT: movprfx z26, z1 +; CHECK-NEXT: fcvtzs z26.d, p0/m, z1.s ; CHECK-NEXT: fcmge p4.s, p0/z, z1.s, z4.s -; CHECK-NEXT: movprfx z4, z2 -; CHECK-NEXT: fcvtzs z4.d, p0/m, z2.s -; CHECK-NEXT: movprfx z24, z3 -; CHECK-NEXT: fcvtzs z24.d, p0/m, z3.s -; CHECK-NEXT: movprfx z25, z1 -; CHECK-NEXT: fcvtzs z25.d, p0/m, z1.s +; CHECK-NEXT: mov z4.d, #0x7fffffffffffffff ; CHECK-NEXT: fcmgt p7.s, p0/z, z3.s, z6.s ; CHECK-NEXT: fcmgt p5.s, p0/z, z2.s, z6.s ; CHECK-NEXT: fcmgt p6.s, p0/z, z0.s, z6.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: mov z4.d, p1/m, z5.d +; CHECK-NEXT: sel z7.d, p1, z7.d, z5.d ; CHECK-NEXT: fcmgt p1.s, p0/z, z1.s, z6.s -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: sel z6.d, p2, z5.d, z7.d +; CHECK-NEXT: sel z6.d, p2, z24.d, z5.d +; CHECK-NEXT: sel z24.d, p3, z25.d, z5.d ; CHECK-NEXT: fcmuo p2.s, p0/z, z2.s, z2.s -; CHECK-NEXT: sel z7.d, p3, z5.d, z24.d -; CHECK-NEXT: fcmuo p3.s, p0/z, z0.s, z0.s -; CHECK-NEXT: sel z5.d, p4, z5.d, z25.d +; CHECK-NEXT: mov z5.d, p4/m, z26.d ; CHECK-NEXT: fcmuo p4.s, p0/z, z3.s, z3.s +; CHECK-NEXT: fcmuo p3.s, p0/z, z0.s, z0.s ; CHECK-NEXT: fcmuo p0.s, p0/z, z1.s, z1.s -; CHECK-NEXT: sel z0.d, p5, z26.d, z4.d -; CHECK-NEXT: sel z1.d, p6, z26.d, z6.d -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z2.d, p7, z26.d, z7.d -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z3.d, p1, z26.d, z5.d +; CHECK-NEXT: sel z0.d, p5, z4.d, z7.d ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel z2.d, p7, z4.d, z24.d +; CHECK-NEXT: sel z1.d, p6, z4.d, z6.d +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel z3.d, p1, z4.d, z5.d +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 ; CHECK-NEXT: mov z2.d, p4/m, #0 // =0x0 ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 ; CHECK-NEXT: mov z3.d, p0/m, #0 // =0x0 ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -782,7 +747,7 @@ define @llrint_v16i64_v16f32( %x) { ; CHECK-LABEL: llrint_v16i64_v16f32: ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill @@ -790,119 +755,114 @@ define @llrint_v16i64_v16f32( %x) { ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG ; CHECK-NEXT: uunpklo z4.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: uunpkhi z7.d, z0.s ; CHECK-NEXT: mov w8, #-553648128 // =0xdf000000 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: uunpklo z7.d, z1.s -; CHECK-NEXT: uunpkhi z1.d, z1.s ; CHECK-NEXT: uunpklo z24.d, z2.s ; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: mov z5.s, w8 +; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff ; CHECK-NEXT: uunpklo z25.d, z3.s ; CHECK-NEXT: uunpkhi z3.d, z3.s -; CHECK-NEXT: mov z26.d, #0x7fffffffffffffff -; CHECK-NEXT: movprfx z5, z4 -; CHECK-NEXT: frintx z5.s, p0/m, z4.s -; CHECK-NEXT: movprfx z6, z0 -; CHECK-NEXT: frintx z6.s, p0/m, z0.s -; CHECK-NEXT: mov z4.s, w8 -; CHECK-NEXT: frintx z7.s, p0/m, z7.s -; CHECK-NEXT: movprfx z28, z1 -; CHECK-NEXT: frintx z28.s, p0/m, z1.s -; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff +; CHECK-NEXT: mov z26.s, w8 ; CHECK-NEXT: mov z0.d, #0x8000000000000000 +; CHECK-NEXT: movprfx z6, z4 +; CHECK-NEXT: frintx z6.s, p0/m, z4.s +; CHECK-NEXT: movprfx z4, z7 +; CHECK-NEXT: frintx z4.s, p0/m, z7.s +; CHECK-NEXT: uunpklo z7.d, z1.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: movprfx z31, z2 +; CHECK-NEXT: frintx z31.s, p0/m, z2.s ; CHECK-NEXT: frintx z24.s, p0/m, z24.s -; CHECK-NEXT: movprfx z29, z2 -; CHECK-NEXT: frintx z29.s, p0/m, z2.s ; CHECK-NEXT: frintx z25.s, p0/m, z25.s -; CHECK-NEXT: movprfx z30, z3 -; CHECK-NEXT: frintx z30.s, p0/m, z3.s -; CHECK-NEXT: mov z27.s, w8 -; CHECK-NEXT: fcmge p1.s, p0/z, z5.s, z4.s -; CHECK-NEXT: fcmge p2.s, p0/z, z6.s, z4.s -; CHECK-NEXT: movprfx z1, z5 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z5.s -; CHECK-NEXT: movprfx z2, z6 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z6.s -; CHECK-NEXT: fcmge p5.s, p0/z, z7.s, z4.s -; CHECK-NEXT: fcmge p6.s, p0/z, z28.s, z4.s +; CHECK-NEXT: mov z29.d, #0x7fffffffffffffff +; CHECK-NEXT: movprfx z27, z6 +; CHECK-NEXT: fcvtzs z27.d, p0/m, z6.s +; CHECK-NEXT: fcmge p2.s, p0/z, z6.s, z5.s +; CHECK-NEXT: movprfx z28, z4 +; CHECK-NEXT: fcvtzs z28.d, p0/m, z4.s +; CHECK-NEXT: fcmge p3.s, p0/z, z4.s, z5.s +; CHECK-NEXT: frintx z7.s, p0/m, z7.s +; CHECK-NEXT: movprfx z30, z1 +; CHECK-NEXT: frintx z30.s, p0/m, z1.s +; CHECK-NEXT: fcmgt p4.s, p0/z, z6.s, z26.s +; CHECK-NEXT: fcmuo p1.s, p0/z, z6.s, z6.s +; CHECK-NEXT: movprfx z6, z3 +; CHECK-NEXT: frintx z6.s, p0/m, z3.s +; CHECK-NEXT: fcmge p6.s, p0/z, z24.s, z5.s +; CHECK-NEXT: movprfx z8, z31 +; CHECK-NEXT: fcvtzs z8.d, p0/m, z31.s +; CHECK-NEXT: fcmge p7.s, p0/z, z31.s, z5.s +; CHECK-NEXT: sel z1.d, p2, z27.d, z0.d +; CHECK-NEXT: movprfx z9, z25 +; CHECK-NEXT: fcvtzs z9.d, p0/m, z25.s +; CHECK-NEXT: fcmge p8.s, p0/z, z25.s, z5.s +; CHECK-NEXT: sel z2.d, p3, z28.d, z0.d ; CHECK-NEXT: movprfx z3, z7 ; CHECK-NEXT: fcvtzs z3.d, p0/m, z7.s -; CHECK-NEXT: fcmge p8.s, p0/z, z29.s, z4.s -; CHECK-NEXT: fcmgt p3.s, p0/z, z5.s, z27.s -; CHECK-NEXT: fcmgt p7.s, p0/z, z6.s, z27.s -; CHECK-NEXT: fcmge p9.s, p0/z, z25.s, z4.s -; CHECK-NEXT: movprfx z31, z25 -; CHECK-NEXT: fcvtzs z31.d, p0/m, z25.s -; CHECK-NEXT: not p4.b, p0/z, p1.b -; CHECK-NEXT: fcmuo p1.s, p0/z, z5.s, z5.s -; CHECK-NEXT: movprfx z5, z28 -; CHECK-NEXT: fcvtzs z5.d, p0/m, z28.s -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: fcmge p10.s, p0/z, z30.s, z4.s -; CHECK-NEXT: movprfx z8, z30 -; CHECK-NEXT: fcvtzs z8.d, p0/m, z30.s -; CHECK-NEXT: mov z1.d, p4/m, z0.d -; CHECK-NEXT: fcmge p4.s, p0/z, z24.s, z4.s -; CHECK-NEXT: movprfx z4, z29 -; CHECK-NEXT: fcvtzs z4.d, p0/m, z29.s -; CHECK-NEXT: mov z2.d, p2/m, z0.d -; CHECK-NEXT: fcmuo p2.s, p0/z, z6.s, z6.s -; CHECK-NEXT: movprfx z6, z24 -; CHECK-NEXT: fcvtzs z6.d, p0/m, z24.s -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: not p6.b, p0/z, p6.b -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z3.d, p5/m, z0.d -; CHECK-NEXT: not p5.b, p0/z, p8.b -; CHECK-NEXT: mov z5.d, p6/m, z0.d -; CHECK-NEXT: fcmgt p8.s, p0/z, z7.s, z27.s -; CHECK-NEXT: not p6.b, p0/z, p9.b -; CHECK-NEXT: mov z6.d, p4/m, z0.d -; CHECK-NEXT: fcmuo p9.s, p0/z, z7.s, z7.s -; CHECK-NEXT: not p4.b, p0/z, p10.b -; CHECK-NEXT: fcmgt p10.s, p0/z, z28.s, z27.s -; CHECK-NEXT: sel z7.d, p5, z0.d, z4.d -; CHECK-NEXT: fcmgt p5.s, p0/z, z24.s, z27.s -; CHECK-NEXT: mov z31.d, p6/m, z0.d -; CHECK-NEXT: fcmgt p6.s, p0/z, z30.s, z27.s -; CHECK-NEXT: mov z8.d, p4/m, z0.d -; CHECK-NEXT: sel z0.d, p3, z26.d, z1.d -; CHECK-NEXT: fcmgt p3.s, p0/z, z29.s, z27.s -; CHECK-NEXT: fcmgt p4.s, p0/z, z25.s, z27.s -; CHECK-NEXT: sel z1.d, p7, z26.d, z2.d -; CHECK-NEXT: fcmuo p7.s, p0/z, z28.s, z28.s -; CHECK-NEXT: sel z2.d, p8, z26.d, z3.d -; CHECK-NEXT: sel z3.d, p10, z26.d, z5.d -; CHECK-NEXT: fcmuo p8.s, p0/z, z29.s, z29.s -; CHECK-NEXT: sel z4.d, p5, z26.d, z6.d -; CHECK-NEXT: fcmuo p5.s, p0/z, z24.s, z24.s -; CHECK-NEXT: fcmuo p10.s, p0/z, z25.s, z25.s -; CHECK-NEXT: sel z5.d, p3, z26.d, z7.d -; CHECK-NEXT: fcmuo p0.s, p0/z, z30.s, z30.s -; CHECK-NEXT: sel z7.d, p6, z26.d, z8.d -; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z6.d, p4, z26.d, z31.d +; CHECK-NEXT: fcmge p3.s, p0/z, z7.s, z5.s +; CHECK-NEXT: movprfx z27, z30 +; CHECK-NEXT: fcvtzs z27.d, p0/m, z30.s +; CHECK-NEXT: fcmge p5.s, p0/z, z30.s, z5.s +; CHECK-NEXT: movprfx z28, z24 +; CHECK-NEXT: fcvtzs z28.d, p0/m, z24.s +; CHECK-NEXT: fcmge p9.s, p0/z, z6.s, z5.s +; CHECK-NEXT: movprfx z5, z6 +; CHECK-NEXT: fcvtzs z5.d, p0/m, z6.s +; CHECK-NEXT: fcmgt p2.s, p0/z, z4.s, z26.s +; CHECK-NEXT: fcmuo p10.s, p0/z, z4.s, z4.s +; CHECK-NEXT: sel z3.d, p3, z3.d, z0.d +; CHECK-NEXT: fcmgt p3.s, p0/z, z7.s, z26.s +; CHECK-NEXT: sel z4.d, p5, z27.d, z0.d +; CHECK-NEXT: fcmuo p5.s, p0/z, z7.s, z7.s +; CHECK-NEXT: sel z7.d, p6, z28.d, z0.d +; CHECK-NEXT: fcmgt p6.s, p0/z, z30.s, z26.s +; CHECK-NEXT: sel z27.d, p7, z8.d, z0.d +; CHECK-NEXT: fcmgt p7.s, p0/z, z24.s, z26.s +; CHECK-NEXT: sel z28.d, p8, z9.d, z0.d +; CHECK-NEXT: sel z8.d, p9, z5.d, z0.d +; CHECK-NEXT: sel z0.d, p4, z29.d, z1.d +; CHECK-NEXT: fcmgt p4.s, p0/z, z31.s, z26.s +; CHECK-NEXT: fcmgt p8.s, p0/z, z25.s, z26.s +; CHECK-NEXT: fcmgt p9.s, p0/z, z6.s, z26.s +; CHECK-NEXT: sel z1.d, p2, z29.d, z2.d +; CHECK-NEXT: sel z2.d, p3, z29.d, z3.d +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: sel z3.d, p6, z29.d, z4.d +; CHECK-NEXT: sel z4.d, p7, z29.d, z7.d +; CHECK-NEXT: fcmuo p6.s, p0/z, z31.s, z31.s +; CHECK-NEXT: fcmuo p7.s, p0/z, z25.s, z25.s +; CHECK-NEXT: fcmuo p2.s, p0/z, z30.s, z30.s +; CHECK-NEXT: fcmuo p3.s, p0/z, z24.s, z24.s +; CHECK-NEXT: fcmuo p0.s, p0/z, z6.s, z6.s +; CHECK-NEXT: sel z5.d, p4, z29.d, z27.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z2.d, p9/m, #0 // =0x0 -; CHECK-NEXT: mov z3.d, p7/m, #0 // =0x0 -; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z4.d, p5/m, #0 // =0x0 -; CHECK-NEXT: mov z5.d, p8/m, #0 // =0x0 +; CHECK-NEXT: sel z6.d, p8, z29.d, z28.d ; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z6.d, p10/m, #0 // =0x0 -; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 +; CHECK-NEXT: sel z7.d, p9, z29.d, z8.d +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z1.d, p10/m, #0 // =0x0 +; CHECK-NEXT: mov z2.d, p5/m, #0 // =0x0 ; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z1.d, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z7.d, p0/m, #0 // =0x0 +; CHECK-NEXT: mov z5.d, p6/m, #0 // =0x0 +; CHECK-NEXT: mov z6.d, p7/m, #0 // =0x0 ; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z3.d, p2/m, #0 // =0x0 +; CHECK-NEXT: mov z4.d, p3/m, #0 // =0x0 ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: mov z7.d, p0/m, #0 // =0x0 +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %a = call @llvm.llrint.nxv16i64.nxv16f32( %x) @@ -951,220 +911,206 @@ define @llrint_v32i64_v32f32( %x) { ; CHECK-NEXT: uunpklo z24.d, z0.s ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w9, #-553648128 // =0xdf000000 -; CHECK-NEXT: uunpklo z26.d, z1.s ; CHECK-NEXT: uunpkhi z25.d, z0.s +; CHECK-NEXT: uunpklo z26.d, z1.s ; CHECK-NEXT: uunpkhi z28.d, z1.s -; CHECK-NEXT: mov z29.s, w9 +; CHECK-NEXT: mov z30.s, w9 ; CHECK-NEXT: mov w9, #1593835519 // =0x5effffff -; CHECK-NEXT: mov z17.d, z5.d +; CHECK-NEXT: mov z18.d, z5.d ; CHECK-NEXT: mov z27.d, #0x8000000000000000 -; CHECK-NEXT: uunpkhi z30.d, z2.s -; CHECK-NEXT: uunpklo z8.d, z3.s +; CHECK-NEXT: uunpkhi z11.d, z3.s +; CHECK-NEXT: uunpklo z29.d, z3.s ; CHECK-NEXT: movprfx z0, z24 ; CHECK-NEXT: frintx z0.s, p0/m, z24.s -; CHECK-NEXT: uunpkhi z9.d, z3.s +; CHECK-NEXT: uunpklo z15.d, z4.s ; CHECK-NEXT: uunpkhi z14.d, z4.s +; CHECK-NEXT: movprfx z5, z25 +; CHECK-NEXT: frintx z5.s, p0/m, z25.s ; CHECK-NEXT: movprfx z24, z26 ; CHECK-NEXT: frintx z24.s, p0/m, z26.s -; CHECK-NEXT: movprfx z1, z25 -; CHECK-NEXT: frintx z1.s, p0/m, z25.s -; CHECK-NEXT: movprfx z5, z28 -; CHECK-NEXT: frintx z5.s, p0/m, z28.s -; CHECK-NEXT: uunpklo z26.d, z2.s -; CHECK-NEXT: uunpklo z16.d, z17.s -; CHECK-NEXT: mov z25.s, w9 -; CHECK-NEXT: movprfx z28, z30 -; CHECK-NEXT: frintx z28.s, p0/m, z30.s -; CHECK-NEXT: movprfx z30, z8 -; CHECK-NEXT: frintx z30.s, p0/m, z8.s -; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z29.s +; CHECK-NEXT: movprfx z25, z28 +; CHECK-NEXT: frintx z25.s, p0/m, z28.s +; CHECK-NEXT: uunpklo z28.d, z2.s +; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: uunpklo z16.d, z18.s +; CHECK-NEXT: uunpkhi z18.d, z18.s +; CHECK-NEXT: uunpklo z20.d, z7.s +; CHECK-NEXT: frintx z29.s, p0/m, z29.s ; CHECK-NEXT: movprfx z31, z0 ; CHECK-NEXT: fcvtzs z31.d, p0/m, z0.s ; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: fcmge p2.s, p0/z, z1.s, z29.s -; CHECK-NEXT: fcmge p3.s, p0/z, z24.s, z29.s -; CHECK-NEXT: fcmge p5.s, p0/z, z5.s, z29.s -; CHECK-NEXT: frintx z26.s, p0/m, z26.s -; CHECK-NEXT: movprfx z10, z1 -; CHECK-NEXT: fcvtzs z10.d, p0/m, z1.s -; CHECK-NEXT: movprfx z11, z24 -; CHECK-NEXT: fcvtzs z11.d, p0/m, z24.s -; CHECK-NEXT: movprfx z12, z5 -; CHECK-NEXT: fcvtzs z12.d, p0/m, z5.s -; CHECK-NEXT: movprfx z15, z28 -; CHECK-NEXT: fcvtzs z15.d, p0/m, z28.s -; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: not p4.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p1.s, p0/z, z1.s, z25.s -; CHECK-NEXT: fcmgt p9.s, p0/z, z5.s, z25.s -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: sel z0.d, p4, z27.d, z31.d -; CHECK-NEXT: fcmge p4.s, p0/z, z26.s, z29.s -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: movprfx z13, z26 -; CHECK-NEXT: fcvtzs z13.d, p0/m, z26.s -; CHECK-NEXT: sel z31.d, p2, z27.d, z10.d -; CHECK-NEXT: uunpklo z10.d, z4.s -; CHECK-NEXT: sel z8.d, p3, z27.d, z11.d -; CHECK-NEXT: fcmge p3.s, p0/z, z28.s, z29.s -; CHECK-NEXT: sel z11.d, p5, z27.d, z12.d -; CHECK-NEXT: movprfx z4, z9 -; CHECK-NEXT: frintx z4.s, p0/m, z9.s +; CHECK-NEXT: movprfx z4, z15 +; CHECK-NEXT: frintx z4.s, p0/m, z15.s +; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z30.s +; CHECK-NEXT: movprfx z9, z24 +; CHECK-NEXT: fcvtzs z9.d, p0/m, z24.s +; CHECK-NEXT: fcmge p3.s, p0/z, z24.s, z30.s +; CHECK-NEXT: movprfx z3, z28 +; CHECK-NEXT: frintx z3.s, p0/m, z28.s +; CHECK-NEXT: movprfx z28, z2 +; CHECK-NEXT: frintx z28.s, p0/m, z2.s +; CHECK-NEXT: movprfx z8, z5 +; CHECK-NEXT: fcvtzs z8.d, p0/m, z5.s +; CHECK-NEXT: fcmge p2.s, p0/z, z5.s, z30.s +; CHECK-NEXT: movprfx z10, z25 +; CHECK-NEXT: fcvtzs z10.d, p0/m, z25.s +; CHECK-NEXT: fcmge p4.s, p0/z, z25.s, z30.s +; CHECK-NEXT: uunpklo z15.d, z6.s +; CHECK-NEXT: uunpkhi z19.d, z6.s +; CHECK-NEXT: uunpkhi z21.d, z7.s +; CHECK-NEXT: sel z0.d, p1, z31.d, z27.d +; CHECK-NEXT: movprfx z31, z11 +; CHECK-NEXT: frintx z31.s, p0/m, z11.s +; CHECK-NEXT: movprfx z7, z14 +; CHECK-NEXT: frintx z7.s, p0/m, z14.s +; CHECK-NEXT: movprfx z12, z28 +; CHECK-NEXT: fcvtzs z12.d, p0/m, z28.s +; CHECK-NEXT: movprfx z14, z18 +; CHECK-NEXT: frintx z14.s, p0/m, z18.s +; CHECK-NEXT: movprfx z18, z20 +; CHECK-NEXT: frintx z18.s, p0/m, z20.s +; CHECK-NEXT: sel z8.d, p2, z8.d, z27.d +; CHECK-NEXT: sel z10.d, p4, z10.d, z27.d +; CHECK-NEXT: movprfx z13, z3 +; CHECK-NEXT: fcvtzs z13.d, p0/m, z3.s +; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sel z0.d, p3, z9.d, z27.d +; CHECK-NEXT: fcmge p3.s, p0/z, z28.s, z30.s +; CHECK-NEXT: movprfx z17, z31 +; CHECK-NEXT: fcvtzs z17.d, p0/m, z31.s +; CHECK-NEXT: fcmge p5.s, p0/z, z31.s, z30.s +; CHECK-NEXT: fcmge p4.s, p0/z, z3.s, z30.s +; CHECK-NEXT: movprfx z11, z29 +; CHECK-NEXT: fcvtzs z11.d, p0/m, z29.s +; CHECK-NEXT: fcmge p2.s, p0/z, z29.s, z30.s +; CHECK-NEXT: frintx z15.s, p0/m, z15.s +; CHECK-NEXT: mov z26.s, w9 +; CHECK-NEXT: movprfx z22, z14 +; CHECK-NEXT: fcvtzs z22.d, p0/m, z14.s +; CHECK-NEXT: fcmge p8.s, p0/z, z18.s, z30.s +; CHECK-NEXT: sel z9.d, p3, z12.d, z27.d +; CHECK-NEXT: movprfx z12, z16 +; CHECK-NEXT: frintx z12.s, p0/m, z16.s +; CHECK-NEXT: movprfx z1, z18 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z18.s +; CHECK-NEXT: sel z6.d, p5, z17.d, z27.d +; CHECK-NEXT: movprfx z17, z19 +; CHECK-NEXT: frintx z17.s, p0/m, z19.s +; CHECK-NEXT: movprfx z19, z21 +; CHECK-NEXT: frintx z19.s, p0/m, z21.s +; CHECK-NEXT: fcmge p5.s, p0/z, z14.s, z30.s +; CHECK-NEXT: fcmgt p1.s, p0/z, z5.s, z26.s ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: not p5.b, p0/z, p4.b -; CHECK-NEXT: fcmge p4.s, p0/z, z30.s, z29.s -; CHECK-NEXT: fcmgt p2.s, p0/z, z24.s, z25.s -; CHECK-NEXT: sel z12.d, p5, z27.d, z13.d -; CHECK-NEXT: uunpkhi z13.d, z17.s -; CHECK-NEXT: movprfx z9, z10 -; CHECK-NEXT: frintx z9.s, p0/m, z10.s -; CHECK-NEXT: movprfx z10, z14 -; CHECK-NEXT: frintx z10.s, p0/m, z14.s -; CHECK-NEXT: uunpkhi z17.d, z6.s -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: uunpklo z14.d, z6.s -; CHECK-NEXT: movprfx z6, z16 -; CHECK-NEXT: frintx z6.s, p0/m, z16.s -; CHECK-NEXT: uunpklo z16.d, z7.s -; CHECK-NEXT: uunpkhi z7.d, z7.s -; CHECK-NEXT: sel z3.d, p3, z27.d, z15.d -; CHECK-NEXT: fcmge p3.s, p0/z, z4.s, z29.s -; CHECK-NEXT: frintx z13.s, p0/m, z13.s -; CHECK-NEXT: movprfx z15, z30 -; CHECK-NEXT: fcvtzs z15.d, p0/m, z30.s -; CHECK-NEXT: fcmge p5.s, p0/z, z9.s, z29.s -; CHECK-NEXT: fcmge p6.s, p0/z, z10.s, z29.s -; CHECK-NEXT: frintx z17.s, p0/m, z17.s -; CHECK-NEXT: movprfx z18, z4 -; CHECK-NEXT: fcvtzs z18.d, p0/m, z4.s -; CHECK-NEXT: movprfx z20, z10 -; CHECK-NEXT: fcvtzs z20.d, p0/m, z10.s -; CHECK-NEXT: frintx z16.s, p0/m, z16.s -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: movprfx z19, z14 -; CHECK-NEXT: frintx z19.s, p0/m, z14.s -; CHECK-NEXT: movprfx z14, z9 -; CHECK-NEXT: fcvtzs z14.d, p0/m, z9.s -; CHECK-NEXT: fcmge p7.s, p0/z, z6.s, z29.s -; CHECK-NEXT: fcmge p8.s, p0/z, z13.s, z29.s -; CHECK-NEXT: movprfx z21, z7 -; CHECK-NEXT: frintx z21.s, p0/m, z7.s -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: not p6.b, p0/z, p6.b -; CHECK-NEXT: mov z15.d, p4/m, z27.d -; CHECK-NEXT: fcmge p4.s, p0/z, z17.s, z29.s -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: sel z7.d, p3, z27.d, z18.d +; CHECK-NEXT: sel z13.d, p4, z13.d, z27.d +; CHECK-NEXT: sel z11.d, p2, z11.d, z27.d +; CHECK-NEXT: movprfx z16, z4 +; CHECK-NEXT: fcvtzs z16.d, p0/m, z4.s +; CHECK-NEXT: fcmge p2.s, p0/z, z4.s, z30.s +; CHECK-NEXT: movprfx z20, z7 +; CHECK-NEXT: fcvtzs z20.d, p0/m, z7.s +; CHECK-NEXT: fcmge p3.s, p0/z, z7.s, z30.s +; CHECK-NEXT: movprfx z21, z12 +; CHECK-NEXT: fcvtzs z21.d, p0/m, z12.s +; CHECK-NEXT: fcmge p4.s, p0/z, z12.s, z30.s +; CHECK-NEXT: movprfx z23, z15 +; CHECK-NEXT: fcvtzs z23.d, p0/m, z15.s +; CHECK-NEXT: fcmge p6.s, p0/z, z15.s, z30.s +; CHECK-NEXT: fcmge p7.s, p0/z, z17.s, z30.s +; CHECK-NEXT: fcmge p9.s, p0/z, z19.s, z30.s ; CHECK-NEXT: movprfx z0, z17 ; CHECK-NEXT: fcvtzs z0.d, p0/m, z17.s -; CHECK-NEXT: sel z18.d, p6, z27.d, z20.d -; CHECK-NEXT: movprfx z20, z6 -; CHECK-NEXT: fcvtzs z20.d, p0/m, z6.s -; CHECK-NEXT: fcmge p6.s, p0/z, z16.s, z29.s -; CHECK-NEXT: fcmge p3.s, p0/z, z19.s, z29.s -; CHECK-NEXT: mov z14.d, p5/m, z27.d -; CHECK-NEXT: not p5.b, p0/z, p7.b -; CHECK-NEXT: not p7.b, p0/z, p8.b -; CHECK-NEXT: fcmge p8.s, p0/z, z21.s, z29.s -; CHECK-NEXT: movprfx z1, z16 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z16.s -; CHECK-NEXT: movprfx z22, z13 -; CHECK-NEXT: fcvtzs z22.d, p0/m, z13.s -; CHECK-NEXT: movprfx z23, z19 -; CHECK-NEXT: fcvtzs z23.d, p0/m, z19.s -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: movprfx z2, z21 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z21.s -; CHECK-NEXT: mov z29.d, #0x7fffffffffffffff -; CHECK-NEXT: mov z20.d, p5/m, z27.d -; CHECK-NEXT: not p5.b, p0/z, p6.b -; CHECK-NEXT: mov z0.d, p4/m, z27.d -; CHECK-NEXT: fcmgt p4.s, p0/z, z16.s, z25.s -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: not p6.b, p0/z, p8.b -; CHECK-NEXT: mov z1.d, p5/m, z27.d -; CHECK-NEXT: mov z22.d, p7/m, z27.d -; CHECK-NEXT: mov z23.d, p3/m, z27.d -; CHECK-NEXT: fcmgt p3.s, p0/z, z21.s, z25.s -; CHECK-NEXT: fcmuo p5.s, p0/z, z16.s, z16.s -; CHECK-NEXT: mov z2.d, p6/m, z27.d -; CHECK-NEXT: sel z27.d, p1, z29.d, z31.d -; CHECK-NEXT: fcmgt p1.s, p0/z, z17.s, z25.s -; CHECK-NEXT: mov z1.d, p4/m, z29.d -; CHECK-NEXT: fcmgt p6.s, p0/z, z26.s, z25.s -; CHECK-NEXT: fcmgt p7.s, p0/z, z30.s, z25.s -; CHECK-NEXT: sel z31.d, p2, z29.d, z8.d -; CHECK-NEXT: fcmgt p2.s, p0/z, z13.s, z25.s -; CHECK-NEXT: fcmuo p8.s, p0/z, z21.s, z21.s -; CHECK-NEXT: mov z2.d, p3/m, z29.d +; CHECK-NEXT: movprfx z2, z19 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z19.s +; CHECK-NEXT: mov z30.d, #0x7fffffffffffffff +; CHECK-NEXT: sel z22.d, p5, z22.d, z27.d +; CHECK-NEXT: fcmgt p5.s, p0/z, z18.s, z26.s +; CHECK-NEXT: sel z1.d, p8, z1.d, z27.d +; CHECK-NEXT: sel z16.d, p2, z16.d, z27.d +; CHECK-NEXT: fcmgt p2.s, p0/z, z24.s, z26.s +; CHECK-NEXT: sel z20.d, p3, z20.d, z27.d +; CHECK-NEXT: fcmgt p3.s, p0/z, z25.s, z26.s +; CHECK-NEXT: sel z21.d, p4, z21.d, z27.d +; CHECK-NEXT: sel z23.d, p6, z23.d, z27.d +; CHECK-NEXT: sel z0.d, p7, z0.d, z27.d +; CHECK-NEXT: sel z2.d, p9, z2.d, z27.d +; CHECK-NEXT: sel z27.d, p1, z30.d, z8.d +; CHECK-NEXT: fcmgt p1.s, p0/z, z17.s, z26.s +; CHECK-NEXT: fcmgt p4.s, p0/z, z19.s, z26.s +; CHECK-NEXT: fcmuo p6.s, p0/z, z18.s, z18.s +; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, p5/m, z30.d +; CHECK-NEXT: fcmuo p9.s, p0/z, z19.s, z19.s +; CHECK-NEXT: mov z10.d, p3/m, z30.d +; CHECK-NEXT: fcmgt p3.s, p0/z, z15.s, z26.s +; CHECK-NEXT: fcmuo p5.s, p0/z, z15.s, z15.s +; CHECK-NEXT: fcmgt p8.s, p0/z, z29.s, z26.s +; CHECK-NEXT: fcmgt p7.s, p0/z, z3.s, z26.s +; CHECK-NEXT: mov z8.d, p2/m, z30.d +; CHECK-NEXT: fcmgt p2.s, p0/z, z14.s, z26.s +; CHECK-NEXT: mov z0.d, p1/m, z30.d +; CHECK-NEXT: fcmgt p1.s, p0/z, z12.s, z26.s +; CHECK-NEXT: mov z2.d, p4/m, z30.d ; CHECK-NEXT: fcmuo p4.s, p0/z, z17.s, z17.s -; CHECK-NEXT: fcmgt p3.s, p0/z, z19.s, z25.s -; CHECK-NEXT: mov z0.d, p1/m, z29.d -; CHECK-NEXT: fcmgt p1.s, p0/z, z6.s, z25.s -; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 -; CHECK-NEXT: sel z8.d, p9, z29.d, z11.d -; CHECK-NEXT: sel z11.d, p6, z29.d, z12.d -; CHECK-NEXT: sel z12.d, p7, z29.d, z15.d -; CHECK-NEXT: fcmgt p5.s, p0/z, z10.s, z25.s -; CHECK-NEXT: sel z15.d, p2, z29.d, z22.d -; CHECK-NEXT: fcmuo p2.s, p0/z, z13.s, z13.s +; CHECK-NEXT: mov z1.d, p6/m, #0 // =0x0 +; CHECK-NEXT: sel z17.d, p3, z30.d, z23.d +; CHECK-NEXT: fcmuo p3.s, p0/z, z12.s, z12.s +; CHECK-NEXT: fcmgt p6.s, p0/z, z7.s, z26.s +; CHECK-NEXT: mov z11.d, p8/m, z30.d +; CHECK-NEXT: mov z13.d, p7/m, z30.d +; CHECK-NEXT: sel z15.d, p2, z30.d, z22.d +; CHECK-NEXT: fcmuo p2.s, p0/z, z14.s, z14.s +; CHECK-NEXT: mov z2.d, p9/m, #0 // =0x0 ; CHECK-NEXT: str z1, [x8, #14, mul vl] -; CHECK-NEXT: mov z2.d, p8/m, #0 // =0x0 +; CHECK-NEXT: sel z1.d, p1, z30.d, z21.d +; CHECK-NEXT: fcmgt p1.s, p0/z, z4.s, z26.s ; CHECK-NEXT: mov z0.d, p4/m, #0 // =0x0 -; CHECK-NEXT: sel z1.d, p1, z29.d, z20.d -; CHECK-NEXT: fcmgt p1.s, p0/z, z9.s, z25.s -; CHECK-NEXT: fcmuo p6.s, p0/z, z19.s, z19.s -; CHECK-NEXT: sel z16.d, p3, z29.d, z23.d -; CHECK-NEXT: fcmuo p3.s, p0/z, z6.s, z6.s -; CHECK-NEXT: fcmgt p4.s, p0/z, z4.s, z25.s +; CHECK-NEXT: fcmgt p4.s, p0/z, z31.s, z26.s +; CHECK-NEXT: mov z17.d, p5/m, #0 // =0x0 ; CHECK-NEXT: str z2, [x8, #15, mul vl] -; CHECK-NEXT: sel z2.d, p5, z29.d, z18.d -; CHECK-NEXT: fcmuo p5.s, p0/z, z10.s, z10.s -; CHECK-NEXT: str z0, [x8, #13, mul vl] +; CHECK-NEXT: fcmuo p5.s, p0/z, z7.s, z7.s +; CHECK-NEXT: sel z2.d, p6, z30.d, z20.d ; CHECK-NEXT: mov z15.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p2.s, p0/z, z9.s, z9.s -; CHECK-NEXT: sel z0.d, p1, z29.d, z14.d -; CHECK-NEXT: mov z16.d, p6/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p1.s, p0/z, z4.s, z4.s +; CHECK-NEXT: fcmuo p2.s, p0/z, z4.s, z4.s ; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p3.s, p0/z, z28.s, z25.s -; CHECK-NEXT: sel z4.d, p4, z29.d, z7.d +; CHECK-NEXT: str z0, [x8, #13, mul vl] +; CHECK-NEXT: fcmgt p3.s, p0/z, z28.s, z26.s +; CHECK-NEXT: sel z0.d, p1, z30.d, z16.d +; CHECK-NEXT: str z17, [x8, #12, mul vl] +; CHECK-NEXT: fcmuo p1.s, p0/z, z31.s, z31.s +; CHECK-NEXT: sel z4.d, p4, z30.d, z6.d ; CHECK-NEXT: str z15, [x8, #11, mul vl] ; CHECK-NEXT: mov z2.d, p5/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p5.s, p0/z, z28.s, z28.s -; CHECK-NEXT: str z16, [x8, #12, mul vl] -; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p4.s, p0/z, z30.s, z30.s +; CHECK-NEXT: fcmuo p4.s, p0/z, z29.s, z29.s ; CHECK-NEXT: str z1, [x8, #10, mul vl] -; CHECK-NEXT: mov z4.d, p1/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p1.s, p0/z, z5.s, z5.s -; CHECK-NEXT: sel z1.d, p3, z29.d, z3.d +; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.s, p0/z, z28.s, z28.s +; CHECK-NEXT: sel z1.d, p3, z30.d, z9.d +; CHECK-NEXT: fcmuo p3.s, p0/z, z3.s, z3.s ; CHECK-NEXT: ldr z3, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: str z2, [x8, #9, mul vl] +; CHECK-NEXT: mov z4.d, p1/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p1.s, p0/z, z25.s, z25.s ; CHECK-NEXT: str z0, [x8, #8, mul vl] -; CHECK-NEXT: fcmuo p3.s, p0/z, z26.s, z26.s -; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: str z4, [x8, #7, mul vl] -; CHECK-NEXT: mov z12.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p2.s, p0/z, z3.s, z25.s -; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: mov z11.d, p4/m, #0 // =0x0 ; CHECK-NEXT: fcmuo p4.s, p0/z, z24.s, z24.s -; CHECK-NEXT: mov z8.d, p1/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p5.s, p0/z, z0.s, z0.s -; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: str z12, [x8, #6, mul vl] -; CHECK-NEXT: str z1, [x8, #5, mul vl] +; CHECK-NEXT: fcmgt p2.s, p0/z, z3.s, z26.s +; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.s, p0/z, z5.s, z5.s +; CHECK-NEXT: str z4, [x8, #7, mul vl] ; CHECK-NEXT: fcmuo p0.s, p0/z, z3.s, z3.s -; CHECK-NEXT: mov z11.d, p3/m, #0 // =0x0 -; CHECK-NEXT: str z8, [x8, #3, mul vl] -; CHECK-NEXT: mov z31.d, p4/m, #0 // =0x0 -; CHECK-NEXT: mov z0.d, p2/m, z29.d -; CHECK-NEXT: str z11, [x8, #4, mul vl] +; CHECK-NEXT: mov z13.d, p3/m, #0 // =0x0 +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z11, [x8, #6, mul vl] +; CHECK-NEXT: mov z10.d, p1/m, #0 // =0x0 +; CHECK-NEXT: str z1, [x8, #5, mul vl] +; CHECK-NEXT: mov z8.d, p4/m, #0 // =0x0 +; CHECK-NEXT: str z13, [x8, #4, mul vl] ; CHECK-NEXT: mov z27.d, p5/m, #0 // =0x0 -; CHECK-NEXT: str z31, [x8, #2, mul vl] -; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 +; CHECK-NEXT: str z10, [x8, #3, mul vl] +; CHECK-NEXT: mov z0.d, p2/m, z30.d +; CHECK-NEXT: str z8, [x8, #2, mul vl] ; CHECK-NEXT: str z27, [x8, #1, mul vl] +; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 ; CHECK-NEXT: str z0, [x8] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload @@ -1202,17 +1148,16 @@ define @llrint_v1i64_v1f64( %x) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff ; CHECK-NEXT: frintx z0.d, p0/m, z0.d ; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.d ; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.d +; CHECK-NEXT: mov z1.d, #0x8000000000000000 ; CHECK-NEXT: fcmgt p2.d, p0/z, z0.d, z3.d ; CHECK-NEXT: mov z3.d, #0x7fffffffffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -1228,17 +1173,16 @@ define @llrint_v2i64_v2f64( %x) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff ; CHECK-NEXT: frintx z0.d, p0/m, z0.d ; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.d ; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.d +; CHECK-NEXT: mov z1.d, #0x8000000000000000 ; CHECK-NEXT: fcmgt p2.d, p0/z, z0.d, z3.d ; CHECK-NEXT: mov z3.d, #0x7fffffffffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -1259,29 +1203,27 @@ define @llrint_v4i64_v4f64( %x) { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 -; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff +; CHECK-NEXT: mov z3.d, #0x8000000000000000 ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff ; CHECK-NEXT: frintx z0.d, p0/m, z0.d ; CHECK-NEXT: frintx z1.d, p0/m, z1.d -; CHECK-NEXT: mov z3.d, x8 -; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z2.d -; CHECK-NEXT: fcmge p2.d, p0/z, z1.d, z2.d -; CHECK-NEXT: mov z2.d, #0x8000000000000000 +; CHECK-NEXT: mov z5.d, x8 ; CHECK-NEXT: movprfx z4, z0 ; CHECK-NEXT: fcvtzs z4.d, p0/m, z0.d -; CHECK-NEXT: movprfx z5, z1 -; CHECK-NEXT: fcvtzs z5.d, p0/m, z1.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z0.d, z3.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z1.d, z3.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d +; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z2.d +; CHECK-NEXT: movprfx z6, z1 +; CHECK-NEXT: fcvtzs z6.d, p0/m, z1.d +; CHECK-NEXT: fcmge p2.d, p0/z, z1.d, z2.d +; CHECK-NEXT: fcmgt p3.d, p0/z, z0.d, z5.d +; CHECK-NEXT: mov z2.d, #0x7fffffffffffffff +; CHECK-NEXT: fcmgt p4.d, p0/z, z1.d, z5.d +; CHECK-NEXT: sel z4.d, p1, z4.d, z3.d ; CHECK-NEXT: fcmuo p1.d, p0/z, z0.d, z0.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z1.d, z1.d -; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d +; CHECK-NEXT: mov z3.d, p2/m, z6.d +; CHECK-NEXT: sel z0.d, p3, z2.d, z4.d +; CHECK-NEXT: sel z1.d, p4, z2.d, z3.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 @@ -1309,51 +1251,47 @@ define @llrint_v8i64_v8f64( %x) { ; CHECK-NEXT: mov z5.d, #0x8000000000000000 ; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff -; CHECK-NEXT: mov z26.d, #0x7fffffffffffffff ; CHECK-NEXT: frintx z0.d, p0/m, z0.d ; CHECK-NEXT: frintx z1.d, p0/m, z1.d ; CHECK-NEXT: frintx z2.d, p0/m, z2.d ; CHECK-NEXT: frintx z3.d, p0/m, z3.d ; CHECK-NEXT: mov z6.d, x8 +; CHECK-NEXT: movprfx z7, z0 +; CHECK-NEXT: fcvtzs z7.d, p0/m, z0.d ; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z4.d +; CHECK-NEXT: movprfx z24, z1 +; CHECK-NEXT: fcvtzs z24.d, p0/m, z1.d ; CHECK-NEXT: fcmge p2.d, p0/z, z1.d, z4.d +; CHECK-NEXT: movprfx z25, z2 +; CHECK-NEXT: fcvtzs z25.d, p0/m, z2.d ; CHECK-NEXT: fcmge p3.d, p0/z, z2.d, z4.d +; CHECK-NEXT: movprfx z26, z3 +; CHECK-NEXT: fcvtzs z26.d, p0/m, z3.d ; CHECK-NEXT: fcmge p4.d, p0/z, z3.d, z4.d -; CHECK-NEXT: movprfx z4, z0 -; CHECK-NEXT: fcvtzs z4.d, p0/m, z0.d -; CHECK-NEXT: movprfx z7, z1 -; CHECK-NEXT: fcvtzs z7.d, p0/m, z1.d -; CHECK-NEXT: movprfx z24, z2 -; CHECK-NEXT: fcvtzs z24.d, p0/m, z2.d -; CHECK-NEXT: movprfx z25, z3 -; CHECK-NEXT: fcvtzs z25.d, p0/m, z3.d +; CHECK-NEXT: mov z4.d, #0x7fffffffffffffff ; CHECK-NEXT: fcmgt p7.d, p0/z, z2.d, z6.d ; CHECK-NEXT: fcmgt p5.d, p0/z, z0.d, z6.d ; CHECK-NEXT: fcmgt p6.d, p0/z, z1.d, z6.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: mov z4.d, p1/m, z5.d +; CHECK-NEXT: sel z7.d, p1, z7.d, z5.d ; CHECK-NEXT: fcmgt p1.d, p0/z, z3.d, z6.d -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: sel z6.d, p2, z5.d, z7.d +; CHECK-NEXT: sel z6.d, p2, z24.d, z5.d +; CHECK-NEXT: sel z24.d, p3, z25.d, z5.d ; CHECK-NEXT: fcmuo p2.d, p0/z, z0.d, z0.d -; CHECK-NEXT: sel z7.d, p3, z5.d, z24.d -; CHECK-NEXT: fcmuo p3.d, p0/z, z1.d, z1.d -; CHECK-NEXT: sel z5.d, p4, z5.d, z25.d +; CHECK-NEXT: mov z5.d, p4/m, z26.d ; CHECK-NEXT: fcmuo p4.d, p0/z, z2.d, z2.d +; CHECK-NEXT: fcmuo p3.d, p0/z, z1.d, z1.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z3.d, z3.d -; CHECK-NEXT: sel z0.d, p5, z26.d, z4.d -; CHECK-NEXT: sel z1.d, p6, z26.d, z6.d -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z2.d, p7, z26.d, z7.d -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z3.d, p1, z26.d, z5.d +; CHECK-NEXT: sel z0.d, p5, z4.d, z7.d ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel z2.d, p7, z4.d, z24.d +; CHECK-NEXT: sel z1.d, p6, z4.d, z6.d +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel z3.d, p1, z4.d, z5.d +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 ; CHECK-NEXT: mov z2.d, p4/m, #0 // =0x0 ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 ; CHECK-NEXT: mov z3.d, p0/m, #0 // =0x0 ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1367,7 +1305,7 @@ define @llrint_v16f64( %x) { ; CHECK-LABEL: llrint_v16f64: ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill @@ -1375,109 +1313,106 @@ define @llrint_v16f64( %x) { ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 ; CHECK-NEXT: mov z24.d, #0x7fffffffffffffff -; CHECK-NEXT: mov z25.d, x8 +; CHECK-NEXT: mov z27.d, x8 ; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff ; CHECK-NEXT: movprfx z26, z0 ; CHECK-NEXT: frintx z26.d, p0/m, z0.d -; CHECK-NEXT: movprfx z27, z1 -; CHECK-NEXT: frintx z27.d, p0/m, z1.d -; CHECK-NEXT: frintx z2.d, p0/m, z2.d +; CHECK-NEXT: movprfx z25, z1 +; CHECK-NEXT: frintx z25.d, p0/m, z1.d ; CHECK-NEXT: mov z0.d, #0x8000000000000000 -; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: frintx z3.d, p0/m, z3.d -; CHECK-NEXT: movprfx z28, z4 -; CHECK-NEXT: frintx z28.d, p0/m, z4.d +; CHECK-NEXT: movprfx z28, z2 +; CHECK-NEXT: frintx z28.d, p0/m, z2.d +; CHECK-NEXT: movprfx z2, z3 +; CHECK-NEXT: frintx z2.d, p0/m, z3.d +; CHECK-NEXT: movprfx z29, z4 +; CHECK-NEXT: frintx z29.d, p0/m, z4.d ; CHECK-NEXT: frintx z5.d, p0/m, z5.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: frintx z6.d, p0/m, z6.d ; CHECK-NEXT: frintx z7.d, p0/m, z7.d -; CHECK-NEXT: fcmge p1.d, p0/z, z26.d, z25.d -; CHECK-NEXT: fcmge p2.d, p0/z, z27.d, z25.d -; CHECK-NEXT: movprfx z4, z26 -; CHECK-NEXT: fcvtzs z4.d, p0/m, z26.d -; CHECK-NEXT: fcmge p5.d, p0/z, z2.d, z25.d -; CHECK-NEXT: movprfx z29, z27 -; CHECK-NEXT: fcvtzs z29.d, p0/m, z27.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z26.d, z1.d -; CHECK-NEXT: fcmge p6.d, p0/z, z3.d, z25.d -; CHECK-NEXT: fcmge p8.d, p0/z, z5.d, z25.d -; CHECK-NEXT: fcmgt p7.d, p0/z, z27.d, z1.d -; CHECK-NEXT: fcmge p9.d, p0/z, z6.d, z25.d -; CHECK-NEXT: movprfx z30, z28 -; CHECK-NEXT: fcvtzs z30.d, p0/m, z28.d -; CHECK-NEXT: fcmge p10.d, p0/z, z7.d, z25.d -; CHECK-NEXT: not p4.b, p0/z, p1.b -; CHECK-NEXT: fcmuo p1.d, p0/z, z26.d, z26.d -; CHECK-NEXT: movprfx z26, z2 -; CHECK-NEXT: fcvtzs z26.d, p0/m, z2.d -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: movprfx z31, z6 -; CHECK-NEXT: fcvtzs z31.d, p0/m, z6.d -; CHECK-NEXT: movprfx z8, z7 -; CHECK-NEXT: fcvtzs z8.d, p0/m, z7.d -; CHECK-NEXT: mov z4.d, p4/m, z0.d -; CHECK-NEXT: fcmge p4.d, p0/z, z28.d, z25.d -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: mov z29.d, p2/m, z0.d -; CHECK-NEXT: fcmuo p2.d, p0/z, z27.d, z27.d -; CHECK-NEXT: movprfx z27, z3 -; CHECK-NEXT: fcvtzs z27.d, p0/m, z3.d -; CHECK-NEXT: sel z25.d, p5, z0.d, z26.d -; CHECK-NEXT: movprfx z26, z5 -; CHECK-NEXT: fcvtzs z26.d, p0/m, z5.d -; CHECK-NEXT: not p6.b, p0/z, p6.b -; CHECK-NEXT: not p5.b, p0/z, p8.b -; CHECK-NEXT: fcmgt p8.d, p0/z, z2.d, z1.d -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z27.d, p6/m, z0.d -; CHECK-NEXT: not p6.b, p0/z, p9.b -; CHECK-NEXT: fcmuo p9.d, p0/z, z2.d, z2.d -; CHECK-NEXT: mov z30.d, p4/m, z0.d -; CHECK-NEXT: not p4.b, p0/z, p10.b -; CHECK-NEXT: fcmgt p10.d, p0/z, z3.d, z1.d -; CHECK-NEXT: mov z26.d, p5/m, z0.d -; CHECK-NEXT: fcmgt p5.d, p0/z, z28.d, z1.d -; CHECK-NEXT: mov z31.d, p6/m, z0.d -; CHECK-NEXT: mov z8.d, p4/m, z0.d -; CHECK-NEXT: sel z0.d, p3, z24.d, z4.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z5.d, z1.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z6.d, z1.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z7.d, z1.d -; CHECK-NEXT: sel z1.d, p7, z24.d, z29.d -; CHECK-NEXT: fcmuo p7.d, p0/z, z3.d, z3.d -; CHECK-NEXT: sel z2.d, p8, z24.d, z25.d -; CHECK-NEXT: sel z3.d, p10, z24.d, z27.d -; CHECK-NEXT: sel z4.d, p5, z24.d, z30.d -; CHECK-NEXT: fcmuo p5.d, p0/z, z28.d, z28.d -; CHECK-NEXT: fcmuo p8.d, p0/z, z5.d, z5.d -; CHECK-NEXT: fcmuo p10.d, p0/z, z6.d, z6.d -; CHECK-NEXT: sel z5.d, p3, z24.d, z26.d +; CHECK-NEXT: movprfx z30, z26 +; CHECK-NEXT: fcvtzs z30.d, p0/m, z26.d +; CHECK-NEXT: fcmge p1.d, p0/z, z26.d, z27.d +; CHECK-NEXT: movprfx z31, z25 +; CHECK-NEXT: fcvtzs z31.d, p0/m, z25.d +; CHECK-NEXT: fcmge p2.d, p0/z, z25.d, z27.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z26.d, z1.d +; CHECK-NEXT: fcmuo p3.d, p0/z, z26.d, z26.d +; CHECK-NEXT: movprfx z26, z28 +; CHECK-NEXT: fcvtzs z26.d, p0/m, z28.d +; CHECK-NEXT: fcmge p6.d, p0/z, z29.d, z27.d +; CHECK-NEXT: movprfx z8, z5 +; CHECK-NEXT: fcvtzs z8.d, p0/m, z5.d +; CHECK-NEXT: fcmge p7.d, p0/z, z5.d, z27.d +; CHECK-NEXT: movprfx z9, z6 +; CHECK-NEXT: fcvtzs z9.d, p0/m, z6.d +; CHECK-NEXT: fcmge p8.d, p0/z, z6.d, z27.d +; CHECK-NEXT: sel z3.d, p1, z30.d, z0.d +; CHECK-NEXT: fcmge p1.d, p0/z, z28.d, z27.d +; CHECK-NEXT: movprfx z30, z2 +; CHECK-NEXT: fcvtzs z30.d, p0/m, z2.d +; CHECK-NEXT: sel z4.d, p2, z31.d, z0.d +; CHECK-NEXT: fcmge p2.d, p0/z, z2.d, z27.d +; CHECK-NEXT: movprfx z31, z29 +; CHECK-NEXT: fcvtzs z31.d, p0/m, z29.d +; CHECK-NEXT: fcmge p9.d, p0/z, z7.d, z27.d +; CHECK-NEXT: movprfx z27, z7 +; CHECK-NEXT: fcvtzs z27.d, p0/m, z7.d +; CHECK-NEXT: fcmgt p5.d, p0/z, z25.d, z1.d +; CHECK-NEXT: fcmuo p10.d, p0/z, z25.d, z25.d +; CHECK-NEXT: sel z25.d, p1, z26.d, z0.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z28.d, z1.d +; CHECK-NEXT: sel z26.d, p2, z30.d, z0.d +; CHECK-NEXT: fcmuo p2.d, p0/z, z28.d, z28.d +; CHECK-NEXT: sel z28.d, p6, z31.d, z0.d +; CHECK-NEXT: fcmgt p6.d, p0/z, z2.d, z1.d +; CHECK-NEXT: sel z30.d, p7, z8.d, z0.d +; CHECK-NEXT: fcmgt p7.d, p0/z, z29.d, z1.d +; CHECK-NEXT: sel z31.d, p8, z9.d, z0.d +; CHECK-NEXT: sel z27.d, p9, z27.d, z0.d +; CHECK-NEXT: sel z0.d, p4, z24.d, z3.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z5.d, z1.d +; CHECK-NEXT: fcmgt p8.d, p0/z, z6.d, z1.d +; CHECK-NEXT: fcmgt p9.d, p0/z, z7.d, z1.d +; CHECK-NEXT: sel z1.d, p5, z24.d, z4.d +; CHECK-NEXT: fcmuo p5.d, p0/z, z2.d, z2.d +; CHECK-NEXT: sel z2.d, p1, z24.d, z25.d +; CHECK-NEXT: sel z3.d, p6, z24.d, z26.d +; CHECK-NEXT: sel z4.d, p7, z24.d, z28.d +; CHECK-NEXT: fcmuo p6.d, p0/z, z5.d, z5.d +; CHECK-NEXT: fcmuo p7.d, p0/z, z6.d, z6.d +; CHECK-NEXT: fcmuo p1.d, p0/z, z29.d, z29.d +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: sel z5.d, p4, z24.d, z30.d +; CHECK-NEXT: sel z6.d, p8, z24.d, z31.d +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: fcmuo p0.d, p0/z, z7.d, z7.d -; CHECK-NEXT: sel z6.d, p4, z24.d, z31.d -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z7.d, p6, z24.d, z8.d -; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z2.d, p9/m, #0 // =0x0 +; CHECK-NEXT: sel z7.d, p9, z24.d, z27.d ; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z3.d, p7/m, #0 // =0x0 -; CHECK-NEXT: mov z4.d, p5/m, #0 // =0x0 -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z5.d, p8/m, #0 // =0x0 -; CHECK-NEXT: mov z6.d, p10/m, #0 // =0x0 +; CHECK-NEXT: mov z1.d, p10/m, #0 // =0x0 +; CHECK-NEXT: mov z3.d, p5/m, #0 // =0x0 ; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z1.d, p2/m, #0 // =0x0 +; CHECK-NEXT: mov z5.d, p6/m, #0 // =0x0 +; CHECK-NEXT: mov z6.d, p7/m, #0 // =0x0 ; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z0.d, p3/m, #0 // =0x0 +; CHECK-NEXT: mov z2.d, p2/m, #0 // =0x0 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z4.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z7.d, p0/m, #0 // =0x0 ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %a = call @llvm.llrint.nxv16i64.nxv16f64( %x) @@ -1525,218 +1460,199 @@ define @llrint_v32f64( %x) { ; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG ; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ldr z2, [x0, #2, mul vl] +; CHECK-NEXT: ldr z3, [x0, #2, mul vl] ; CHECK-NEXT: mov x9, #-4332462841530417152 // =0xc3e0000000000000 -; CHECK-NEXT: ldr z24, [x0, #6, mul vl] ; CHECK-NEXT: ldr z1, [x0, #1, mul vl] -; CHECK-NEXT: mov z7.d, x9 -; CHECK-NEXT: mov z26.d, #0x8000000000000000 -; CHECK-NEXT: ldr z3, [x0, #3, mul vl] -; CHECK-NEXT: frintx z0.d, p0/m, z0.d -; CHECK-NEXT: movprfx z30, z2 -; CHECK-NEXT: frintx z30.d, p0/m, z2.d -; CHECK-NEXT: ldr z6, [x0, #5, mul vl] -; CHECK-NEXT: movprfx z25, z24 -; CHECK-NEXT: frintx z25.d, p0/m, z24.d -; CHECK-NEXT: movprfx z12, z1 -; CHECK-NEXT: frintx z12.d, p0/m, z1.d +; CHECK-NEXT: mov z25.d, #0x8000000000000000 +; CHECK-NEXT: mov z26.d, x9 ; CHECK-NEXT: ldr z5, [x0, #4, mul vl] -; CHECK-NEXT: frintx z3.d, p0/m, z3.d +; CHECK-NEXT: ldr z4, [x0, #3, mul vl] +; CHECK-NEXT: frintx z0.d, p0/m, z0.d +; CHECK-NEXT: movprfx z8, z3 +; CHECK-NEXT: frintx z8.d, p0/m, z3.d ; CHECK-NEXT: mov x9, #4890909195324358655 // =0x43dfffffffffffff -; CHECK-NEXT: frintx z6.d, p0/m, z6.d -; CHECK-NEXT: mov z4.d, x9 -; CHECK-NEXT: fcmge p3.d, p0/z, z0.d, z7.d -; CHECK-NEXT: movprfx z24, z0 -; CHECK-NEXT: fcvtzs z24.d, p0/m, z0.d -; CHECK-NEXT: fcmge p5.d, p0/z, z30.d, z7.d -; CHECK-NEXT: movprfx z28, z30 -; CHECK-NEXT: fcvtzs z28.d, p0/m, z30.d -; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: movprfx z10, z1 +; CHECK-NEXT: frintx z10.d, p0/m, z1.d +; CHECK-NEXT: mov z2.d, x9 +; CHECK-NEXT: frintx z4.d, p0/m, z4.d ; CHECK-NEXT: frintx z5.d, p0/m, z5.d -; CHECK-NEXT: fcmge p4.d, p0/z, z12.d, z7.d -; CHECK-NEXT: ldr z8, [x0, #7, mul vl] -; CHECK-NEXT: ldr z9, [x0, #15, mul vl] -; CHECK-NEXT: movprfx z27, z12 -; CHECK-NEXT: fcvtzs z27.d, p0/m, z12.d -; CHECK-NEXT: fcmge p6.d, p0/z, z3.d, z7.d -; CHECK-NEXT: fcmge p9.d, p0/z, z6.d, z7.d -; CHECK-NEXT: not p7.b, p0/z, p3.b -; CHECK-NEXT: movprfx z31, z3 -; CHECK-NEXT: fcvtzs z31.d, p0/m, z3.d -; CHECK-NEXT: movprfx z15, z6 -; CHECK-NEXT: fcvtzs z15.d, p0/m, z6.d -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: fcmge p8.d, p0/z, z5.d, z7.d -; CHECK-NEXT: movprfx z13, z5 -; CHECK-NEXT: fcvtzs z13.d, p0/m, z5.d -; CHECK-NEXT: sel z0.d, p7, z26.d, z24.d -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: movprfx z17, z25 -; CHECK-NEXT: fcvtzs z17.d, p0/m, z25.d -; CHECK-NEXT: not p3.b, p0/z, p6.b -; CHECK-NEXT: fcmge p6.d, p0/z, z25.d, z7.d -; CHECK-NEXT: movprfx z22, z9 -; CHECK-NEXT: frintx z22.d, p0/m, z9.d -; CHECK-NEXT: sel z29.d, p4, z26.d, z27.d -; CHECK-NEXT: movprfx z27, z8 -; CHECK-NEXT: frintx z27.d, p0/m, z8.d -; CHECK-NEXT: fcmgt p1.d, p0/z, z12.d, z4.d +; CHECK-NEXT: movprfx z7, z0 +; CHECK-NEXT: fcvtzs z7.d, p0/m, z0.d +; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z26.d +; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ldr z24, [x0, #6, mul vl] +; CHECK-NEXT: movprfx z29, z8 +; CHECK-NEXT: fcvtzs z29.d, p0/m, z8.d +; CHECK-NEXT: fcmge p4.d, p0/z, z8.d, z26.d +; CHECK-NEXT: ldr z27, [x0, #7, mul vl] +; CHECK-NEXT: movprfx z28, z10 +; CHECK-NEXT: fcvtzs z28.d, p0/m, z10.d +; CHECK-NEXT: fcmge p3.d, p0/z, z10.d, z26.d +; CHECK-NEXT: ldr z6, [x0, #5, mul vl] +; CHECK-NEXT: movprfx z31, z4 +; CHECK-NEXT: fcvtzs z31.d, p0/m, z4.d +; CHECK-NEXT: fcmge p5.d, p0/z, z4.d, z26.d +; CHECK-NEXT: sel z0.d, p2, z7.d, z25.d +; CHECK-NEXT: frintx z24.d, p0/m, z24.d +; CHECK-NEXT: movprfx z9, z5 +; CHECK-NEXT: fcvtzs z9.d, p0/m, z5.d +; CHECK-NEXT: frintx z27.d, p0/m, z27.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z10.d, z2.d +; CHECK-NEXT: frintx z6.d, p0/m, z6.d +; CHECK-NEXT: sel z30.d, p3, z28.d, z25.d ; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: sel z0.d, p5, z26.d, z28.d -; CHECK-NEXT: not p4.b, p0/z, p8.b -; CHECK-NEXT: ldr z10, [x0, #8, mul vl] -; CHECK-NEXT: not p5.b, p0/z, p9.b -; CHECK-NEXT: sel z24.d, p3, z26.d, z31.d -; CHECK-NEXT: not p3.b, p0/z, p6.b -; CHECK-NEXT: movprfx z2, z22 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z22.d -; CHECK-NEXT: fcmgt p2.d, p0/z, z30.d, z4.d +; CHECK-NEXT: sel z0.d, p4, z29.d, z25.d +; CHECK-NEXT: fcmge p4.d, p0/z, z5.d, z26.d +; CHECK-NEXT: movprfx z11, z24 +; CHECK-NEXT: fcvtzs z11.d, p0/m, z24.d +; CHECK-NEXT: fcmge p2.d, p0/z, z24.d, z26.d +; CHECK-NEXT: sel z31.d, p5, z31.d, z25.d +; CHECK-NEXT: movprfx z13, z27 +; CHECK-NEXT: fcvtzs z13.d, p0/m, z27.d +; CHECK-NEXT: fcmge p3.d, p0/z, z27.d, z26.d ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: fcmge p7.d, p0/z, z27.d, z7.d -; CHECK-NEXT: sel z31.d, p5, z26.d, z15.d -; CHECK-NEXT: ldr z11, [x0, #9, mul vl] -; CHECK-NEXT: movprfx z28, z10 -; CHECK-NEXT: frintx z28.d, p0/m, z10.d -; CHECK-NEXT: ldr z10, [x0, #10, mul vl] -; CHECK-NEXT: ldr z18, [x0, #11, mul vl] +; CHECK-NEXT: movprfx z28, z6 +; CHECK-NEXT: fcvtzs z28.d, p0/m, z6.d +; CHECK-NEXT: fcmge p5.d, p0/z, z6.d, z26.d +; CHECK-NEXT: ldr z12, [x0, #9, mul vl] +; CHECK-NEXT: ldr z14, [x0, #10, mul vl] +; CHECK-NEXT: ldr z15, [x0, #11, mul vl] +; CHECK-NEXT: ldr z17, [x0, #12, mul vl] +; CHECK-NEXT: ldr z19, [x0, #14, mul vl] ; CHECK-NEXT: ldr z16, [x0, #13, mul vl] -; CHECK-NEXT: ldr z14, [x0, #14, mul vl] -; CHECK-NEXT: ldr z19, [x0, #12, mul vl] -; CHECK-NEXT: mov z17.d, p3/m, z26.d -; CHECK-NEXT: fcmgt p9.d, p0/z, z3.d, z4.d -; CHECK-NEXT: movprfx z8, z11 -; CHECK-NEXT: frintx z8.d, p0/m, z11.d -; CHECK-NEXT: sel z11.d, p4, z26.d, z13.d -; CHECK-NEXT: frintx z10.d, p0/m, z10.d -; CHECK-NEXT: movprfx z13, z18 -; CHECK-NEXT: frintx z13.d, p0/m, z18.d -; CHECK-NEXT: fcmge p5.d, p0/z, z28.d, z7.d -; CHECK-NEXT: movprfx z18, z27 -; CHECK-NEXT: fcvtzs z18.d, p0/m, z27.d +; CHECK-NEXT: ldr z29, [x0, #8, mul vl] +; CHECK-NEXT: ldr z18, [x0, #15, mul vl] +; CHECK-NEXT: sel z7.d, p4, z9.d, z25.d +; CHECK-NEXT: movprfx z9, z12 +; CHECK-NEXT: frintx z9.d, p0/m, z12.d +; CHECK-NEXT: movprfx z12, z14 +; CHECK-NEXT: frintx z12.d, p0/m, z14.d +; CHECK-NEXT: sel z14.d, p2, z11.d, z25.d +; CHECK-NEXT: sel z11.d, p3, z13.d, z25.d +; CHECK-NEXT: movprfx z13, z15 +; CHECK-NEXT: frintx z13.d, p0/m, z15.d +; CHECK-NEXT: movprfx z15, z17 +; CHECK-NEXT: frintx z15.d, p0/m, z17.d +; CHECK-NEXT: movprfx z17, z19 +; CHECK-NEXT: frintx z17.d, p0/m, z19.d +; CHECK-NEXT: frintx z29.d, p0/m, z29.d ; CHECK-NEXT: frintx z16.d, p0/m, z16.d -; CHECK-NEXT: movprfx z15, z19 -; CHECK-NEXT: frintx z15.d, p0/m, z19.d -; CHECK-NEXT: movprfx z19, z28 -; CHECK-NEXT: fcvtzs z19.d, p0/m, z28.d -; CHECK-NEXT: movprfx z21, z14 -; CHECK-NEXT: frintx z21.d, p0/m, z14.d -; CHECK-NEXT: not p4.b, p0/z, p7.b -; CHECK-NEXT: fcmge p6.d, p0/z, z8.d, z7.d -; CHECK-NEXT: movprfx z20, z8 -; CHECK-NEXT: fcvtzs z20.d, p0/m, z8.d -; CHECK-NEXT: fcmge p7.d, p0/z, z10.d, z7.d -; CHECK-NEXT: fcmge p8.d, p0/z, z13.d, z7.d -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: sel z9.d, p4, z26.d, z18.d -; CHECK-NEXT: fcmge p4.d, p0/z, z16.d, z7.d -; CHECK-NEXT: fcmge p3.d, p0/z, z15.d, z7.d -; CHECK-NEXT: movprfx z0, z16 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z16.d -; CHECK-NEXT: sel z14.d, p5, z26.d, z19.d -; CHECK-NEXT: movprfx z19, z10 -; CHECK-NEXT: fcvtzs z19.d, p0/m, z10.d -; CHECK-NEXT: movprfx z1, z21 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z21.d -; CHECK-NEXT: not p6.b, p0/z, p6.b +; CHECK-NEXT: frintx z18.d, p0/m, z18.d +; CHECK-NEXT: sel z28.d, p5, z28.d, z25.d +; CHECK-NEXT: movprfx z19, z9 +; CHECK-NEXT: fcvtzs z19.d, p0/m, z9.d +; CHECK-NEXT: fcmge p3.d, p0/z, z9.d, z26.d +; CHECK-NEXT: movprfx z21, z12 +; CHECK-NEXT: fcvtzs z21.d, p0/m, z12.d +; CHECK-NEXT: movprfx z22, z13 +; CHECK-NEXT: fcvtzs z22.d, p0/m, z13.d +; CHECK-NEXT: fcmge p5.d, p0/z, z13.d, z26.d +; CHECK-NEXT: fcmge p4.d, p0/z, z12.d, z26.d +; CHECK-NEXT: fcmge p8.d, p0/z, z17.d, z26.d +; CHECK-NEXT: movprfx z1, z17 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z17.d +; CHECK-NEXT: movprfx z20, z29 +; CHECK-NEXT: fcvtzs z20.d, p0/m, z29.d +; CHECK-NEXT: fcmge p2.d, p0/z, z29.d, z26.d ; CHECK-NEXT: movprfx z23, z15 ; CHECK-NEXT: fcvtzs z23.d, p0/m, z15.d -; CHECK-NEXT: not p5.b, p0/z, p7.b -; CHECK-NEXT: sel z18.d, p6, z26.d, z20.d -; CHECK-NEXT: fcmge p6.d, p0/z, z21.d, z7.d -; CHECK-NEXT: not p7.b, p0/z, p8.b -; CHECK-NEXT: fcmge p8.d, p0/z, z22.d, z7.d -; CHECK-NEXT: movprfx z20, z13 -; CHECK-NEXT: fcvtzs z20.d, p0/m, z13.d -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z7.d, #0x7fffffffffffffff -; CHECK-NEXT: mov z19.d, p5/m, z26.d -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: mov z0.d, p4/m, z26.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z21.d, z4.d -; CHECK-NEXT: not p5.b, p0/z, p6.b -; CHECK-NEXT: mov z23.d, p3/m, z26.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z22.d, z4.d -; CHECK-NEXT: not p6.b, p0/z, p8.b -; CHECK-NEXT: mov z20.d, p7/m, z26.d -; CHECK-NEXT: fcmuo p8.d, p0/z, z22.d, z22.d +; CHECK-NEXT: fcmge p6.d, p0/z, z15.d, z26.d +; CHECK-NEXT: fcmge p7.d, p0/z, z16.d, z26.d +; CHECK-NEXT: fcmge p9.d, p0/z, z18.d, z26.d +; CHECK-NEXT: movprfx z0, z16 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z16.d +; CHECK-NEXT: movprfx z3, z18 +; CHECK-NEXT: fcvtzs z3.d, p0/m, z18.d +; CHECK-NEXT: mov z26.d, #0x7fffffffffffffff +; CHECK-NEXT: sel z22.d, p5, z22.d, z25.d +; CHECK-NEXT: fcmgt p5.d, p0/z, z17.d, z2.d +; CHECK-NEXT: sel z1.d, p8, z1.d, z25.d +; CHECK-NEXT: sel z19.d, p3, z19.d, z25.d +; CHECK-NEXT: sel z20.d, p2, z20.d, z25.d +; CHECK-NEXT: fcmgt p2.d, p0/z, z8.d, z2.d +; CHECK-NEXT: fcmgt p3.d, p0/z, z4.d, z2.d +; CHECK-NEXT: sel z21.d, p4, z21.d, z25.d +; CHECK-NEXT: sel z23.d, p6, z23.d, z25.d +; CHECK-NEXT: sel z0.d, p7, z0.d, z25.d +; CHECK-NEXT: sel z3.d, p9, z3.d, z25.d +; CHECK-NEXT: sel z25.d, p1, z26.d, z30.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z16.d, z2.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z18.d, z2.d +; CHECK-NEXT: fcmuo p6.d, p0/z, z17.d, z17.d +; CHECK-NEXT: ldr z30, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov z1.d, p5/m, z26.d -; CHECK-NEXT: fcmuo p5.d, p0/z, z21.d, z21.d -; CHECK-NEXT: fcmgt p7.d, p0/z, z25.d, z4.d -; CHECK-NEXT: mov z2.d, p6/m, z26.d -; CHECK-NEXT: sel z26.d, p1, z7.d, z29.d -; CHECK-NEXT: fcmgt p1.d, p0/z, z16.d, z4.d -; CHECK-NEXT: ldr z29, [sp] // 16-byte Folded Reload -; CHECK-NEXT: fcmgt p6.d, p0/z, z5.d, z4.d -; CHECK-NEXT: mov z24.d, p9/m, z7.d -; CHECK-NEXT: mov z1.d, p4/m, z7.d +; CHECK-NEXT: mov z31.d, p3/m, z26.d +; CHECK-NEXT: fcmgt p3.d, p0/z, z15.d, z2.d +; CHECK-NEXT: fcmuo p9.d, p0/z, z18.d, z18.d +; CHECK-NEXT: fcmuo p5.d, p0/z, z15.d, z15.d +; CHECK-NEXT: fcmgt p8.d, p0/z, z24.d, z2.d +; CHECK-NEXT: mov z30.d, p2/m, z26.d +; CHECK-NEXT: fcmgt p2.d, p0/z, z13.d, z2.d +; CHECK-NEXT: mov z0.d, p1/m, z26.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z12.d, z2.d +; CHECK-NEXT: mov z3.d, p4/m, z26.d ; CHECK-NEXT: fcmuo p4.d, p0/z, z16.d, z16.d -; CHECK-NEXT: mov z2.d, p3/m, z7.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z15.d, z4.d -; CHECK-NEXT: mov z17.d, p7/m, z7.d -; CHECK-NEXT: mov z29.d, p2/m, z7.d -; CHECK-NEXT: fcmgt p2.d, p0/z, z13.d, z4.d -; CHECK-NEXT: mov z0.d, p1/m, z7.d -; CHECK-NEXT: fcmgt p1.d, p0/z, z10.d, z4.d -; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 -; CHECK-NEXT: mov z11.d, p6/m, z7.d -; CHECK-NEXT: fcmuo p6.d, p0/z, z15.d, z15.d -; CHECK-NEXT: fcmgt p5.d, p0/z, z8.d, z4.d -; CHECK-NEXT: mov z2.d, p8/m, #0 // =0x0 -; CHECK-NEXT: sel z16.d, p3, z7.d, z23.d -; CHECK-NEXT: fcmuo p3.d, p0/z, z10.d, z10.d -; CHECK-NEXT: mov z0.d, p4/m, #0 // =0x0 -; CHECK-NEXT: sel z15.d, p2, z7.d, z20.d +; CHECK-NEXT: mov z1.d, p6/m, #0 // =0x0 +; CHECK-NEXT: sel z16.d, p3, z26.d, z23.d +; CHECK-NEXT: fcmuo p3.d, p0/z, z12.d, z12.d +; CHECK-NEXT: fcmgt p6.d, p0/z, z9.d, z2.d +; CHECK-NEXT: fcmgt p7.d, p0/z, z5.d, z2.d +; CHECK-NEXT: mov z14.d, p8/m, z26.d +; CHECK-NEXT: sel z15.d, p2, z26.d, z22.d ; CHECK-NEXT: fcmuo p2.d, p0/z, z13.d, z13.d +; CHECK-NEXT: mov z3.d, p9/m, #0 // =0x0 ; CHECK-NEXT: str z1, [x8, #14, mul vl] -; CHECK-NEXT: sel z1.d, p1, z7.d, z19.d -; CHECK-NEXT: fcmgt p1.d, p0/z, z28.d, z4.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z27.d, z4.d -; CHECK-NEXT: str z2, [x8, #15, mul vl] -; CHECK-NEXT: sel z2.d, p5, z7.d, z18.d -; CHECK-NEXT: mov z16.d, p6/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p5.d, p0/z, z8.d, z8.d -; CHECK-NEXT: str z0, [x8, #13, mul vl] +; CHECK-NEXT: sel z1.d, p1, z26.d, z21.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z29.d, z2.d +; CHECK-NEXT: mov z0.d, p4/m, #0 // =0x0 +; CHECK-NEXT: fcmgt p4.d, p0/z, z27.d, z2.d +; CHECK-NEXT: mov z16.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.d, p0/z, z9.d, z9.d +; CHECK-NEXT: str z3, [x8, #15, mul vl] +; CHECK-NEXT: sel z3.d, p6, z26.d, z19.d ; CHECK-NEXT: mov z15.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p2.d, p0/z, z28.d, z28.d +; CHECK-NEXT: fcmuo p2.d, p0/z, z29.d, z29.d ; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p3.d, p0/z, z6.d, z4.d -; CHECK-NEXT: sel z0.d, p1, z7.d, z14.d +; CHECK-NEXT: str z0, [x8, #13, mul vl] +; CHECK-NEXT: fcmgt p3.d, p0/z, z6.d, z2.d +; CHECK-NEXT: sel z0.d, p1, z26.d, z20.d ; CHECK-NEXT: fcmuo p1.d, p0/z, z27.d, z27.d -; CHECK-NEXT: sel z27.d, p4, z7.d, z9.d +; CHECK-NEXT: sel z27.d, p4, z26.d, z11.d ; CHECK-NEXT: str z16, [x8, #12, mul vl] -; CHECK-NEXT: fcmuo p4.d, p0/z, z25.d, z25.d ; CHECK-NEXT: str z15, [x8, #11, mul vl] -; CHECK-NEXT: mov z2.d, p5/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p5.d, p0/z, z6.d, z6.d +; CHECK-NEXT: mov z3.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p4.d, p0/z, z24.d, z24.d ; CHECK-NEXT: str z1, [x8, #10, mul vl] ; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 -; CHECK-NEXT: sel z1.d, p3, z7.d, z31.d +; CHECK-NEXT: fcmuo p5.d, p0/z, z6.d, z6.d +; CHECK-NEXT: sel z1.d, p3, z26.d, z28.d ; CHECK-NEXT: fcmuo p3.d, p0/z, z5.d, z5.d ; CHECK-NEXT: ldr z5, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: mov z27.d, p1/m, #0 // =0x0 -; CHECK-NEXT: str z2, [x8, #9, mul vl] -; CHECK-NEXT: fcmuo p1.d, p0/z, z3.d, z3.d +; CHECK-NEXT: str z3, [x8, #9, mul vl] +; CHECK-NEXT: fcmuo p1.d, p0/z, z4.d, z4.d ; CHECK-NEXT: str z0, [x8, #8, mul vl] -; CHECK-NEXT: mov z17.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p4.d, p0/z, z30.d, z30.d -; CHECK-NEXT: fcmgt p2.d, p0/z, z5.d, z4.d +; CHECK-NEXT: mov z14.d, p4/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p4.d, p0/z, z8.d, z8.d +; CHECK-NEXT: fcmgt p2.d, p0/z, z5.d, z2.d ; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p5.d, p0/z, z12.d, z12.d +; CHECK-NEXT: fcmuo p5.d, p0/z, z10.d, z10.d ; CHECK-NEXT: str z27, [x8, #7, mul vl] ; CHECK-NEXT: fcmuo p0.d, p0/z, z5.d, z5.d -; CHECK-NEXT: mov z11.d, p3/m, #0 // =0x0 +; CHECK-NEXT: mov z7.d, p7/m, z26.d ; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: mov z24.d, p1/m, #0 // =0x0 -; CHECK-NEXT: str z17, [x8, #6, mul vl] -; CHECK-NEXT: mov z29.d, p4/m, #0 // =0x0 +; CHECK-NEXT: mov z31.d, p1/m, #0 // =0x0 +; CHECK-NEXT: str z14, [x8, #6, mul vl] +; CHECK-NEXT: mov z30.d, p4/m, #0 // =0x0 ; CHECK-NEXT: str z1, [x8, #5, mul vl] -; CHECK-NEXT: mov z26.d, p5/m, #0 // =0x0 -; CHECK-NEXT: str z11, [x8, #4, mul vl] -; CHECK-NEXT: mov z0.d, p2/m, z7.d -; CHECK-NEXT: str z24, [x8, #3, mul vl] -; CHECK-NEXT: str z29, [x8, #2, mul vl] -; CHECK-NEXT: str z26, [x8, #1, mul vl] +; CHECK-NEXT: mov z7.d, p3/m, #0 // =0x0 +; CHECK-NEXT: mov z25.d, p5/m, #0 // =0x0 +; CHECK-NEXT: mov z0.d, p2/m, z26.d +; CHECK-NEXT: str z31, [x8, #3, mul vl] +; CHECK-NEXT: str z30, [x8, #2, mul vl] +; CHECK-NEXT: str z7, [x8, #4, mul vl] ; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 +; CHECK-NEXT: str z25, [x8, #1, mul vl] ; CHECK-NEXT: str z0, [x8] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-lrint.ll b/llvm/test/CodeGen/AArch64/sve-lrint.ll index 908ba2392a437..fd010e7ef0d61 100644 --- a/llvm/test/CodeGen/AArch64/sve-lrint.ll +++ b/llvm/test/CodeGen/AArch64/sve-lrint.ll @@ -7,17 +7,16 @@ define @lrint_v1f16( %x) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #64511 // =0xfbff -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff ; CHECK-NEXT: frintx z0.h, p0/m, z0.h ; CHECK-NEXT: mov z3.h, w8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.h ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.h +; CHECK-NEXT: mov z1.d, #0x8000000000000000 ; CHECK-NEXT: fcmgt p2.h, p0/z, z0.h, z3.h ; CHECK-NEXT: mov z3.d, #0x7fffffffffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -33,17 +32,16 @@ define @lrint_v2f16( %x) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #64511 // =0xfbff -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff ; CHECK-NEXT: frintx z0.h, p0/m, z0.h ; CHECK-NEXT: mov z3.h, w8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.h ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.h +; CHECK-NEXT: mov z1.d, #0x8000000000000000 ; CHECK-NEXT: fcmgt p2.h, p0/z, z0.h, z3.h ; CHECK-NEXT: mov z3.d, #0x7fffffffffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -68,27 +66,25 @@ define @lrint_v4f16( %x) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: mov z3.h, w8 -; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff +; CHECK-NEXT: mov z3.d, #0x8000000000000000 +; CHECK-NEXT: mov z5.h, w8 ; CHECK-NEXT: frintx z1.h, p0/m, z1.h ; CHECK-NEXT: frintx z0.h, p0/m, z0.h -; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z2.h -; CHECK-NEXT: fcmge p2.h, p0/z, z0.h, z2.h -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.h -; CHECK-NEXT: movprfx z5, z0 -; CHECK-NEXT: fcvtzs z5.d, p0/m, z0.h -; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z3.h -; CHECK-NEXT: fcmgt p4.h, p0/z, z0.h, z3.h -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d +; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z2.h +; CHECK-NEXT: movprfx z6, z0 +; CHECK-NEXT: fcvtzs z6.d, p0/m, z0.h +; CHECK-NEXT: fcmge p2.h, p0/z, z0.h, z2.h +; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z5.h +; CHECK-NEXT: mov z2.d, #0x7fffffffffffffff +; CHECK-NEXT: fcmgt p4.h, p0/z, z0.h, z5.h +; CHECK-NEXT: sel z4.d, p1, z4.d, z3.d ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z1.h ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h -; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d +; CHECK-NEXT: mov z3.d, p2/m, z6.d +; CHECK-NEXT: sel z0.d, p3, z2.d, z4.d +; CHECK-NEXT: sel z1.d, p4, z2.d, z3.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 @@ -118,7 +114,6 @@ define @lrint_v8f16( %x) { ; CHECK-NEXT: mov z4.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff ; CHECK-NEXT: mov z6.h, w8 -; CHECK-NEXT: mov z26.d, #0x7fffffffffffffff ; CHECK-NEXT: uunpklo z2.d, z1.s ; CHECK-NEXT: uunpkhi z1.d, z1.s ; CHECK-NEXT: uunpklo z3.d, z0.s @@ -129,45 +124,42 @@ define @lrint_v8f16( %x) { ; CHECK-NEXT: movprfx z5, z0 ; CHECK-NEXT: frintx z5.h, p0/m, z0.h ; CHECK-NEXT: mov z0.d, #0x8000000000000000 +; CHECK-NEXT: movprfx z7, z2 +; CHECK-NEXT: fcvtzs z7.d, p0/m, z2.h ; CHECK-NEXT: fcmge p1.h, p0/z, z2.h, z4.h +; CHECK-NEXT: movprfx z24, z1 +; CHECK-NEXT: fcvtzs z24.d, p0/m, z1.h ; CHECK-NEXT: fcmge p2.h, p0/z, z1.h, z4.h +; CHECK-NEXT: movprfx z25, z3 +; CHECK-NEXT: fcvtzs z25.d, p0/m, z3.h ; CHECK-NEXT: fcmge p3.h, p0/z, z3.h, z4.h +; CHECK-NEXT: movprfx z26, z5 +; CHECK-NEXT: fcvtzs z26.d, p0/m, z5.h ; CHECK-NEXT: fcmge p4.h, p0/z, z5.h, z4.h -; CHECK-NEXT: movprfx z4, z2 -; CHECK-NEXT: fcvtzs z4.d, p0/m, z2.h -; CHECK-NEXT: movprfx z7, z1 -; CHECK-NEXT: fcvtzs z7.d, p0/m, z1.h -; CHECK-NEXT: movprfx z24, z3 -; CHECK-NEXT: fcvtzs z24.d, p0/m, z3.h -; CHECK-NEXT: movprfx z25, z5 -; CHECK-NEXT: fcvtzs z25.d, p0/m, z5.h +; CHECK-NEXT: mov z4.d, #0x7fffffffffffffff ; CHECK-NEXT: fcmgt p7.h, p0/z, z3.h, z6.h ; CHECK-NEXT: fcmgt p5.h, p0/z, z2.h, z6.h ; CHECK-NEXT: fcmgt p6.h, p0/z, z1.h, z6.h -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: mov z4.d, p1/m, z0.d +; CHECK-NEXT: sel z7.d, p1, z7.d, z0.d ; CHECK-NEXT: fcmgt p1.h, p0/z, z5.h, z6.h -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: sel z6.d, p2, z0.d, z7.d +; CHECK-NEXT: sel z6.d, p2, z24.d, z0.d +; CHECK-NEXT: sel z24.d, p3, z25.d, z0.d ; CHECK-NEXT: fcmuo p2.h, p0/z, z2.h, z2.h -; CHECK-NEXT: sel z7.d, p3, z0.d, z24.d -; CHECK-NEXT: fcmuo p3.h, p0/z, z1.h, z1.h -; CHECK-NEXT: sel z24.d, p4, z0.d, z25.d +; CHECK-NEXT: sel z25.d, p4, z26.d, z0.d ; CHECK-NEXT: fcmuo p4.h, p0/z, z3.h, z3.h +; CHECK-NEXT: fcmuo p3.h, p0/z, z1.h, z1.h ; CHECK-NEXT: fcmuo p0.h, p0/z, z5.h, z5.h -; CHECK-NEXT: sel z0.d, p5, z26.d, z4.d -; CHECK-NEXT: sel z1.d, p6, z26.d, z6.d -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z2.d, p7, z26.d, z7.d -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z3.d, p1, z26.d, z24.d +; CHECK-NEXT: sel z0.d, p5, z4.d, z7.d ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel z2.d, p7, z4.d, z24.d +; CHECK-NEXT: sel z1.d, p6, z4.d, z6.d +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel z3.d, p1, z4.d, z25.d +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 ; CHECK-NEXT: mov z2.d, p4/m, #0 // =0x0 ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 ; CHECK-NEXT: mov z3.d, p0/m, #0 // =0x0 ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -199,113 +191,105 @@ define @lrint_v16f16( %x) { ; CHECK-NEXT: uunpkhi z0.s, z0.h ; CHECK-NEXT: mov w8, #64511 // =0xfbff ; CHECK-NEXT: uunpklo z4.s, z1.h -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpkhi z1.s, z1.h -; CHECK-NEXT: mov z5.h, w8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z24.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: mov z25.d, #0x8000000000000000 +; CHECK-NEXT: mov z5.d, #0x8000000000000000 ; CHECK-NEXT: mov z27.h, w8 ; CHECK-NEXT: mov z7.d, #0x7fffffffffffffff ; CHECK-NEXT: uunpklo z3.d, z2.s ; CHECK-NEXT: uunpkhi z2.d, z2.s ; CHECK-NEXT: uunpklo z6.d, z0.s ; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: uunpklo z24.d, z4.s +; CHECK-NEXT: uunpklo z25.d, z4.s ; CHECK-NEXT: uunpkhi z4.d, z4.s ; CHECK-NEXT: uunpklo z26.d, z1.s ; CHECK-NEXT: uunpkhi z1.d, z1.s -; CHECK-NEXT: frintx z2.h, p0/m, z2.h ; CHECK-NEXT: frintx z3.h, p0/m, z3.h +; CHECK-NEXT: frintx z2.h, p0/m, z2.h ; CHECK-NEXT: frintx z6.h, p0/m, z6.h ; CHECK-NEXT: movprfx z28, z0 ; CHECK-NEXT: frintx z28.h, p0/m, z0.h -; CHECK-NEXT: movprfx z29, z4 -; CHECK-NEXT: frintx z29.h, p0/m, z4.h -; CHECK-NEXT: frintx z24.h, p0/m, z24.h -; CHECK-NEXT: movprfx z30, z1 -; CHECK-NEXT: frintx z30.h, p0/m, z1.h +; CHECK-NEXT: movprfx z30, z4 +; CHECK-NEXT: frintx z30.h, p0/m, z4.h +; CHECK-NEXT: frintx z25.h, p0/m, z25.h ; CHECK-NEXT: frintx z26.h, p0/m, z26.h -; CHECK-NEXT: fcmge p5.h, p0/z, z2.h, z5.h -; CHECK-NEXT: fcmge p2.h, p0/z, z3.h, z5.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z2.h +; CHECK-NEXT: movprfx z31, z1 +; CHECK-NEXT: frintx z31.h, p0/m, z1.h ; CHECK-NEXT: movprfx z0, z3 ; CHECK-NEXT: fcvtzs z0.d, p0/m, z3.h -; CHECK-NEXT: fcmge p6.h, p0/z, z6.h, z5.h -; CHECK-NEXT: fcmgt p3.h, p0/z, z3.h, z27.h +; CHECK-NEXT: fcmge p2.h, p0/z, z3.h, z24.h +; CHECK-NEXT: movprfx z29, z2 +; CHECK-NEXT: fcvtzs z29.d, p0/m, z2.h +; CHECK-NEXT: fcmge p3.h, p0/z, z2.h, z24.h +; CHECK-NEXT: fcmgt p4.h, p0/z, z3.h, z27.h ; CHECK-NEXT: fcmuo p1.h, p0/z, z3.h, z3.h -; CHECK-NEXT: fcmge p7.h, p0/z, z28.h, z5.h -; CHECK-NEXT: movprfx z3, z6 -; CHECK-NEXT: fcvtzs z3.d, p0/m, z6.h -; CHECK-NEXT: fcmge p8.h, p0/z, z24.h, z5.h -; CHECK-NEXT: fcmgt p4.h, p0/z, z2.h, z27.h -; CHECK-NEXT: fcmge p9.h, p0/z, z26.h, z5.h -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: movprfx z4, z24 -; CHECK-NEXT: fcvtzs z4.d, p0/m, z24.h -; CHECK-NEXT: fcmge p10.h, p0/z, z30.h, z5.h -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: movprfx z31, z26 -; CHECK-NEXT: fcvtzs z31.d, p0/m, z26.h +; CHECK-NEXT: movprfx z1, z6 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z6.h +; CHECK-NEXT: fcmge p6.h, p0/z, z6.h, z24.h +; CHECK-NEXT: movprfx z4, z28 +; CHECK-NEXT: fcvtzs z4.d, p0/m, z28.h ; CHECK-NEXT: movprfx z8, z30 ; CHECK-NEXT: fcvtzs z8.d, p0/m, z30.h -; CHECK-NEXT: mov z1.d, p5/m, z25.d -; CHECK-NEXT: fcmge p5.h, p0/z, z29.h, z5.h -; CHECK-NEXT: not p6.b, p0/z, p6.b -; CHECK-NEXT: mov z0.d, p2/m, z25.d -; CHECK-NEXT: fcmuo p2.h, p0/z, z2.h, z2.h -; CHECK-NEXT: movprfx z2, z28 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z28.h -; CHECK-NEXT: movprfx z5, z29 -; CHECK-NEXT: fcvtzs z5.d, p0/m, z29.h -; CHECK-NEXT: not p7.b, p0/z, p7.b -; CHECK-NEXT: mov z3.d, p6/m, z25.d -; CHECK-NEXT: not p6.b, p0/z, p8.b -; CHECK-NEXT: fcmgt p8.h, p0/z, z6.h, z27.h -; CHECK-NEXT: mov z1.d, p4/m, z7.d -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: mov z0.d, p3/m, z7.d -; CHECK-NEXT: fcmgt p3.h, p0/z, z29.h, z27.h -; CHECK-NEXT: sel z9.d, p7, z25.d, z2.d -; CHECK-NEXT: not p7.b, p0/z, p9.b -; CHECK-NEXT: mov z4.d, p6/m, z25.d -; CHECK-NEXT: not p6.b, p0/z, p10.b -; CHECK-NEXT: fcmgt p10.h, p0/z, z28.h, z27.h -; CHECK-NEXT: mov z5.d, p5/m, z25.d -; CHECK-NEXT: fcmgt p5.h, p0/z, z24.h, z27.h -; CHECK-NEXT: fcmuo p9.h, p0/z, z6.h, z6.h -; CHECK-NEXT: sel z6.d, p7, z25.d, z31.d -; CHECK-NEXT: sel z25.d, p6, z25.d, z8.d -; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: fcmgt p6.h, p0/z, z26.h, z27.h -; CHECK-NEXT: fcmgt p7.h, p0/z, z30.h, z27.h -; CHECK-NEXT: fcmuo p4.h, p0/z, z28.h, z28.h -; CHECK-NEXT: sel z2.d, p8, z7.d, z3.d -; CHECK-NEXT: sel z3.d, p10, z7.d, z9.d +; CHECK-NEXT: fcmge p7.h, p0/z, z30.h, z24.h +; CHECK-NEXT: movprfx z9, z26 +; CHECK-NEXT: fcvtzs z9.d, p0/m, z26.h +; CHECK-NEXT: sel z0.d, p2, z0.d, z5.d +; CHECK-NEXT: fcmge p2.h, p0/z, z28.h, z24.h +; CHECK-NEXT: fcmge p8.h, p0/z, z26.h, z24.h +; CHECK-NEXT: sel z3.d, p3, z29.d, z5.d +; CHECK-NEXT: movprfx z29, z25 +; CHECK-NEXT: fcvtzs z29.d, p0/m, z25.h +; CHECK-NEXT: fcmge p3.h, p0/z, z25.h, z24.h +; CHECK-NEXT: fcmge p9.h, p0/z, z31.h, z24.h +; CHECK-NEXT: movprfx z24, z31 +; CHECK-NEXT: fcvtzs z24.d, p0/m, z31.h +; CHECK-NEXT: fcmgt p5.h, p0/z, z2.h, z27.h +; CHECK-NEXT: fcmuo p10.h, p0/z, z2.h, z2.h +; CHECK-NEXT: sel z2.d, p6, z1.d, z5.d +; CHECK-NEXT: fcmgt p6.h, p0/z, z6.h, z27.h +; CHECK-NEXT: sel z4.d, p2, z4.d, z5.d +; CHECK-NEXT: fcmuo p2.h, p0/z, z6.h, z6.h +; CHECK-NEXT: mov z0.d, p4/m, z7.d +; CHECK-NEXT: sel z6.d, p3, z29.d, z5.d +; CHECK-NEXT: fcmgt p3.h, p0/z, z28.h, z27.h +; CHECK-NEXT: sel z29.d, p7, z8.d, z5.d +; CHECK-NEXT: fcmgt p7.h, p0/z, z25.h, z27.h +; CHECK-NEXT: sel z8.d, p8, z9.d, z5.d ; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: fcmuo p8.h, p0/z, z29.h, z29.h -; CHECK-NEXT: mov z4.d, p5/m, z7.d -; CHECK-NEXT: fcmuo p5.h, p0/z, z24.h, z24.h -; CHECK-NEXT: fcmuo p10.h, p0/z, z26.h, z26.h -; CHECK-NEXT: mov z5.d, p3/m, z7.d -; CHECK-NEXT: mov z6.d, p6/m, z7.d -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: fcmuo p0.h, p0/z, z30.h, z30.h -; CHECK-NEXT: sel z7.d, p7, z7.d, z25.d -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z2.d, p9/m, #0 // =0x0 -; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z3.d, p4/m, #0 // =0x0 +; CHECK-NEXT: fcmgt p4.h, p0/z, z30.h, z27.h +; CHECK-NEXT: fcmgt p8.h, p0/z, z26.h, z27.h +; CHECK-NEXT: sel z24.d, p9, z24.d, z5.d +; CHECK-NEXT: fcmgt p9.h, p0/z, z31.h, z27.h +; CHECK-NEXT: sel z1.d, p5, z7.d, z3.d +; CHECK-NEXT: fcmuo p5.h, p0/z, z28.h, z28.h +; CHECK-NEXT: mov z2.d, p6/m, z7.d +; CHECK-NEXT: sel z3.d, p3, z7.d, z4.d +; CHECK-NEXT: fcmuo p6.h, p0/z, z30.h, z30.h +; CHECK-NEXT: sel z4.d, p7, z7.d, z6.d +; CHECK-NEXT: fcmuo p7.h, p0/z, z26.h, z26.h +; CHECK-NEXT: fcmuo p3.h, p0/z, z25.h, z25.h +; CHECK-NEXT: sel z5.d, p4, z7.d, z29.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z4.d, p5/m, #0 // =0x0 -; CHECK-NEXT: mov z5.d, p8/m, #0 // =0x0 +; CHECK-NEXT: sel z6.d, p8, z7.d, z8.d +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z6.d, p10/m, #0 // =0x0 -; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p0.h, p0/z, z31.h, z31.h +; CHECK-NEXT: sel z7.d, p9, z7.d, z24.d +; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z1.d, p10/m, #0 // =0x0 ; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z1.d, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z7.d, p0/m, #0 // =0x0 +; CHECK-NEXT: mov z3.d, p5/m, #0 // =0x0 +; CHECK-NEXT: mov z5.d, p6/m, #0 // =0x0 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z6.d, p7/m, #0 // =0x0 +; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z2.d, p2/m, #0 // =0x0 +; CHECK-NEXT: mov z4.d, p3/m, #0 // =0x0 ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z7.d, p0/m, #0 // =0x0 ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -341,8 +325,8 @@ define @lrint_v32f16( %x) { ; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-3 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 168 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG @@ -353,230 +337,219 @@ define @lrint_v32f16( %x) { ; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG ; CHECK-NEXT: uunpklo z4.s, z0.h -; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpkhi z0.s, z0.h ; CHECK-NEXT: mov w9, #64511 // =0xfbff ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: uunpklo z6.s, z1.h -; CHECK-NEXT: mov z26.h, w9 -; CHECK-NEXT: uunpkhi z25.s, z1.h +; CHECK-NEXT: mov z28.h, w9 ; CHECK-NEXT: mov w9, #31743 // =0x7bff +; CHECK-NEXT: uunpklo z24.s, z1.h +; CHECK-NEXT: uunpkhi z25.s, z1.h ; CHECK-NEXT: mov z27.d, #0x8000000000000000 -; CHECK-NEXT: uunpklo z31.s, z2.h -; CHECK-NEXT: uunpkhi z12.s, z2.h -; CHECK-NEXT: mov z17.d, z3.d -; CHECK-NEXT: uunpklo z0.d, z4.s +; CHECK-NEXT: uunpklo z8.s, z2.h +; CHECK-NEXT: uunpkhi z15.s, z3.h +; CHECK-NEXT: uunpklo z13.s, z3.h +; CHECK-NEXT: uunpklo z5.d, z4.s +; CHECK-NEXT: uunpkhi z7.d, z0.s +; CHECK-NEXT: uunpklo z6.d, z0.s ; CHECK-NEXT: uunpkhi z4.d, z4.s -; CHECK-NEXT: uunpklo z7.d, z5.s -; CHECK-NEXT: uunpkhi z24.d, z5.s -; CHECK-NEXT: uunpklo z28.d, z6.s -; CHECK-NEXT: uunpkhi z29.d, z6.s -; CHECK-NEXT: uunpklo z8.d, z25.s -; CHECK-NEXT: uunpkhi z9.d, z25.s -; CHECK-NEXT: uunpklo z16.s, z17.h -; CHECK-NEXT: uunpklo z11.d, z31.s -; CHECK-NEXT: uunpkhi z14.d, z31.s -; CHECK-NEXT: uunpkhi z17.s, z17.h -; CHECK-NEXT: movprfx z30, z4 -; CHECK-NEXT: frintx z30.h, p0/m, z4.h -; CHECK-NEXT: movprfx z4, z7 -; CHECK-NEXT: frintx z4.h, p0/m, z7.h -; CHECK-NEXT: frintx z0.h, p0/m, z0.h -; CHECK-NEXT: movprfx z6, z24 -; CHECK-NEXT: frintx z6.h, p0/m, z24.h -; CHECK-NEXT: movprfx z7, z28 -; CHECK-NEXT: frintx z7.h, p0/m, z28.h -; CHECK-NEXT: movprfx z25, z29 -; CHECK-NEXT: frintx z25.h, p0/m, z29.h -; CHECK-NEXT: movprfx z3, z9 -; CHECK-NEXT: frintx z3.h, p0/m, z9.h -; CHECK-NEXT: mov z5.h, w9 -; CHECK-NEXT: movprfx z31, z11 -; CHECK-NEXT: frintx z31.h, p0/m, z11.h -; CHECK-NEXT: movprfx z9, z14 -; CHECK-NEXT: frintx z9.h, p0/m, z14.h -; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z26.h -; CHECK-NEXT: fcmge p4.h, p0/z, z4.h, z26.h -; CHECK-NEXT: movprfx z24, z0 -; CHECK-NEXT: fcvtzs z24.d, p0/m, z0.h -; CHECK-NEXT: fcmge p2.h, p0/z, z30.h, z26.h -; CHECK-NEXT: movprfx z29, z4 -; CHECK-NEXT: fcvtzs z29.d, p0/m, z4.h -; CHECK-NEXT: fcmge p6.h, p0/z, z6.h, z26.h -; CHECK-NEXT: movprfx z28, z30 -; CHECK-NEXT: fcvtzs z28.d, p0/m, z30.h -; CHECK-NEXT: movprfx z10, z6 -; CHECK-NEXT: fcvtzs z10.d, p0/m, z6.h -; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: fcmge p3.h, p0/z, z7.h, z26.h -; CHECK-NEXT: movprfx z13, z7 -; CHECK-NEXT: fcvtzs z13.d, p0/m, z7.h -; CHECK-NEXT: movprfx z15, z25 -; CHECK-NEXT: fcvtzs z15.d, p0/m, z25.h -; CHECK-NEXT: not p5.b, p0/z, p1.b -; CHECK-NEXT: movprfx z18, z3 -; CHECK-NEXT: fcvtzs z18.d, p0/m, z3.h -; CHECK-NEXT: movprfx z20, z31 -; CHECK-NEXT: fcvtzs z20.d, p0/m, z31.h -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: movprfx z21, z9 -; CHECK-NEXT: fcvtzs z21.d, p0/m, z9.h -; CHECK-NEXT: fcmgt p1.h, p0/z, z30.h, z5.h -; CHECK-NEXT: sel z0.d, p5, z27.d, z24.d -; CHECK-NEXT: not p7.b, p0/z, p2.b -; CHECK-NEXT: fcmgt p2.h, p0/z, z4.h, z5.h -; CHECK-NEXT: mov z29.d, p4/m, z27.d -; CHECK-NEXT: fcmge p4.h, p0/z, z25.h, z26.h -; CHECK-NEXT: not p5.b, p0/z, p6.b -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: fcmge p6.h, p0/z, z9.h, z26.h -; CHECK-NEXT: fcmgt p9.h, p0/z, z6.h, z5.h -; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: sel z0.d, p7, z27.d, z28.d -; CHECK-NEXT: movprfx z28, z8 -; CHECK-NEXT: frintx z28.h, p0/m, z8.h -; CHECK-NEXT: sel z8.d, p5, z27.d, z10.d -; CHECK-NEXT: uunpklo z10.d, z12.s -; CHECK-NEXT: uunpkhi z12.d, z12.s -; CHECK-NEXT: not p5.b, p0/z, p4.b -; CHECK-NEXT: sel z11.d, p3, z27.d, z13.d -; CHECK-NEXT: uunpklo z13.d, z16.s -; CHECK-NEXT: fcmge p3.h, p0/z, z3.h, z26.h -; CHECK-NEXT: not p6.b, p0/z, p6.b -; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: sel z24.d, p5, z27.d, z15.d -; CHECK-NEXT: uunpkhi z15.d, z16.s -; CHECK-NEXT: movprfx z14, z28 -; CHECK-NEXT: fcvtzs z14.d, p0/m, z28.h -; CHECK-NEXT: frintx z10.h, p0/m, z10.h -; CHECK-NEXT: uunpklo z16.d, z17.s -; CHECK-NEXT: frintx z12.h, p0/m, z12.h -; CHECK-NEXT: uunpkhi z17.d, z17.s -; CHECK-NEXT: movprfx z19, z13 -; CHECK-NEXT: frintx z19.h, p0/m, z13.h -; CHECK-NEXT: fcmge p4.h, p0/z, z28.h, z26.h -; CHECK-NEXT: fcmge p5.h, p0/z, z31.h, z26.h -; CHECK-NEXT: not p3.b, p0/z, p3.b +; CHECK-NEXT: uunpkhi z26.d, z24.s +; CHECK-NEXT: uunpkhi z11.d, z25.s +; CHECK-NEXT: uunpklo z29.d, z25.s +; CHECK-NEXT: uunpkhi z16.d, z8.s +; CHECK-NEXT: uunpklo z21.d, z15.s +; CHECK-NEXT: uunpklo z20.d, z13.s +; CHECK-NEXT: movprfx z0, z5 +; CHECK-NEXT: frintx z0.h, p0/m, z5.h +; CHECK-NEXT: movprfx z5, z7 +; CHECK-NEXT: frintx z5.h, p0/m, z7.h +; CHECK-NEXT: uunpklo z7.d, z24.s +; CHECK-NEXT: movprfx z1, z4 +; CHECK-NEXT: frintx z1.h, p0/m, z4.h +; CHECK-NEXT: movprfx z4, z6 +; CHECK-NEXT: frintx z4.h, p0/m, z6.h +; CHECK-NEXT: uunpkhi z13.d, z13.s +; CHECK-NEXT: movprfx z25, z26 +; CHECK-NEXT: frintx z25.h, p0/m, z26.h +; CHECK-NEXT: movprfx z26, z29 +; CHECK-NEXT: frintx z26.h, p0/m, z29.h +; CHECK-NEXT: uunpkhi z15.d, z15.s +; CHECK-NEXT: mov z6.h, w9 +; CHECK-NEXT: movprfx z30, z0 +; CHECK-NEXT: fcvtzs z30.d, p0/m, z0.h +; CHECK-NEXT: fcmge p2.h, p0/z, z0.h, z28.h +; CHECK-NEXT: movprfx z10, z5 +; CHECK-NEXT: fcvtzs z10.d, p0/m, z5.h +; CHECK-NEXT: fcmge p5.h, p0/z, z5.h, z28.h +; CHECK-NEXT: str z0, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: movprfx z31, z1 +; CHECK-NEXT: fcvtzs z31.d, p0/m, z1.h +; CHECK-NEXT: fcmge p3.h, p0/z, z1.h, z28.h +; CHECK-NEXT: str z4, [sp] // 16-byte Folded Spill +; CHECK-NEXT: movprfx z9, z4 +; CHECK-NEXT: fcvtzs z9.d, p0/m, z4.h +; CHECK-NEXT: fcmge p4.h, p0/z, z4.h, z28.h +; CHECK-NEXT: movprfx z24, z7 +; CHECK-NEXT: frintx z24.h, p0/m, z7.h +; CHECK-NEXT: movprfx z4, z11 +; CHECK-NEXT: frintx z4.h, p0/m, z11.h +; CHECK-NEXT: sel z0.d, p2, z30.d, z27.d +; CHECK-NEXT: uunpklo z11.d, z8.s +; CHECK-NEXT: movprfx z14, z26 +; CHECK-NEXT: fcvtzs z14.d, p0/m, z26.h +; CHECK-NEXT: sel z30.d, p5, z10.d, z27.d +; CHECK-NEXT: uunpkhi z10.s, z2.h +; CHECK-NEXT: frintx z13.h, p0/m, z13.h +; CHECK-NEXT: sel z31.d, p3, z31.d, z27.d +; CHECK-NEXT: fcmge p3.h, p0/z, z25.h, z28.h ; CHECK-NEXT: frintx z15.h, p0/m, z15.h -; CHECK-NEXT: fcmge p7.h, p0/z, z10.h, z26.h -; CHECK-NEXT: frintx z16.h, p0/m, z16.h -; CHECK-NEXT: fcmge p8.h, p0/z, z12.h, z26.h -; CHECK-NEXT: frintx z17.h, p0/m, z17.h -; CHECK-NEXT: movprfx z23, z19 -; CHECK-NEXT: fcvtzs z23.d, p0/m, z19.h -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: sel z13.d, p3, z27.d, z18.d -; CHECK-NEXT: fcmge p3.h, p0/z, z19.h, z26.h -; CHECK-NEXT: movprfx z0, z15 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z15.h -; CHECK-NEXT: sel z22.d, p4, z27.d, z14.d -; CHECK-NEXT: sel z18.d, p6, z27.d, z21.d -; CHECK-NEXT: movprfx z21, z12 -; CHECK-NEXT: fcvtzs z21.d, p0/m, z12.h -; CHECK-NEXT: movprfx z1, z16 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z16.h -; CHECK-NEXT: sel z14.d, p5, z27.d, z20.d -; CHECK-NEXT: fcmge p4.h, p0/z, z15.h, z26.h +; CHECK-NEXT: sel z29.d, p4, z9.d, z27.d +; CHECK-NEXT: movprfx z12, z24 +; CHECK-NEXT: fcvtzs z12.d, p0/m, z24.h +; CHECK-NEXT: fcmge p2.h, p0/z, z24.h, z28.h +; CHECK-NEXT: movprfx z9, z25 +; CHECK-NEXT: fcvtzs z9.d, p0/m, z25.h +; CHECK-NEXT: movprfx z18, z4 +; CHECK-NEXT: fcvtzs z18.d, p0/m, z4.h +; CHECK-NEXT: fcmge p5.h, p0/z, z4.h, z28.h +; CHECK-NEXT: uunpkhi z19.d, z10.s +; CHECK-NEXT: uunpklo z17.d, z10.s +; CHECK-NEXT: movprfx z10, z16 +; CHECK-NEXT: frintx z10.h, p0/m, z16.h +; CHECK-NEXT: movprfx z3, z11 +; CHECK-NEXT: frintx z3.h, p0/m, z11.h +; CHECK-NEXT: fcmge p4.h, p0/z, z26.h, z28.h +; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sel z11.d, p2, z12.d, z27.d +; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z6.h +; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sel z8.d, p3, z9.d, z27.d +; CHECK-NEXT: sel z9.d, p5, z18.d, z27.d +; CHECK-NEXT: movprfx z18, z20 +; CHECK-NEXT: frintx z18.h, p0/m, z20.h +; CHECK-NEXT: movprfx z16, z19 +; CHECK-NEXT: frintx z16.h, p0/m, z19.h +; CHECK-NEXT: movprfx z19, z21 +; CHECK-NEXT: frintx z19.h, p0/m, z21.h +; CHECK-NEXT: movprfx z12, z17 +; CHECK-NEXT: frintx z12.h, p0/m, z17.h +; CHECK-NEXT: movprfx z17, z3 +; CHECK-NEXT: fcvtzs z17.d, p0/m, z3.h +; CHECK-NEXT: fcmge p2.h, p0/z, z3.h, z28.h +; CHECK-NEXT: sel z14.d, p4, z14.d, z27.d ; CHECK-NEXT: movprfx z20, z10 ; CHECK-NEXT: fcvtzs z20.d, p0/m, z10.h -; CHECK-NEXT: movprfx z2, z17 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z17.h -; CHECK-NEXT: not p5.b, p0/z, p7.b -; CHECK-NEXT: fcmge p6.h, p0/z, z16.h, z26.h -; CHECK-NEXT: not p7.b, p0/z, p8.b -; CHECK-NEXT: fcmge p8.h, p0/z, z17.h, z26.h -; CHECK-NEXT: mov z26.d, #0x7fffffffffffffff -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z20.d, p5/m, z27.d -; CHECK-NEXT: mov z21.d, p7/m, z27.d -; CHECK-NEXT: not p5.b, p0/z, p6.b -; CHECK-NEXT: mov z23.d, p3/m, z27.d -; CHECK-NEXT: fcmgt p3.h, p0/z, z17.h, z5.h -; CHECK-NEXT: not p6.b, p0/z, p8.b -; CHECK-NEXT: mov z0.d, p4/m, z27.d -; CHECK-NEXT: fcmgt p4.h, p0/z, z16.h, z5.h -; CHECK-NEXT: mov z1.d, p5/m, z27.d -; CHECK-NEXT: fcmuo p5.h, p0/z, z16.h, z16.h -; CHECK-NEXT: mov z29.d, p2/m, z26.d -; CHECK-NEXT: mov z2.d, p6/m, z27.d -; CHECK-NEXT: ldr z27, [sp] // 16-byte Folded Reload -; CHECK-NEXT: fcmgt p6.h, p0/z, z7.h, z5.h -; CHECK-NEXT: fcmgt p2.h, p0/z, z12.h, z5.h -; CHECK-NEXT: fcmuo p8.h, p0/z, z17.h, z17.h -; CHECK-NEXT: fcmgt p7.h, p0/z, z28.h, z5.h -; CHECK-NEXT: mov z1.d, p4/m, z26.d -; CHECK-NEXT: fcmuo p4.h, p0/z, z15.h, z15.h -; CHECK-NEXT: mov z8.d, p9/m, z26.d -; CHECK-NEXT: mov z27.d, p1/m, z26.d -; CHECK-NEXT: fcmgt p1.h, p0/z, z15.h, z5.h -; CHECK-NEXT: mov z2.d, p3/m, z26.d -; CHECK-NEXT: fcmgt p3.h, p0/z, z19.h, z5.h -; CHECK-NEXT: mov z11.d, p6/m, z26.d +; CHECK-NEXT: fcmge p3.h, p0/z, z10.h, z28.h +; CHECK-NEXT: fcmge p7.h, p0/z, z13.h, z28.h +; CHECK-NEXT: movprfx z23, z18 +; CHECK-NEXT: fcvtzs z23.d, p0/m, z18.h +; CHECK-NEXT: fcmge p6.h, p0/z, z18.h, z28.h +; CHECK-NEXT: fcmge p9.h, p0/z, z15.h, z28.h +; CHECK-NEXT: movprfx z22, z16 +; CHECK-NEXT: fcvtzs z22.d, p0/m, z16.h +; CHECK-NEXT: fcmge p5.h, p0/z, z16.h, z28.h +; CHECK-NEXT: fcmge p8.h, p0/z, z19.h, z28.h +; CHECK-NEXT: movprfx z1, z19 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z19.h +; CHECK-NEXT: movprfx z21, z12 +; CHECK-NEXT: fcvtzs z21.d, p0/m, z12.h +; CHECK-NEXT: fcmge p4.h, p0/z, z12.h, z28.h +; CHECK-NEXT: movprfx z0, z13 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z13.h +; CHECK-NEXT: movprfx z2, z15 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z15.h +; CHECK-NEXT: ldr z7, [sp] // 16-byte Folded Reload +; CHECK-NEXT: mov z28.d, #0x7fffffffffffffff +; CHECK-NEXT: sel z17.d, p2, z17.d, z27.d +; CHECK-NEXT: sel z20.d, p3, z20.d, z27.d +; CHECK-NEXT: sel z22.d, p5, z22.d, z27.d +; CHECK-NEXT: fcmgt p5.h, p0/z, z19.h, z6.h +; CHECK-NEXT: fcmgt p3.h, p0/z, z5.h, z6.h +; CHECK-NEXT: fcmgt p2.h, p0/z, z7.h, z6.h +; CHECK-NEXT: sel z1.d, p8, z1.d, z27.d +; CHECK-NEXT: sel z21.d, p4, z21.d, z27.d +; CHECK-NEXT: sel z23.d, p6, z23.d, z27.d +; CHECK-NEXT: sel z0.d, p7, z0.d, z27.d +; CHECK-NEXT: sel z2.d, p9, z2.d, z27.d +; CHECK-NEXT: sel z27.d, p1, z28.d, z31.d +; CHECK-NEXT: fcmgt p1.h, p0/z, z13.h, z6.h +; CHECK-NEXT: fcmgt p4.h, p0/z, z15.h, z6.h ; CHECK-NEXT: fcmuo p6.h, p0/z, z19.h, z19.h -; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p5.h, p0/z, z9.h, z5.h -; CHECK-NEXT: sel z15.d, p2, z26.d, z21.d -; CHECK-NEXT: fcmuo p2.h, p0/z, z12.h, z12.h -; CHECK-NEXT: mov z2.d, p8/m, #0 // =0x0 -; CHECK-NEXT: sel z16.d, p7, z26.d, z22.d -; CHECK-NEXT: mov z0.d, p1/m, z26.d -; CHECK-NEXT: fcmgt p1.h, p0/z, z10.h, z5.h +; CHECK-NEXT: mov z1.d, p5/m, z28.d +; CHECK-NEXT: fcmgt p7.h, p0/z, z24.h, z6.h +; CHECK-NEXT: mov z29.d, p2/m, z28.d +; CHECK-NEXT: fcmgt p2.h, p0/z, z16.h, z6.h +; CHECK-NEXT: fcmgt p8.h, p0/z, z26.h, z6.h +; CHECK-NEXT: mov z30.d, p3/m, z28.d +; CHECK-NEXT: fcmgt p3.h, p0/z, z18.h, z6.h +; CHECK-NEXT: fcmuo p9.h, p0/z, z15.h, z15.h +; CHECK-NEXT: mov z0.d, p1/m, z28.d +; CHECK-NEXT: fcmgt p1.h, p0/z, z12.h, z6.h +; CHECK-NEXT: mov z2.d, p4/m, z28.d +; CHECK-NEXT: fcmuo p4.h, p0/z, z13.h, z13.h +; CHECK-NEXT: mov z1.d, p6/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.h, p0/z, z18.h, z18.h +; CHECK-NEXT: sel z13.d, p2, z28.d, z22.d +; CHECK-NEXT: fcmuo p2.h, p0/z, z16.h, z16.h +; CHECK-NEXT: sel z31.d, p7, z28.d, z11.d +; CHECK-NEXT: sel z11.d, p8, z28.d, z14.d +; CHECK-NEXT: fcmgt p6.h, p0/z, z10.h, z6.h +; CHECK-NEXT: sel z14.d, p3, z28.d, z23.d ; CHECK-NEXT: str z1, [x8, #14, mul vl] -; CHECK-NEXT: sel z17.d, p3, z26.d, z23.d -; CHECK-NEXT: fcmuo p3.h, p0/z, z10.h, z10.h -; CHECK-NEXT: str z2, [x8, #15, mul vl] -; CHECK-NEXT: sel z2.d, p5, z26.d, z18.d -; CHECK-NEXT: fcmuo p5.h, p0/z, z9.h, z9.h +; CHECK-NEXT: sel z1.d, p1, z28.d, z21.d +; CHECK-NEXT: fcmgt p1.h, p0/z, z3.h, z6.h +; CHECK-NEXT: mov z2.d, p9/m, #0 // =0x0 ; CHECK-NEXT: mov z0.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p4.h, p0/z, z3.h, z5.h -; CHECK-NEXT: mov z15.d, p2/m, #0 // =0x0 -; CHECK-NEXT: sel z1.d, p1, z26.d, z20.d -; CHECK-NEXT: fcmgt p1.h, p0/z, z31.h, z5.h -; CHECK-NEXT: mov z17.d, p6/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p2.h, p0/z, z31.h, z31.h +; CHECK-NEXT: fcmuo p3.h, p0/z, z12.h, z12.h +; CHECK-NEXT: fcmgt p4.h, p0/z, z4.h, z6.h +; CHECK-NEXT: mov z14.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.h, p0/z, z10.h, z10.h +; CHECK-NEXT: mov z13.d, p2/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p2.h, p0/z, z3.h, z3.h +; CHECK-NEXT: str z2, [x8, #15, mul vl] +; CHECK-NEXT: sel z2.d, p6, z28.d, z20.d ; CHECK-NEXT: str z0, [x8, #13, mul vl] -; CHECK-NEXT: mov z2.d, p5/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p5.h, p0/z, z25.h, z25.h -; CHECK-NEXT: str z17, [x8, #12, mul vl] +; CHECK-NEXT: sel z0.d, p1, z28.d, z17.d ; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p3.h, p0/z, z25.h, z5.h -; CHECK-NEXT: str z15, [x8, #11, mul vl] -; CHECK-NEXT: sel z0.d, p1, z26.d, z14.d -; CHECK-NEXT: fcmuo p1.h, p0/z, z3.h, z3.h -; CHECK-NEXT: sel z3.d, p4, z26.d, z13.d -; CHECK-NEXT: fcmuo p4.h, p0/z, z28.h, z28.h +; CHECK-NEXT: str z14, [x8, #12, mul vl] +; CHECK-NEXT: fcmuo p1.h, p0/z, z4.h, z4.h +; CHECK-NEXT: fcmgt p3.h, p0/z, z25.h, z6.h +; CHECK-NEXT: str z13, [x8, #11, mul vl] +; CHECK-NEXT: sel z3.d, p4, z28.d, z9.d +; CHECK-NEXT: mov z2.d, p5/m, #0 // =0x0 ; CHECK-NEXT: str z1, [x8, #10, mul vl] -; CHECK-NEXT: sel z1.d, p3, z26.d, z24.d -; CHECK-NEXT: fcmuo p3.h, p0/z, z7.h, z7.h -; CHECK-NEXT: ldr z7, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: str z2, [x8, #9, mul vl] ; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.h, p0/z, z25.h, z25.h +; CHECK-NEXT: ldr z4, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: fcmuo p4.h, p0/z, z26.h, z26.h +; CHECK-NEXT: str z2, [x8, #9, mul vl] ; CHECK-NEXT: mov z3.d, p1/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p1.h, p0/z, z6.h, z6.h -; CHECK-NEXT: mov z16.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p4.h, p0/z, z4.h, z4.h -; CHECK-NEXT: fcmgt p2.h, p0/z, z7.h, z5.h -; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p5.h, p0/z, z30.h, z30.h +; CHECK-NEXT: sel z1.d, p3, z28.d, z8.d ; CHECK-NEXT: str z0, [x8, #8, mul vl] -; CHECK-NEXT: fcmuo p0.h, p0/z, z7.h, z7.h -; CHECK-NEXT: mov z11.d, p3/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p3.h, p0/z, z24.h, z24.h +; CHECK-NEXT: fcmuo p1.h, p0/z, z5.h, z5.h +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: fcmgt p2.h, p0/z, z4.h, z6.h ; CHECK-NEXT: str z3, [x8, #7, mul vl] +; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: mov z11.d, p4/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p4.h, p0/z, z7.h, z7.h +; CHECK-NEXT: fcmuo p5.h, p0/z, z0.h, z0.h ; CHECK-NEXT: ldr z0, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: str z16, [x8, #6, mul vl] -; CHECK-NEXT: mov z8.d, p1/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p0.h, p0/z, z4.h, z4.h +; CHECK-NEXT: mov z31.d, p3/m, #0 // =0x0 +; CHECK-NEXT: str z11, [x8, #6, mul vl] +; CHECK-NEXT: mov z30.d, p1/m, #0 // =0x0 ; CHECK-NEXT: str z1, [x8, #5, mul vl] +; CHECK-NEXT: mov z0.d, p2/m, z28.d ; CHECK-NEXT: mov z29.d, p4/m, #0 // =0x0 ; CHECK-NEXT: mov z27.d, p5/m, #0 // =0x0 -; CHECK-NEXT: str z11, [x8, #4, mul vl] -; CHECK-NEXT: str z8, [x8, #3, mul vl] -; CHECK-NEXT: mov z0.d, p2/m, z26.d +; CHECK-NEXT: str z31, [x8, #4, mul vl] +; CHECK-NEXT: str z30, [x8, #3, mul vl] +; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 ; CHECK-NEXT: str z29, [x8, #2, mul vl] ; CHECK-NEXT: str z27, [x8, #1, mul vl] -; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 ; CHECK-NEXT: str z0, [x8] -; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -612,17 +585,16 @@ define @lrint_v1f32( %x) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #-553648128 // =0xdf000000 -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff ; CHECK-NEXT: frintx z0.s, p0/m, z0.s ; CHECK-NEXT: mov z3.s, w8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.s ; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.s +; CHECK-NEXT: mov z1.d, #0x8000000000000000 ; CHECK-NEXT: fcmgt p2.s, p0/z, z0.s, z3.s ; CHECK-NEXT: mov z3.d, #0x7fffffffffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -638,17 +610,16 @@ define @lrint_v2f32( %x) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #-553648128 // =0xdf000000 -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff ; CHECK-NEXT: frintx z0.s, p0/m, z0.s ; CHECK-NEXT: mov z3.s, w8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.s ; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.s +; CHECK-NEXT: mov z1.d, #0x8000000000000000 ; CHECK-NEXT: fcmgt p2.s, p0/z, z0.s, z3.s ; CHECK-NEXT: mov z3.d, #0x7fffffffffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -673,27 +644,25 @@ define @lrint_v4f32( %x) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff -; CHECK-NEXT: mov z3.s, w8 -; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff +; CHECK-NEXT: mov z3.d, #0x8000000000000000 +; CHECK-NEXT: mov z5.s, w8 ; CHECK-NEXT: frintx z1.s, p0/m, z1.s ; CHECK-NEXT: frintx z0.s, p0/m, z0.s -; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, z2.s -; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z2.s -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.s -; CHECK-NEXT: movprfx z5, z0 -; CHECK-NEXT: fcvtzs z5.d, p0/m, z0.s -; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z3.s -; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z3.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d +; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, z2.s +; CHECK-NEXT: movprfx z6, z0 +; CHECK-NEXT: fcvtzs z6.d, p0/m, z0.s +; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z2.s +; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z5.s +; CHECK-NEXT: mov z2.d, #0x7fffffffffffffff +; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z5.s +; CHECK-NEXT: sel z4.d, p1, z4.d, z3.d ; CHECK-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s ; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s -; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d +; CHECK-NEXT: mov z3.d, p2/m, z6.d +; CHECK-NEXT: sel z0.d, p3, z2.d, z4.d +; CHECK-NEXT: sel z1.d, p4, z2.d, z3.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 @@ -726,50 +695,46 @@ define @lrint_v8f32( %x) { ; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff ; CHECK-NEXT: mov z5.d, #0x8000000000000000 ; CHECK-NEXT: mov z6.s, w8 -; CHECK-NEXT: mov z26.d, #0x7fffffffffffffff ; CHECK-NEXT: frintx z2.s, p0/m, z2.s ; CHECK-NEXT: frintx z0.s, p0/m, z0.s ; CHECK-NEXT: frintx z3.s, p0/m, z3.s ; CHECK-NEXT: frintx z1.s, p0/m, z1.s +; CHECK-NEXT: movprfx z7, z2 +; CHECK-NEXT: fcvtzs z7.d, p0/m, z2.s ; CHECK-NEXT: fcmge p1.s, p0/z, z2.s, z4.s +; CHECK-NEXT: movprfx z24, z0 +; CHECK-NEXT: fcvtzs z24.d, p0/m, z0.s ; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z4.s -; CHECK-NEXT: movprfx z7, z0 -; CHECK-NEXT: fcvtzs z7.d, p0/m, z0.s +; CHECK-NEXT: movprfx z25, z3 +; CHECK-NEXT: fcvtzs z25.d, p0/m, z3.s ; CHECK-NEXT: fcmge p3.s, p0/z, z3.s, z4.s +; CHECK-NEXT: movprfx z26, z1 +; CHECK-NEXT: fcvtzs z26.d, p0/m, z1.s ; CHECK-NEXT: fcmge p4.s, p0/z, z1.s, z4.s -; CHECK-NEXT: movprfx z4, z2 -; CHECK-NEXT: fcvtzs z4.d, p0/m, z2.s -; CHECK-NEXT: movprfx z24, z3 -; CHECK-NEXT: fcvtzs z24.d, p0/m, z3.s -; CHECK-NEXT: movprfx z25, z1 -; CHECK-NEXT: fcvtzs z25.d, p0/m, z1.s +; CHECK-NEXT: mov z4.d, #0x7fffffffffffffff ; CHECK-NEXT: fcmgt p7.s, p0/z, z3.s, z6.s ; CHECK-NEXT: fcmgt p5.s, p0/z, z2.s, z6.s ; CHECK-NEXT: fcmgt p6.s, p0/z, z0.s, z6.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: mov z4.d, p1/m, z5.d +; CHECK-NEXT: sel z7.d, p1, z7.d, z5.d ; CHECK-NEXT: fcmgt p1.s, p0/z, z1.s, z6.s -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: sel z6.d, p2, z5.d, z7.d +; CHECK-NEXT: sel z6.d, p2, z24.d, z5.d +; CHECK-NEXT: sel z24.d, p3, z25.d, z5.d ; CHECK-NEXT: fcmuo p2.s, p0/z, z2.s, z2.s -; CHECK-NEXT: sel z7.d, p3, z5.d, z24.d -; CHECK-NEXT: fcmuo p3.s, p0/z, z0.s, z0.s -; CHECK-NEXT: sel z5.d, p4, z5.d, z25.d +; CHECK-NEXT: mov z5.d, p4/m, z26.d ; CHECK-NEXT: fcmuo p4.s, p0/z, z3.s, z3.s +; CHECK-NEXT: fcmuo p3.s, p0/z, z0.s, z0.s ; CHECK-NEXT: fcmuo p0.s, p0/z, z1.s, z1.s -; CHECK-NEXT: sel z0.d, p5, z26.d, z4.d -; CHECK-NEXT: sel z1.d, p6, z26.d, z6.d -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z2.d, p7, z26.d, z7.d -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z3.d, p1, z26.d, z5.d +; CHECK-NEXT: sel z0.d, p5, z4.d, z7.d ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel z2.d, p7, z4.d, z24.d +; CHECK-NEXT: sel z1.d, p6, z4.d, z6.d +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel z3.d, p1, z4.d, z5.d +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 ; CHECK-NEXT: mov z2.d, p4/m, #0 // =0x0 ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 ; CHECK-NEXT: mov z3.d, p0/m, #0 // =0x0 ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -783,7 +748,7 @@ define @lrint_v16f32( %x) { ; CHECK-LABEL: lrint_v16f32: ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill @@ -791,119 +756,114 @@ define @lrint_v16f32( %x) { ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG ; CHECK-NEXT: uunpklo z4.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: uunpkhi z7.d, z0.s ; CHECK-NEXT: mov w8, #-553648128 // =0xdf000000 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: uunpklo z7.d, z1.s -; CHECK-NEXT: uunpkhi z1.d, z1.s ; CHECK-NEXT: uunpklo z24.d, z2.s ; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: mov z5.s, w8 +; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff ; CHECK-NEXT: uunpklo z25.d, z3.s ; CHECK-NEXT: uunpkhi z3.d, z3.s -; CHECK-NEXT: mov z26.d, #0x7fffffffffffffff -; CHECK-NEXT: movprfx z5, z4 -; CHECK-NEXT: frintx z5.s, p0/m, z4.s -; CHECK-NEXT: movprfx z6, z0 -; CHECK-NEXT: frintx z6.s, p0/m, z0.s -; CHECK-NEXT: mov z4.s, w8 -; CHECK-NEXT: frintx z7.s, p0/m, z7.s -; CHECK-NEXT: movprfx z28, z1 -; CHECK-NEXT: frintx z28.s, p0/m, z1.s -; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff +; CHECK-NEXT: mov z26.s, w8 ; CHECK-NEXT: mov z0.d, #0x8000000000000000 +; CHECK-NEXT: movprfx z6, z4 +; CHECK-NEXT: frintx z6.s, p0/m, z4.s +; CHECK-NEXT: movprfx z4, z7 +; CHECK-NEXT: frintx z4.s, p0/m, z7.s +; CHECK-NEXT: uunpklo z7.d, z1.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: movprfx z31, z2 +; CHECK-NEXT: frintx z31.s, p0/m, z2.s ; CHECK-NEXT: frintx z24.s, p0/m, z24.s -; CHECK-NEXT: movprfx z29, z2 -; CHECK-NEXT: frintx z29.s, p0/m, z2.s ; CHECK-NEXT: frintx z25.s, p0/m, z25.s -; CHECK-NEXT: movprfx z30, z3 -; CHECK-NEXT: frintx z30.s, p0/m, z3.s -; CHECK-NEXT: mov z27.s, w8 -; CHECK-NEXT: fcmge p1.s, p0/z, z5.s, z4.s -; CHECK-NEXT: fcmge p2.s, p0/z, z6.s, z4.s -; CHECK-NEXT: movprfx z1, z5 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z5.s -; CHECK-NEXT: movprfx z2, z6 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z6.s -; CHECK-NEXT: fcmge p5.s, p0/z, z7.s, z4.s -; CHECK-NEXT: fcmge p6.s, p0/z, z28.s, z4.s +; CHECK-NEXT: mov z29.d, #0x7fffffffffffffff +; CHECK-NEXT: movprfx z27, z6 +; CHECK-NEXT: fcvtzs z27.d, p0/m, z6.s +; CHECK-NEXT: fcmge p2.s, p0/z, z6.s, z5.s +; CHECK-NEXT: movprfx z28, z4 +; CHECK-NEXT: fcvtzs z28.d, p0/m, z4.s +; CHECK-NEXT: fcmge p3.s, p0/z, z4.s, z5.s +; CHECK-NEXT: frintx z7.s, p0/m, z7.s +; CHECK-NEXT: movprfx z30, z1 +; CHECK-NEXT: frintx z30.s, p0/m, z1.s +; CHECK-NEXT: fcmgt p4.s, p0/z, z6.s, z26.s +; CHECK-NEXT: fcmuo p1.s, p0/z, z6.s, z6.s +; CHECK-NEXT: movprfx z6, z3 +; CHECK-NEXT: frintx z6.s, p0/m, z3.s +; CHECK-NEXT: fcmge p6.s, p0/z, z24.s, z5.s +; CHECK-NEXT: movprfx z8, z31 +; CHECK-NEXT: fcvtzs z8.d, p0/m, z31.s +; CHECK-NEXT: fcmge p7.s, p0/z, z31.s, z5.s +; CHECK-NEXT: sel z1.d, p2, z27.d, z0.d +; CHECK-NEXT: movprfx z9, z25 +; CHECK-NEXT: fcvtzs z9.d, p0/m, z25.s +; CHECK-NEXT: fcmge p8.s, p0/z, z25.s, z5.s +; CHECK-NEXT: sel z2.d, p3, z28.d, z0.d ; CHECK-NEXT: movprfx z3, z7 ; CHECK-NEXT: fcvtzs z3.d, p0/m, z7.s -; CHECK-NEXT: fcmge p8.s, p0/z, z29.s, z4.s -; CHECK-NEXT: fcmgt p3.s, p0/z, z5.s, z27.s -; CHECK-NEXT: fcmgt p7.s, p0/z, z6.s, z27.s -; CHECK-NEXT: fcmge p9.s, p0/z, z25.s, z4.s -; CHECK-NEXT: movprfx z31, z25 -; CHECK-NEXT: fcvtzs z31.d, p0/m, z25.s -; CHECK-NEXT: not p4.b, p0/z, p1.b -; CHECK-NEXT: fcmuo p1.s, p0/z, z5.s, z5.s -; CHECK-NEXT: movprfx z5, z28 -; CHECK-NEXT: fcvtzs z5.d, p0/m, z28.s -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: fcmge p10.s, p0/z, z30.s, z4.s -; CHECK-NEXT: movprfx z8, z30 -; CHECK-NEXT: fcvtzs z8.d, p0/m, z30.s -; CHECK-NEXT: mov z1.d, p4/m, z0.d -; CHECK-NEXT: fcmge p4.s, p0/z, z24.s, z4.s -; CHECK-NEXT: movprfx z4, z29 -; CHECK-NEXT: fcvtzs z4.d, p0/m, z29.s -; CHECK-NEXT: mov z2.d, p2/m, z0.d -; CHECK-NEXT: fcmuo p2.s, p0/z, z6.s, z6.s -; CHECK-NEXT: movprfx z6, z24 -; CHECK-NEXT: fcvtzs z6.d, p0/m, z24.s -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: not p6.b, p0/z, p6.b -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z3.d, p5/m, z0.d -; CHECK-NEXT: not p5.b, p0/z, p8.b -; CHECK-NEXT: mov z5.d, p6/m, z0.d -; CHECK-NEXT: fcmgt p8.s, p0/z, z7.s, z27.s -; CHECK-NEXT: not p6.b, p0/z, p9.b -; CHECK-NEXT: mov z6.d, p4/m, z0.d -; CHECK-NEXT: fcmuo p9.s, p0/z, z7.s, z7.s -; CHECK-NEXT: not p4.b, p0/z, p10.b -; CHECK-NEXT: fcmgt p10.s, p0/z, z28.s, z27.s -; CHECK-NEXT: sel z7.d, p5, z0.d, z4.d -; CHECK-NEXT: fcmgt p5.s, p0/z, z24.s, z27.s -; CHECK-NEXT: mov z31.d, p6/m, z0.d -; CHECK-NEXT: fcmgt p6.s, p0/z, z30.s, z27.s -; CHECK-NEXT: mov z8.d, p4/m, z0.d -; CHECK-NEXT: sel z0.d, p3, z26.d, z1.d -; CHECK-NEXT: fcmgt p3.s, p0/z, z29.s, z27.s -; CHECK-NEXT: fcmgt p4.s, p0/z, z25.s, z27.s -; CHECK-NEXT: sel z1.d, p7, z26.d, z2.d -; CHECK-NEXT: fcmuo p7.s, p0/z, z28.s, z28.s -; CHECK-NEXT: sel z2.d, p8, z26.d, z3.d -; CHECK-NEXT: sel z3.d, p10, z26.d, z5.d -; CHECK-NEXT: fcmuo p8.s, p0/z, z29.s, z29.s -; CHECK-NEXT: sel z4.d, p5, z26.d, z6.d -; CHECK-NEXT: fcmuo p5.s, p0/z, z24.s, z24.s -; CHECK-NEXT: fcmuo p10.s, p0/z, z25.s, z25.s -; CHECK-NEXT: sel z5.d, p3, z26.d, z7.d -; CHECK-NEXT: fcmuo p0.s, p0/z, z30.s, z30.s -; CHECK-NEXT: sel z7.d, p6, z26.d, z8.d -; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z6.d, p4, z26.d, z31.d +; CHECK-NEXT: fcmge p3.s, p0/z, z7.s, z5.s +; CHECK-NEXT: movprfx z27, z30 +; CHECK-NEXT: fcvtzs z27.d, p0/m, z30.s +; CHECK-NEXT: fcmge p5.s, p0/z, z30.s, z5.s +; CHECK-NEXT: movprfx z28, z24 +; CHECK-NEXT: fcvtzs z28.d, p0/m, z24.s +; CHECK-NEXT: fcmge p9.s, p0/z, z6.s, z5.s +; CHECK-NEXT: movprfx z5, z6 +; CHECK-NEXT: fcvtzs z5.d, p0/m, z6.s +; CHECK-NEXT: fcmgt p2.s, p0/z, z4.s, z26.s +; CHECK-NEXT: fcmuo p10.s, p0/z, z4.s, z4.s +; CHECK-NEXT: sel z3.d, p3, z3.d, z0.d +; CHECK-NEXT: fcmgt p3.s, p0/z, z7.s, z26.s +; CHECK-NEXT: sel z4.d, p5, z27.d, z0.d +; CHECK-NEXT: fcmuo p5.s, p0/z, z7.s, z7.s +; CHECK-NEXT: sel z7.d, p6, z28.d, z0.d +; CHECK-NEXT: fcmgt p6.s, p0/z, z30.s, z26.s +; CHECK-NEXT: sel z27.d, p7, z8.d, z0.d +; CHECK-NEXT: fcmgt p7.s, p0/z, z24.s, z26.s +; CHECK-NEXT: sel z28.d, p8, z9.d, z0.d +; CHECK-NEXT: sel z8.d, p9, z5.d, z0.d +; CHECK-NEXT: sel z0.d, p4, z29.d, z1.d +; CHECK-NEXT: fcmgt p4.s, p0/z, z31.s, z26.s +; CHECK-NEXT: fcmgt p8.s, p0/z, z25.s, z26.s +; CHECK-NEXT: fcmgt p9.s, p0/z, z6.s, z26.s +; CHECK-NEXT: sel z1.d, p2, z29.d, z2.d +; CHECK-NEXT: sel z2.d, p3, z29.d, z3.d +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: sel z3.d, p6, z29.d, z4.d +; CHECK-NEXT: sel z4.d, p7, z29.d, z7.d +; CHECK-NEXT: fcmuo p6.s, p0/z, z31.s, z31.s +; CHECK-NEXT: fcmuo p7.s, p0/z, z25.s, z25.s +; CHECK-NEXT: fcmuo p2.s, p0/z, z30.s, z30.s +; CHECK-NEXT: fcmuo p3.s, p0/z, z24.s, z24.s +; CHECK-NEXT: fcmuo p0.s, p0/z, z6.s, z6.s +; CHECK-NEXT: sel z5.d, p4, z29.d, z27.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z2.d, p9/m, #0 // =0x0 -; CHECK-NEXT: mov z3.d, p7/m, #0 // =0x0 -; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z4.d, p5/m, #0 // =0x0 -; CHECK-NEXT: mov z5.d, p8/m, #0 // =0x0 +; CHECK-NEXT: sel z6.d, p8, z29.d, z28.d ; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z6.d, p10/m, #0 // =0x0 -; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 +; CHECK-NEXT: sel z7.d, p9, z29.d, z8.d +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z1.d, p10/m, #0 // =0x0 +; CHECK-NEXT: mov z2.d, p5/m, #0 // =0x0 ; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z1.d, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z7.d, p0/m, #0 // =0x0 +; CHECK-NEXT: mov z5.d, p6/m, #0 // =0x0 +; CHECK-NEXT: mov z6.d, p7/m, #0 // =0x0 ; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z3.d, p2/m, #0 // =0x0 +; CHECK-NEXT: mov z4.d, p3/m, #0 // =0x0 ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: mov z7.d, p0/m, #0 // =0x0 +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %a = call @llvm.lrint.nxv16iXLen.nxv16f32( %x) @@ -952,220 +912,206 @@ define @lrint_v32f32( %x) { ; CHECK-NEXT: uunpklo z24.d, z0.s ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w9, #-553648128 // =0xdf000000 -; CHECK-NEXT: uunpklo z26.d, z1.s ; CHECK-NEXT: uunpkhi z25.d, z0.s +; CHECK-NEXT: uunpklo z26.d, z1.s ; CHECK-NEXT: uunpkhi z28.d, z1.s -; CHECK-NEXT: mov z29.s, w9 +; CHECK-NEXT: mov z30.s, w9 ; CHECK-NEXT: mov w9, #1593835519 // =0x5effffff -; CHECK-NEXT: mov z17.d, z5.d +; CHECK-NEXT: mov z18.d, z5.d ; CHECK-NEXT: mov z27.d, #0x8000000000000000 -; CHECK-NEXT: uunpkhi z30.d, z2.s -; CHECK-NEXT: uunpklo z8.d, z3.s +; CHECK-NEXT: uunpkhi z11.d, z3.s +; CHECK-NEXT: uunpklo z29.d, z3.s ; CHECK-NEXT: movprfx z0, z24 ; CHECK-NEXT: frintx z0.s, p0/m, z24.s -; CHECK-NEXT: uunpkhi z9.d, z3.s +; CHECK-NEXT: uunpklo z15.d, z4.s ; CHECK-NEXT: uunpkhi z14.d, z4.s +; CHECK-NEXT: movprfx z5, z25 +; CHECK-NEXT: frintx z5.s, p0/m, z25.s ; CHECK-NEXT: movprfx z24, z26 ; CHECK-NEXT: frintx z24.s, p0/m, z26.s -; CHECK-NEXT: movprfx z1, z25 -; CHECK-NEXT: frintx z1.s, p0/m, z25.s -; CHECK-NEXT: movprfx z5, z28 -; CHECK-NEXT: frintx z5.s, p0/m, z28.s -; CHECK-NEXT: uunpklo z26.d, z2.s -; CHECK-NEXT: uunpklo z16.d, z17.s -; CHECK-NEXT: mov z25.s, w9 -; CHECK-NEXT: movprfx z28, z30 -; CHECK-NEXT: frintx z28.s, p0/m, z30.s -; CHECK-NEXT: movprfx z30, z8 -; CHECK-NEXT: frintx z30.s, p0/m, z8.s -; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z29.s +; CHECK-NEXT: movprfx z25, z28 +; CHECK-NEXT: frintx z25.s, p0/m, z28.s +; CHECK-NEXT: uunpklo z28.d, z2.s +; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: uunpklo z16.d, z18.s +; CHECK-NEXT: uunpkhi z18.d, z18.s +; CHECK-NEXT: uunpklo z20.d, z7.s +; CHECK-NEXT: frintx z29.s, p0/m, z29.s ; CHECK-NEXT: movprfx z31, z0 ; CHECK-NEXT: fcvtzs z31.d, p0/m, z0.s ; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: fcmge p2.s, p0/z, z1.s, z29.s -; CHECK-NEXT: fcmge p3.s, p0/z, z24.s, z29.s -; CHECK-NEXT: fcmge p5.s, p0/z, z5.s, z29.s -; CHECK-NEXT: frintx z26.s, p0/m, z26.s -; CHECK-NEXT: movprfx z10, z1 -; CHECK-NEXT: fcvtzs z10.d, p0/m, z1.s -; CHECK-NEXT: movprfx z11, z24 -; CHECK-NEXT: fcvtzs z11.d, p0/m, z24.s -; CHECK-NEXT: movprfx z12, z5 -; CHECK-NEXT: fcvtzs z12.d, p0/m, z5.s -; CHECK-NEXT: movprfx z15, z28 -; CHECK-NEXT: fcvtzs z15.d, p0/m, z28.s -; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: not p4.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p1.s, p0/z, z1.s, z25.s -; CHECK-NEXT: fcmgt p9.s, p0/z, z5.s, z25.s -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: sel z0.d, p4, z27.d, z31.d -; CHECK-NEXT: fcmge p4.s, p0/z, z26.s, z29.s -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: movprfx z13, z26 -; CHECK-NEXT: fcvtzs z13.d, p0/m, z26.s -; CHECK-NEXT: sel z31.d, p2, z27.d, z10.d -; CHECK-NEXT: uunpklo z10.d, z4.s -; CHECK-NEXT: sel z8.d, p3, z27.d, z11.d -; CHECK-NEXT: fcmge p3.s, p0/z, z28.s, z29.s -; CHECK-NEXT: sel z11.d, p5, z27.d, z12.d -; CHECK-NEXT: movprfx z4, z9 -; CHECK-NEXT: frintx z4.s, p0/m, z9.s +; CHECK-NEXT: movprfx z4, z15 +; CHECK-NEXT: frintx z4.s, p0/m, z15.s +; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z30.s +; CHECK-NEXT: movprfx z9, z24 +; CHECK-NEXT: fcvtzs z9.d, p0/m, z24.s +; CHECK-NEXT: fcmge p3.s, p0/z, z24.s, z30.s +; CHECK-NEXT: movprfx z3, z28 +; CHECK-NEXT: frintx z3.s, p0/m, z28.s +; CHECK-NEXT: movprfx z28, z2 +; CHECK-NEXT: frintx z28.s, p0/m, z2.s +; CHECK-NEXT: movprfx z8, z5 +; CHECK-NEXT: fcvtzs z8.d, p0/m, z5.s +; CHECK-NEXT: fcmge p2.s, p0/z, z5.s, z30.s +; CHECK-NEXT: movprfx z10, z25 +; CHECK-NEXT: fcvtzs z10.d, p0/m, z25.s +; CHECK-NEXT: fcmge p4.s, p0/z, z25.s, z30.s +; CHECK-NEXT: uunpklo z15.d, z6.s +; CHECK-NEXT: uunpkhi z19.d, z6.s +; CHECK-NEXT: uunpkhi z21.d, z7.s +; CHECK-NEXT: sel z0.d, p1, z31.d, z27.d +; CHECK-NEXT: movprfx z31, z11 +; CHECK-NEXT: frintx z31.s, p0/m, z11.s +; CHECK-NEXT: movprfx z7, z14 +; CHECK-NEXT: frintx z7.s, p0/m, z14.s +; CHECK-NEXT: movprfx z12, z28 +; CHECK-NEXT: fcvtzs z12.d, p0/m, z28.s +; CHECK-NEXT: movprfx z14, z18 +; CHECK-NEXT: frintx z14.s, p0/m, z18.s +; CHECK-NEXT: movprfx z18, z20 +; CHECK-NEXT: frintx z18.s, p0/m, z20.s +; CHECK-NEXT: sel z8.d, p2, z8.d, z27.d +; CHECK-NEXT: sel z10.d, p4, z10.d, z27.d +; CHECK-NEXT: movprfx z13, z3 +; CHECK-NEXT: fcvtzs z13.d, p0/m, z3.s +; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sel z0.d, p3, z9.d, z27.d +; CHECK-NEXT: fcmge p3.s, p0/z, z28.s, z30.s +; CHECK-NEXT: movprfx z17, z31 +; CHECK-NEXT: fcvtzs z17.d, p0/m, z31.s +; CHECK-NEXT: fcmge p5.s, p0/z, z31.s, z30.s +; CHECK-NEXT: fcmge p4.s, p0/z, z3.s, z30.s +; CHECK-NEXT: movprfx z11, z29 +; CHECK-NEXT: fcvtzs z11.d, p0/m, z29.s +; CHECK-NEXT: fcmge p2.s, p0/z, z29.s, z30.s +; CHECK-NEXT: frintx z15.s, p0/m, z15.s +; CHECK-NEXT: mov z26.s, w9 +; CHECK-NEXT: movprfx z22, z14 +; CHECK-NEXT: fcvtzs z22.d, p0/m, z14.s +; CHECK-NEXT: fcmge p8.s, p0/z, z18.s, z30.s +; CHECK-NEXT: sel z9.d, p3, z12.d, z27.d +; CHECK-NEXT: movprfx z12, z16 +; CHECK-NEXT: frintx z12.s, p0/m, z16.s +; CHECK-NEXT: movprfx z1, z18 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z18.s +; CHECK-NEXT: sel z6.d, p5, z17.d, z27.d +; CHECK-NEXT: movprfx z17, z19 +; CHECK-NEXT: frintx z17.s, p0/m, z19.s +; CHECK-NEXT: movprfx z19, z21 +; CHECK-NEXT: frintx z19.s, p0/m, z21.s +; CHECK-NEXT: fcmge p5.s, p0/z, z14.s, z30.s +; CHECK-NEXT: fcmgt p1.s, p0/z, z5.s, z26.s ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: not p5.b, p0/z, p4.b -; CHECK-NEXT: fcmge p4.s, p0/z, z30.s, z29.s -; CHECK-NEXT: fcmgt p2.s, p0/z, z24.s, z25.s -; CHECK-NEXT: sel z12.d, p5, z27.d, z13.d -; CHECK-NEXT: uunpkhi z13.d, z17.s -; CHECK-NEXT: movprfx z9, z10 -; CHECK-NEXT: frintx z9.s, p0/m, z10.s -; CHECK-NEXT: movprfx z10, z14 -; CHECK-NEXT: frintx z10.s, p0/m, z14.s -; CHECK-NEXT: uunpkhi z17.d, z6.s -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: uunpklo z14.d, z6.s -; CHECK-NEXT: movprfx z6, z16 -; CHECK-NEXT: frintx z6.s, p0/m, z16.s -; CHECK-NEXT: uunpklo z16.d, z7.s -; CHECK-NEXT: uunpkhi z7.d, z7.s -; CHECK-NEXT: sel z3.d, p3, z27.d, z15.d -; CHECK-NEXT: fcmge p3.s, p0/z, z4.s, z29.s -; CHECK-NEXT: frintx z13.s, p0/m, z13.s -; CHECK-NEXT: movprfx z15, z30 -; CHECK-NEXT: fcvtzs z15.d, p0/m, z30.s -; CHECK-NEXT: fcmge p5.s, p0/z, z9.s, z29.s -; CHECK-NEXT: fcmge p6.s, p0/z, z10.s, z29.s -; CHECK-NEXT: frintx z17.s, p0/m, z17.s -; CHECK-NEXT: movprfx z18, z4 -; CHECK-NEXT: fcvtzs z18.d, p0/m, z4.s -; CHECK-NEXT: movprfx z20, z10 -; CHECK-NEXT: fcvtzs z20.d, p0/m, z10.s -; CHECK-NEXT: frintx z16.s, p0/m, z16.s -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: movprfx z19, z14 -; CHECK-NEXT: frintx z19.s, p0/m, z14.s -; CHECK-NEXT: movprfx z14, z9 -; CHECK-NEXT: fcvtzs z14.d, p0/m, z9.s -; CHECK-NEXT: fcmge p7.s, p0/z, z6.s, z29.s -; CHECK-NEXT: fcmge p8.s, p0/z, z13.s, z29.s -; CHECK-NEXT: movprfx z21, z7 -; CHECK-NEXT: frintx z21.s, p0/m, z7.s -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: not p6.b, p0/z, p6.b -; CHECK-NEXT: mov z15.d, p4/m, z27.d -; CHECK-NEXT: fcmge p4.s, p0/z, z17.s, z29.s -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: sel z7.d, p3, z27.d, z18.d +; CHECK-NEXT: sel z13.d, p4, z13.d, z27.d +; CHECK-NEXT: sel z11.d, p2, z11.d, z27.d +; CHECK-NEXT: movprfx z16, z4 +; CHECK-NEXT: fcvtzs z16.d, p0/m, z4.s +; CHECK-NEXT: fcmge p2.s, p0/z, z4.s, z30.s +; CHECK-NEXT: movprfx z20, z7 +; CHECK-NEXT: fcvtzs z20.d, p0/m, z7.s +; CHECK-NEXT: fcmge p3.s, p0/z, z7.s, z30.s +; CHECK-NEXT: movprfx z21, z12 +; CHECK-NEXT: fcvtzs z21.d, p0/m, z12.s +; CHECK-NEXT: fcmge p4.s, p0/z, z12.s, z30.s +; CHECK-NEXT: movprfx z23, z15 +; CHECK-NEXT: fcvtzs z23.d, p0/m, z15.s +; CHECK-NEXT: fcmge p6.s, p0/z, z15.s, z30.s +; CHECK-NEXT: fcmge p7.s, p0/z, z17.s, z30.s +; CHECK-NEXT: fcmge p9.s, p0/z, z19.s, z30.s ; CHECK-NEXT: movprfx z0, z17 ; CHECK-NEXT: fcvtzs z0.d, p0/m, z17.s -; CHECK-NEXT: sel z18.d, p6, z27.d, z20.d -; CHECK-NEXT: movprfx z20, z6 -; CHECK-NEXT: fcvtzs z20.d, p0/m, z6.s -; CHECK-NEXT: fcmge p6.s, p0/z, z16.s, z29.s -; CHECK-NEXT: fcmge p3.s, p0/z, z19.s, z29.s -; CHECK-NEXT: mov z14.d, p5/m, z27.d -; CHECK-NEXT: not p5.b, p0/z, p7.b -; CHECK-NEXT: not p7.b, p0/z, p8.b -; CHECK-NEXT: fcmge p8.s, p0/z, z21.s, z29.s -; CHECK-NEXT: movprfx z1, z16 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z16.s -; CHECK-NEXT: movprfx z22, z13 -; CHECK-NEXT: fcvtzs z22.d, p0/m, z13.s -; CHECK-NEXT: movprfx z23, z19 -; CHECK-NEXT: fcvtzs z23.d, p0/m, z19.s -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: movprfx z2, z21 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z21.s -; CHECK-NEXT: mov z29.d, #0x7fffffffffffffff -; CHECK-NEXT: mov z20.d, p5/m, z27.d -; CHECK-NEXT: not p5.b, p0/z, p6.b -; CHECK-NEXT: mov z0.d, p4/m, z27.d -; CHECK-NEXT: fcmgt p4.s, p0/z, z16.s, z25.s -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: not p6.b, p0/z, p8.b -; CHECK-NEXT: mov z1.d, p5/m, z27.d -; CHECK-NEXT: mov z22.d, p7/m, z27.d -; CHECK-NEXT: mov z23.d, p3/m, z27.d -; CHECK-NEXT: fcmgt p3.s, p0/z, z21.s, z25.s -; CHECK-NEXT: fcmuo p5.s, p0/z, z16.s, z16.s -; CHECK-NEXT: mov z2.d, p6/m, z27.d -; CHECK-NEXT: sel z27.d, p1, z29.d, z31.d -; CHECK-NEXT: fcmgt p1.s, p0/z, z17.s, z25.s -; CHECK-NEXT: mov z1.d, p4/m, z29.d -; CHECK-NEXT: fcmgt p6.s, p0/z, z26.s, z25.s -; CHECK-NEXT: fcmgt p7.s, p0/z, z30.s, z25.s -; CHECK-NEXT: sel z31.d, p2, z29.d, z8.d -; CHECK-NEXT: fcmgt p2.s, p0/z, z13.s, z25.s -; CHECK-NEXT: fcmuo p8.s, p0/z, z21.s, z21.s -; CHECK-NEXT: mov z2.d, p3/m, z29.d +; CHECK-NEXT: movprfx z2, z19 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z19.s +; CHECK-NEXT: mov z30.d, #0x7fffffffffffffff +; CHECK-NEXT: sel z22.d, p5, z22.d, z27.d +; CHECK-NEXT: fcmgt p5.s, p0/z, z18.s, z26.s +; CHECK-NEXT: sel z1.d, p8, z1.d, z27.d +; CHECK-NEXT: sel z16.d, p2, z16.d, z27.d +; CHECK-NEXT: fcmgt p2.s, p0/z, z24.s, z26.s +; CHECK-NEXT: sel z20.d, p3, z20.d, z27.d +; CHECK-NEXT: fcmgt p3.s, p0/z, z25.s, z26.s +; CHECK-NEXT: sel z21.d, p4, z21.d, z27.d +; CHECK-NEXT: sel z23.d, p6, z23.d, z27.d +; CHECK-NEXT: sel z0.d, p7, z0.d, z27.d +; CHECK-NEXT: sel z2.d, p9, z2.d, z27.d +; CHECK-NEXT: sel z27.d, p1, z30.d, z8.d +; CHECK-NEXT: fcmgt p1.s, p0/z, z17.s, z26.s +; CHECK-NEXT: fcmgt p4.s, p0/z, z19.s, z26.s +; CHECK-NEXT: fcmuo p6.s, p0/z, z18.s, z18.s +; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, p5/m, z30.d +; CHECK-NEXT: fcmuo p9.s, p0/z, z19.s, z19.s +; CHECK-NEXT: mov z10.d, p3/m, z30.d +; CHECK-NEXT: fcmgt p3.s, p0/z, z15.s, z26.s +; CHECK-NEXT: fcmuo p5.s, p0/z, z15.s, z15.s +; CHECK-NEXT: fcmgt p8.s, p0/z, z29.s, z26.s +; CHECK-NEXT: fcmgt p7.s, p0/z, z3.s, z26.s +; CHECK-NEXT: mov z8.d, p2/m, z30.d +; CHECK-NEXT: fcmgt p2.s, p0/z, z14.s, z26.s +; CHECK-NEXT: mov z0.d, p1/m, z30.d +; CHECK-NEXT: fcmgt p1.s, p0/z, z12.s, z26.s +; CHECK-NEXT: mov z2.d, p4/m, z30.d ; CHECK-NEXT: fcmuo p4.s, p0/z, z17.s, z17.s -; CHECK-NEXT: fcmgt p3.s, p0/z, z19.s, z25.s -; CHECK-NEXT: mov z0.d, p1/m, z29.d -; CHECK-NEXT: fcmgt p1.s, p0/z, z6.s, z25.s -; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 -; CHECK-NEXT: sel z8.d, p9, z29.d, z11.d -; CHECK-NEXT: sel z11.d, p6, z29.d, z12.d -; CHECK-NEXT: sel z12.d, p7, z29.d, z15.d -; CHECK-NEXT: fcmgt p5.s, p0/z, z10.s, z25.s -; CHECK-NEXT: sel z15.d, p2, z29.d, z22.d -; CHECK-NEXT: fcmuo p2.s, p0/z, z13.s, z13.s +; CHECK-NEXT: mov z1.d, p6/m, #0 // =0x0 +; CHECK-NEXT: sel z17.d, p3, z30.d, z23.d +; CHECK-NEXT: fcmuo p3.s, p0/z, z12.s, z12.s +; CHECK-NEXT: fcmgt p6.s, p0/z, z7.s, z26.s +; CHECK-NEXT: mov z11.d, p8/m, z30.d +; CHECK-NEXT: mov z13.d, p7/m, z30.d +; CHECK-NEXT: sel z15.d, p2, z30.d, z22.d +; CHECK-NEXT: fcmuo p2.s, p0/z, z14.s, z14.s +; CHECK-NEXT: mov z2.d, p9/m, #0 // =0x0 ; CHECK-NEXT: str z1, [x8, #14, mul vl] -; CHECK-NEXT: mov z2.d, p8/m, #0 // =0x0 +; CHECK-NEXT: sel z1.d, p1, z30.d, z21.d +; CHECK-NEXT: fcmgt p1.s, p0/z, z4.s, z26.s ; CHECK-NEXT: mov z0.d, p4/m, #0 // =0x0 -; CHECK-NEXT: sel z1.d, p1, z29.d, z20.d -; CHECK-NEXT: fcmgt p1.s, p0/z, z9.s, z25.s -; CHECK-NEXT: fcmuo p6.s, p0/z, z19.s, z19.s -; CHECK-NEXT: sel z16.d, p3, z29.d, z23.d -; CHECK-NEXT: fcmuo p3.s, p0/z, z6.s, z6.s -; CHECK-NEXT: fcmgt p4.s, p0/z, z4.s, z25.s +; CHECK-NEXT: fcmgt p4.s, p0/z, z31.s, z26.s +; CHECK-NEXT: mov z17.d, p5/m, #0 // =0x0 ; CHECK-NEXT: str z2, [x8, #15, mul vl] -; CHECK-NEXT: sel z2.d, p5, z29.d, z18.d -; CHECK-NEXT: fcmuo p5.s, p0/z, z10.s, z10.s -; CHECK-NEXT: str z0, [x8, #13, mul vl] +; CHECK-NEXT: fcmuo p5.s, p0/z, z7.s, z7.s +; CHECK-NEXT: sel z2.d, p6, z30.d, z20.d ; CHECK-NEXT: mov z15.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p2.s, p0/z, z9.s, z9.s -; CHECK-NEXT: sel z0.d, p1, z29.d, z14.d -; CHECK-NEXT: mov z16.d, p6/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p1.s, p0/z, z4.s, z4.s +; CHECK-NEXT: fcmuo p2.s, p0/z, z4.s, z4.s ; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p3.s, p0/z, z28.s, z25.s -; CHECK-NEXT: sel z4.d, p4, z29.d, z7.d +; CHECK-NEXT: str z0, [x8, #13, mul vl] +; CHECK-NEXT: fcmgt p3.s, p0/z, z28.s, z26.s +; CHECK-NEXT: sel z0.d, p1, z30.d, z16.d +; CHECK-NEXT: str z17, [x8, #12, mul vl] +; CHECK-NEXT: fcmuo p1.s, p0/z, z31.s, z31.s +; CHECK-NEXT: sel z4.d, p4, z30.d, z6.d ; CHECK-NEXT: str z15, [x8, #11, mul vl] ; CHECK-NEXT: mov z2.d, p5/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p5.s, p0/z, z28.s, z28.s -; CHECK-NEXT: str z16, [x8, #12, mul vl] -; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p4.s, p0/z, z30.s, z30.s +; CHECK-NEXT: fcmuo p4.s, p0/z, z29.s, z29.s ; CHECK-NEXT: str z1, [x8, #10, mul vl] -; CHECK-NEXT: mov z4.d, p1/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p1.s, p0/z, z5.s, z5.s -; CHECK-NEXT: sel z1.d, p3, z29.d, z3.d +; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.s, p0/z, z28.s, z28.s +; CHECK-NEXT: sel z1.d, p3, z30.d, z9.d +; CHECK-NEXT: fcmuo p3.s, p0/z, z3.s, z3.s ; CHECK-NEXT: ldr z3, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: str z2, [x8, #9, mul vl] +; CHECK-NEXT: mov z4.d, p1/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p1.s, p0/z, z25.s, z25.s ; CHECK-NEXT: str z0, [x8, #8, mul vl] -; CHECK-NEXT: fcmuo p3.s, p0/z, z26.s, z26.s -; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: str z4, [x8, #7, mul vl] -; CHECK-NEXT: mov z12.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p2.s, p0/z, z3.s, z25.s -; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: mov z11.d, p4/m, #0 // =0x0 ; CHECK-NEXT: fcmuo p4.s, p0/z, z24.s, z24.s -; CHECK-NEXT: mov z8.d, p1/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p5.s, p0/z, z0.s, z0.s -; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: str z12, [x8, #6, mul vl] -; CHECK-NEXT: str z1, [x8, #5, mul vl] +; CHECK-NEXT: fcmgt p2.s, p0/z, z3.s, z26.s +; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.s, p0/z, z5.s, z5.s +; CHECK-NEXT: str z4, [x8, #7, mul vl] ; CHECK-NEXT: fcmuo p0.s, p0/z, z3.s, z3.s -; CHECK-NEXT: mov z11.d, p3/m, #0 // =0x0 -; CHECK-NEXT: str z8, [x8, #3, mul vl] -; CHECK-NEXT: mov z31.d, p4/m, #0 // =0x0 -; CHECK-NEXT: mov z0.d, p2/m, z29.d -; CHECK-NEXT: str z11, [x8, #4, mul vl] +; CHECK-NEXT: mov z13.d, p3/m, #0 // =0x0 +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z11, [x8, #6, mul vl] +; CHECK-NEXT: mov z10.d, p1/m, #0 // =0x0 +; CHECK-NEXT: str z1, [x8, #5, mul vl] +; CHECK-NEXT: mov z8.d, p4/m, #0 // =0x0 +; CHECK-NEXT: str z13, [x8, #4, mul vl] ; CHECK-NEXT: mov z27.d, p5/m, #0 // =0x0 -; CHECK-NEXT: str z31, [x8, #2, mul vl] -; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 +; CHECK-NEXT: str z10, [x8, #3, mul vl] +; CHECK-NEXT: mov z0.d, p2/m, z30.d +; CHECK-NEXT: str z8, [x8, #2, mul vl] ; CHECK-NEXT: str z27, [x8, #1, mul vl] +; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 ; CHECK-NEXT: str z0, [x8] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload @@ -1203,17 +1149,16 @@ define @lrint_v1f64( %x) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff ; CHECK-NEXT: frintx z0.d, p0/m, z0.d ; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.d ; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.d +; CHECK-NEXT: mov z1.d, #0x8000000000000000 ; CHECK-NEXT: fcmgt p2.d, p0/z, z0.d, z3.d ; CHECK-NEXT: mov z3.d, #0x7fffffffffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -1229,17 +1174,16 @@ define @lrint_v2f64( %x) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 -; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff ; CHECK-NEXT: frintx z0.d, p0/m, z0.d ; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.d ; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z0.d +; CHECK-NEXT: mov z1.d, #0x8000000000000000 ; CHECK-NEXT: fcmgt p2.d, p0/z, z0.d, z3.d ; CHECK-NEXT: mov z3.d, #0x7fffffffffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d ; CHECK-NEXT: mov z1.d, p1/m, z2.d ; CHECK-NEXT: sel z0.d, p2, z3.d, z1.d @@ -1260,29 +1204,27 @@ define @lrint_v4f64( %x) { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 -; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff +; CHECK-NEXT: mov z3.d, #0x8000000000000000 ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff ; CHECK-NEXT: frintx z0.d, p0/m, z0.d ; CHECK-NEXT: frintx z1.d, p0/m, z1.d -; CHECK-NEXT: mov z3.d, x8 -; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z2.d -; CHECK-NEXT: fcmge p2.d, p0/z, z1.d, z2.d -; CHECK-NEXT: mov z2.d, #0x8000000000000000 +; CHECK-NEXT: mov z5.d, x8 ; CHECK-NEXT: movprfx z4, z0 ; CHECK-NEXT: fcvtzs z4.d, p0/m, z0.d -; CHECK-NEXT: movprfx z5, z1 -; CHECK-NEXT: fcvtzs z5.d, p0/m, z1.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z0.d, z3.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z1.d, z3.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d +; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z2.d +; CHECK-NEXT: movprfx z6, z1 +; CHECK-NEXT: fcvtzs z6.d, p0/m, z1.d +; CHECK-NEXT: fcmge p2.d, p0/z, z1.d, z2.d +; CHECK-NEXT: fcmgt p3.d, p0/z, z0.d, z5.d +; CHECK-NEXT: mov z2.d, #0x7fffffffffffffff +; CHECK-NEXT: fcmgt p4.d, p0/z, z1.d, z5.d +; CHECK-NEXT: sel z4.d, p1, z4.d, z3.d ; CHECK-NEXT: fcmuo p1.d, p0/z, z0.d, z0.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z1.d, z1.d -; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d +; CHECK-NEXT: mov z3.d, p2/m, z6.d +; CHECK-NEXT: sel z0.d, p3, z2.d, z4.d +; CHECK-NEXT: sel z1.d, p4, z2.d, z3.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 @@ -1310,51 +1252,47 @@ define @lrint_v8f64( %x) { ; CHECK-NEXT: mov z5.d, #0x8000000000000000 ; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff -; CHECK-NEXT: mov z26.d, #0x7fffffffffffffff ; CHECK-NEXT: frintx z0.d, p0/m, z0.d ; CHECK-NEXT: frintx z1.d, p0/m, z1.d ; CHECK-NEXT: frintx z2.d, p0/m, z2.d ; CHECK-NEXT: frintx z3.d, p0/m, z3.d ; CHECK-NEXT: mov z6.d, x8 +; CHECK-NEXT: movprfx z7, z0 +; CHECK-NEXT: fcvtzs z7.d, p0/m, z0.d ; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z4.d +; CHECK-NEXT: movprfx z24, z1 +; CHECK-NEXT: fcvtzs z24.d, p0/m, z1.d ; CHECK-NEXT: fcmge p2.d, p0/z, z1.d, z4.d +; CHECK-NEXT: movprfx z25, z2 +; CHECK-NEXT: fcvtzs z25.d, p0/m, z2.d ; CHECK-NEXT: fcmge p3.d, p0/z, z2.d, z4.d +; CHECK-NEXT: movprfx z26, z3 +; CHECK-NEXT: fcvtzs z26.d, p0/m, z3.d ; CHECK-NEXT: fcmge p4.d, p0/z, z3.d, z4.d -; CHECK-NEXT: movprfx z4, z0 -; CHECK-NEXT: fcvtzs z4.d, p0/m, z0.d -; CHECK-NEXT: movprfx z7, z1 -; CHECK-NEXT: fcvtzs z7.d, p0/m, z1.d -; CHECK-NEXT: movprfx z24, z2 -; CHECK-NEXT: fcvtzs z24.d, p0/m, z2.d -; CHECK-NEXT: movprfx z25, z3 -; CHECK-NEXT: fcvtzs z25.d, p0/m, z3.d +; CHECK-NEXT: mov z4.d, #0x7fffffffffffffff ; CHECK-NEXT: fcmgt p7.d, p0/z, z2.d, z6.d ; CHECK-NEXT: fcmgt p5.d, p0/z, z0.d, z6.d ; CHECK-NEXT: fcmgt p6.d, p0/z, z1.d, z6.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: mov z4.d, p1/m, z5.d +; CHECK-NEXT: sel z7.d, p1, z7.d, z5.d ; CHECK-NEXT: fcmgt p1.d, p0/z, z3.d, z6.d -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: sel z6.d, p2, z5.d, z7.d +; CHECK-NEXT: sel z6.d, p2, z24.d, z5.d +; CHECK-NEXT: sel z24.d, p3, z25.d, z5.d ; CHECK-NEXT: fcmuo p2.d, p0/z, z0.d, z0.d -; CHECK-NEXT: sel z7.d, p3, z5.d, z24.d -; CHECK-NEXT: fcmuo p3.d, p0/z, z1.d, z1.d -; CHECK-NEXT: sel z5.d, p4, z5.d, z25.d +; CHECK-NEXT: mov z5.d, p4/m, z26.d ; CHECK-NEXT: fcmuo p4.d, p0/z, z2.d, z2.d +; CHECK-NEXT: fcmuo p3.d, p0/z, z1.d, z1.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z3.d, z3.d -; CHECK-NEXT: sel z0.d, p5, z26.d, z4.d -; CHECK-NEXT: sel z1.d, p6, z26.d, z6.d -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z2.d, p7, z26.d, z7.d -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z3.d, p1, z26.d, z5.d +; CHECK-NEXT: sel z0.d, p5, z4.d, z7.d ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel z2.d, p7, z4.d, z24.d +; CHECK-NEXT: sel z1.d, p6, z4.d, z6.d +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel z3.d, p1, z4.d, z5.d +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 ; CHECK-NEXT: mov z2.d, p4/m, #0 // =0x0 ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 ; CHECK-NEXT: mov z3.d, p0/m, #0 // =0x0 ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1368,7 +1306,7 @@ define @lrint_v16f64( %x) { ; CHECK-LABEL: lrint_v16f64: ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill @@ -1376,109 +1314,106 @@ define @lrint_v16f64( %x) { ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 ; CHECK-NEXT: mov z24.d, #0x7fffffffffffffff -; CHECK-NEXT: mov z25.d, x8 +; CHECK-NEXT: mov z27.d, x8 ; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff ; CHECK-NEXT: movprfx z26, z0 ; CHECK-NEXT: frintx z26.d, p0/m, z0.d -; CHECK-NEXT: movprfx z27, z1 -; CHECK-NEXT: frintx z27.d, p0/m, z1.d -; CHECK-NEXT: frintx z2.d, p0/m, z2.d +; CHECK-NEXT: movprfx z25, z1 +; CHECK-NEXT: frintx z25.d, p0/m, z1.d ; CHECK-NEXT: mov z0.d, #0x8000000000000000 -; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: frintx z3.d, p0/m, z3.d -; CHECK-NEXT: movprfx z28, z4 -; CHECK-NEXT: frintx z28.d, p0/m, z4.d +; CHECK-NEXT: movprfx z28, z2 +; CHECK-NEXT: frintx z28.d, p0/m, z2.d +; CHECK-NEXT: movprfx z2, z3 +; CHECK-NEXT: frintx z2.d, p0/m, z3.d +; CHECK-NEXT: movprfx z29, z4 +; CHECK-NEXT: frintx z29.d, p0/m, z4.d ; CHECK-NEXT: frintx z5.d, p0/m, z5.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: frintx z6.d, p0/m, z6.d ; CHECK-NEXT: frintx z7.d, p0/m, z7.d -; CHECK-NEXT: fcmge p1.d, p0/z, z26.d, z25.d -; CHECK-NEXT: fcmge p2.d, p0/z, z27.d, z25.d -; CHECK-NEXT: movprfx z4, z26 -; CHECK-NEXT: fcvtzs z4.d, p0/m, z26.d -; CHECK-NEXT: fcmge p5.d, p0/z, z2.d, z25.d -; CHECK-NEXT: movprfx z29, z27 -; CHECK-NEXT: fcvtzs z29.d, p0/m, z27.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z26.d, z1.d -; CHECK-NEXT: fcmge p6.d, p0/z, z3.d, z25.d -; CHECK-NEXT: fcmge p8.d, p0/z, z5.d, z25.d -; CHECK-NEXT: fcmgt p7.d, p0/z, z27.d, z1.d -; CHECK-NEXT: fcmge p9.d, p0/z, z6.d, z25.d -; CHECK-NEXT: movprfx z30, z28 -; CHECK-NEXT: fcvtzs z30.d, p0/m, z28.d -; CHECK-NEXT: fcmge p10.d, p0/z, z7.d, z25.d -; CHECK-NEXT: not p4.b, p0/z, p1.b -; CHECK-NEXT: fcmuo p1.d, p0/z, z26.d, z26.d -; CHECK-NEXT: movprfx z26, z2 -; CHECK-NEXT: fcvtzs z26.d, p0/m, z2.d -; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: movprfx z31, z6 -; CHECK-NEXT: fcvtzs z31.d, p0/m, z6.d -; CHECK-NEXT: movprfx z8, z7 -; CHECK-NEXT: fcvtzs z8.d, p0/m, z7.d -; CHECK-NEXT: mov z4.d, p4/m, z0.d -; CHECK-NEXT: fcmge p4.d, p0/z, z28.d, z25.d -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: mov z29.d, p2/m, z0.d -; CHECK-NEXT: fcmuo p2.d, p0/z, z27.d, z27.d -; CHECK-NEXT: movprfx z27, z3 -; CHECK-NEXT: fcvtzs z27.d, p0/m, z3.d -; CHECK-NEXT: sel z25.d, p5, z0.d, z26.d -; CHECK-NEXT: movprfx z26, z5 -; CHECK-NEXT: fcvtzs z26.d, p0/m, z5.d -; CHECK-NEXT: not p6.b, p0/z, p6.b -; CHECK-NEXT: not p5.b, p0/z, p8.b -; CHECK-NEXT: fcmgt p8.d, p0/z, z2.d, z1.d -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z27.d, p6/m, z0.d -; CHECK-NEXT: not p6.b, p0/z, p9.b -; CHECK-NEXT: fcmuo p9.d, p0/z, z2.d, z2.d -; CHECK-NEXT: mov z30.d, p4/m, z0.d -; CHECK-NEXT: not p4.b, p0/z, p10.b -; CHECK-NEXT: fcmgt p10.d, p0/z, z3.d, z1.d -; CHECK-NEXT: mov z26.d, p5/m, z0.d -; CHECK-NEXT: fcmgt p5.d, p0/z, z28.d, z1.d -; CHECK-NEXT: mov z31.d, p6/m, z0.d -; CHECK-NEXT: mov z8.d, p4/m, z0.d -; CHECK-NEXT: sel z0.d, p3, z24.d, z4.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z5.d, z1.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z6.d, z1.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z7.d, z1.d -; CHECK-NEXT: sel z1.d, p7, z24.d, z29.d -; CHECK-NEXT: fcmuo p7.d, p0/z, z3.d, z3.d -; CHECK-NEXT: sel z2.d, p8, z24.d, z25.d -; CHECK-NEXT: sel z3.d, p10, z24.d, z27.d -; CHECK-NEXT: sel z4.d, p5, z24.d, z30.d -; CHECK-NEXT: fcmuo p5.d, p0/z, z28.d, z28.d -; CHECK-NEXT: fcmuo p8.d, p0/z, z5.d, z5.d -; CHECK-NEXT: fcmuo p10.d, p0/z, z6.d, z6.d -; CHECK-NEXT: sel z5.d, p3, z24.d, z26.d +; CHECK-NEXT: movprfx z30, z26 +; CHECK-NEXT: fcvtzs z30.d, p0/m, z26.d +; CHECK-NEXT: fcmge p1.d, p0/z, z26.d, z27.d +; CHECK-NEXT: movprfx z31, z25 +; CHECK-NEXT: fcvtzs z31.d, p0/m, z25.d +; CHECK-NEXT: fcmge p2.d, p0/z, z25.d, z27.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z26.d, z1.d +; CHECK-NEXT: fcmuo p3.d, p0/z, z26.d, z26.d +; CHECK-NEXT: movprfx z26, z28 +; CHECK-NEXT: fcvtzs z26.d, p0/m, z28.d +; CHECK-NEXT: fcmge p6.d, p0/z, z29.d, z27.d +; CHECK-NEXT: movprfx z8, z5 +; CHECK-NEXT: fcvtzs z8.d, p0/m, z5.d +; CHECK-NEXT: fcmge p7.d, p0/z, z5.d, z27.d +; CHECK-NEXT: movprfx z9, z6 +; CHECK-NEXT: fcvtzs z9.d, p0/m, z6.d +; CHECK-NEXT: fcmge p8.d, p0/z, z6.d, z27.d +; CHECK-NEXT: sel z3.d, p1, z30.d, z0.d +; CHECK-NEXT: fcmge p1.d, p0/z, z28.d, z27.d +; CHECK-NEXT: movprfx z30, z2 +; CHECK-NEXT: fcvtzs z30.d, p0/m, z2.d +; CHECK-NEXT: sel z4.d, p2, z31.d, z0.d +; CHECK-NEXT: fcmge p2.d, p0/z, z2.d, z27.d +; CHECK-NEXT: movprfx z31, z29 +; CHECK-NEXT: fcvtzs z31.d, p0/m, z29.d +; CHECK-NEXT: fcmge p9.d, p0/z, z7.d, z27.d +; CHECK-NEXT: movprfx z27, z7 +; CHECK-NEXT: fcvtzs z27.d, p0/m, z7.d +; CHECK-NEXT: fcmgt p5.d, p0/z, z25.d, z1.d +; CHECK-NEXT: fcmuo p10.d, p0/z, z25.d, z25.d +; CHECK-NEXT: sel z25.d, p1, z26.d, z0.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z28.d, z1.d +; CHECK-NEXT: sel z26.d, p2, z30.d, z0.d +; CHECK-NEXT: fcmuo p2.d, p0/z, z28.d, z28.d +; CHECK-NEXT: sel z28.d, p6, z31.d, z0.d +; CHECK-NEXT: fcmgt p6.d, p0/z, z2.d, z1.d +; CHECK-NEXT: sel z30.d, p7, z8.d, z0.d +; CHECK-NEXT: fcmgt p7.d, p0/z, z29.d, z1.d +; CHECK-NEXT: sel z31.d, p8, z9.d, z0.d +; CHECK-NEXT: sel z27.d, p9, z27.d, z0.d +; CHECK-NEXT: sel z0.d, p4, z24.d, z3.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z5.d, z1.d +; CHECK-NEXT: fcmgt p8.d, p0/z, z6.d, z1.d +; CHECK-NEXT: fcmgt p9.d, p0/z, z7.d, z1.d +; CHECK-NEXT: sel z1.d, p5, z24.d, z4.d +; CHECK-NEXT: fcmuo p5.d, p0/z, z2.d, z2.d +; CHECK-NEXT: sel z2.d, p1, z24.d, z25.d +; CHECK-NEXT: sel z3.d, p6, z24.d, z26.d +; CHECK-NEXT: sel z4.d, p7, z24.d, z28.d +; CHECK-NEXT: fcmuo p6.d, p0/z, z5.d, z5.d +; CHECK-NEXT: fcmuo p7.d, p0/z, z6.d, z6.d +; CHECK-NEXT: fcmuo p1.d, p0/z, z29.d, z29.d +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: sel z5.d, p4, z24.d, z30.d +; CHECK-NEXT: sel z6.d, p8, z24.d, z31.d +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: fcmuo p0.d, p0/z, z7.d, z7.d -; CHECK-NEXT: sel z6.d, p4, z24.d, z31.d -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z7.d, p6, z24.d, z8.d -; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z2.d, p9/m, #0 // =0x0 +; CHECK-NEXT: sel z7.d, p9, z24.d, z27.d ; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z3.d, p7/m, #0 // =0x0 -; CHECK-NEXT: mov z4.d, p5/m, #0 // =0x0 -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z5.d, p8/m, #0 // =0x0 -; CHECK-NEXT: mov z6.d, p10/m, #0 // =0x0 +; CHECK-NEXT: mov z1.d, p10/m, #0 // =0x0 +; CHECK-NEXT: mov z3.d, p5/m, #0 // =0x0 ; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z1.d, p2/m, #0 // =0x0 +; CHECK-NEXT: mov z5.d, p6/m, #0 // =0x0 +; CHECK-NEXT: mov z6.d, p7/m, #0 // =0x0 ; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z0.d, p3/m, #0 // =0x0 +; CHECK-NEXT: mov z2.d, p2/m, #0 // =0x0 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov z4.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z7.d, p0/m, #0 // =0x0 ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %a = call @llvm.lrint.nxv16iXLen.nxv16f64( %x) @@ -1526,218 +1461,199 @@ define @lrint_v32f64( %x) { ; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG ; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ldr z2, [x0, #2, mul vl] +; CHECK-NEXT: ldr z3, [x0, #2, mul vl] ; CHECK-NEXT: mov x9, #-4332462841530417152 // =0xc3e0000000000000 -; CHECK-NEXT: ldr z24, [x0, #6, mul vl] ; CHECK-NEXT: ldr z1, [x0, #1, mul vl] -; CHECK-NEXT: mov z7.d, x9 -; CHECK-NEXT: mov z26.d, #0x8000000000000000 -; CHECK-NEXT: ldr z3, [x0, #3, mul vl] -; CHECK-NEXT: frintx z0.d, p0/m, z0.d -; CHECK-NEXT: movprfx z30, z2 -; CHECK-NEXT: frintx z30.d, p0/m, z2.d -; CHECK-NEXT: ldr z6, [x0, #5, mul vl] -; CHECK-NEXT: movprfx z25, z24 -; CHECK-NEXT: frintx z25.d, p0/m, z24.d -; CHECK-NEXT: movprfx z12, z1 -; CHECK-NEXT: frintx z12.d, p0/m, z1.d +; CHECK-NEXT: mov z25.d, #0x8000000000000000 +; CHECK-NEXT: mov z26.d, x9 ; CHECK-NEXT: ldr z5, [x0, #4, mul vl] -; CHECK-NEXT: frintx z3.d, p0/m, z3.d +; CHECK-NEXT: ldr z4, [x0, #3, mul vl] +; CHECK-NEXT: frintx z0.d, p0/m, z0.d +; CHECK-NEXT: movprfx z8, z3 +; CHECK-NEXT: frintx z8.d, p0/m, z3.d ; CHECK-NEXT: mov x9, #4890909195324358655 // =0x43dfffffffffffff -; CHECK-NEXT: frintx z6.d, p0/m, z6.d -; CHECK-NEXT: mov z4.d, x9 -; CHECK-NEXT: fcmge p3.d, p0/z, z0.d, z7.d -; CHECK-NEXT: movprfx z24, z0 -; CHECK-NEXT: fcvtzs z24.d, p0/m, z0.d -; CHECK-NEXT: fcmge p5.d, p0/z, z30.d, z7.d -; CHECK-NEXT: movprfx z28, z30 -; CHECK-NEXT: fcvtzs z28.d, p0/m, z30.d -; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: movprfx z10, z1 +; CHECK-NEXT: frintx z10.d, p0/m, z1.d +; CHECK-NEXT: mov z2.d, x9 +; CHECK-NEXT: frintx z4.d, p0/m, z4.d ; CHECK-NEXT: frintx z5.d, p0/m, z5.d -; CHECK-NEXT: fcmge p4.d, p0/z, z12.d, z7.d -; CHECK-NEXT: ldr z8, [x0, #7, mul vl] -; CHECK-NEXT: ldr z9, [x0, #15, mul vl] -; CHECK-NEXT: movprfx z27, z12 -; CHECK-NEXT: fcvtzs z27.d, p0/m, z12.d -; CHECK-NEXT: fcmge p6.d, p0/z, z3.d, z7.d -; CHECK-NEXT: fcmge p9.d, p0/z, z6.d, z7.d -; CHECK-NEXT: not p7.b, p0/z, p3.b -; CHECK-NEXT: movprfx z31, z3 -; CHECK-NEXT: fcvtzs z31.d, p0/m, z3.d -; CHECK-NEXT: movprfx z15, z6 -; CHECK-NEXT: fcvtzs z15.d, p0/m, z6.d -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: fcmge p8.d, p0/z, z5.d, z7.d -; CHECK-NEXT: movprfx z13, z5 -; CHECK-NEXT: fcvtzs z13.d, p0/m, z5.d -; CHECK-NEXT: sel z0.d, p7, z26.d, z24.d -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: movprfx z17, z25 -; CHECK-NEXT: fcvtzs z17.d, p0/m, z25.d -; CHECK-NEXT: not p3.b, p0/z, p6.b -; CHECK-NEXT: fcmge p6.d, p0/z, z25.d, z7.d -; CHECK-NEXT: movprfx z22, z9 -; CHECK-NEXT: frintx z22.d, p0/m, z9.d -; CHECK-NEXT: sel z29.d, p4, z26.d, z27.d -; CHECK-NEXT: movprfx z27, z8 -; CHECK-NEXT: frintx z27.d, p0/m, z8.d -; CHECK-NEXT: fcmgt p1.d, p0/z, z12.d, z4.d +; CHECK-NEXT: movprfx z7, z0 +; CHECK-NEXT: fcvtzs z7.d, p0/m, z0.d +; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z26.d +; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ldr z24, [x0, #6, mul vl] +; CHECK-NEXT: movprfx z29, z8 +; CHECK-NEXT: fcvtzs z29.d, p0/m, z8.d +; CHECK-NEXT: fcmge p4.d, p0/z, z8.d, z26.d +; CHECK-NEXT: ldr z27, [x0, #7, mul vl] +; CHECK-NEXT: movprfx z28, z10 +; CHECK-NEXT: fcvtzs z28.d, p0/m, z10.d +; CHECK-NEXT: fcmge p3.d, p0/z, z10.d, z26.d +; CHECK-NEXT: ldr z6, [x0, #5, mul vl] +; CHECK-NEXT: movprfx z31, z4 +; CHECK-NEXT: fcvtzs z31.d, p0/m, z4.d +; CHECK-NEXT: fcmge p5.d, p0/z, z4.d, z26.d +; CHECK-NEXT: sel z0.d, p2, z7.d, z25.d +; CHECK-NEXT: frintx z24.d, p0/m, z24.d +; CHECK-NEXT: movprfx z9, z5 +; CHECK-NEXT: fcvtzs z9.d, p0/m, z5.d +; CHECK-NEXT: frintx z27.d, p0/m, z27.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z10.d, z2.d +; CHECK-NEXT: frintx z6.d, p0/m, z6.d +; CHECK-NEXT: sel z30.d, p3, z28.d, z25.d ; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: sel z0.d, p5, z26.d, z28.d -; CHECK-NEXT: not p4.b, p0/z, p8.b -; CHECK-NEXT: ldr z10, [x0, #8, mul vl] -; CHECK-NEXT: not p5.b, p0/z, p9.b -; CHECK-NEXT: sel z24.d, p3, z26.d, z31.d -; CHECK-NEXT: not p3.b, p0/z, p6.b -; CHECK-NEXT: movprfx z2, z22 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z22.d -; CHECK-NEXT: fcmgt p2.d, p0/z, z30.d, z4.d +; CHECK-NEXT: sel z0.d, p4, z29.d, z25.d +; CHECK-NEXT: fcmge p4.d, p0/z, z5.d, z26.d +; CHECK-NEXT: movprfx z11, z24 +; CHECK-NEXT: fcvtzs z11.d, p0/m, z24.d +; CHECK-NEXT: fcmge p2.d, p0/z, z24.d, z26.d +; CHECK-NEXT: sel z31.d, p5, z31.d, z25.d +; CHECK-NEXT: movprfx z13, z27 +; CHECK-NEXT: fcvtzs z13.d, p0/m, z27.d +; CHECK-NEXT: fcmge p3.d, p0/z, z27.d, z26.d ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: fcmge p7.d, p0/z, z27.d, z7.d -; CHECK-NEXT: sel z31.d, p5, z26.d, z15.d -; CHECK-NEXT: ldr z11, [x0, #9, mul vl] -; CHECK-NEXT: movprfx z28, z10 -; CHECK-NEXT: frintx z28.d, p0/m, z10.d -; CHECK-NEXT: ldr z10, [x0, #10, mul vl] -; CHECK-NEXT: ldr z18, [x0, #11, mul vl] +; CHECK-NEXT: movprfx z28, z6 +; CHECK-NEXT: fcvtzs z28.d, p0/m, z6.d +; CHECK-NEXT: fcmge p5.d, p0/z, z6.d, z26.d +; CHECK-NEXT: ldr z12, [x0, #9, mul vl] +; CHECK-NEXT: ldr z14, [x0, #10, mul vl] +; CHECK-NEXT: ldr z15, [x0, #11, mul vl] +; CHECK-NEXT: ldr z17, [x0, #12, mul vl] +; CHECK-NEXT: ldr z19, [x0, #14, mul vl] ; CHECK-NEXT: ldr z16, [x0, #13, mul vl] -; CHECK-NEXT: ldr z14, [x0, #14, mul vl] -; CHECK-NEXT: ldr z19, [x0, #12, mul vl] -; CHECK-NEXT: mov z17.d, p3/m, z26.d -; CHECK-NEXT: fcmgt p9.d, p0/z, z3.d, z4.d -; CHECK-NEXT: movprfx z8, z11 -; CHECK-NEXT: frintx z8.d, p0/m, z11.d -; CHECK-NEXT: sel z11.d, p4, z26.d, z13.d -; CHECK-NEXT: frintx z10.d, p0/m, z10.d -; CHECK-NEXT: movprfx z13, z18 -; CHECK-NEXT: frintx z13.d, p0/m, z18.d -; CHECK-NEXT: fcmge p5.d, p0/z, z28.d, z7.d -; CHECK-NEXT: movprfx z18, z27 -; CHECK-NEXT: fcvtzs z18.d, p0/m, z27.d +; CHECK-NEXT: ldr z29, [x0, #8, mul vl] +; CHECK-NEXT: ldr z18, [x0, #15, mul vl] +; CHECK-NEXT: sel z7.d, p4, z9.d, z25.d +; CHECK-NEXT: movprfx z9, z12 +; CHECK-NEXT: frintx z9.d, p0/m, z12.d +; CHECK-NEXT: movprfx z12, z14 +; CHECK-NEXT: frintx z12.d, p0/m, z14.d +; CHECK-NEXT: sel z14.d, p2, z11.d, z25.d +; CHECK-NEXT: sel z11.d, p3, z13.d, z25.d +; CHECK-NEXT: movprfx z13, z15 +; CHECK-NEXT: frintx z13.d, p0/m, z15.d +; CHECK-NEXT: movprfx z15, z17 +; CHECK-NEXT: frintx z15.d, p0/m, z17.d +; CHECK-NEXT: movprfx z17, z19 +; CHECK-NEXT: frintx z17.d, p0/m, z19.d +; CHECK-NEXT: frintx z29.d, p0/m, z29.d ; CHECK-NEXT: frintx z16.d, p0/m, z16.d -; CHECK-NEXT: movprfx z15, z19 -; CHECK-NEXT: frintx z15.d, p0/m, z19.d -; CHECK-NEXT: movprfx z19, z28 -; CHECK-NEXT: fcvtzs z19.d, p0/m, z28.d -; CHECK-NEXT: movprfx z21, z14 -; CHECK-NEXT: frintx z21.d, p0/m, z14.d -; CHECK-NEXT: not p4.b, p0/z, p7.b -; CHECK-NEXT: fcmge p6.d, p0/z, z8.d, z7.d -; CHECK-NEXT: movprfx z20, z8 -; CHECK-NEXT: fcvtzs z20.d, p0/m, z8.d -; CHECK-NEXT: fcmge p7.d, p0/z, z10.d, z7.d -; CHECK-NEXT: fcmge p8.d, p0/z, z13.d, z7.d -; CHECK-NEXT: not p5.b, p0/z, p5.b -; CHECK-NEXT: sel z9.d, p4, z26.d, z18.d -; CHECK-NEXT: fcmge p4.d, p0/z, z16.d, z7.d -; CHECK-NEXT: fcmge p3.d, p0/z, z15.d, z7.d -; CHECK-NEXT: movprfx z0, z16 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z16.d -; CHECK-NEXT: sel z14.d, p5, z26.d, z19.d -; CHECK-NEXT: movprfx z19, z10 -; CHECK-NEXT: fcvtzs z19.d, p0/m, z10.d -; CHECK-NEXT: movprfx z1, z21 -; CHECK-NEXT: fcvtzs z1.d, p0/m, z21.d -; CHECK-NEXT: not p6.b, p0/z, p6.b +; CHECK-NEXT: frintx z18.d, p0/m, z18.d +; CHECK-NEXT: sel z28.d, p5, z28.d, z25.d +; CHECK-NEXT: movprfx z19, z9 +; CHECK-NEXT: fcvtzs z19.d, p0/m, z9.d +; CHECK-NEXT: fcmge p3.d, p0/z, z9.d, z26.d +; CHECK-NEXT: movprfx z21, z12 +; CHECK-NEXT: fcvtzs z21.d, p0/m, z12.d +; CHECK-NEXT: movprfx z22, z13 +; CHECK-NEXT: fcvtzs z22.d, p0/m, z13.d +; CHECK-NEXT: fcmge p5.d, p0/z, z13.d, z26.d +; CHECK-NEXT: fcmge p4.d, p0/z, z12.d, z26.d +; CHECK-NEXT: fcmge p8.d, p0/z, z17.d, z26.d +; CHECK-NEXT: movprfx z1, z17 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z17.d +; CHECK-NEXT: movprfx z20, z29 +; CHECK-NEXT: fcvtzs z20.d, p0/m, z29.d +; CHECK-NEXT: fcmge p2.d, p0/z, z29.d, z26.d ; CHECK-NEXT: movprfx z23, z15 ; CHECK-NEXT: fcvtzs z23.d, p0/m, z15.d -; CHECK-NEXT: not p5.b, p0/z, p7.b -; CHECK-NEXT: sel z18.d, p6, z26.d, z20.d -; CHECK-NEXT: fcmge p6.d, p0/z, z21.d, z7.d -; CHECK-NEXT: not p7.b, p0/z, p8.b -; CHECK-NEXT: fcmge p8.d, p0/z, z22.d, z7.d -; CHECK-NEXT: movprfx z20, z13 -; CHECK-NEXT: fcvtzs z20.d, p0/m, z13.d -; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z7.d, #0x7fffffffffffffff -; CHECK-NEXT: mov z19.d, p5/m, z26.d -; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: mov z0.d, p4/m, z26.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z21.d, z4.d -; CHECK-NEXT: not p5.b, p0/z, p6.b -; CHECK-NEXT: mov z23.d, p3/m, z26.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z22.d, z4.d -; CHECK-NEXT: not p6.b, p0/z, p8.b -; CHECK-NEXT: mov z20.d, p7/m, z26.d -; CHECK-NEXT: fcmuo p8.d, p0/z, z22.d, z22.d +; CHECK-NEXT: fcmge p6.d, p0/z, z15.d, z26.d +; CHECK-NEXT: fcmge p7.d, p0/z, z16.d, z26.d +; CHECK-NEXT: fcmge p9.d, p0/z, z18.d, z26.d +; CHECK-NEXT: movprfx z0, z16 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z16.d +; CHECK-NEXT: movprfx z3, z18 +; CHECK-NEXT: fcvtzs z3.d, p0/m, z18.d +; CHECK-NEXT: mov z26.d, #0x7fffffffffffffff +; CHECK-NEXT: sel z22.d, p5, z22.d, z25.d +; CHECK-NEXT: fcmgt p5.d, p0/z, z17.d, z2.d +; CHECK-NEXT: sel z1.d, p8, z1.d, z25.d +; CHECK-NEXT: sel z19.d, p3, z19.d, z25.d +; CHECK-NEXT: sel z20.d, p2, z20.d, z25.d +; CHECK-NEXT: fcmgt p2.d, p0/z, z8.d, z2.d +; CHECK-NEXT: fcmgt p3.d, p0/z, z4.d, z2.d +; CHECK-NEXT: sel z21.d, p4, z21.d, z25.d +; CHECK-NEXT: sel z23.d, p6, z23.d, z25.d +; CHECK-NEXT: sel z0.d, p7, z0.d, z25.d +; CHECK-NEXT: sel z3.d, p9, z3.d, z25.d +; CHECK-NEXT: sel z25.d, p1, z26.d, z30.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z16.d, z2.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z18.d, z2.d +; CHECK-NEXT: fcmuo p6.d, p0/z, z17.d, z17.d +; CHECK-NEXT: ldr z30, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov z1.d, p5/m, z26.d -; CHECK-NEXT: fcmuo p5.d, p0/z, z21.d, z21.d -; CHECK-NEXT: fcmgt p7.d, p0/z, z25.d, z4.d -; CHECK-NEXT: mov z2.d, p6/m, z26.d -; CHECK-NEXT: sel z26.d, p1, z7.d, z29.d -; CHECK-NEXT: fcmgt p1.d, p0/z, z16.d, z4.d -; CHECK-NEXT: ldr z29, [sp] // 16-byte Folded Reload -; CHECK-NEXT: fcmgt p6.d, p0/z, z5.d, z4.d -; CHECK-NEXT: mov z24.d, p9/m, z7.d -; CHECK-NEXT: mov z1.d, p4/m, z7.d +; CHECK-NEXT: mov z31.d, p3/m, z26.d +; CHECK-NEXT: fcmgt p3.d, p0/z, z15.d, z2.d +; CHECK-NEXT: fcmuo p9.d, p0/z, z18.d, z18.d +; CHECK-NEXT: fcmuo p5.d, p0/z, z15.d, z15.d +; CHECK-NEXT: fcmgt p8.d, p0/z, z24.d, z2.d +; CHECK-NEXT: mov z30.d, p2/m, z26.d +; CHECK-NEXT: fcmgt p2.d, p0/z, z13.d, z2.d +; CHECK-NEXT: mov z0.d, p1/m, z26.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z12.d, z2.d +; CHECK-NEXT: mov z3.d, p4/m, z26.d ; CHECK-NEXT: fcmuo p4.d, p0/z, z16.d, z16.d -; CHECK-NEXT: mov z2.d, p3/m, z7.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z15.d, z4.d -; CHECK-NEXT: mov z17.d, p7/m, z7.d -; CHECK-NEXT: mov z29.d, p2/m, z7.d -; CHECK-NEXT: fcmgt p2.d, p0/z, z13.d, z4.d -; CHECK-NEXT: mov z0.d, p1/m, z7.d -; CHECK-NEXT: fcmgt p1.d, p0/z, z10.d, z4.d -; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 -; CHECK-NEXT: mov z11.d, p6/m, z7.d -; CHECK-NEXT: fcmuo p6.d, p0/z, z15.d, z15.d -; CHECK-NEXT: fcmgt p5.d, p0/z, z8.d, z4.d -; CHECK-NEXT: mov z2.d, p8/m, #0 // =0x0 -; CHECK-NEXT: sel z16.d, p3, z7.d, z23.d -; CHECK-NEXT: fcmuo p3.d, p0/z, z10.d, z10.d -; CHECK-NEXT: mov z0.d, p4/m, #0 // =0x0 -; CHECK-NEXT: sel z15.d, p2, z7.d, z20.d +; CHECK-NEXT: mov z1.d, p6/m, #0 // =0x0 +; CHECK-NEXT: sel z16.d, p3, z26.d, z23.d +; CHECK-NEXT: fcmuo p3.d, p0/z, z12.d, z12.d +; CHECK-NEXT: fcmgt p6.d, p0/z, z9.d, z2.d +; CHECK-NEXT: fcmgt p7.d, p0/z, z5.d, z2.d +; CHECK-NEXT: mov z14.d, p8/m, z26.d +; CHECK-NEXT: sel z15.d, p2, z26.d, z22.d ; CHECK-NEXT: fcmuo p2.d, p0/z, z13.d, z13.d +; CHECK-NEXT: mov z3.d, p9/m, #0 // =0x0 ; CHECK-NEXT: str z1, [x8, #14, mul vl] -; CHECK-NEXT: sel z1.d, p1, z7.d, z19.d -; CHECK-NEXT: fcmgt p1.d, p0/z, z28.d, z4.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z27.d, z4.d -; CHECK-NEXT: str z2, [x8, #15, mul vl] -; CHECK-NEXT: sel z2.d, p5, z7.d, z18.d -; CHECK-NEXT: mov z16.d, p6/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p5.d, p0/z, z8.d, z8.d -; CHECK-NEXT: str z0, [x8, #13, mul vl] +; CHECK-NEXT: sel z1.d, p1, z26.d, z21.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z29.d, z2.d +; CHECK-NEXT: mov z0.d, p4/m, #0 // =0x0 +; CHECK-NEXT: fcmgt p4.d, p0/z, z27.d, z2.d +; CHECK-NEXT: mov z16.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p5.d, p0/z, z9.d, z9.d +; CHECK-NEXT: str z3, [x8, #15, mul vl] +; CHECK-NEXT: sel z3.d, p6, z26.d, z19.d ; CHECK-NEXT: mov z15.d, p2/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p2.d, p0/z, z28.d, z28.d +; CHECK-NEXT: fcmuo p2.d, p0/z, z29.d, z29.d ; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 -; CHECK-NEXT: fcmgt p3.d, p0/z, z6.d, z4.d -; CHECK-NEXT: sel z0.d, p1, z7.d, z14.d +; CHECK-NEXT: str z0, [x8, #13, mul vl] +; CHECK-NEXT: fcmgt p3.d, p0/z, z6.d, z2.d +; CHECK-NEXT: sel z0.d, p1, z26.d, z20.d ; CHECK-NEXT: fcmuo p1.d, p0/z, z27.d, z27.d -; CHECK-NEXT: sel z27.d, p4, z7.d, z9.d +; CHECK-NEXT: sel z27.d, p4, z26.d, z11.d ; CHECK-NEXT: str z16, [x8, #12, mul vl] -; CHECK-NEXT: fcmuo p4.d, p0/z, z25.d, z25.d ; CHECK-NEXT: str z15, [x8, #11, mul vl] -; CHECK-NEXT: mov z2.d, p5/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p5.d, p0/z, z6.d, z6.d +; CHECK-NEXT: mov z3.d, p5/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p4.d, p0/z, z24.d, z24.d ; CHECK-NEXT: str z1, [x8, #10, mul vl] ; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 -; CHECK-NEXT: sel z1.d, p3, z7.d, z31.d +; CHECK-NEXT: fcmuo p5.d, p0/z, z6.d, z6.d +; CHECK-NEXT: sel z1.d, p3, z26.d, z28.d ; CHECK-NEXT: fcmuo p3.d, p0/z, z5.d, z5.d ; CHECK-NEXT: ldr z5, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: mov z27.d, p1/m, #0 // =0x0 -; CHECK-NEXT: str z2, [x8, #9, mul vl] -; CHECK-NEXT: fcmuo p1.d, p0/z, z3.d, z3.d +; CHECK-NEXT: str z3, [x8, #9, mul vl] +; CHECK-NEXT: fcmuo p1.d, p0/z, z4.d, z4.d ; CHECK-NEXT: str z0, [x8, #8, mul vl] -; CHECK-NEXT: mov z17.d, p4/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p4.d, p0/z, z30.d, z30.d -; CHECK-NEXT: fcmgt p2.d, p0/z, z5.d, z4.d +; CHECK-NEXT: mov z14.d, p4/m, #0 // =0x0 +; CHECK-NEXT: fcmuo p4.d, p0/z, z8.d, z8.d +; CHECK-NEXT: fcmgt p2.d, p0/z, z5.d, z2.d ; CHECK-NEXT: mov z1.d, p5/m, #0 // =0x0 -; CHECK-NEXT: fcmuo p5.d, p0/z, z12.d, z12.d +; CHECK-NEXT: fcmuo p5.d, p0/z, z10.d, z10.d ; CHECK-NEXT: str z27, [x8, #7, mul vl] ; CHECK-NEXT: fcmuo p0.d, p0/z, z5.d, z5.d -; CHECK-NEXT: mov z11.d, p3/m, #0 // =0x0 +; CHECK-NEXT: mov z7.d, p7/m, z26.d ; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: mov z24.d, p1/m, #0 // =0x0 -; CHECK-NEXT: str z17, [x8, #6, mul vl] -; CHECK-NEXT: mov z29.d, p4/m, #0 // =0x0 +; CHECK-NEXT: mov z31.d, p1/m, #0 // =0x0 +; CHECK-NEXT: str z14, [x8, #6, mul vl] +; CHECK-NEXT: mov z30.d, p4/m, #0 // =0x0 ; CHECK-NEXT: str z1, [x8, #5, mul vl] -; CHECK-NEXT: mov z26.d, p5/m, #0 // =0x0 -; CHECK-NEXT: str z11, [x8, #4, mul vl] -; CHECK-NEXT: mov z0.d, p2/m, z7.d -; CHECK-NEXT: str z24, [x8, #3, mul vl] -; CHECK-NEXT: str z29, [x8, #2, mul vl] -; CHECK-NEXT: str z26, [x8, #1, mul vl] +; CHECK-NEXT: mov z7.d, p3/m, #0 // =0x0 +; CHECK-NEXT: mov z25.d, p5/m, #0 // =0x0 +; CHECK-NEXT: mov z0.d, p2/m, z26.d +; CHECK-NEXT: str z31, [x8, #3, mul vl] +; CHECK-NEXT: str z30, [x8, #2, mul vl] +; CHECK-NEXT: str z7, [x8, #4, mul vl] ; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 +; CHECK-NEXT: str z25, [x8, #1, mul vl] ; CHECK-NEXT: str z0, [x8] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll index bbc94f568dd0a..0c0762da5bba2 100644 --- a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll +++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll @@ -989,9 +989,9 @@ define @fadd_nxv4f32_x( %x, %n, zeroinitializer @@ -1004,9 +1004,9 @@ define @fadd_nxv8f16_x( %x, %n, zeroinitializer @@ -1019,9 +1019,9 @@ define @fadd_nxv2f64_x( %x, %n, zeroinitializer @@ -1034,9 +1034,9 @@ define @fsub_nxv4f32_x( %x, %n, zeroinitializer @@ -1049,9 +1049,9 @@ define @fsub_nxv8f16_x( %x, %n, zeroinitializer @@ -1064,9 +1064,9 @@ define @fsub_nxv2f64_x( %x, %n, zeroinitializer @@ -1079,9 +1079,9 @@ define @fmul_nxv4f32_x( %x, %n, zeroinitializer @@ -1094,9 +1094,9 @@ define @fmul_nxv8f16_x( %x, %n, zeroinitializer @@ -1109,9 +1109,9 @@ define @fmul_nxv2f64_x( %x, %n, zeroinitializer @@ -1125,9 +1125,8 @@ define @fdiv_nxv4f32_x( %x, %n, zeroinitializer @@ -1141,9 +1140,8 @@ define @fdiv_nxv8f16_x( %x, %n, zeroinitializer @@ -1157,9 +1155,8 @@ define @fdiv_nxv2f64_x( %x, %n, zeroinitializer @@ -1173,8 +1170,8 @@ define @minnum_nxv4f32_x( %x, %n, zeroinitializer @@ -1188,8 +1185,8 @@ define @minnum_nxv8f16_x( %x, %n, zeroinitializer @@ -1203,8 +1200,8 @@ define @minnum_nxv2f64_x( %x, %n, zeroinitializer @@ -1218,8 +1215,8 @@ define @maxnum_nxv4f32_x( %x, %n, zeroinitializer @@ -1233,8 +1230,8 @@ define @maxnum_nxv8f16_x( %x, %n, zeroinitializer @@ -1248,8 +1245,8 @@ define @maxnum_nxv2f64_x( %x, %n, zeroinitializer @@ -1263,8 +1260,8 @@ define @minimum_nxv4f32_x( %x, %n, zeroinitializer @@ -1278,8 +1275,8 @@ define @minimum_nxv8f16_x( %x, %n, zeroinitializer @@ -1293,8 +1290,8 @@ define @minimum_nxv2f64_x( %x, %n, zeroinitializer @@ -1308,8 +1305,8 @@ define @maximum_nxv4f32_x( %x, %n, zeroinitializer @@ -1323,8 +1320,8 @@ define @maximum_nxv8f16_x( %x, %n, zeroinitializer @@ -1338,8 +1335,8 @@ define @maximum_nxv2f64_x( %x, %n, zeroinitializer @@ -1353,8 +1350,8 @@ define @fmai_nxv4f32_x( %x, %n, zeroinitializer @@ -1368,8 +1365,8 @@ define @fmai_nxv8f16_x( %x, %n, zeroinitializer @@ -1383,8 +1380,8 @@ define @fmai_nxv2f64_x( %x, %n, zeroinitializer @@ -1398,8 +1395,8 @@ define @fma_nxv4f32_x( %x, %n, zeroinitializer @@ -1414,8 +1411,8 @@ define @fma_nxv8f16_x( %x, %n, zeroinitializer @@ -1430,8 +1427,8 @@ define @fma_nxv2f64_x( %x, %n, zeroinitializer @@ -2470,9 +2467,8 @@ define @fadd_nxv4f32_y( %x, %n, zeroinitializer @@ -2486,9 +2482,8 @@ define @fadd_nxv8f16_y( %x, %n, zeroinitializer @@ -2502,9 +2497,8 @@ define @fadd_nxv2f64_y( %x, %n, zeroinitializer @@ -2517,10 +2511,9 @@ define @fsub_nxv4f32_y( %x, %n, zeroinitializer @@ -2533,10 +2526,9 @@ define @fsub_nxv8f16_y( %x, %n, zeroinitializer @@ -2549,10 +2541,9 @@ define @fsub_nxv2f64_y( %x, %n, zeroinitializer @@ -2566,9 +2557,8 @@ define @fmul_nxv4f32_y( %x, %n, zeroinitializer @@ -2582,9 +2572,8 @@ define @fmul_nxv8f16_y( %x, %n, zeroinitializer @@ -2598,9 +2587,8 @@ define @fmul_nxv2f64_y( %x, %n, zeroinitializer @@ -2614,9 +2602,8 @@ define @fdiv_nxv4f32_y( %x, %n, zeroinitializer @@ -2630,9 +2617,8 @@ define @fdiv_nxv8f16_y( %x, %n, zeroinitializer @@ -2646,9 +2632,8 @@ define @fdiv_nxv2f64_y( %x, %n, zeroinitializer @@ -2662,9 +2647,8 @@ define @minnum_nxv4f32_y( %x, %n, zeroinitializer @@ -2678,9 +2662,8 @@ define @minnum_nxv8f16_y( %x, %n, zeroinitializer @@ -2694,9 +2677,8 @@ define @minnum_nxv2f64_y( %x, %n, zeroinitializer @@ -2710,9 +2692,8 @@ define @maxnum_nxv4f32_y( %x, %n, zeroinitializer @@ -2726,9 +2707,8 @@ define @maxnum_nxv8f16_y( %x, %n, zeroinitializer @@ -2742,9 +2722,8 @@ define @maxnum_nxv2f64_y( %x, %n, zeroinitializer @@ -2758,9 +2737,8 @@ define @minimum_nxv4f32_y( %x, %n, zeroinitializer @@ -2774,9 +2752,8 @@ define @minimum_nxv8f16_y( %x, %n, zeroinitializer @@ -2790,9 +2767,8 @@ define @minimum_nxv2f64_y( %x, %n, zeroinitializer @@ -2806,9 +2782,8 @@ define @maximum_nxv4f32_y( %x, %n, zeroinitializer @@ -2822,9 +2797,8 @@ define @maximum_nxv8f16_y( %x, %n, zeroinitializer @@ -2838,9 +2812,8 @@ define @maximum_nxv2f64_y( %x, %n, zeroinitializer @@ -2855,8 +2828,7 @@ define @fmai_nxv4f32_y( %x, %n, zeroinitializer @@ -2871,8 +2843,7 @@ define @fmai_nxv8f16_y( %x, %n, zeroinitializer @@ -2887,8 +2858,7 @@ define @fmai_nxv2f64_y( %x, %n, zeroinitializer @@ -2903,8 +2873,7 @@ define @fma_nxv4f32_y( %x, %n, zeroinitializer @@ -2920,8 +2889,7 @@ define @fma_nxv8f16_y( %x, %n, zeroinitializer @@ -2937,8 +2905,7 @@ define @fma_nxv2f64_y( %x, %n, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll index 66dece82a0ac5..58d6149b94d3a 100644 --- a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll +++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll @@ -641,9 +641,9 @@ define @fadd_nxv4f32_x( %x, %n, zeroinitializer @@ -656,9 +656,9 @@ define @fadd_nxv8f16_x( %x, %n, zeroinitializer @@ -671,9 +671,9 @@ define @fadd_nxv2f64_x( %x, %n, zeroinitializer @@ -686,9 +686,9 @@ define @fsub_nxv4f32_x( %x, %n, zeroinitializer @@ -701,9 +701,9 @@ define @fsub_nxv8f16_x( %x, %n, zeroinitializer @@ -716,9 +716,9 @@ define @fsub_nxv2f64_x( %x, %n, zeroinitializer @@ -731,9 +731,9 @@ define @fmul_nxv4f32_x( %x, %n, zeroinitializer @@ -746,9 +746,9 @@ define @fmul_nxv8f16_x( %x, %n, zeroinitializer @@ -761,9 +761,9 @@ define @fmul_nxv2f64_x( %x, %n, zeroinitializer @@ -777,9 +777,8 @@ define @fdiv_nxv4f32_x( %x, %n, zeroinitializer @@ -793,9 +792,8 @@ define @fdiv_nxv8f16_x( %x, %n, zeroinitializer @@ -809,9 +807,8 @@ define @fdiv_nxv2f64_x( %x, %n, zeroinitializer @@ -825,8 +822,8 @@ define @fma_nxv4f32_x( %x, %n, zeroinitializer @@ -841,8 +838,8 @@ define @fma_nxv8f16_x( %x, %n, zeroinitializer @@ -857,8 +854,8 @@ define @fma_nxv2f64_x( %x, %n, zeroinitializer @@ -1540,10 +1537,9 @@ define @fadd_nxv4f32_y( %x, %n, zeroinitializer @@ -1556,10 +1552,9 @@ define @fadd_nxv8f16_y( %x, %n, zeroinitializer @@ -1572,10 +1567,9 @@ define @fadd_nxv2f64_y( %x, %n, zeroinitializer @@ -1588,10 +1582,9 @@ define @fsub_nxv4f32_y( %x, %n, zeroinitializer @@ -1604,10 +1597,9 @@ define @fsub_nxv8f16_y( %x, %n, zeroinitializer @@ -1620,10 +1612,9 @@ define @fsub_nxv2f64_y( %x, %n, zeroinitializer @@ -1636,10 +1627,9 @@ define @fmul_nxv4f32_y( %x, %n, zeroinitializer @@ -1652,10 +1642,9 @@ define @fmul_nxv8f16_y( %x, %n, zeroinitializer @@ -1668,10 +1657,9 @@ define @fmul_nxv2f64_y( %x, %n, zeroinitializer @@ -1685,9 +1673,8 @@ define @fdiv_nxv4f32_y( %x, %n, zeroinitializer @@ -1701,9 +1688,8 @@ define @fdiv_nxv8f16_y( %x, %n, zeroinitializer @@ -1717,9 +1703,8 @@ define @fdiv_nxv2f64_y( %x, %n, zeroinitializer @@ -1734,8 +1719,7 @@ define @fmai_nxv4f32_y( %x, %n, zeroinitializer @@ -1750,8 +1734,7 @@ define @fmai_nxv8f16_y( %x, %n, zeroinitializer @@ -1766,8 +1749,7 @@ define @fmai_nxv2f64_y( %x, %n, zeroinitializer @@ -1782,8 +1764,7 @@ define @fma_nxv4f32_y( %x, %n, zeroinitializer @@ -1799,8 +1780,7 @@ define @fma_nxv8f16_y( %x, %n, zeroinitializer @@ -1816,8 +1796,7 @@ define @fma_nxv2f64_y( %x, %n, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll index 22956f8fe3551..9d3fe3a90b463 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll @@ -47,9 +47,9 @@ define <4 x i32> @select_addsub_v4i32(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) define <4 x i32> @select_addsub_v4i32_select_swapped(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: select_addsub_v4i32_select_swapped: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vmnot.m v0, v0 -; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vrsub.vi v10, v9, 0 +; CHECK-NEXT: vmerge.vvm v9, v10, v9, v0 ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <4 x i32> %a, %b @@ -74,9 +74,9 @@ define <4 x i32> @select_addsub_v4i32_add_swapped(<4 x i1> %cc, <4 x i32> %a, <4 define <4 x i32> @select_addsub_v4i32_both_swapped(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: select_addsub_v4i32_both_swapped: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vmnot.m v0, v0 -; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vrsub.vi v10, v9, 0 +; CHECK-NEXT: vmerge.vvm v9, v10, v9, v0 ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <4 x i32> %a, %b diff --git a/llvm/test/CodeGen/X86/pr78897.ll b/llvm/test/CodeGen/X86/pr78897.ll index 0caa569107c0c..4613c2bcdcaf4 100644 --- a/llvm/test/CodeGen/X86/pr78897.ll +++ b/llvm/test/CodeGen/X86/pr78897.ll @@ -225,9 +225,9 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind { ; X86-AVX512-NEXT: pushl %esi ; X86-AVX512-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %xmm0 ; X86-AVX512-NEXT: vmovd %xmm0, %eax -; X86-AVX512-NEXT: kmovd %eax, %k0 -; X86-AVX512-NEXT: knotw %k0, %k1 -; X86-AVX512-NEXT: vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u] +; X86-AVX512-NEXT: kmovd %eax, %k1 +; X86-AVX512-NEXT: knotw %k1, %k2 +; X86-AVX512-NEXT: vmovdqu8 {{.*#+}} xmm0 {%k2} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u] ; X86-AVX512-NEXT: vpextrd $1, %xmm0, %eax ; X86-AVX512-NEXT: vmovd %xmm0, %edx ; X86-AVX512-NEXT: movl $286331152, %ecx # imm = 0x11111110 @@ -247,9 +247,9 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind { ; X86-AVX512-NEXT: addl %edx, %eax ; X86-AVX512-NEXT: vmovd %esi, %xmm1 ; X86-AVX512-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X86-AVX512-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} -; X86-AVX512-NEXT: vpsrlw $4, %xmm1, %xmm0 -; X86-AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X86-AVX512-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} +; X86-AVX512-NEXT: vpsrlw $4, %xmm0, %xmm1 +; X86-AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X86-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 ; X86-AVX512-NEXT: popl %esi ; X86-AVX512-NEXT: popl %edi @@ -258,9 +258,9 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind { ; ; X64-AVX512-LABEL: produceShuffleVectorForByte: ; X64-AVX512: # %bb.0: # %entry -; X64-AVX512-NEXT: kmovd %edi, %k0 -; X64-AVX512-NEXT: knotw %k0, %k1 -; X64-AVX512-NEXT: vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u] +; X64-AVX512-NEXT: kmovd %edi, %k1 +; X64-AVX512-NEXT: knotw %k1, %k2 +; X64-AVX512-NEXT: vmovdqu8 {{.*#+}} xmm0 {%k2} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u] ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110 ; X64-AVX512-NEXT: movabsq $76861433640456465, %rdx # imm = 0x111111111111111 @@ -269,9 +269,9 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind { ; X64-AVX512-NEXT: vmovq %rax, %xmm0 ; X64-AVX512-NEXT: imulq %rcx, %rdx ; X64-AVX512-NEXT: vmovq %rdx, %xmm1 -; X64-AVX512-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} -; X64-AVX512-NEXT: vpsrlw $4, %xmm1, %xmm0 -; X64-AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X64-AVX512-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} +; X64-AVX512-NEXT: vpsrlw $4, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X64-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-AVX512-NEXT: retq entry: