diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 82c82c1c19bf0..86c28b31efe85 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -26196,20 +26196,20 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG, EVT EltVT = VT.getVectorElementType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // TODO: Remove/replace the extract cost check? If the elements are available - // as scalars, then there may be no extract cost. Should we ask if - // inserting a scalar back into a vector is cheap instead? int Index0, Index1; SDValue Src0 = DAG.getSplatSourceVector(N0, Index0); SDValue Src1 = DAG.getSplatSourceVector(N1, Index1); - // Extract element from splat_vector should be free. - // TODO: use DAG.isSplatValue instead? - bool IsBothSplatVector = N0.getOpcode() == ISD::SPLAT_VECTOR && - N1.getOpcode() == ISD::SPLAT_VECTOR; + // Extracting from a shuffle_vector might cost something, but extracting from + // a splat_vector or a splatted build_vector should be free since the operands + // are scalars anyway. + bool IsExtractFree = (N0.getOpcode() == ISD::SPLAT_VECTOR || + N0.getOpcode() == ISD::BUILD_VECTOR) && + (N1.getOpcode() == ISD::SPLAT_VECTOR || + N1.getOpcode() == ISD::BUILD_VECTOR); if (!Src0 || !Src1 || Index0 != Index1 || Src0.getValueType().getVectorElementType() != EltVT || Src1.getValueType().getVectorElementType() != EltVT || - !(IsBothSplatVector || TLI.isExtractVecEltCheap(VT, Index0)) || + !(IsExtractFree || TLI.isExtractVecEltCheap(VT, Index0)) || !TLI.isOperationLegalOrCustom(Opcode, EltVT)) return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll index e8437b5cd801f..b42ba4816f7be 100644 --- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll +++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll @@ -473,16 +473,16 @@ define <4 x i1> @lane_mask_v4i1_i8(i8 %index, i8 %TC) { define <2 x i1> @lane_mask_v2i1_i8(i8 %index, i8 %TC) { ; CHECK-LABEL: lane_mask_v2i1_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: movi d0, #0x0000ff000000ff -; CHECK-NEXT: dup v1.2s, w0 +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: movi d2, #0x0000ff000000ff +; CHECK-NEXT: dup v0.2s, w8 ; CHECK-NEXT: adrp x8, .LCPI27_0 -; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI27_0] -; CHECK-NEXT: dup v3.2s, w1 -; CHECK-NEXT: and v1.8b, v1.8b, v0.8b -; CHECK-NEXT: add v1.2s, v1.2s, v2.2s -; CHECK-NEXT: umin v1.2s, v1.2s, v0.2s -; CHECK-NEXT: and v0.8b, v3.8b, v0.8b -; CHECK-NEXT: cmhi v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI27_0] +; CHECK-NEXT: and w8, w1, #0xff +; CHECK-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-NEXT: dup v1.2s, w8 +; CHECK-NEXT: umin v0.2s, v0.2s, v2.2s +; CHECK-NEXT: cmhi v0.2s, v1.2s, v0.2s ; CHECK-NEXT: ret %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i8(i8 %index, i8 %TC) ret <2 x i1> %active.lane.mask diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll index cad3fb58086d6..6e8daef96549d 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll @@ -267,8 +267,12 @@ define <2 x i32> @test_select_cc_v2i32_icmpi1(i1 %cc, <2 x i32> %a, <2 x i32> %b ; CHECK: // %bb.0: ; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: mvn w9, w8 ; CHECK-NEXT: dup v2.2s, w8 -; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: dup v3.2s, w9 +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: and v1.8b, v1.8b, v3.8b +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %cmp = icmp ne i1 %cc, 0 %e = select i1 %cmp, <2 x i32> %a, <2 x i32> %b diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll index 3542b26b53539..eb16b80ec316c 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll @@ -593,8 +593,9 @@ entry: define <2 x i32> @fct20(ptr nocapture %sp0) { ; CHECK-LABEL: fct20: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr s0, [x0, #4] -; CHECK-NEXT: mul.2s v0, v0, v0 +; CHECK-NEXT: ldr w8, [x0, #4] +; CHECK-NEXT: mul w8, w8, w8 +; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: ret entry: %addr = getelementptr i32, ptr %sp0, i64 1 @@ -607,8 +608,9 @@ entry: define <4 x i32> @fct21(ptr nocapture %sp0) { ; CHECK-LABEL: fct21: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr s0, [x0, #4] -; CHECK-NEXT: mul.4s v0, v0, v0 +; CHECK-NEXT: ldr w8, [x0, #4] +; CHECK-NEXT: mul w8, w8, w8 +; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: ret entry: %addr = getelementptr i32, ptr %sp0, i64 1 @@ -703,8 +705,9 @@ entry: define <2 x i32> @fct28(ptr nocapture %sp0, i64 %offset) { ; CHECK-LABEL: fct28: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr s0, [x0, x1, lsl #2] -; CHECK-NEXT: mul.2s v0, v0, v0 +; CHECK-NEXT: ldr w8, [x0, x1, lsl #2] +; CHECK-NEXT: mul w8, w8, w8 +; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: ret entry: %addr = getelementptr i32, ptr %sp0, i64 %offset @@ -717,8 +720,9 @@ entry: define <4 x i32> @fct29(ptr nocapture %sp0, i64 %offset) { ; CHECK-LABEL: fct29: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr s0, [x0, x1, lsl #2] -; CHECK-NEXT: mul.4s v0, v0, v0 +; CHECK-NEXT: ldr w8, [x0, x1, lsl #2] +; CHECK-NEXT: mul w8, w8, w8 +; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: ret entry: %addr = getelementptr i32, ptr %sp0, i64 %offset diff --git a/llvm/test/CodeGen/AArch64/fdiv-combine.ll b/llvm/test/CodeGen/AArch64/fdiv-combine.ll index 1ed63f3ef2507..7f3d96388aefa 100644 --- a/llvm/test/CodeGen/AArch64/fdiv-combine.ll +++ b/llvm/test/CodeGen/AArch64/fdiv-combine.ll @@ -100,13 +100,11 @@ define void @two_fdiv_double(double %D, double %a, double %b) #0 { define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { ; CHECK-LABEL: splat_three_fdiv_4xfloat: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: fmov v4.4s, #1.00000000 -; CHECK-NEXT: dup v0.4s, v0.s[0] -; CHECK-NEXT: fdiv v4.4s, v4.4s, v0.4s -; CHECK-NEXT: fmul v0.4s, v1.4s, v4.4s -; CHECK-NEXT: fmul v1.4s, v2.4s, v4.4s -; CHECK-NEXT: fmul v2.4s, v3.4s, v4.4s +; CHECK-NEXT: fmov s4, #1.00000000 +; CHECK-NEXT: fdiv s4, s4, s0 +; CHECK-NEXT: fmul v0.4s, v1.4s, v4.s[0] +; CHECK-NEXT: fmul v1.4s, v2.4s, v4.s[0] +; CHECK-NEXT: fmul v2.4s, v3.4s, v4.s[0] ; CHECK-NEXT: b foo_3_4xf %D.ins = insertelement <4 x float> poison, float %D, i64 0 %splat = shufflevector <4 x float> %D.ins, <4 x float> poison, <4 x i32> zeroinitializer @@ -120,11 +118,9 @@ define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b, define <4 x float> @splat_fdiv_v4f32(float %D, <4 x float> %a) #1 { ; CHECK-LABEL: splat_fdiv_v4f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: fmov v2.4s, #1.00000000 -; CHECK-NEXT: dup v0.4s, v0.s[0] -; CHECK-NEXT: fdiv v0.4s, v2.4s, v0.4s -; CHECK-NEXT: fmul v0.4s, v1.4s, v0.4s +; CHECK-NEXT: fmov s2, #1.00000000 +; CHECK-NEXT: fdiv s0, s2, s0 +; CHECK-NEXT: fmul v0.4s, v1.4s, v0.s[0] ; CHECK-NEXT: ret entry: %D.ins = insertelement <4 x float> poison, float %D, i64 0 diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll index 595991e86a91c..28bdca6cdad1b 100644 --- a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll @@ -76,28 +76,28 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; CHECK-NEXT: add x11, x11, x11, lsl #3 ; CHECK-NEXT: add x9, x9, x9, lsl #3 ; CHECK-NEXT: sub x8, x8, x11 -; CHECK-NEXT: sub x11, x13, x12 +; CHECK-NEXT: sub x12, x13, x12 ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: mov x8, #8589934591 // =0x1ffffffff ; CHECK-NEXT: sub x9, x10, x9 -; CHECK-NEXT: asr x10, x11, #3 +; CHECK-NEXT: lsr x10, x12, #3 ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: mov v0.d[1], x9 -; CHECK-NEXT: add x9, x10, x11, lsr #63 +; CHECK-NEXT: add x9, x10, x12, lsr #63 ; CHECK-NEXT: add x8, x9, x9, lsl #3 ; CHECK-NEXT: adrp x9, .LCPI3_0 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI3_0] -; CHECK-NEXT: add x8, x12, x8 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: fmov d3, x8 +; CHECK-NEXT: add x8, x2, x8 +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI3_0] +; CHECK-NEXT: and x8, x8, #0x1ffffffff +; CHECK-NEXT: dup v2.2d, x8 ; CHECK-NEXT: adrp x8, .LCPI3_1 -; CHECK-NEXT: cmeq v0.2d, v0.2d, v2.2d -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] -; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: cmeq v0.2d, v0.2d, v1.2d +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: cmeq v1.2d, v2.2d, v1.2d ; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: cmeq v1.2d, v1.2d, v2.2d -; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: xtn v1.2s, v1.2d ; CHECK-NEXT: mov w1, v0.s[1] ; CHECK-NEXT: fmov w0, s0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll index 13ebda1df7f9d..0890c6c2581d6 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll @@ -135,8 +135,12 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) v ; CHECK: // %bb.0: ; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: mvn w9, w8 ; CHECK-NEXT: dup v2.2s, w8 -; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: dup v3.2s, w9 +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: and v1.8b, v1.8b, v3.8b +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %sel = select i1 %mask, <2 x float> %op1, <2 x float> %op2 ret <2 x float> %sel @@ -148,8 +152,12 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) v ; CHECK: // %bb.0: ; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: mvn w9, w8 ; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: dup v3.4s, w9 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %sel = select i1 %mask, <4 x float> %op1, <4 x float> %op2 ret <4 x float> %sel @@ -259,8 +267,12 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask ; CHECK: // %bb.0: ; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csetm x8, ne +; CHECK-NEXT: mvn x9, x8 ; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: fmov d3, x9 +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: and v1.8b, v1.8b, v3.8b +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %sel = select i1 %mask, <1 x double> %op1, <1 x double> %op2 ret <1 x double> %sel @@ -272,8 +284,12 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask ; CHECK: // %bb.0: ; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csetm x8, ne +; CHECK-NEXT: mvn x9, x8 ; CHECK-NEXT: dup v2.2d, x8 -; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: dup v3.2d, x9 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %sel = select i1 %mask, <2 x double> %op1, <2 x double> %op2 ret <2 x double> %sel diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll index 3566bbc2b4561..45f100a3b68e1 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll @@ -237,8 +237,12 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> ; CHECK: // %bb.0: ; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csetm x8, ne +; CHECK-NEXT: mvn x9, x8 ; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: fmov d3, x9 +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: and v1.8b, v1.8b, v3.8b +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %sel = select <1 x i1> %mask, <1 x double> %op1, <1 x double> %op2 ret <1 x double> %sel diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll index 710dce4de6dda..6809b8f69bda0 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll @@ -254,8 +254,12 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) vscale_ ; CHECK: // %bb.0: ; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: mvn w9, w8 ; CHECK-NEXT: dup v2.2s, w8 -; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: dup v3.2s, w9 +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: and v1.8b, v1.8b, v3.8b +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2 ret <2 x i32> %sel @@ -267,8 +271,12 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) vscale_ ; CHECK: // %bb.0: ; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: mvn w9, w8 ; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: dup v3.4s, w9 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2 ret <4 x i32> %sel @@ -378,8 +386,12 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) vscale_ ; CHECK: // %bb.0: ; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csetm x8, ne +; CHECK-NEXT: mvn x9, x8 ; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: fmov d3, x9 +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: and v1.8b, v1.8b, v3.8b +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2 ret <1 x i64> %sel @@ -391,8 +403,12 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) vscale_ ; CHECK: // %bb.0: ; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csetm x8, ne +; CHECK-NEXT: mvn x9, x8 ; CHECK-NEXT: dup v2.2d, x8 -; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: dup v3.2d, x9 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2 ret <2 x i64> %sel diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll index 2c6ffeaeefd6d..f13abd56ea07e 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll @@ -349,8 +349,12 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) v ; CHECK: // %bb.0: ; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csetm x8, ne +; CHECK-NEXT: mvn x9, x8 ; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: fmov d3, x9 +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: and v1.8b, v1.8b, v3.8b +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %sel = select <1 x i1> %mask, <1 x i64> %op1, <1 x i64> %op2 ret <1 x i64> %sel diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll index 46b2f82d9de2a..432ba48da14f9 100644 --- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll @@ -244,7 +244,8 @@ define void @undef_hi_op_v2f16(half %arg0) { ; GFX9-LABEL: undef_hi_op_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -254,7 +255,8 @@ define void @undef_hi_op_v2f16(half %arg0) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX8-NEXT: v_or_b32_e32 v0, 0x7e000000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 ; GFX8-NEXT: ;;#ASMEND @@ -269,8 +271,9 @@ define void @undef_hi_op_v2i16(i16 %arg0) { ; GFX9-LABEL: undef_hi_op_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x63 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_add_u16_e32 v0, 0x63, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -280,6 +283,8 @@ define void @undef_hi_op_v2i16(i16 %arg0) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u16_e32 v0, 0x63, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 ; GFX8-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index 7c469c9f4ccae..1bf5c1348cdbe 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -104,14 +104,15 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB0_4: ; %exit ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0] -; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 15, v3 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: s_movk_i32 s5, 0x8000 ; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 -; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -132,19 +133,21 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB0_4: ; %exit ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0] +; GFX11-NEXT: v_ashrrev_i16 v0, 15, v3 +; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v2 op_sel_hi:[0,1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v1 -; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v2 -; GFX11-NEXT: v_or_b32_e32 v3, 0xffff8000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_or_b32_e32 v4, 0xffff8000, v0 +; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v3 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v1, v2, v4, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -634,14 +637,15 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1 ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: .LBB3_4: ; %exit ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0] -; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 15, v5 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: s_movk_i32 s5, 0x8000 ; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 -; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] ; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -666,19 +670,21 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB3_4: ; %exit ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v1 -; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v2 -; GFX11-NEXT: v_or_b32_e32 v3, 0xffff8000, v3 +; GFX11-NEXT: v_ashrrev_i16 v0, 15, v3 +; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v2 op_sel_hi:[0,1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_or_b32_e32 v4, 0xffff8000, v0 +; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v3 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v1, v2, v4, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll index d200b25c17d33..70c16d550a208 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -95,9 +95,10 @@ define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) noc ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sext_i32_i16 s4, s4 ; SI-NEXT: s_sext_i32_i16 s5, s5 -; SI-NEXT: s_mul_i32 s5, s5, s4 -; SI-NEXT: s_lshr_b32 s4, s5, 16 +; SI-NEXT: s_mul_i32 s4, s5, s4 +; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_alignbit_b32 v0, s5, v0, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -113,9 +114,10 @@ define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) noc ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s0, s2 ; VI-NEXT: s_sext_i32_i16 s1, s3 -; VI-NEXT: s_mul_i32 s1, s1, s0 -; VI-NEXT: s_lshr_b32 s0, s1, 16 +; VI-NEXT: s_mul_i32 s0, s1, s0 +; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll index 8186f6c9b42fb..44d5862fab44f 100644 --- a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll +++ b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll @@ -8,9 +8,8 @@ define i32 @foo(ptr %descs, i32 %num, i32 %cw) local_unnamed_addr #0 { ; CHECK-LABEL: foo: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldr d16, [r0, #32] -; CHECK-NEXT: vadd.i32 d16, d16, d16 -; CHECK-NEXT: vmov.32 r0, d16[0] +; CHECK-NEXT: ldr r0, [r0, #32] +; CHECK-NEXT: add r0, r0, r0 ; CHECK-NEXT: bx lr entry: %wide.vec = load <16 x i32>, ptr %descs, align 4 diff --git a/llvm/test/CodeGen/PowerPC/combine-fneg.ll b/llvm/test/CodeGen/PowerPC/combine-fneg.ll index 3015e68e471a7..be7ad8f18178c 100644 --- a/llvm/test/CodeGen/PowerPC/combine-fneg.ll +++ b/llvm/test/CodeGen/PowerPC/combine-fneg.ll @@ -5,19 +5,17 @@ define <4 x double> @fneg_fdiv_splat(double %a0, <4 x double> %a1) { ; CHECK-LABEL: fneg_fdiv_splat: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: def $f1 killed $f1 def $vsl1 -; CHECK-NEXT: xxspltd 0, 1, 0 -; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI0_0@toc@l -; CHECK-NEXT: lxvd2x 2, 0, 3 -; CHECK-NEXT: xvredp 1, 0 -; CHECK-NEXT: xxlor 3, 2, 2 -; CHECK-NEXT: xvmaddadp 3, 0, 1 -; CHECK-NEXT: xvnmsubadp 1, 1, 3 -; CHECK-NEXT: xvmaddadp 2, 0, 1 -; CHECK-NEXT: xvmsubadp 1, 1, 2 -; CHECK-NEXT: xvmuldp 34, 34, 1 -; CHECK-NEXT: xvmuldp 35, 35, 1 +; CHECK-NEXT: vspltisw 4, 1 +; CHECK-NEXT: xsredp 0, 1 +; CHECK-NEXT: xvcvsxwdp 2, 36 +; CHECK-NEXT: fmr 3, 2 +; CHECK-NEXT: xsnmsubadp 3, 1, 0 +; CHECK-NEXT: xsmaddadp 0, 0, 3 +; CHECK-NEXT: xsnmsubadp 2, 1, 0 +; CHECK-NEXT: xsnmaddadp 0, 0, 2 +; CHECK-NEXT: xxspltd 0, 0, 0 +; CHECK-NEXT: xvmuldp 34, 34, 0 +; CHECK-NEXT: xvmuldp 35, 35, 0 ; CHECK-NEXT: blr entry: %splat.splatinsert = insertelement <4 x double> undef, double %a0, i32 0 diff --git a/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll b/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll index 68db90ad2e198..b94091f106250 100644 --- a/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll +++ b/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll @@ -5,17 +5,16 @@ define <4 x float> @repeated_fp_divisor_noest(float %a, <4 x float> %b) { ; CHECK-LABEL: repeated_fp_divisor_noest: ; CHECK: # %bb.0: -; CHECK-NEXT: xscvdpspn 0, 1 -; CHECK-NEXT: addis 3, 2, .LCPI0_1@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI0_1@toc@l -; CHECK-NEXT: lxvd2x 1, 0, 3 +; CHECK-NEXT: vspltisw 3, 1 ; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha ; CHECK-NEXT: addi 3, 3, .LCPI0_0@toc@l -; CHECK-NEXT: xxspltw 0, 0, 0 -; CHECK-NEXT: xvdivsp 0, 1, 0 +; CHECK-NEXT: xvcvsxwdp 0, 35 +; CHECK-NEXT: xsdivsp 0, 0, 1 ; CHECK-NEXT: lxvd2x 1, 0, 3 ; CHECK-NEXT: xxswapd 35, 1 ; CHECK-NEXT: xvmulsp 1, 34, 35 +; CHECK-NEXT: xscvdpspn 0, 0 +; CHECK-NEXT: xxspltw 0, 0, 0 ; CHECK-NEXT: xvmulsp 34, 1, 0 ; CHECK-NEXT: blr %ins = insertelement <4 x float> undef, float %a, i32 0 diff --git a/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll index e7a0f149ac976..d78b29415b61a 100644 --- a/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll @@ -188,61 +188,61 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; PPC64LE-NEXT: lis 6, 1820 ; PPC64LE-NEXT: sldi 3, 3, 31 ; PPC64LE-NEXT: ori 6, 6, 29127 -; PPC64LE-NEXT: sldi 4, 4, 31 +; PPC64LE-NEXT: sldi 7, 5, 31 ; PPC64LE-NEXT: rldic 6, 6, 34, 3 -; PPC64LE-NEXT: sradi 3, 3, 31 +; PPC64LE-NEXT: sldi 4, 4, 31 ; PPC64LE-NEXT: oris 6, 6, 29127 -; PPC64LE-NEXT: sradi 4, 4, 31 -; PPC64LE-NEXT: ori 7, 6, 7282 -; PPC64LE-NEXT: sldi 5, 5, 31 +; PPC64LE-NEXT: sradi 3, 3, 31 +; PPC64LE-NEXT: ori 8, 6, 7282 +; PPC64LE-NEXT: sradi 7, 7, 31 ; PPC64LE-NEXT: ori 6, 6, 7281 -; PPC64LE-NEXT: mulhd 8, 3, 7 -; PPC64LE-NEXT: mulhd 7, 4, 7 -; PPC64LE-NEXT: sradi 5, 5, 31 -; PPC64LE-NEXT: mulhd 6, 5, 6 -; PPC64LE-NEXT: rldicl 9, 8, 1, 63 -; PPC64LE-NEXT: rldicl 10, 7, 1, 63 -; PPC64LE-NEXT: add 8, 8, 9 -; PPC64LE-NEXT: add 7, 7, 10 -; PPC64LE-NEXT: sldi 10, 8, 3 -; PPC64LE-NEXT: sub 6, 6, 5 -; PPC64LE-NEXT: add 8, 8, 10 -; PPC64LE-NEXT: sldi 10, 7, 3 -; PPC64LE-NEXT: rldicl 9, 6, 1, 63 -; PPC64LE-NEXT: add 7, 7, 10 -; PPC64LE-NEXT: sub 3, 3, 8 -; PPC64LE-NEXT: addis 8, 2, .LCPI3_1@toc@ha +; PPC64LE-NEXT: sradi 4, 4, 31 +; PPC64LE-NEXT: mulhd 9, 3, 8 +; PPC64LE-NEXT: mulhd 8, 4, 8 +; PPC64LE-NEXT: mulhd 6, 7, 6 +; PPC64LE-NEXT: rldicl 10, 9, 1, 63 +; PPC64LE-NEXT: sub 6, 6, 7 +; PPC64LE-NEXT: rldicl 7, 8, 1, 63 +; PPC64LE-NEXT: add 9, 9, 10 +; PPC64LE-NEXT: add 7, 8, 7 +; PPC64LE-NEXT: sldi 8, 9, 3 +; PPC64LE-NEXT: rldicl 10, 6, 1, 63 ; PPC64LE-NEXT: sradi 6, 6, 3 -; PPC64LE-NEXT: sub 4, 4, 7 +; PPC64LE-NEXT: add 8, 9, 8 +; PPC64LE-NEXT: sldi 9, 7, 3 +; PPC64LE-NEXT: sub 3, 3, 8 +; PPC64LE-NEXT: add 6, 6, 10 +; PPC64LE-NEXT: add 7, 7, 9 ; PPC64LE-NEXT: mtfprd 0, 3 ; PPC64LE-NEXT: addis 3, 2, .LCPI3_0@toc@ha -; PPC64LE-NEXT: addi 7, 8, .LCPI3_1@toc@l -; PPC64LE-NEXT: add 6, 6, 9 -; PPC64LE-NEXT: mtfprd 1, 4 -; PPC64LE-NEXT: addi 3, 3, .LCPI3_0@toc@l -; PPC64LE-NEXT: lxvd2x 2, 0, 7 ; PPC64LE-NEXT: sldi 8, 6, 3 -; PPC64LE-NEXT: lxvd2x 3, 0, 3 -; PPC64LE-NEXT: add 4, 6, 8 -; PPC64LE-NEXT: addis 6, 2, .LCPI3_2@toc@ha +; PPC64LE-NEXT: sub 4, 4, 7 +; PPC64LE-NEXT: addis 7, 2, .LCPI3_1@toc@ha +; PPC64LE-NEXT: addi 3, 3, .LCPI3_0@toc@l +; PPC64LE-NEXT: add 6, 6, 8 +; PPC64LE-NEXT: mtfprd 1, 4 +; PPC64LE-NEXT: addis 4, 2, .LCPI3_2@toc@ha +; PPC64LE-NEXT: addi 7, 7, .LCPI3_1@toc@l +; PPC64LE-NEXT: lxvd2x 2, 0, 3 +; PPC64LE-NEXT: add 3, 5, 6 +; PPC64LE-NEXT: addi 4, 4, .LCPI3_2@toc@l +; PPC64LE-NEXT: lxvd2x 3, 0, 7 +; PPC64LE-NEXT: clrldi 3, 3, 31 ; PPC64LE-NEXT: xxmrghd 34, 1, 0 -; PPC64LE-NEXT: add 3, 5, 4 -; PPC64LE-NEXT: addi 4, 6, .LCPI3_2@toc@l -; PPC64LE-NEXT: xxswapd 35, 2 -; PPC64LE-NEXT: mtfprd 0, 3 -; PPC64LE-NEXT: lxvd2x 1, 0, 4 -; PPC64LE-NEXT: xxland 34, 34, 3 +; PPC64LE-NEXT: lxvd2x 0, 0, 4 +; PPC64LE-NEXT: mtfprd 1, 3 +; PPC64LE-NEXT: xxswapd 35, 3 +; PPC64LE-NEXT: xxspltd 37, 1, 0 ; PPC64LE-NEXT: xxswapd 36, 0 +; PPC64LE-NEXT: xxland 34, 34, 2 ; PPC64LE-NEXT: vcmpequd 2, 2, 3 -; PPC64LE-NEXT: xxswapd 35, 1 -; PPC64LE-NEXT: xxland 36, 36, 3 -; PPC64LE-NEXT: vcmpequd 3, 4, 3 +; PPC64LE-NEXT: vcmpequd 3, 5, 4 ; PPC64LE-NEXT: xxlnor 0, 34, 34 +; PPC64LE-NEXT: xxlnor 34, 35, 35 ; PPC64LE-NEXT: xxswapd 1, 0 ; PPC64LE-NEXT: mffprwz 4, 0 -; PPC64LE-NEXT: xxlnor 34, 35, 35 -; PPC64LE-NEXT: mffprwz 3, 1 ; PPC64LE-NEXT: xxswapd 2, 34 +; PPC64LE-NEXT: mffprwz 3, 1 ; PPC64LE-NEXT: mffprwz 5, 2 ; PPC64LE-NEXT: blr %srem = srem <3 x i33> %X, diff --git a/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll b/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll new file mode 100644 index 0000000000000..f9e6f5fe06df9 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll @@ -0,0 +1,622 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64 + +define @nxv1i1(i1 %x, i1 %y) { +; CHECK-LABEL: nxv1i1: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vmsne.vi v8, v8, 0 +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vmsne.vi v9, v9, 0 +; CHECK-NEXT: vmxor.mm v0, v8, v9 +; CHECK-NEXT: ret + %head.x = insertelement poison, i1 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i1 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv2i1(i1 %x, i1 %y) { +; CHECK-LABEL: nxv2i1: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vmsne.vi v8, v8, 0 +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vmsne.vi v9, v9, 0 +; CHECK-NEXT: vmxor.mm v0, v8, v9 +; CHECK-NEXT: ret + %head.x = insertelement poison, i1 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i1 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv4i1(i1 %x, i1 %y) { +; CHECK-LABEL: nxv4i1: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vmsne.vi v8, v8, 0 +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vmsne.vi v9, v9, 0 +; CHECK-NEXT: vmxor.mm v0, v8, v9 +; CHECK-NEXT: ret + %head.x = insertelement poison, i1 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i1 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv8i1(i1 %x, i1 %y) { +; CHECK-LABEL: nxv8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vmsne.vi v8, v8, 0 +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vmsne.vi v9, v9, 0 +; CHECK-NEXT: vmxor.mm v0, v8, v9 +; CHECK-NEXT: ret + %head.x = insertelement poison, i1 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i1 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv16i1(i1 %x, i1 %y) { +; CHECK-LABEL: nxv16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vmsne.vi v10, v8, 0 +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vmsne.vi v11, v8, 0 +; CHECK-NEXT: vmxor.mm v0, v10, v11 +; CHECK-NEXT: ret + %head.x = insertelement poison, i1 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i1 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv32i1(i1 %x, i1 %y) { +; CHECK-LABEL: nxv32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: vsetvli a2, zero, e8, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vmsne.vi v12, v8, 0 +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vmsne.vi v13, v8, 0 +; CHECK-NEXT: vmxor.mm v0, v12, v13 +; CHECK-NEXT: ret + %head.x = insertelement poison, i1 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i1 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv64i1(i1 %x, i1 %y) { +; CHECK-LABEL: nxv64i1: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vmsne.vi v16, v8, 0 +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vmsne.vi v17, v8, 0 +; CHECK-NEXT: vmxor.mm v0, v16, v17 +; CHECK-NEXT: ret + %head.x = insertelement poison, i1 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i1 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv1i8(i8 %x, i8 %y) { +; CHECK-LABEL: nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: ret + %head.x = insertelement poison, i8 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i8 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv2i8(i8 %x, i8 %y) { +; CHECK-LABEL: nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: ret + %head.x = insertelement poison, i8 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i8 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv4i8(i8 %x, i8 %y) { +; CHECK-LABEL: nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: ret + %head.x = insertelement poison, i8 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i8 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv8i8(i8 %x, i8 %y) { +; CHECK-LABEL: nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: ret + %head.x = insertelement poison, i8 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i8 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv16i8(i8 %x, i8 %y) { +; CHECK-LABEL: nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: ret + %head.x = insertelement poison, i8 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i8 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv32i8(i8 %x, i8 %y) { +; CHECK-LABEL: nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: ret + %head.x = insertelement poison, i8 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i8 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv64i8(i8 %x, i8 %y) { +; CHECK-LABEL: nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: ret + %head.x = insertelement poison, i8 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i8 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv1i16(i16 %x, i16 %y) { +; CHECK-LABEL: nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: ret + %head.x = insertelement poison, i16 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i16 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv2i16(i16 %x, i16 %y) { +; CHECK-LABEL: nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: ret + %head.x = insertelement poison, i16 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i16 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv4i16(i16 %x, i16 %y) { +; CHECK-LABEL: nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: ret + %head.x = insertelement poison, i16 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i16 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv8i16(i16 %x, i16 %y) { +; CHECK-LABEL: nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: ret + %head.x = insertelement poison, i16 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i16 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv16i16(i16 %x, i16 %y) { +; CHECK-LABEL: nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: ret + %head.x = insertelement poison, i16 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i16 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv32i16(i16 %x, i16 %y) { +; CHECK-LABEL: nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: ret + %head.x = insertelement poison, i16 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i16 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv1i32(i32 %x, i32 %y) { +; RV32-LABEL: nxv1i32: +; RV32: # %bb.0: +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: nxv1i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vadd.vx v8, v8, a1 +; RV64-NEXT: ret + %head.x = insertelement poison, i32 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i32 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv2i32(i32 %x, i32 %y) { +; RV32-LABEL: nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vadd.vx v8, v8, a1 +; RV64-NEXT: ret + %head.x = insertelement poison, i32 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i32 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv4i32(i32 %x, i32 %y) { +; RV32-LABEL: nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vadd.vx v8, v8, a1 +; RV64-NEXT: ret + %head.x = insertelement poison, i32 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i32 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv8i32(i32 %x, i32 %y) { +; RV32-LABEL: nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e32, m4, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vadd.vx v8, v8, a1 +; RV64-NEXT: ret + %head.x = insertelement poison, i32 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i32 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv16i32(i32 %x, i32 %y) { +; RV32-LABEL: nxv16i32: +; RV32: # %bb.0: +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: nxv16i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vadd.vx v8, v8, a1 +; RV64-NEXT: ret + %head.x = insertelement poison, i32 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i32 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv1i64(i64 %x, i64 %y) { +; RV32-LABEL: nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: sw a3, 4(sp) +; RV32-NEXT: sw a2, 0(sp) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: ret + %head.x = insertelement poison, i64 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i64 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv2i64(i64 %x, i64 %y) { +; RV32-LABEL: nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: sw a3, 4(sp) +; RV32-NEXT: sw a2, 0(sp) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: ret + %head.x = insertelement poison, i64 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i64 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv4i64(i64 %x, i64 %y) { +; RV32-LABEL: nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: sw a3, 4(sp) +; RV32-NEXT: sw a2, 0(sp) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vadd.vv v8, v8, v12 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: ret + %head.x = insertelement poison, i64 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i64 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv8i64(i64 %x, i64 %y) { +; RV32-LABEL: nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: sw a3, 4(sp) +; RV32-NEXT: sw a2, 0(sp) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: ret + %head.x = insertelement poison, i64 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i64 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = add %splat.x, %splat.y + ret %v +} + +define @nxv4f16(half %x, half %y) { +; CHECK-LABEL: nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: fadd.h fa5, fa0, fa1 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa5 +; CHECK-NEXT: ret + %head.x = insertelement poison, half %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, half %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = fadd %splat.x, %splat.y + ret %v +} + +define @nxv2f32(float %x, float %y) { +; CHECK-LABEL: nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: fadd.s fa5, fa0, fa1 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa5 +; CHECK-NEXT: ret + %head.x = insertelement poison, float %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, float %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = fadd %splat.x, %splat.y + ret %v +} + +define @nxv2f64(double %x, double %y) { +; CHECK-LABEL: nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: fadd.d fa5, fa0, fa1 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa5 +; CHECK-NEXT: ret + %head.x = insertelement poison, double %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, double %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = fadd %splat.x, %splat.y + ret %v +} + diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll new file mode 100644 index 0000000000000..d72dfa9ae21cb --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll @@ -0,0 +1,669 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64 + +define <1 x i1> @v1i1(i1 %x, i1 %y) { +; CHECK-LABEL: v1i1: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vmsne.vi v8, v8, 0 +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vmv.s.x v9, a1 +; CHECK-NEXT: vmsne.vi v9, v9, 0 +; CHECK-NEXT: vmxor.mm v0, v8, v9 +; CHECK-NEXT: ret + %head.x = insertelement <1 x i1> poison, i1 %x, i32 0 + %splat.x = shufflevector <1 x i1> %head.x, <1 x i1> poison, <1 x i32> zeroinitializer + %head.y = insertelement <1 x i1> poison, i1 %y, i32 0 + %splat.y = shufflevector <1 x i1> %head.y, <1 x i1> poison, <1 x i32> zeroinitializer + %v = add <1 x i1> %splat.x, %splat.y + ret <1 x i1> %v +} + +define <2 x i1> @v2i1(i1 %x, i1 %y) { +; CHECK-LABEL: v2i1: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vmsne.vi v8, v8, 0 +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vmsne.vi v9, v9, 0 +; CHECK-NEXT: vmxor.mm v0, v8, v9 +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vrgather.vi v9, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret + %head.x = insertelement <2 x i1> poison, i1 %x, i32 0 + %splat.x = shufflevector <2 x i1> %head.x, <2 x i1> poison, <2 x i32> zeroinitializer + %head.y = insertelement <2 x i1> poison, i1 %y, i32 0 + %splat.y = shufflevector <2 x i1> %head.y, <2 x i1> poison, <2 x i32> zeroinitializer + %v = add <2 x i1> %splat.x, %splat.y + ret <2 x i1> %v +} + +define <4 x i1> @v4i1(i1 %x, i1 %y) { +; CHECK-LABEL: v4i1: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vmsne.vi v8, v8, 0 +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vmsne.vi v9, v9, 0 +; CHECK-NEXT: vmxor.mm v0, v8, v9 +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vrgather.vi v9, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret + %head.x = insertelement <4 x i1> poison, i1 %x, i32 0 + %splat.x = shufflevector <4 x i1> %head.x, <4 x i1> poison, <4 x i32> zeroinitializer + %head.y = insertelement <4 x i1> poison, i1 %y, i32 0 + %splat.y = shufflevector <4 x i1> %head.y, <4 x i1> poison, <4 x i32> zeroinitializer + %v = add <4 x i1> %splat.x, %splat.y + ret <4 x i1> %v +} + +define <8 x i1> @v8i1(i1 %x, i1 %y) { +; CHECK-LABEL: v8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vmsne.vi v8, v8, 0 +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vmsne.vi v9, v9, 0 +; CHECK-NEXT: vmxor.mm v0, v8, v9 +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vrgather.vi v9, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret + %head.x = insertelement <8 x i1> poison, i1 %x, i32 0 + %splat.x = shufflevector <8 x i1> %head.x, <8 x i1> poison, <8 x i32> zeroinitializer + %head.y = insertelement <8 x i1> poison, i1 %y, i32 0 + %splat.y = shufflevector <8 x i1> %head.y, <8 x i1> poison, <8 x i32> zeroinitializer + %v = add <8 x i1> %splat.x, %splat.y + ret <8 x i1> %v +} + +define <16 x i1> @v16i1(i1 %x, i1 %y) { +; CHECK-LABEL: v16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vmsne.vi v8, v8, 0 +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vmsne.vi v9, v9, 0 +; CHECK-NEXT: vmxor.mm v0, v8, v9 +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vrgather.vi v9, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret + %head.x = insertelement <16 x i1> poison, i1 %x, i32 0 + %splat.x = shufflevector <16 x i1> %head.x, <16 x i1> poison, <16 x i32> zeroinitializer + %head.y = insertelement <16 x i1> poison, i1 %y, i32 0 + %splat.y = shufflevector <16 x i1> %head.y, <16 x i1> poison, <16 x i32> zeroinitializer + %v = add <16 x i1> %splat.x, %splat.y + ret <16 x i1> %v +} + +define <32 x i1> @v32i1(i1 %x, i1 %y) { +; CHECK-LABEL: v32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vmsne.vi v10, v8, 0 +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vmsne.vi v11, v8, 0 +; CHECK-NEXT: vmxor.mm v0, v10, v11 +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vrgather.vi v10, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v10, 0 +; CHECK-NEXT: ret + %head.x = insertelement <32 x i1> poison, i1 %x, i32 0 + %splat.x = shufflevector <32 x i1> %head.x, <32 x i1> poison, <32 x i32> zeroinitializer + %head.y = insertelement <32 x i1> poison, i1 %y, i32 0 + %splat.y = shufflevector <32 x i1> %head.y, <32 x i1> poison, <32 x i32> zeroinitializer + %v = add <32 x i1> %splat.x, %splat.y + ret <32 x i1> %v +} + +define <64 x i1> @v64i1(i1 %x, i1 %y) { +; CHECK-LABEL: v64i1: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vmsne.vi v12, v8, 0 +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vmsne.vi v13, v8, 0 +; CHECK-NEXT: vmxor.mm v0, v12, v13 +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vrgather.vi v12, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v12, 0 +; CHECK-NEXT: ret + %head.x = insertelement <64 x i1> poison, i1 %x, i32 0 + %splat.x = shufflevector <64 x i1> %head.x, <64 x i1> poison, <64 x i32> zeroinitializer + %head.y = insertelement <64 x i1> poison, i1 %y, i32 0 + %splat.y = shufflevector <64 x i1> %head.y, <64 x i1> poison, <64 x i32> zeroinitializer + %v = add <64 x i1> %splat.x, %splat.y + ret <64 x i1> %v +} + +define <1 x i8> @v1i8(i8 %x, i8 %y) { +; CHECK-LABEL: v1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: ret + %head.x = insertelement <1 x i8> poison, i8 %x, i32 0 + %splat.x = shufflevector <1 x i8> %head.x, <1 x i8> poison, <1 x i32> zeroinitializer + %head.y = insertelement <1 x i8> poison, i8 %y, i32 0 + %splat.y = shufflevector <1 x i8> %head.y, <1 x i8> poison, <1 x i32> zeroinitializer + %v = add <1 x i8> %splat.x, %splat.y + ret <1 x i8> %v +} + +define <2 x i8> @v2i8(i8 %x, i8 %y) { +; CHECK-LABEL: v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v9, v8, a1 +; CHECK-NEXT: vrgather.vi v8, v9, 0 +; CHECK-NEXT: ret + %head.x = insertelement <2 x i8> poison, i8 %x, i32 0 + %splat.x = shufflevector <2 x i8> %head.x, <2 x i8> poison, <2 x i32> zeroinitializer + %head.y = insertelement <2 x i8> poison, i8 %y, i32 0 + %splat.y = shufflevector <2 x i8> %head.y, <2 x i8> poison, <2 x i32> zeroinitializer + %v = add <2 x i8> %splat.x, %splat.y + ret <2 x i8> %v +} + +define <4 x i8> @v4i8(i8 %x, i8 %y) { +; CHECK-LABEL: v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v9, v8, a1 +; CHECK-NEXT: vrgather.vi v8, v9, 0 +; CHECK-NEXT: ret + %head.x = insertelement <4 x i8> poison, i8 %x, i32 0 + %splat.x = shufflevector <4 x i8> %head.x, <4 x i8> poison, <4 x i32> zeroinitializer + %head.y = insertelement <4 x i8> poison, i8 %y, i32 0 + %splat.y = shufflevector <4 x i8> %head.y, <4 x i8> poison, <4 x i32> zeroinitializer + %v = add <4 x i8> %splat.x, %splat.y + ret <4 x i8> %v +} + +define <8 x i8> @v8i8(i8 %x, i8 %y) { +; CHECK-LABEL: v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v9, v8, a1 +; CHECK-NEXT: vrgather.vi v8, v9, 0 +; CHECK-NEXT: ret + %head.x = insertelement <8 x i8> poison, i8 %x, i32 0 + %splat.x = shufflevector <8 x i8> %head.x, <8 x i8> poison, <8 x i32> zeroinitializer + %head.y = insertelement <8 x i8> poison, i8 %y, i32 0 + %splat.y = shufflevector <8 x i8> %head.y, <8 x i8> poison, <8 x i32> zeroinitializer + %v = add <8 x i8> %splat.x, %splat.y + ret <8 x i8> %v +} + +define <16 x i8> @v16i8(i8 %x, i8 %y) { +; CHECK-LABEL: v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v9, v8, a1 +; CHECK-NEXT: vrgather.vi v8, v9, 0 +; CHECK-NEXT: ret + %head.x = insertelement <16 x i8> poison, i8 %x, i32 0 + %splat.x = shufflevector <16 x i8> %head.x, <16 x i8> poison, <16 x i32> zeroinitializer + %head.y = insertelement <16 x i8> poison, i8 %y, i32 0 + %splat.y = shufflevector <16 x i8> %head.y, <16 x i8> poison, <16 x i32> zeroinitializer + %v = add <16 x i8> %splat.x, %splat.y + ret <16 x i8> %v +} + +define <32 x i8> @v32i8(i8 %x, i8 %y) { +; CHECK-LABEL: v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v10, v8, a1 +; CHECK-NEXT: vrgather.vi v8, v10, 0 +; CHECK-NEXT: ret + %head.x = insertelement <32 x i8> poison, i8 %x, i32 0 + %splat.x = shufflevector <32 x i8> %head.x, <32 x i8> poison, <32 x i32> zeroinitializer + %head.y = insertelement <32 x i8> poison, i8 %y, i32 0 + %splat.y = shufflevector <32 x i8> %head.y, <32 x i8> poison, <32 x i32> zeroinitializer + %v = add <32 x i8> %splat.x, %splat.y + ret <32 x i8> %v +} + +define <64 x i8> @v64i8(i8 %x, i8 %y) { +; CHECK-LABEL: v64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v12, v8, a1 +; CHECK-NEXT: vrgather.vi v8, v12, 0 +; CHECK-NEXT: ret + %head.x = insertelement <64 x i8> poison, i8 %x, i32 0 + %splat.x = shufflevector <64 x i8> %head.x, <64 x i8> poison, <64 x i32> zeroinitializer + %head.y = insertelement <64 x i8> poison, i8 %y, i32 0 + %splat.y = shufflevector <64 x i8> %head.y, <64 x i8> poison, <64 x i32> zeroinitializer + %v = add <64 x i8> %splat.x, %splat.y + ret <64 x i8> %v +} + +define <1 x i16> @v1i16(i16 %x, i16 %y) { +; CHECK-LABEL: v1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: ret + %head.x = insertelement <1 x i16> poison, i16 %x, i32 0 + %splat.x = shufflevector <1 x i16> %head.x, <1 x i16> poison, <1 x i32> zeroinitializer + %head.y = insertelement <1 x i16> poison, i16 %y, i32 0 + %splat.y = shufflevector <1 x i16> %head.y, <1 x i16> poison, <1 x i32> zeroinitializer + %v = add <1 x i16> %splat.x, %splat.y + ret <1 x i16> %v +} + +define <2 x i16> @v2i16(i16 %x, i16 %y) { +; CHECK-LABEL: v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v9, v8, a1 +; CHECK-NEXT: vrgather.vi v8, v9, 0 +; CHECK-NEXT: ret + %head.x = insertelement <2 x i16> poison, i16 %x, i32 0 + %splat.x = shufflevector <2 x i16> %head.x, <2 x i16> poison, <2 x i32> zeroinitializer + %head.y = insertelement <2 x i16> poison, i16 %y, i32 0 + %splat.y = shufflevector <2 x i16> %head.y, <2 x i16> poison, <2 x i32> zeroinitializer + %v = add <2 x i16> %splat.x, %splat.y + ret <2 x i16> %v +} + +define <4 x i16> @v4i16(i16 %x, i16 %y) { +; CHECK-LABEL: v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v9, v8, a1 +; CHECK-NEXT: vrgather.vi v8, v9, 0 +; CHECK-NEXT: ret + %head.x = insertelement <4 x i16> poison, i16 %x, i32 0 + %splat.x = shufflevector <4 x i16> %head.x, <4 x i16> poison, <4 x i32> zeroinitializer + %head.y = insertelement <4 x i16> poison, i16 %y, i32 0 + %splat.y = shufflevector <4 x i16> %head.y, <4 x i16> poison, <4 x i32> zeroinitializer + %v = add <4 x i16> %splat.x, %splat.y + ret <4 x i16> %v +} + +define <8 x i16> @v8i16(i16 %x, i16 %y) { +; CHECK-LABEL: v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v9, v8, a1 +; CHECK-NEXT: vrgather.vi v8, v9, 0 +; CHECK-NEXT: ret + %head.x = insertelement <8 x i16> poison, i16 %x, i32 0 + %splat.x = shufflevector <8 x i16> %head.x, <8 x i16> poison, <8 x i32> zeroinitializer + %head.y = insertelement <8 x i16> poison, i16 %y, i32 0 + %splat.y = shufflevector <8 x i16> %head.y, <8 x i16> poison, <8 x i32> zeroinitializer + %v = add <8 x i16> %splat.x, %splat.y + ret <8 x i16> %v +} + +define <16 x i16> @v16i16(i16 %x, i16 %y) { +; CHECK-LABEL: v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v10, v8, a1 +; CHECK-NEXT: vrgather.vi v8, v10, 0 +; CHECK-NEXT: ret + %head.x = insertelement <16 x i16> poison, i16 %x, i32 0 + %splat.x = shufflevector <16 x i16> %head.x, <16 x i16> poison, <16 x i32> zeroinitializer + %head.y = insertelement <16 x i16> poison, i16 %y, i32 0 + %splat.y = shufflevector <16 x i16> %head.y, <16 x i16> poison, <16 x i32> zeroinitializer + %v = add <16 x i16> %splat.x, %splat.y + ret <16 x i16> %v +} + +define <32 x i16> @v32i16(i16 %x, i16 %y) { +; CHECK-LABEL: v32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vadd.vx v12, v8, a1 +; CHECK-NEXT: vrgather.vi v8, v12, 0 +; CHECK-NEXT: ret + %head.x = insertelement <32 x i16> poison, i16 %x, i32 0 + %splat.x = shufflevector <32 x i16> %head.x, <32 x i16> poison, <32 x i32> zeroinitializer + %head.y = insertelement <32 x i16> poison, i16 %y, i32 0 + %splat.y = shufflevector <32 x i16> %head.y, <32 x i16> poison, <32 x i32> zeroinitializer + %v = add <32 x i16> %splat.x, %splat.y + ret <32 x i16> %v +} + +define <1 x i32> @v1i32(i32 %x, i32 %y) { +; RV32-LABEL: v1i32: +; RV32: # %bb.0: +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: v1i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV64-NEXT: vmv.s.x v8, a0 +; RV64-NEXT: vadd.vx v8, v8, a1 +; RV64-NEXT: ret + %head.x = insertelement <1 x i32> poison, i32 %x, i32 0 + %splat.x = shufflevector <1 x i32> %head.x, <1 x i32> poison, <1 x i32> zeroinitializer + %head.y = insertelement <1 x i32> poison, i32 %y, i32 0 + %splat.y = shufflevector <1 x i32> %head.y, <1 x i32> poison, <1 x i32> zeroinitializer + %v = add <1 x i32> %splat.x, %splat.y + ret <1 x i32> %v +} + +define <2 x i32> @v2i32(i32 %x, i32 %y) { +; RV32-LABEL: v2i32: +; RV32: # %bb.0: +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vadd.vx v9, v8, a1 +; RV64-NEXT: vrgather.vi v8, v9, 0 +; RV64-NEXT: ret + %head.x = insertelement <2 x i32> poison, i32 %x, i32 0 + %splat.x = shufflevector <2 x i32> %head.x, <2 x i32> poison, <2 x i32> zeroinitializer + %head.y = insertelement <2 x i32> poison, i32 %y, i32 0 + %splat.y = shufflevector <2 x i32> %head.y, <2 x i32> poison, <2 x i32> zeroinitializer + %v = add <2 x i32> %splat.x, %splat.y + ret <2 x i32> %v +} + +define <4 x i32> @v4i32(i32 %x, i32 %y) { +; RV32-LABEL: v4i32: +; RV32: # %bb.0: +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vadd.vx v9, v8, a1 +; RV64-NEXT: vrgather.vi v8, v9, 0 +; RV64-NEXT: ret + %head.x = insertelement <4 x i32> poison, i32 %x, i32 0 + %splat.x = shufflevector <4 x i32> %head.x, <4 x i32> poison, <4 x i32> zeroinitializer + %head.y = insertelement <4 x i32> poison, i32 %y, i32 0 + %splat.y = shufflevector <4 x i32> %head.y, <4 x i32> poison, <4 x i32> zeroinitializer + %v = add <4 x i32> %splat.x, %splat.y + ret <4 x i32> %v +} + +define <8 x i32> @v8i32(i32 %x, i32 %y) { +; RV32-LABEL: v8i32: +; RV32: # %bb.0: +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: v8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vadd.vx v10, v8, a1 +; RV64-NEXT: vrgather.vi v8, v10, 0 +; RV64-NEXT: ret + %head.x = insertelement <8 x i32> poison, i32 %x, i32 0 + %splat.x = shufflevector <8 x i32> %head.x, <8 x i32> poison, <8 x i32> zeroinitializer + %head.y = insertelement <8 x i32> poison, i32 %y, i32 0 + %splat.y = shufflevector <8 x i32> %head.y, <8 x i32> poison, <8 x i32> zeroinitializer + %v = add <8 x i32> %splat.x, %splat.y + ret <8 x i32> %v +} + +define <16 x i32> @v16i32(i32 %x, i32 %y) { +; RV32-LABEL: v16i32: +; RV32: # %bb.0: +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: v16i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vadd.vx v12, v8, a1 +; RV64-NEXT: vrgather.vi v8, v12, 0 +; RV64-NEXT: ret + %head.x = insertelement <16 x i32> poison, i32 %x, i32 0 + %splat.x = shufflevector <16 x i32> %head.x, <16 x i32> poison, <16 x i32> zeroinitializer + %head.y = insertelement <16 x i32> poison, i32 %y, i32 0 + %splat.y = shufflevector <16 x i32> %head.y, <16 x i32> poison, <16 x i32> zeroinitializer + %v = add <16 x i32> %splat.x, %splat.y + ret <16 x i32> %v +} + +define <1 x i64> @v1i64(i64 %x, i64 %y) { +; RV32-LABEL: v1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: sw a3, 4(sp) +; RV32-NEXT: sw a2, 0(sp) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: v1i64: +; RV64: # %bb.0: +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a0 +; RV64-NEXT: ret + %head.x = insertelement <1 x i64> poison, i64 %x, i32 0 + %splat.x = shufflevector <1 x i64> %head.x, <1 x i64> poison, <1 x i32> zeroinitializer + %head.y = insertelement <1 x i64> poison, i64 %y, i32 0 + %splat.y = shufflevector <1 x i64> %head.y, <1 x i64> poison, <1 x i32> zeroinitializer + %v = add <1 x i64> %splat.x, %splat.y + ret <1 x i64> %v +} + +define <2 x i64> @v2i64(i64 %x, i64 %y) { +; RV32-LABEL: v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: sw a3, 4(sp) +; RV32-NEXT: sw a2, 0(sp) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vadd.vv v9, v8, v9 +; RV32-NEXT: vrgather.vi v8, v9, 0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: v2i64: +; RV64: # %bb.0: +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: ret + %head.x = insertelement <2 x i64> poison, i64 %x, i32 0 + %splat.x = shufflevector <2 x i64> %head.x, <2 x i64> poison, <2 x i32> zeroinitializer + %head.y = insertelement <2 x i64> poison, i64 %y, i32 0 + %splat.y = shufflevector <2 x i64> %head.y, <2 x i64> poison, <2 x i32> zeroinitializer + %v = add <2 x i64> %splat.x, %splat.y + ret <2 x i64> %v +} + +define <4 x i64> @v4i64(i64 %x, i64 %y) { +; RV32-LABEL: v4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: sw a3, 4(sp) +; RV32-NEXT: sw a2, 0(sp) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vadd.vv v10, v8, v10 +; RV32-NEXT: vrgather.vi v8, v10, 0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: v4i64: +; RV64: # %bb.0: +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: ret + %head.x = insertelement <4 x i64> poison, i64 %x, i32 0 + %splat.x = shufflevector <4 x i64> %head.x, <4 x i64> poison, <4 x i32> zeroinitializer + %head.y = insertelement <4 x i64> poison, i64 %y, i32 0 + %splat.y = shufflevector <4 x i64> %head.y, <4 x i64> poison, <4 x i32> zeroinitializer + %v = add <4 x i64> %splat.x, %splat.y + ret <4 x i64> %v +} + +define <8 x i64> @v8i64(i64 %x, i64 %y) { +; RV32-LABEL: v8i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: sw a3, 4(sp) +; RV32-NEXT: sw a2, 0(sp) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vadd.vv v12, v8, v12 +; RV32-NEXT: vrgather.vi v8, v12, 0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: v8i64: +; RV64: # %bb.0: +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: ret + %head.x = insertelement <8 x i64> poison, i64 %x, i32 0 + %splat.x = shufflevector <8 x i64> %head.x, <8 x i64> poison, <8 x i32> zeroinitializer + %head.y = insertelement <8 x i64> poison, i64 %y, i32 0 + %splat.y = shufflevector <8 x i64> %head.y, <8 x i64> poison, <8 x i32> zeroinitializer + %v = add <8 x i64> %splat.x, %splat.y + ret <8 x i64> %v +} + +define <4 x half> @v4f16(half %x, half %y) { +; CHECK-LABEL: v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: fadd.h fa5, fa0, fa1 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa5 +; CHECK-NEXT: ret + %head.x = insertelement <4 x half> poison, half %x, i32 0 + %splat.x = shufflevector <4 x half> %head.x, <4 x half> poison, <4 x i32> zeroinitializer + %head.y = insertelement <4 x half> poison, half %y, i32 0 + %splat.y = shufflevector <4 x half> %head.y, <4 x half> poison, <4 x i32> zeroinitializer + %v = fadd <4 x half> %splat.x, %splat.y + ret <4 x half> %v +} + +define <2 x float> @v2f32(float %x, float %y) { +; CHECK-LABEL: v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: fadd.s fa5, fa0, fa1 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa5 +; CHECK-NEXT: ret + %head.x = insertelement <2 x float> poison, float %x, i32 0 + %splat.x = shufflevector <2 x float> %head.x, <2 x float> poison, <2 x i32> zeroinitializer + %head.y = insertelement <2 x float> poison, float %y, i32 0 + %splat.y = shufflevector <2 x float> %head.y, <2 x float> poison, <2 x i32> zeroinitializer + %v = fadd <2 x float> %splat.x, %splat.y + ret <2 x float> %v +} + +define <1 x double> @v2f64(double %x, double %y) { +; CHECK-LABEL: v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: fadd.d fa5, fa0, fa1 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa5 +; CHECK-NEXT: ret + %head.x = insertelement <1 x double> poison, double %x, i32 0 + %splat.x = shufflevector <1 x double> %head.x, <1 x double> poison, <1 x i32> zeroinitializer + %head.y = insertelement <1 x double> poison, double %y, i32 0 + %splat.y = shufflevector <1 x double> %head.y, <1 x double> poison, <1 x i32> zeroinitializer + %v = fadd <1 x double> %splat.x, %splat.y + ret <1 x double> %v +} + diff --git a/llvm/test/CodeGen/SystemZ/dag-combine-03.ll b/llvm/test/CodeGen/SystemZ/dag-combine-03.ll index 3625ac68b327f..15750f54ed054 100644 --- a/llvm/test/CodeGen/SystemZ/dag-combine-03.ll +++ b/llvm/test/CodeGen/SystemZ/dag-combine-03.ll @@ -24,11 +24,7 @@ define void @fun(i64 %a0) { ; CHECK-NEXT: cgr %r0, %r2 ; CHECK-NEXT: lhi %r0, 0 ; CHECK-NEXT: lochie %r0, 1 -; CHECK-NEXT: vlvgp %v0, %r3, %r3 -; CHECK-NEXT: vlvgp %v1, %r0, %r0 -; CHECK-NEXT: vx %v0, %v0, %v1 -; CHECK-NEXT: vlgvf %r0, %v0, 1 -; CHECK-NEXT: chi %r0, 0 +; CHECK-NEXT: xr %r0, %r3 ; CHECK-NEXT: locghie %r1, 0 ; CHECK-NEXT: j .LBB0_1 entry: diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll index 532dbde89c826..f76569630dd10 100644 --- a/llvm/test/CodeGen/SystemZ/pr60413.ll +++ b/llvm/test/CodeGen/SystemZ/pr60413.ll @@ -27,12 +27,11 @@ define dso_local void @m() local_unnamed_addr #1 { ; CHECK-NEXT: vlvgp %v0, %r0, %r1 ; CHECK-NEXT: vlvgf %v0, %r0, 0 ; CHECK-NEXT: vlvgf %v0, %r0, 2 -; CHECK-NEXT: vlvgp %v2, %r1, %r1 +; CHECK-NEXT: vgbm %v2, 30583 +; CHECK-NEXT: vn %v0, %v0, %v2 +; CHECK-NEXT: vn %v1, %v1, %v2 +; CHECK-NEXT: vlvgp %v2, %r0, %r0 ; CHECK-NEXT: vrepf %v2, %v2, 1 -; CHECK-NEXT: vgbm %v3, 30583 -; CHECK-NEXT: vn %v0, %v0, %v3 -; CHECK-NEXT: vn %v1, %v1, %v3 -; CHECK-NEXT: vn %v2, %v2, %v3 ; CHECK-NEXT: vrepif %v3, 127 ; CHECK-NEXT: vchlf %v1, %v1, %v3 ; CHECK-NEXT: vlgvf %r13, %v1, 0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll index 6d9dc46e1caa0..a60c804bea1df 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll @@ -11,13 +11,10 @@ define arm_aapcs_vfpcc float @vctpi32(ptr %0, i32 %1) { ; CHECK-LABEL: vctpi32: ; CHECK: @ %bb.0: ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: mvn r3, #31 -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: subs r2, r1, #1 -; CHECK-NEXT: vadd.i32 q2, q2, r3 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: subs r0, #32 ; CHECK-NEXT: vidup.u32 q1, r4, #8 -; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.32 lr, r2 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll index ad63e9ee9ff4c..310f964740208 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -460,16 +460,15 @@ define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32* ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: add.w r7, r6, r7, lsr #1 -; CHECK-NEXT: vdup.32 q1, r9 +; CHECK-NEXT: vdup.32 q0, r9 ; CHECK-NEXT: bic r7, r7, #3 -; CHECK-NEXT: vshl.i32 q3, q1, #3 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: add.w r10, r6, r7, lsr #2 ; CHECK-NEXT: adr r7, .LCPI9_0 ; CHECK-NEXT: adr r6, .LCPI9_1 ; CHECK-NEXT: vldrw.u32 q2, [r7] -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r6] ; CHECK-NEXT: .LBB9_1: @ %for.cond8.preheader.us.us.preheader ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB9_2 Depth 2 @@ -481,33 +480,32 @@ define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32* ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 -; CHECK-NEXT: vdup.32 q5, r7 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vshl.i32 q5, q5, #2 -; CHECK-NEXT: vmov q6, q1 -; CHECK-NEXT: vadd.i32 q5, q5, r0 +; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload +; CHECK-NEXT: add.w r4, r0, r7, lsl #2 +; CHECK-NEXT: lsl.w r6, r9, #3 ; CHECK-NEXT: dls lr, r10 -; CHECK-NEXT: vmov.i32 q4, #0x0 -; CHECK-NEXT: vadd.i32 q5, q5, q0 -; CHECK-NEXT: vmlas.i32 q6, q2, r5 +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmlas.i32 q6, q1, r5 +; CHECK-NEXT: vadd.i32 q4, q2, r4 +; CHECK-NEXT: vdup.32 q5, r6 ; CHECK-NEXT: .LBB9_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q7, q6, q3 +; CHECK-NEXT: vadd.i32 q7, q6, q5 ; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2] -; CHECK-NEXT: vldrw.u32 q6, [q5, #32]! +; CHECK-NEXT: vldrw.u32 q6, [q4, #32]! ; CHECK-NEXT: vmul.i32 q0, q0, q6 ; CHECK-NEXT: vmov q6, q7 -; CHECK-NEXT: vadd.i32 q4, q0, q4 +; CHECK-NEXT: vadd.i32 q3, q0, q3 ; CHECK-NEXT: le lr, .LBB9_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2 -; CHECK-NEXT: add.w r4, r5, r11 +; CHECK-NEXT: add.w r6, r5, r11 ; CHECK-NEXT: adds r5, #1 -; CHECK-NEXT: vaddv.u32 r6, q4 +; CHECK-NEXT: vaddv.u32 r4, q3 ; CHECK-NEXT: cmp r5, r9 -; CHECK-NEXT: str.w r6, [r2, r4, lsl #2] +; CHECK-NEXT: str.w r4, [r2, r6, lsl #2] ; CHECK-NEXT: bne .LBB9_2 ; CHECK-NEXT: @ %bb.5: @ %for.cond4.for.cond.cleanup6_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB9_1 Depth=1 @@ -522,15 +520,15 @@ define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32* ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.7: ; CHECK-NEXT: .LCPI9_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .LCPI9_1: ; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 ; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 ; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 ; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 +; CHECK-NEXT: .LCPI9_1: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 6 @ 0x6 for.cond8.preheader.us.us.preheader.preheader: ; preds = %entry %0 = add i32 %l, -1 @@ -607,11 +605,11 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* ; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: strd r0, r2, [sp, #24] @ 8-byte Folded Spill ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: mov r0, r3 ; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrne r0, [sp, #136] -; CHECK-NEXT: cmpne r0, #0 +; CHECK-NEXT: ldrne.w r12, [sp, #136] +; CHECK-NEXT: cmpne.w r12, #0 ; CHECK-NEXT: bne .LBB10_2 ; CHECK-NEXT: .LBB10_1: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #32 @@ -619,24 +617,23 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader -; CHECK-NEXT: ldr.w r12, [sp, #140] +; CHECK-NEXT: ldr.w r11, [sp, #140] ; CHECK-NEXT: movs r7, #1 -; CHECK-NEXT: mov.w r11, #0 +; CHECK-NEXT: lsl.w r6, r12, #1 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: bic r2, r12, #3 -; CHECK-NEXT: subs r3, r2, #4 +; CHECK-NEXT: bic r0, r11, #3 +; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: subs r3, r0, #4 +; CHECK-NEXT: vdup.32 q1, r12 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: add.w r0, r7, r3, lsr #2 -; CHECK-NEXT: ldr r7, [sp, #136] ; CHECK-NEXT: adr r3, .LCPI10_0 -; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: lsl.w r0, r12, #1 -; CHECK-NEXT: vdup.32 q1, r7 +; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: lsl.w r0, r11, #1 ; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: lsls r6, r7, #1 -; CHECK-NEXT: vshl.i32 q3, q1, #2 -; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: b .LBB10_5 ; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader @@ -647,15 +644,15 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* ; CHECK-NEXT: b .LBB10_15 ; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 +; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-NEXT: add r7, r11 +; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: add r2, r0 +; CHECK-NEXT: str r2, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: add r11, r12 -; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: add r3, r0 -; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: adds r3, #1 -; CHECK-NEXT: cmp r3, r0 +; CHECK-NEXT: adds r2, #1 +; CHECK-NEXT: cmp r2, r0 ; CHECK-NEXT: beq .LBB10_1 ; CHECK-NEXT: .LBB10_5: @ %for.cond1.preheader.us ; CHECK-NEXT: @ =>This Loop Header: Depth=1 @@ -663,9 +660,9 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* ; CHECK-NEXT: @ Child Loop BB10_11 Depth 3 ; CHECK-NEXT: @ Child Loop BB10_14 Depth 3 ; CHECK-NEXT: @ Child Loop BB10_15 Depth 2 -; CHECK-NEXT: mul r5, r3, r7 -; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: mul r5, r2, r12 +; CHECK-NEXT: cmp.w r11, #0 +; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: beq .LBB10_3 ; CHECK-NEXT: @ %bb.6: @ %for.cond5.preheader.us.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 @@ -676,7 +673,7 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* ; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: add.w r0, r8, r5 ; CHECK-NEXT: add.w r8, r8, #1 -; CHECK-NEXT: cmp r8, r7 +; CHECK-NEXT: cmp r8, r12 ; CHECK-NEXT: strh.w r10, [r3, r0, lsl #1] ; CHECK-NEXT: beq .LBB10_4 ; CHECK-NEXT: .LBB10_8: @ %for.cond5.preheader.us.us @@ -684,7 +681,7 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB10_11 Depth 3 ; CHECK-NEXT: @ Child Loop BB10_14 Depth 3 -; CHECK-NEXT: cmp.w r12, #3 +; CHECK-NEXT: cmp.w r11, #3 ; CHECK-NEXT: bhi .LBB10_10 ; CHECK-NEXT: @ %bb.9: @ in Loop: Header=BB10_8 Depth=2 ; CHECK-NEXT: movs r4, #0 @@ -692,9 +689,11 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* ; CHECK-NEXT: b .LBB10_13 ; CHECK-NEXT: .LBB10_10: @ %vector.ph ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: lsl.w r0, r12, #2 ; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vdup.32 q4, r0 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: vmlas.i32 q5, q2, r8 ; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload @@ -702,27 +701,26 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q6, q5, q3 +; CHECK-NEXT: vadd.i32 q6, q5, q4 ; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1] ; CHECK-NEXT: vldrh.s32 q5, [r3], #8 ; CHECK-NEXT: vmul.i32 q5, q7, q5 -; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vadd.i32 q3, q5, q3 ; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: le lr, .LBB10_11 ; CHECK-NEXT: @ %bb.12: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: vaddv.u32 r10, q4 -; CHECK-NEXT: cmp r2, r12 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: ldr r4, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: vaddv.u32 r10, q3 +; CHECK-NEXT: cmp r4, r11 ; CHECK-NEXT: beq .LBB10_7 ; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: mla r3, r7, r4, r8 -; CHECK-NEXT: add.w r0, r11, r4 -; CHECK-NEXT: ldr r7, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: sub.w lr, r12, r4 -; CHECK-NEXT: add.w r9, r7, r0, lsl #1 -; CHECK-NEXT: ldr r7, [sp, #136] +; CHECK-NEXT: mla r3, r12, r4, r8 +; CHECK-NEXT: adds r0, r7, r4 +; CHECK-NEXT: ldr r2, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: sub.w lr, r11, r4 +; CHECK-NEXT: add.w r9, r2, r0, lsl #1 ; CHECK-NEXT: add.w r3, r1, r3, lsl #1 ; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 @@ -867,12 +865,11 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly ; CHECK-NEXT: add.w r8, r7, #10 ; CHECK-NEXT: adr r7, .LCPI11_0 ; CHECK-NEXT: ldr r1, [sp, #96] -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vldrw.u32 q1, [r7] +; CHECK-NEXT: vldrw.u32 q0, [r7] +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: mov.w r9, #6 +; CHECK-NEXT: movs r3, #6 ; CHECK-NEXT: movs r6, #11 -; CHECK-NEXT: vshl.i32 q0, q0, #2 ; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: .LBB11_1: @ %for.body10.i ; CHECK-NEXT: @ =>This Loop Header: Depth=1 @@ -881,7 +878,7 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly ; CHECK-NEXT: @ Child Loop BB11_4 Depth 4 ; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 ; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r5, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB11_2: @ %for.cond22.preheader.i ; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1 ; CHECK-NEXT: @ => This Loop Header: Depth=2 @@ -895,7 +892,7 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly ; CHECK-NEXT: @ => This Loop Header: Depth=3 ; CHECK-NEXT: @ Child Loop BB11_4 Depth 4 ; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 -; CHECK-NEXT: dls lr, r9 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: mov.w r11, #4 ; CHECK-NEXT: .LBB11_4: @ %for.body78.us.i @@ -906,11 +903,13 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly ; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 ; CHECK-NEXT: mul r4, r11, r6 ; CHECK-NEXT: vdup.32 q3, r5 +; CHECK-NEXT: lsl.w r9, r2, #2 ; CHECK-NEXT: vdup.32 q2, r7 -; CHECK-NEXT: vadd.i32 q4, q1, r4 +; CHECK-NEXT: vadd.i32 q4, q0, r4 +; CHECK-NEXT: vdup.32 q1, r9 ; CHECK-NEXT: vmla.i32 q3, q4, r2 ; CHECK-NEXT: adds r4, #113 -; CHECK-NEXT: vadd.i32 q4, q1, r4 +; CHECK-NEXT: vadd.i32 q4, q0, r4 ; CHECK-NEXT: mov r4, r8 ; CHECK-NEXT: vmla.i32 q2, q4, r2 ; CHECK-NEXT: .LBB11_5: @ %vector.body @@ -920,8 +919,8 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly ; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=5 ; CHECK-NEXT: vldrb.s32 q6, [r0, q2] -; CHECK-NEXT: vadd.i32 q5, q2, q0 -; CHECK-NEXT: vadd.i32 q4, q3, q0 +; CHECK-NEXT: vadd.i32 q5, q2, q1 +; CHECK-NEXT: vadd.i32 q4, q3, q1 ; CHECK-NEXT: subs r4, #4 ; CHECK-NEXT: vadd.i32 q2, q6, r2 ; CHECK-NEXT: vldrb.s32 q6, [r1, q3] @@ -941,12 +940,13 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly ; CHECK-NEXT: bne .LBB11_3 ; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup26.i ; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=2 +; CHECK-NEXT: ldr r5, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: adds r7, #1 -; CHECK-NEXT: cmp r7, r3 +; CHECK-NEXT: cmp r7, r5 ; CHECK-NEXT: bne .LBB11_2 ; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup20.i ; CHECK-NEXT: @ in Loop: Header=BB11_1 Depth=1 -; CHECK-NEXT: ldr r5, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload ; CHECK-NEXT: ldr r7, [sp, #148] ; CHECK-NEXT: adds r5, #1 ; CHECK-NEXT: cmp r5, r7 diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll index 8f969b8ad4c61..64057470be7fc 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll @@ -245,9 +245,9 @@ entry: define arm_aapcs_vfpcc void @ptr_v4i16_dup(i32 %v, <4 x ptr> %offs) { ; CHECK-LABEL: ptr_v4i16_dup: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: uxth r0, r0 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: vmovlb.u16 q1, q1 +; CHECK-NEXT: vdup.32 q1, r0 ; CHECK-NEXT: vstrh.32 q1, [r1, q0] ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-selectcc.ll b/llvm/test/CodeGen/Thumb2/mve-selectcc.ll index 63c1d8a975877..1c4255f850550 100644 --- a/llvm/test/CodeGen/Thumb2/mve-selectcc.ll +++ b/llvm/test/CodeGen/Thumb2/mve-selectcc.ll @@ -210,19 +210,21 @@ define i32 @e() { ; CHECK-NEXT: adr r0, .LCPI14_0 ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: movs r0, #4 +; CHECK-NEXT: mov.w r12, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: .LBB14_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r12 ; CHECK-NEXT: cmp r1, #8 ; CHECK-NEXT: csetm r2, eq -; CHECK-NEXT: subs.w r3, r1, #8 +; CHECK-NEXT: subs.w r0, r1, #8 +; CHECK-NEXT: mvn.w r3, r2 ; CHECK-NEXT: vdup.32 q2, r2 -; CHECK-NEXT: csel r1, r1, r3, ne -; CHECK-NEXT: vbic q1, q1, q2 +; CHECK-NEXT: vdup.32 q3, r3 ; CHECK-NEXT: vand q2, q2, q0 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: csel r1, r1, r0, ne ; CHECK-NEXT: vorr q1, q2, q1 ; CHECK-NEXT: b .LBB14_1 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll index 217caeebe6335..4445f0e84dc8f 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll @@ -774,8 +774,9 @@ entry: define arm_aapcs_vfpcc <4 x i32> @zext16_02468101214_0ext(<8 x i16> %src1, i16 %src2) { ; CHECK-LABEL: zext16_02468101214_0ext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmullb.u16 q0, q0, q1 +; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: uxth r0, r0 +; CHECK-NEXT: vmul.i32 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> @@ -790,8 +791,9 @@ entry: define arm_aapcs_vfpcc <4 x i32> @zext16_0ext_02468101214(<8 x i16> %src1, i16 %src2) { ; CHECK-LABEL: zext16_0ext_02468101214: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmullb.u16 q0, q1, q0 +; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: uxth r0, r0 +; CHECK-NEXT: vmul.i32 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> @@ -840,9 +842,9 @@ entry: define arm_aapcs_vfpcc <4 x i32> @zext16_13579111315_0ext(<8 x i16> %src1, i16 %src2) { ; CHECK-LABEL: zext16_13579111315_0ext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vrev32.16 q0, q0 -; CHECK-NEXT: vmullb.u16 q0, q0, q1 +; CHECK-NEXT: vmovlt.u16 q0, q0 +; CHECK-NEXT: uxth r0, r0 +; CHECK-NEXT: vmul.i32 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> @@ -857,9 +859,9 @@ entry: define arm_aapcs_vfpcc <4 x i32> @zext16_0ext_13579111315(<8 x i16> %src1, i16 %src2) { ; CHECK-LABEL: zext16_0ext_13579111315: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vrev32.16 q0, q0 -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmullb.u16 q0, q1, q0 +; CHECK-NEXT: vmovlt.u16 q0, q0 +; CHECK-NEXT: uxth r0, r0 +; CHECK-NEXT: vmul.i32 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-23634.ll b/llvm/test/CodeGen/X86/avx512-bugfix-23634.ll index ee20747e23dcc..e393acfa9fdd8 100644 --- a/llvm/test/CodeGen/X86/avx512-bugfix-23634.ll +++ b/llvm/test/CodeGen/X86/avx512-bugfix-23634.ll @@ -8,16 +8,17 @@ define void @f_fu(ptr %ret, ptr %aa, float %b) { ; CHECK-LABEL: f_fu: ; CHECK: ## %bb.0: ## %allocas ; CHECK-NEXT: vcvttss2si %xmm0, %eax -; CHECK-NEXT: vpbroadcastd %eax, %zmm0 -; CHECK-NEXT: vcvttps2dq (%rsi), %zmm1 -; CHECK-NEXT: vpsrld $31, %zmm0, %zmm2 -; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm2 -; CHECK-NEXT: vpsrad $1, %zmm2, %zmm2 -; CHECK-NEXT: movw $-21846, %ax ## imm = 0xAAAA -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa32 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1} -; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vcvttps2dq (%rsi), %zmm0 +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrl $31, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: sarl %ecx +; CHECK-NEXT: movw $-21846, %dx ## imm = 0xAAAA +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqa32 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k1} +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vpbroadcastd %ecx, %zmm1 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 ; CHECK-NEXT: vmovups %zmm0, (%rdi) ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll index 6c266be808eaf..d323178532d03 100644 --- a/llvm/test/CodeGen/X86/combine-bitselect.ll +++ b/llvm/test/CodeGen/X86/combine-bitselect.ll @@ -195,24 +195,35 @@ define <2 x i64> @bitselect_v2i64_broadcast_rrr(<2 x i64> %a0, <2 x i64> %a1, i6 ; SSE: # %bb.0: ; SSE-NEXT: movq %rdi, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; SSE-NEXT: notq %rdi +; SSE-NEXT: movq %rdi, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: retq ; ; XOP-LABEL: bitselect_v2i64_broadcast_rrr: ; XOP: # %bb.0: ; XOP-NEXT: vmovq %rdi, %xmm2 ; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 +; XOP-NEXT: notq %rdi +; XOP-NEXT: vmovq %rdi, %xmm3 +; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; XOP-NEXT: vpand %xmm2, %xmm0, %xmm0 +; XOP-NEXT: vpand %xmm3, %xmm1, %xmm1 +; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX1-LABEL: bitselect_v2i64_broadcast_rrr: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq %rdi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-NEXT: notq %rdi +; AVX1-NEXT: vmovq %rdi, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -220,8 +231,11 @@ define <2 x i64> @bitselect_v2i64_broadcast_rrr(<2 x i64> %a0, <2 x i64> %a1, i6 ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq %rdi, %xmm2 ; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2 +; AVX2-NEXT: notq %rdi +; AVX2-NEXT: vmovq %rdi, %xmm3 +; AVX2-NEXT: vpbroadcastq %xmm3, %xmm3 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpandn %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -229,15 +243,21 @@ define <2 x i64> @bitselect_v2i64_broadcast_rrr(<2 x i64> %a0, <2 x i64> %a1, i6 ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq %rdi, %xmm2 ; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2 +; AVX512F-NEXT: notq %rdi +; AVX512F-NEXT: vmovq %rdi, %xmm3 +; AVX512F-NEXT: vpbroadcastq %xmm3, %xmm3 ; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpandn %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v2i64_broadcast_rrr: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastq %rdi, %xmm2 -; AVX512VL-NEXT: vpternlogq $226, %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: notq %rdi +; AVX512VL-NEXT: vpbroadcastq %rdi, %xmm3 +; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq $248, %xmm3, %xmm1, %xmm0 ; AVX512VL-NEXT: retq %1 = insertelement <2 x i64> undef, i64 %a2, i32 0 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer @@ -252,38 +272,77 @@ define <2 x i64> @bitselect_v2i64_broadcast_rrr(<2 x i64> %a0, <2 x i64> %a1, i6 define <2 x i64> @bitselect_v2i64_broadcast_rrm(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) { ; SSE-LABEL: bitselect_v2i64_broadcast_rrm: ; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: movq %rax, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: retq ; ; XOP-LABEL: bitselect_v2i64_broadcast_rrm: ; XOP: # %bb.0: -; XOP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 +; XOP-NEXT: movq (%rdi), %rax +; XOP-NEXT: vmovq %rax, %xmm2 +; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; XOP-NEXT: notq %rax +; XOP-NEXT: vmovq %rax, %xmm3 +; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; XOP-NEXT: vpand %xmm2, %xmm0, %xmm0 +; XOP-NEXT: vpand %xmm3, %xmm1, %xmm1 +; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v2i64_broadcast_rrm: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vandnps %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v2i64_broadcast_rrm: +; AVX1: # %bb.0: +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-NEXT: notq %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v2i64_broadcast_rrm: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2 +; AVX2-NEXT: notq %rax +; AVX2-NEXT: vmovq %rax, %xmm3 +; AVX2-NEXT: vpbroadcastq %xmm3, %xmm3 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: bitselect_v2i64_broadcast_rrm: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX512F-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vandnps %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: movq (%rdi), %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2 +; AVX512F-NEXT: notq %rax +; AVX512F-NEXT: vmovq %rax, %xmm3 +; AVX512F-NEXT: vpbroadcastq %xmm3, %xmm3 +; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v2i64_broadcast_rrm: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpternlogq $228, (%rdi){1to2}, %xmm1, %xmm0 +; AVX512VL-NEXT: movq (%rdi), %rax +; AVX512VL-NEXT: vpbroadcastq %rax, %xmm2 +; AVX512VL-NEXT: notq %rax +; AVX512VL-NEXT: vpbroadcastq %rax, %xmm3 +; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq $248, %xmm3, %xmm1, %xmm0 ; AVX512VL-NEXT: retq %a2 = load i64, ptr %p2 %1 = insertelement <2 x i64> undef, i64 %a2, i32 0 @@ -510,13 +569,15 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6 ; SSE: # %bb.0: ; SSE-NEXT: movq %rdi, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: notq %rdi +; SSE-NEXT: movq %rdi, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: retq ; ; XOP-LABEL: bitselect_v4i64_broadcast_rrr: @@ -524,7 +585,13 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6 ; XOP-NEXT: vmovq %rdi, %xmm2 ; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; XOP-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 +; XOP-NEXT: notq %rdi +; XOP-NEXT: vmovq %rdi, %xmm3 +; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 +; XOP-NEXT: vandps %ymm2, %ymm0, %ymm0 +; XOP-NEXT: vandps %ymm3, %ymm1, %ymm1 +; XOP-NEXT: vorps %ymm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX1-LABEL: bitselect_v4i64_broadcast_rrr: @@ -532,8 +599,12 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6 ; AVX1-NEXT: vmovq %rdi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-NEXT: notq %rdi +; AVX1-NEXT: vmovq %rdi, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -541,8 +612,11 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6 ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq %rdi, %xmm2 ; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-NEXT: notq %rdi +; AVX2-NEXT: vmovq %rdi, %xmm3 +; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -550,15 +624,21 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6 ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq %rdi, %xmm2 ; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-NEXT: notq %rdi +; AVX512F-NEXT: vmovq %rdi, %xmm3 +; AVX512F-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrr: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastq %rdi, %ymm2 -; AVX512VL-NEXT: vpternlogq $226, %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: notq %rdi +; AVX512VL-NEXT: vpbroadcastq %rdi, %ymm3 +; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $248, %ymm3, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %1 = insertelement <4 x i64> undef, i64 %a2, i32 0 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <4 x i32> zeroinitializer @@ -573,42 +653,84 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6 define <4 x i64> @bitselect_v4i64_broadcast_rrm(<4 x i64> %a0, <4 x i64> %a1, ptr %p2) { ; SSE-LABEL: bitselect_v4i64_broadcast_rrm: ; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: movq %rax, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: retq ; ; XOP-LABEL: bitselect_v4i64_broadcast_rrm: ; XOP: # %bb.0: -; XOP-NEXT: vbroadcastsd (%rdi), %ymm2 -; XOP-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 +; XOP-NEXT: movq (%rdi), %rax +; XOP-NEXT: vmovq %rax, %xmm2 +; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; XOP-NEXT: notq %rax +; XOP-NEXT: vmovq %rax, %xmm3 +; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 +; XOP-NEXT: vandps %ymm2, %ymm0, %ymm0 +; XOP-NEXT: vandps %ymm3, %ymm1, %ymm1 +; XOP-NEXT: vorps %ymm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v4i64_broadcast_rrm: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm2 -; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v4i64_broadcast_rrm: +; AVX1: # %bb.0: +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-NEXT: notq %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v4i64_broadcast_rrm: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-NEXT: notq %rax +; AVX2-NEXT: vmovq %rax, %xmm3 +; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: bitselect_v4i64_broadcast_rrm: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcastsd (%rdi), %ymm2 -; AVX512F-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: movq (%rdi), %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-NEXT: notq %rax +; AVX512F-NEXT: vmovq %rax, %xmm3 +; AVX512F-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrm: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpternlogq $228, (%rdi){1to4}, %ymm1, %ymm0 +; AVX512VL-NEXT: movq (%rdi), %rax +; AVX512VL-NEXT: vpbroadcastq %rax, %ymm2 +; AVX512VL-NEXT: notq %rax +; AVX512VL-NEXT: vpbroadcastq %rax, %ymm3 +; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $248, %ymm3, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %a2 = load i64, ptr %p2 %1 = insertelement <4 x i64> undef, i64 %a2, i32 0 @@ -871,21 +993,21 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6 ; SSE: # %bb.0: ; SSE-NEXT: movq %rdi, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] +; SSE-NEXT: notq %rdi +; SSE-NEXT: movq %rdi, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] ; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: pandn %xmm7, %xmm9 -; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm9 +; SSE-NEXT: por %xmm9, %xmm0 ; SSE-NEXT: retq ; ; XOP-LABEL: bitselect_v8i64_broadcast_rrr: @@ -893,8 +1015,16 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6 ; XOP-NEXT: vmovq %rdi, %xmm4 ; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; XOP-NEXT: vpcmov %ymm4, %ymm2, %ymm0, %ymm0 -; XOP-NEXT: vpcmov %ymm4, %ymm3, %ymm1, %ymm1 +; XOP-NEXT: notq %rdi +; XOP-NEXT: vmovq %rdi, %xmm5 +; XOP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] +; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 +; XOP-NEXT: vandps %ymm4, %ymm1, %ymm1 +; XOP-NEXT: vandps %ymm4, %ymm0, %ymm0 +; XOP-NEXT: vandps %ymm5, %ymm3, %ymm3 +; XOP-NEXT: vorps %ymm3, %ymm1, %ymm1 +; XOP-NEXT: vandps %ymm5, %ymm2, %ymm2 +; XOP-NEXT: vorps %ymm2, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX1-LABEL: bitselect_v8i64_broadcast_rrr: @@ -902,11 +1032,15 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6 ; AVX1-NEXT: vmovq %rdi, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 +; AVX1-NEXT: notq %rdi +; AVX1-NEXT: vmovq %rdi, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm3, %ymm4, %ymm3 +; AVX1-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -914,18 +1048,24 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6 ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq %rdi, %xmm4 ; AVX2-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-NEXT: notq %rdi +; AVX2-NEXT: vmovq %rdi, %xmm5 +; AVX2-NEXT: vpbroadcastq %xmm5, %ymm5 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: bitselect_v8i64_broadcast_rrr: ; AVX512: # %bb.0: ; AVX512-NEXT: vpbroadcastq %rdi, %zmm2 -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 +; AVX512-NEXT: notq %rdi +; AVX512-NEXT: vpbroadcastq %rdi, %zmm3 +; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq $248, %zmm3, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = insertelement <8 x i64> undef, i64 %a2, i32 0 %2 = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> zeroinitializer @@ -940,46 +1080,86 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6 define <8 x i64> @bitselect_v8i64_broadcast_rrm(<8 x i64> %a0, <8 x i64> %a1, ptr %p2) { ; SSE-LABEL: bitselect_v8i64_broadcast_rrm: ; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm8 = mem[0],zero +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: movq %rax, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] ; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: pandn %xmm7, %xmm9 -; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm9 +; SSE-NEXT: por %xmm9, %xmm0 ; SSE-NEXT: retq ; ; XOP-LABEL: bitselect_v8i64_broadcast_rrm: ; XOP: # %bb.0: -; XOP-NEXT: vbroadcastsd (%rdi), %ymm4 -; XOP-NEXT: vpcmov %ymm4, %ymm2, %ymm0, %ymm0 -; XOP-NEXT: vpcmov %ymm4, %ymm3, %ymm1, %ymm1 +; XOP-NEXT: movq (%rdi), %rax +; XOP-NEXT: vmovq %rax, %xmm4 +; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 +; XOP-NEXT: notq %rax +; XOP-NEXT: vmovq %rax, %xmm5 +; XOP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] +; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 +; XOP-NEXT: vandps %ymm4, %ymm1, %ymm1 +; XOP-NEXT: vandps %ymm4, %ymm0, %ymm0 +; XOP-NEXT: vandps %ymm5, %ymm3, %ymm3 +; XOP-NEXT: vorps %ymm3, %ymm1, %ymm1 +; XOP-NEXT: vandps %ymm5, %ymm2, %ymm2 +; XOP-NEXT: vorps %ymm2, %ymm0, %ymm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v8i64_broadcast_rrm: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm4 -; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX1: # %bb.0: +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: vmovq %rax, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 +; AVX1-NEXT: notq %rax +; AVX1-NEXT: vmovq %rax, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: vmovq %rax, %xmm4 +; AVX2-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-NEXT: notq %rax +; AVX2-NEXT: vmovq %rax, %xmm5 +; AVX2-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: bitselect_v8i64_broadcast_rrm: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogq $228, (%rdi){1to8}, %zmm1, %zmm0 +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: vpbroadcastq %rax, %zmm2 +; AVX512-NEXT: notq %rax +; AVX512-NEXT: vpbroadcastq %rax, %zmm3 +; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq $248, %zmm3, %zmm1, %zmm0 ; AVX512-NEXT: retq %a2 = load i64, ptr %p2 %1 = insertelement <8 x i64> undef, i64 %a2, i32 0 diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll index db52dd6b47b46..4d5f55b331267 100644 --- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll @@ -67,9 +67,9 @@ declare <4 x i32> @llvm.vp.mul.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) define void @vp_sdiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind { ; SSE-LABEL: vp_sdiv_v4i32: ; SSE: # %bb.0: +; SSE-NEXT: addl $-2147483648, %esi # imm = 0x80000000 ; SSE-NEXT: movd %esi, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: paddd %xmm2, %xmm1 @@ -210,9 +210,9 @@ declare <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) define void @vp_udiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind { ; SSE-LABEL: vp_udiv_v4i32: ; SSE: # %bb.0: +; SSE-NEXT: addl $-2147483648, %esi # imm = 0x80000000 ; SSE-NEXT: movd %esi, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: paddd %xmm2, %xmm1 @@ -353,9 +353,9 @@ declare <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) define void @vp_srem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind { ; SSE-LABEL: vp_srem_v4i32: ; SSE: # %bb.0: +; SSE-NEXT: addl $-2147483648, %esi # imm = 0x80000000 ; SSE-NEXT: movd %esi, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: paddd %xmm2, %xmm1 @@ -496,9 +496,9 @@ declare <4 x i32> @llvm.vp.srem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) define void @vp_urem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind { ; SSE-LABEL: vp_urem_v4i32: ; SSE: # %bb.0: +; SSE-NEXT: addl $-2147483648, %esi # imm = 0x80000000 ; SSE-NEXT: movd %esi, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: paddd %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll index e500801b69c4d..462327b324c3d 100644 --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -232,13 +232,27 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin ; X86-NEXT: popl %eax ; X86-NEXT: retl ; -; X64-LABEL: signbits_ashr_insert_ashr_extract_sitofp: -; X64: # %bb.0: -; X64-NEXT: sarq $30, %rdi -; X64-NEXT: vmovq %rdi, %xmm0 -; X64-NEXT: vpsrlq $3, %xmm0, %xmm0 -; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 -; X64-NEXT: retq +; X64-AVX1-LABEL: signbits_ashr_insert_ashr_extract_sitofp: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovq %rdi, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X64-AVX1-NEXT: vpsrad $30, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpsrlq $30, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; X64-AVX1-NEXT: vpsrlq $3, %xmm0, %xmm0 +; X64-AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: signbits_ashr_insert_ashr_extract_sitofp: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovq %rdi, %xmm0 +; X64-AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrad $30, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpsrlq $30, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; X64-AVX2-NEXT: vpsrlq $3, %xmm0, %xmm0 +; X64-AVX2-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %1 = ashr i64 %a0, 30 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 %3 = insertelement <2 x i64> %2, i64 %a1, i32 1 diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 1289eef7795dc..b4595fb42ef61 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -816,11 +816,13 @@ define <16 x float> @test14(ptr %base, i32 %ind, <16 x ptr> %vec) { ; ; KNL_32-LABEL: test14: ; KNL_32: # %bb.0: -; KNL_32-NEXT: vmovd %xmm0, %eax -; KNL_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1 +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vmovd %xmm0, %ecx +; KNL_32-NEXT: shll $2, %eax +; KNL_32-NEXT: vpbroadcastd %eax, %zmm1 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} +; KNL_32-NEXT: vgatherdps (%ecx,%zmm1), %zmm0 {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: test14: @@ -836,11 +838,13 @@ define <16 x float> @test14(ptr %base, i32 %ind, <16 x ptr> %vec) { ; ; SKX_32-LABEL: test14: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vmovd %xmm0, %eax -; SKX_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1 +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: vmovd %xmm0, %ecx +; SKX_32-NEXT: shll $2, %eax +; SKX_32-NEXT: vpbroadcastd %eax, %zmm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} +; SKX_32-NEXT: vgatherdps (%ecx,%zmm1), %zmm0 {%k1} ; SKX_32-NEXT: retl %broadcast.splatinsert = insertelement <16 x ptr> %vec, ptr %base, i32 1 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll index 1e31ee7ad6b59..6f2732095ac9c 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -1074,6 +1074,7 @@ define void @merge_4i32_i32_combine(ptr %dst, ptr %src) { ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-SSE1-NEXT: andps %xmm0, %xmm1 ; X86-SSE1-NEXT: movaps %xmm1, (%eax) diff --git a/llvm/test/CodeGen/X86/pr50609.ll b/llvm/test/CodeGen/X86/pr50609.ll index ea85c312f38bd..960d24c0c23c3 100644 --- a/llvm/test/CodeGen/X86/pr50609.ll +++ b/llvm/test/CodeGen/X86/pr50609.ll @@ -4,17 +4,7 @@ define void @PR50609(ptr noalias nocapture %RET, ptr noalias %aFOO, <16 x i32> %__mask) nounwind { ; CHECK-LABEL: PR50609: ; CHECK: # %bb.0: # %allocas -; CHECK-NEXT: leal 40(%rsi), %eax -; CHECK-NEXT: vmovq %rsi, %xmm2 -; CHECK-NEXT: vmovd %eax, %xmm3 -; CHECK-NEXT: vpsubq %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vpsrad $31, %xmm2, %xmm3 -; CHECK-NEXT: vpsrld $30, %xmm3, %xmm3 -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; CHECK-NEXT: vpsrad $2, %xmm2, %xmm2 -; CHECK-NEXT: vcvtdq2ps %ymm2, %ymm2 -; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0,0,0] -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1] ; CHECK-NEXT: vmaskmovps %ymm2, %ymm0, (%rdi) ; CHECK-NEXT: vmaskmovps %ymm2, %ymm1, 32(%rdi) ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll index 565946d342e93..a6119ef3189e3 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll @@ -171,23 +171,25 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE2-LABEL: test_srem_vec: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdx, %rcx -; SSE2-NEXT: shlq $31, %rcx -; SSE2-NEXT: sarq $31, %rcx -; SSE2-NEXT: shlq $31, %rdi -; SSE2-NEXT: sarq $31, %rdi +; SSE2-NEXT: movq %rsi, %r8 +; SSE2-NEXT: movq %rdx, %rsi ; SSE2-NEXT: shlq $31, %rsi ; SSE2-NEXT: sarq $31, %rsi -; SSE2-NEXT: movabsq $2049638230412172402, %r8 # imm = 0x1C71C71C71C71C72 -; SSE2-NEXT: movq %rsi, %rax -; SSE2-NEXT: imulq %r8 +; SSE2-NEXT: shlq $31, %rdi +; SSE2-NEXT: sarq $31, %rdi +; SSE2-NEXT: shlq $31, %r8 +; SSE2-NEXT: sarq $31, %r8 +; SSE2-NEXT: movabsq $2049638230412172402, %r9 # imm = 0x1C71C71C71C71C72 +; SSE2-NEXT: movq %r8, %rax +; SSE2-NEXT: imulq %r9 ; SSE2-NEXT: movq %rdx, %rax ; SSE2-NEXT: shrq $63, %rax ; SSE2-NEXT: addq %rdx, %rax ; SSE2-NEXT: leaq (%rax,%rax,8), %rax -; SSE2-NEXT: subq %rax, %rsi -; SSE2-NEXT: movq %rsi, %xmm1 +; SSE2-NEXT: subq %rax, %r8 +; SSE2-NEXT: movq %r8, %xmm1 ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: imulq %r8 +; SSE2-NEXT: imulq %r9 ; SSE2-NEXT: movq %rdx, %rax ; SSE2-NEXT: shrq $63, %rax ; SSE2-NEXT: addq %rdx, %rax @@ -195,26 +197,27 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE2-NEXT: subq %rax, %rdi ; SSE2-NEXT: movq %rdi, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591] -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71 -; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: movq %rsi, %rax ; SSE2-NEXT: imulq %rdx -; SSE2-NEXT: subq %rcx, %rdx +; SSE2-NEXT: subq %rsi, %rdx ; SSE2-NEXT: movq %rdx, %rax ; SSE2-NEXT: shrq $63, %rax ; SSE2-NEXT: sarq $3, %rdx ; SSE2-NEXT: addq %rax, %rdx ; SSE2-NEXT: leaq (%rdx,%rdx,8), %rax ; SSE2-NEXT: addq %rcx, %rax -; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: movabsq $8589934591, %rcx # imm = 0x1FFFFFFFF +; SSE2-NEXT: andq %rax, %rcx +; SSE2-NEXT: movq %rcx, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,2] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,3] -; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] +; SSE2-NEXT: andps %xmm2, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) @@ -226,23 +229,25 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE41-LABEL: test_srem_vec: ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdx, %rcx -; SSE41-NEXT: shlq $31, %rcx -; SSE41-NEXT: sarq $31, %rcx -; SSE41-NEXT: shlq $31, %rdi -; SSE41-NEXT: sarq $31, %rdi +; SSE41-NEXT: movq %rsi, %r8 +; SSE41-NEXT: movq %rdx, %rsi ; SSE41-NEXT: shlq $31, %rsi ; SSE41-NEXT: sarq $31, %rsi -; SSE41-NEXT: movabsq $2049638230412172402, %r8 # imm = 0x1C71C71C71C71C72 -; SSE41-NEXT: movq %rsi, %rax -; SSE41-NEXT: imulq %r8 +; SSE41-NEXT: shlq $31, %rdi +; SSE41-NEXT: sarq $31, %rdi +; SSE41-NEXT: shlq $31, %r8 +; SSE41-NEXT: sarq $31, %r8 +; SSE41-NEXT: movabsq $2049638230412172402, %r9 # imm = 0x1C71C71C71C71C72 +; SSE41-NEXT: movq %r8, %rax +; SSE41-NEXT: imulq %r9 ; SSE41-NEXT: movq %rdx, %rax ; SSE41-NEXT: shrq $63, %rax ; SSE41-NEXT: addq %rdx, %rax ; SSE41-NEXT: leaq (%rax,%rax,8), %rax -; SSE41-NEXT: subq %rax, %rsi -; SSE41-NEXT: movq %rsi, %xmm1 +; SSE41-NEXT: subq %rax, %r8 +; SSE41-NEXT: movq %r8, %xmm1 ; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: imulq %r8 +; SSE41-NEXT: imulq %r9 ; SSE41-NEXT: movq %rdx, %rax ; SSE41-NEXT: shrq $63, %rax ; SSE41-NEXT: addq %rdx, %rax @@ -250,28 +255,29 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE41-NEXT: subq %rax, %rdi ; SSE41-NEXT: movq %rdi, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591] -; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71 -; SSE41-NEXT: movq %rcx, %rax +; SSE41-NEXT: movq %rsi, %rax ; SSE41-NEXT: imulq %rdx -; SSE41-NEXT: subq %rcx, %rdx +; SSE41-NEXT: subq %rsi, %rdx ; SSE41-NEXT: movq %rdx, %rax ; SSE41-NEXT: shrq $63, %rax ; SSE41-NEXT: sarq $3, %rdx ; SSE41-NEXT: addq %rax, %rdx ; SSE41-NEXT: leaq (%rdx,%rdx,8), %rax ; SSE41-NEXT: addq %rcx, %rax -; SSE41-NEXT: movq %rax, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: movabsq $8589934591, %rcx # imm = 0x1FFFFFFFF +; SSE41-NEXT: andq %rax, %rcx +; SSE41-NEXT: movq %rcx, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE41-NEXT: pcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm1 ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: pextrb $8, %xmm0, %edx -; SSE41-NEXT: pextrb $0, %xmm2, %ecx +; SSE41-NEXT: pextrb $0, %xmm1, %ecx ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: # kill: def $dl killed $dl killed $edx ; SSE41-NEXT: # kill: def $cl killed $cl killed $ecx diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll index 809d94b649fb4..94aabbe730278 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -915,11 +915,8 @@ define <16 x i1> @PR52500(<16 x i1> %msk, i32 %in) { ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vmovd %edi, %xmm0 -; AVX512F-NEXT: movl $789, %eax # imm = 0x315 -; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 +; AVX512F-NEXT: imull $789, %edi, %eax # imm = 0x315 +; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 {%k1} ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 @@ -931,11 +928,8 @@ define <16 x i1> @PR52500(<16 x i1> %msk, i32 %in) { ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512VL-NEXT: vmovd %edi, %xmm0 -; AVX512VL-NEXT: movl $789, %eax # imm = 0x315 -; AVX512VL-NEXT: vmovd %eax, %xmm1 -; AVX512VL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0 +; AVX512VL-NEXT: imull $789, %edi, %eax # imm = 0x315 +; AVX512VL-NEXT: vpbroadcastd %eax, %zmm0 ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 {%k1} ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 @@ -946,11 +940,8 @@ define <16 x i1> @PR52500(<16 x i1> %msk, i32 %in) { ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: vpsllw $7, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpmovb2m %xmm0, %k1 -; VL_BW_DQ-NEXT: vmovd %edi, %xmm0 -; VL_BW_DQ-NEXT: movl $789, %eax # imm = 0x315 -; VL_BW_DQ-NEXT: vmovd %eax, %xmm1 -; VL_BW_DQ-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0 +; VL_BW_DQ-NEXT: imull $789, %edi, %eax # imm = 0x315 +; VL_BW_DQ-NEXT: vpbroadcastd %eax, %zmm0 ; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 {%k1} ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 ; VL_BW_DQ-NEXT: vzeroupper