From ee12cc24cf9e8844d3afaaa9f20d261367f674a9 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 5 Feb 2025 15:47:38 +0000 Subject: [PATCH] [X86] Fold (f16 bitcast extract_vectorelt(v,0)) to (extract_vectorelt (v8f16 bitcast(v,0))) Also handles possible truncations from i32 to i16. Cleans up some of the poor codegen identified in #98630 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 13 + llvm/test/CodeGen/X86/bfloat.ll | 12 - .../CodeGen/X86/canonicalize-vars-f16-type.ll | 6 - .../CodeGen/X86/fminimumnum-fmaximumnum.ll | 400 ++++++++---------- .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 32 +- llvm/test/CodeGen/X86/fp-round.ll | 2 - llvm/test/CodeGen/X86/fp-roundeven.ll | 2 - .../test/CodeGen/X86/fp-strict-scalar-fp16.ll | 12 - .../X86/fp-strict-scalar-inttofp-fp16.ll | 24 -- .../X86/fp-strict-scalar-round-fp16.ll | 14 - llvm/test/CodeGen/X86/half.ll | 144 +++---- llvm/test/CodeGen/X86/pr31088.ll | 8 - .../CodeGen/X86/select-narrow-int-to-fp.ll | 8 - .../CodeGen/X86/vector-half-conversions.ll | 227 +++++----- 14 files changed, 353 insertions(+), 551 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6cf6061deba70..b0cebea5f2988 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45160,6 +45160,19 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, } } + // Attempt to peek through f16 bitcasted extractions hidden by truncation. + if (VT == MVT::f16 && SrcVT == MVT::i16) { + SDValue Src = peekThroughTruncates(N0); + if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Src.getOperand(0).getValueSizeInBits() == 128 && + isNullConstant(Src.getOperand(1))) { + SDLoc DL(N); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + DAG.getBitcast(MVT::v8f16, Src.getOperand(0)), + DAG.getVectorIdxConstant(0, DL)); + } + } + // Since MMX types are special and don't usually play with other vector types, // it's better to handle them early to be sure we emit efficient code by // avoiding store-load conversions. diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll index d67cd6b62c2b9..4d269cfff2afe 100644 --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -82,8 +82,6 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind { ; X86-NEXT: vmovd %eax, %xmm1 ; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 -; X86-NEXT: vmovw %xmm0, %eax -; X86-NEXT: vmovw %eax, %xmm0 ; X86-NEXT: retl ; ; SSE2-LABEL: add2: @@ -110,8 +108,6 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind { ; FP16-NEXT: vmovd %eax, %xmm1 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 -; FP16-NEXT: vmovw %xmm0, %eax -; FP16-NEXT: vmovw %eax, %xmm0 ; FP16-NEXT: retq ; ; AVXNC-LABEL: add2: @@ -124,8 +120,6 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind { ; AVXNC-NEXT: vmovd %eax, %xmm1 ; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 -; AVXNC-NEXT: vmovd %xmm0, %eax -; AVXNC-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVXNC-NEXT: retq %add = fadd bfloat %a, %b ret bfloat %add @@ -432,8 +426,6 @@ define bfloat @add_constant2(bfloat %a) nounwind { ; X86-NEXT: vmovd %eax, %xmm0 ; X86-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 -; X86-NEXT: vmovw %xmm0, %eax -; X86-NEXT: vmovw %eax, %xmm0 ; X86-NEXT: retl ; ; SSE2-LABEL: add_constant2: @@ -454,8 +446,6 @@ define bfloat @add_constant2(bfloat %a) nounwind { ; FP16-NEXT: vmovd %eax, %xmm0 ; FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 -; FP16-NEXT: vmovw %xmm0, %eax -; FP16-NEXT: vmovw %eax, %xmm0 ; FP16-NEXT: retq ; ; AVXNC-LABEL: add_constant2: @@ -465,8 +455,6 @@ define bfloat @add_constant2(bfloat %a) nounwind { ; AVXNC-NEXT: vmovd %eax, %xmm0 ; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 -; AVXNC-NEXT: vmovd %xmm0, %eax -; AVXNC-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVXNC-NEXT: retq %add = fadd bfloat %a, 1.0 ret bfloat %add diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll index 04087c4f0dd5e..556b0deaf4c83 100644 --- a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll +++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll @@ -154,8 +154,6 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind { ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX512-NEXT: retq entry: @@ -239,15 +237,11 @@ define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind ; AVX512-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3] ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vmovd %xmm2, %eax -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX512-NEXT: vmovd %xmm0, (%rdi) ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll index c617b45707f8f..c7f5e13cb7464 100644 --- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll +++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll @@ -1812,212 +1812,186 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; ; AVX512-LABEL: test_fmaximumnum_v4f16: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $88, %rsp +; AVX512-NEXT: subq $72, %rsp ; AVX512-NEXT: vmovdqa %xmm1, %xmm4 -; AVX512-NEXT: vmovdqa %xmm0, %xmm6 +; AVX512-NEXT: vmovdqa %xmm0, %xmm8 ; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512-NEXT: vucomiss %xmm0, %xmm0 ; AVX512-NEXT: setp %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm2 +; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vucomiss %xmm1, %xmm1 ; AVX512-NEXT: setp %al ; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1 +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2} +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm2 -; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm2 +; AVX512-NEXT: vucomiss %xmm0, %xmm1 ; AVX512-NEXT: seta %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm7 -; AVX512-NEXT: vmulss %xmm7, %xmm0, %xmm0 -; AVX512-NEXT: vxorps %xmm9, %xmm9, %xmm9 -; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm9[1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm2 +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm9 +; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vucomiss %xmm1, %xmm1 ; AVX512-NEXT: setp %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3] -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512-NEXT: vucomiss %xmm3, %xmm3 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vucomiss %xmm2, %xmm2 ; AVX512-NEXT: setp %al ; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm3 -; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm3 +; AVX512-NEXT: vucomiss %xmm3, %xmm2 ; AVX512-NEXT: seta %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vmulss %xmm7, %xmm2, %xmm2 -; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vucomiss %xmm1, %xmm1 ; AVX512-NEXT: setp %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512-NEXT: vucomiss %xmm3, %xmm3 +; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vucomiss %xmm2, %xmm2 ; AVX512-NEXT: setp %al ; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm5 -; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm3 -; AVX512-NEXT: vucomiss %xmm3, %xmm5 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vucomiss %xmm1, %xmm2 ; AVX512-NEXT: seta %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm5, %xmm3, %xmm3 {%k1} -; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm4[1,0] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm4[1,0] +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vucomiss %xmm2, %xmm2 ; AVX512-NEXT: setp %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vshufpd {{.*#+}} xmm5 = xmm6[1,0] -; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512-NEXT: vucomiss %xmm5, %xmm5 +; AVX512-NEXT: vshufpd {{.*#+}} xmm7 = xmm8[1,0] +; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512-NEXT: vucomiss %xmm7, %xmm7 ; AVX512-NEXT: setp %al ; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm5, %xmm5 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm5 -; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm5 +; AVX512-NEXT: vmovss %xmm2, %xmm7, %xmm7 {%k2} +; AVX512-NEXT: vcvtps2ph $4, %xmm7, %xmm14 +; AVX512-NEXT: vcvtph2ps %xmm14, %xmm7 +; AVX512-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vucomiss %xmm2, %xmm7 ; AVX512-NEXT: seta %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512-NEXT: vmulss %xmm7, %xmm3, %xmm3 -; AVX512-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm9[1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm1 +; AVX512-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vxorps %xmm15, %xmm15, %xmm15 +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm5 +; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm3 +; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm0 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmulss %xmm7, %xmm0, %xmm0 -; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm9[1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovd %xmm0, %ecx -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm3 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm2 +; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7] ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512-NEXT: vucomiss %xmm0, %xmm0 ; AVX512-NEXT: setp %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,3,3,3,4,5,6,7] -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm2 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vucomiss %xmm1, %xmm1 ; AVX512-NEXT: setp %al ; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm12 -; AVX512-NEXT: vcvtph2ps %xmm12, %xmm3 -; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm15 -; AVX512-NEXT: vcvtph2ps %xmm15, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm3 +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2} +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm12 +; AVX512-NEXT: vcvtph2ps %xmm12, %xmm1 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm13 +; AVX512-NEXT: vcvtph2ps %xmm13, %xmm6 +; AVX512-NEXT: vucomiss %xmm6, %xmm1 ; AVX512-NEXT: seta %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vmovss %xmm1, %xmm6, %xmm6 {%k1} ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512-NEXT: vucomiss %xmm0, %xmm0 ; AVX512-NEXT: setp %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm6[1,1,3,3] -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512-NEXT: vucomiss %xmm3, %xmm3 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm8[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vucomiss %xmm1, %xmm1 ; AVX512-NEXT: setp %al ; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm10 +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2} +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm10 ; AVX512-NEXT: vcvtph2ps %xmm10, %xmm3 ; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm11 -; AVX512-NEXT: vcvtph2ps %xmm11, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm3 +; AVX512-NEXT: vcvtph2ps %xmm11, %xmm5 +; AVX512-NEXT: vucomiss %xmm5, %xmm3 ; AVX512-NEXT: seta %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vmulss %xmm7, %xmm2, %xmm2 -; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm14 -; AVX512-NEXT: vmovd %xmm14, %eax -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmulss %xmm7, %xmm0, %xmm0 -; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm9[1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm13 -; AVX512-NEXT: vmovd %xmm13, %ecx -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512-NEXT: vmovss %xmm3, %xmm5, %xmm5 {%k1} ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm0 ; AVX512-NEXT: vucomiss %xmm0, %xmm0 ; AVX512-NEXT: setp %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vcvtph2ps %xmm6, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm8, %xmm3 +; AVX512-NEXT: vucomiss %xmm3, %xmm3 ; AVX512-NEXT: setp %al ; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm3 +; AVX512-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k2} +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm1 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm5 -; AVX512-NEXT: vcvtph2ps %xmm5, %xmm2 +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm7 +; AVX512-NEXT: vcvtph2ps %xmm7, %xmm2 ; AVX512-NEXT: vucomiss %xmm2, %xmm1 ; AVX512-NEXT: seta %al ; AVX512-NEXT: kmovw %eax, %k1 @@ -2027,133 +2001,109 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vucomiss %xmm1, %xmm1 ; AVX512-NEXT: setp %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[1,1,1,1,4,5,6,7] ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 ; AVX512-NEXT: vucomiss %xmm4, %xmm4 ; AVX512-NEXT: setp %al ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovss %xmm1, %xmm4, %xmm4 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX512-NEXT: vcvtph2ps %xmm4, %xmm6 -; AVX512-NEXT: vmovss %xmm6, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm8 +; AVX512-NEXT: vcvtph2ps %xmm8, %xmm4 +; AVX512-NEXT: vmovss %xmm4, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm6 +; AVX512-NEXT: vucomiss %xmm0, %xmm4 ; AVX512-NEXT: seta %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm6, %xmm4 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512-NEXT: vmulss %xmm4, %xmm9, %xmm4 +; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm5 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX512-NEXT: vmulss %xmm5, %xmm9, %xmm5 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vmulss %xmm7, %xmm2, %xmm2 +; AVX512-NEXT: vmulss %xmm2, %xmm9, %xmm2 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmulss %xmm7, %xmm0, %xmm0 -; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1,2,3] -; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm9[1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm7 -; AVX512-NEXT: vmovd %xmm7, %eax -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm9 -; AVX512-NEXT: vmovd %xmm9, %ecx -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 +; AVX512-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm15[1,2,3] +; AVX512-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm15[1,2,3] +; AVX512-NEXT: vblendps {{.*#+}} xmm9 = xmm2[0],xmm15[1,2,3] +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm2 +; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm6 +; AVX512-NEXT: vcvtps2ph $4, %xmm9, %xmm4 +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512-NEXT: vmovd %xmm2, %eax -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512-NEXT: vmovd %xmm2, %ecx -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 -; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm6 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512-NEXT: vmovd %xmm6, %eax -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512-NEXT: vmovd %xmm6, %ecx -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm6 -; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm8 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX512-NEXT: vmovd %xmm12, %eax -; AVX512-NEXT: vmovd %xmm10, %ecx -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm6 -; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm8 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512-NEXT: vmovd %xmm3, %eax -; AVX512-NEXT: vmovd %xmm4, %ecx -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3 -; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm4 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpeqw %xmm2, %xmm3, %xmm4 -; AVX512-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm4 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512-NEXT: vmovd %xmm3, %eax -; AVX512-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX512-NEXT: vmovd %xmm3, %ecx -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3 -; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm6 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512-NEXT: vmovd %xmm6, %eax -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512-NEXT: vmovd %xmm6, %ecx -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm6 -; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm8 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX512-NEXT: vmovd %xmm15, %eax -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm6 -; AVX512-NEXT: vmovd %xmm11, %eax -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm8 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512-NEXT: vmovd %xmm5, %eax -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm5 -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; AVX512-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX512-NEXT: vpblendvb %xmm2, %xmm1, %xmm4, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm9, %xmm2 +; AVX512-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX512-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX512-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm9[0] +; AVX512-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX512-NEXT: vpcmpeqw %xmm3, %xmm8, %xmm9 +; AVX512-NEXT: vpblendvb %xmm9, %xmm3, %xmm0, %xmm3 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX512-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX512-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; AVX512-NEXT: vpcmpeqw %xmm1, %xmm8, %xmm7 +; AVX512-NEXT: vpblendvb %xmm7, %xmm1, %xmm3, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm5, %xmm3 ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vucomiss %xmm3, %xmm2 +; AVX512-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512-NEXT: vucomiss %xmm5, %xmm3 ; AVX512-NEXT: movl $65535, %ecx # imm = 0xFFFF ; AVX512-NEXT: movl $0, %edx ; AVX512-NEXT: cmovel %ecx, %edx -; AVX512-NEXT: vcvtph2ps %xmm7, %xmm2 -; AVX512-NEXT: vucomiss %xmm3, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm3 +; AVX512-NEXT: vucomiss %xmm5, %xmm3 ; AVX512-NEXT: movl $0, %esi ; AVX512-NEXT: cmovel %ecx, %esi -; AVX512-NEXT: vcvtph2ps %xmm13, %xmm2 -; AVX512-NEXT: vucomiss %xmm3, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm6, %xmm3 +; AVX512-NEXT: vucomiss %xmm5, %xmm3 ; AVX512-NEXT: movl $0, %edi ; AVX512-NEXT: cmovel %ecx, %edi -; AVX512-NEXT: vcvtph2ps %xmm14, %xmm2 -; AVX512-NEXT: vucomiss %xmm3, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vucomiss %xmm5, %xmm2 ; AVX512-NEXT: movl $0, %r8d ; AVX512-NEXT: cmovel %ecx, %r8d ; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512-NEXT: vucomiss %xmm3, %xmm2 +; AVX512-NEXT: vucomiss %xmm5, %xmm2 ; AVX512-NEXT: movl $0, %r9d ; AVX512-NEXT: cmovel %ecx, %r9d ; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512-NEXT: vucomiss %xmm3, %xmm2 +; AVX512-NEXT: vucomiss %xmm5, %xmm2 ; AVX512-NEXT: movl $0, %r10d ; AVX512-NEXT: cmovel %ecx, %r10d ; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512-NEXT: vucomiss %xmm3, %xmm2 +; AVX512-NEXT: vucomiss %xmm5, %xmm2 ; AVX512-NEXT: movl $0, %r11d ; AVX512-NEXT: cmovel %ecx, %r11d ; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512-NEXT: vucomiss %xmm3, %xmm2 +; AVX512-NEXT: vucomiss %xmm5, %xmm2 ; AVX512-NEXT: vmovd %esi, %xmm2 ; AVX512-NEXT: vpinsrw $1, %edx, %xmm2, %xmm2 ; AVX512-NEXT: vpinsrw $2, %edi, %xmm2, %xmm2 @@ -2164,7 +2114,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: cmovel %ecx, %eax ; AVX512-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; AVX512-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: addq $88, %rsp +; AVX512-NEXT: addq $72, %rsp ; AVX512-NEXT: retq ; ; AVX10_2-LABEL: test_fmaximumnum_v4f16: diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 2163121410553..53517373d3e4d 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -569,17 +569,13 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) { ; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] ; CHECK-AVX512F-NEXT: callq ldexpf@PLT ; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-AVX512F-NEXT: vpextrw $6, %xmm0, %eax ; CHECK-AVX512F-NEXT: movswl %ax, %edi ; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] ; CHECK-AVX512F-NEXT: callq ldexpf@PLT ; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -589,17 +585,13 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) { ; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] ; CHECK-AVX512F-NEXT: callq ldexpf@PLT ; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-AVX512F-NEXT: vpextrw $4, %xmm0, %eax ; CHECK-AVX512F-NEXT: movswl %ax, %edi ; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] ; CHECK-AVX512F-NEXT: callq ldexpf@PLT ; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -611,17 +603,13 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) { ; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] ; CHECK-AVX512F-NEXT: callq ldexpf@PLT ; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-AVX512F-NEXT: vpextrw $2, %xmm0, %eax ; CHECK-AVX512F-NEXT: movswl %ax, %edi ; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] ; CHECK-AVX512F-NEXT: callq ldexpf@PLT ; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -631,17 +619,13 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) { ; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] ; CHECK-AVX512F-NEXT: callq ldexpf@PLT ; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax ; CHECK-AVX512F-NEXT: movswl %ax, %edi ; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] ; CHECK-AVX512F-NEXT: callq ldexpf@PLT ; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -1429,8 +1413,6 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind { ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax -; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: @@ -1443,8 +1425,6 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind { ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovd %xmm0, %eax -; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to half @@ -1545,8 +1525,6 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0] ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax -; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: @@ -1560,8 +1538,6 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0] ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-FMA-NEXT: vmovd %xmm0, %eax -; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to half diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll index 1665ef98dd861..58c4f71892e90 100644 --- a/llvm/test/CodeGen/X86/fp-round.ll +++ b/llvm/test/CodeGen/X86/fp-round.ll @@ -59,8 +59,6 @@ define half @round_f16(half %h) { ; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512FP16-LABEL: round_f16: diff --git a/llvm/test/CodeGen/X86/fp-roundeven.ll b/llvm/test/CodeGen/X86/fp-roundeven.ll index 7d1c52cd65451..ee298fd47f728 100644 --- a/llvm/test/CodeGen/X86/fp-roundeven.ll +++ b/llvm/test/CodeGen/X86/fp-roundeven.ll @@ -53,8 +53,6 @@ define half @roundeven_f16(half %h) { ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512F-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512FP16-LABEL: roundeven_f16: diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll index e1b677e855094..fbc798d8bbe48 100644 --- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll @@ -46,8 +46,6 @@ define half @fadd_f16(half %a, half %b) nounwind strictfp { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: fadd_f16: @@ -96,8 +94,6 @@ define half @fsub_f16(half %a, half %b) nounwind strictfp { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: fsub_f16: @@ -146,8 +142,6 @@ define half @fmul_f16(half %a, half %b) nounwind strictfp { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: fmul_f16: @@ -196,8 +190,6 @@ define half @fdiv_f16(half %a, half %b) nounwind strictfp { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: fdiv_f16: @@ -479,8 +471,6 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp { ; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vmovd %xmm0, %eax -; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; F16C-NEXT: popq %rax ; F16C-NEXT: retq ; @@ -502,8 +492,6 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp { ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; X86-LABEL: fma_f16: diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll index 3c99d4daa806f..7c0386f0e784e 100644 --- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll @@ -37,8 +37,6 @@ define half @sitofp_i1tof16(i1 %x) #0 { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: sitofp_i1tof16: @@ -80,8 +78,6 @@ define half @sitofp_i8tof16(i8 %x) #0 { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: sitofp_i8tof16: @@ -118,8 +114,6 @@ define half @sitofp_i16tof16(i16 %x) #0 { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: sitofp_i16tof16: @@ -154,8 +148,6 @@ define half @sitofp_i32tof16(i32 %x) #0 { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: sitofp_i32tof16: @@ -188,8 +180,6 @@ define half @sitofp_i64tof16(i64 %x) #0 { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: sitofp_i64tof16: @@ -225,8 +215,6 @@ define half @uitofp_i1tof16(i1 %x) #0 { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: uitofp_i1tof16: @@ -265,8 +253,6 @@ define half @uitofp_i8tof16(i8 %x) #0 { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: uitofp_i8tof16: @@ -303,8 +289,6 @@ define half @uitofp_i16tof16(i16 %x) #0 { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: uitofp_i16tof16: @@ -341,8 +325,6 @@ define half @uitofp_i32tof16(i32 %x) #0 { ; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vmovd %xmm0, %eax -; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; F16C-NEXT: retq ; ; AVX512-LABEL: uitofp_i32tof16: @@ -351,8 +333,6 @@ define half @uitofp_i32tof16(i32 %x) #0 { ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; X86-LABEL: uitofp_i32tof16: @@ -409,8 +389,6 @@ define half @uitofp_i64tof16(i64 %x) #0 { ; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vmovd %xmm0, %eax -; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; F16C-NEXT: retq ; ; AVX512-LABEL: uitofp_i64tof16: @@ -419,8 +397,6 @@ define half @uitofp_i64tof16(i64 %x) #0 { ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; X86-LABEL: uitofp_i64tof16: diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll index 6fe5dcd292930..1ab97dafb8514 100644 --- a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll @@ -33,8 +33,6 @@ define half @fceil32(half %f) #0 { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: fceil32: @@ -71,8 +69,6 @@ define half @ffloor32(half %f) #0 { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: ffloor32: @@ -109,8 +105,6 @@ define half @ftrunc32(half %f) #0 { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: ftrunc32: @@ -147,8 +141,6 @@ define half @frint32(half %f) #0 { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: frint32: @@ -186,8 +178,6 @@ define half @fnearbyint32(half %f) #0 { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: fnearbyint32: @@ -225,8 +215,6 @@ define half @froundeven16(half %f) #0 { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: froundeven16: @@ -265,8 +253,6 @@ define half @fround16(half %f) #0 { ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: popq %rax ; AVX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll index 9ae4a64cfafab..2472e6e19c862 100644 --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -800,8 +800,6 @@ define half @test_f80trunc_nodagcombine() #0 { ; BWON-F16C-NEXT: pushq %rax ; BWON-F16C-NEXT: callq test_floatret@PLT ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vmovd %xmm0, %eax -; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; BWON-F16C-NEXT: popq %rax ; BWON-F16C-NEXT: retq ; @@ -1015,8 +1013,6 @@ define half @test_sqrt(half %0) #0 { ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; BWON-F16C-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vmovd %xmm0, %eax -; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_sqrt: @@ -1115,36 +1111,36 @@ entry: define void @main.45() #0 { ; CHECK-LIBCALL-LABEL: main.45: ; CHECK-LIBCALL: # %bb.0: # %entry -; CHECK-LIBCALL-NEXT: pushq %rbp ; CHECK-LIBCALL-NEXT: pushq %r15 ; CHECK-LIBCALL-NEXT: pushq %r14 ; CHECK-LIBCALL-NEXT: pushq %rbx -; CHECK-LIBCALL-NEXT: pushq %rax +; CHECK-LIBCALL-NEXT: subq $16, %rsp ; CHECK-LIBCALL-NEXT: pinsrw $0, (%rax), %xmm0 ; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] +; CHECK-LIBCALL-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; CHECK-LIBCALL-NEXT: movq %xmm1, %rbx ; CHECK-LIBCALL-NEXT: movq %rbx, %r14 ; CHECK-LIBCALL-NEXT: shrq $48, %r14 ; CHECK-LIBCALL-NEXT: movq %rbx, %r15 ; CHECK-LIBCALL-NEXT: shrq $32, %r15 -; CHECK-LIBCALL-NEXT: movl %ebx, %ebp -; CHECK-LIBCALL-NEXT: shrl $16, %ebp +; CHECK-LIBCALL-NEXT: shrl $16, %ebx ; CHECK-LIBCALL-NEXT: callq __extendhfsf2@PLT ; CHECK-LIBCALL-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-LIBCALL-NEXT: movl $32256, %eax # imm = 0x7E00 -; CHECK-LIBCALL-NEXT: cmovpl %eax, %ebp -; CHECK-LIBCALL-NEXT: cmovpl %eax, %r15d -; CHECK-LIBCALL-NEXT: cmovpl %eax, %r14d -; CHECK-LIBCALL-NEXT: cmovpl %eax, %ebx -; CHECK-LIBCALL-NEXT: movw %bx, (%rax) +; CHECK-LIBCALL-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movl $32256, %ecx # imm = 0x7E00 +; CHECK-LIBCALL-NEXT: cmovpl %ecx, %eax +; CHECK-LIBCALL-NEXT: cmovpl %ecx, %ebx +; CHECK-LIBCALL-NEXT: cmovpl %ecx, %r15d +; CHECK-LIBCALL-NEXT: cmovpl %ecx, %r14d ; CHECK-LIBCALL-NEXT: movw %r14w, (%rax) ; CHECK-LIBCALL-NEXT: movw %r15w, (%rax) -; CHECK-LIBCALL-NEXT: movw %bp, (%rax) -; CHECK-LIBCALL-NEXT: addq $8, %rsp +; CHECK-LIBCALL-NEXT: movw %bx, (%rax) +; CHECK-LIBCALL-NEXT: movw %ax, (%rax) +; CHECK-LIBCALL-NEXT: addq $16, %rsp ; CHECK-LIBCALL-NEXT: popq %rbx ; CHECK-LIBCALL-NEXT: popq %r14 ; CHECK-LIBCALL-NEXT: popq %r15 -; CHECK-LIBCALL-NEXT: popq %rbp ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: main.45: @@ -1161,28 +1157,20 @@ define void @main.45() #0 { ; ; CHECK-I686-LABEL: main.45: ; CHECK-I686: # %bb.0: # %entry -; CHECK-I686-NEXT: pushl %edi ; CHECK-I686-NEXT: pushl %esi -; CHECK-I686-NEXT: subl $20, %esp +; CHECK-I686-NEXT: subl $8, %esp ; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 -; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-I686-NEXT: movd %xmm1, %esi -; CHECK-I686-NEXT: movl %esi, %edi -; CHECK-I686-NEXT: shrl $16, %edi -; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax -; CHECK-I686-NEXT: movw %ax, (%esp) +; CHECK-I686-NEXT: pextrw $0, %xmm0, %esi +; CHECK-I686-NEXT: movw %si, (%esp) ; CHECK-I686-NEXT: calll __extendhfsf2 ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-I686-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-I686-NEXT: movl $32256, %eax # imm = 0x7E00 -; CHECK-I686-NEXT: cmovpl %eax, %esi -; CHECK-I686-NEXT: cmovpl %eax, %edi -; CHECK-I686-NEXT: movw %di, (%eax) -; CHECK-I686-NEXT: movw %si, (%eax) -; CHECK-I686-NEXT: addl $20, %esp +; CHECK-I686-NEXT: cmovnpl %esi, %eax +; CHECK-I686-NEXT: movw %ax, (%eax) +; CHECK-I686-NEXT: addl $8, %esp ; CHECK-I686-NEXT: popl %esi -; CHECK-I686-NEXT: popl %edi ; CHECK-I686-NEXT: retl entry: %0 = load half, ptr undef, align 8 @@ -1319,8 +1307,6 @@ define half @pr61271(half %0, half %1) #0 { ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; BWON-F16C-NEXT: vminss %xmm1, %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vmovd %xmm0, %eax -; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: pr61271: @@ -1581,79 +1567,67 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 { ; BWON-F16C-LABEL: maxnum_v8f16: ; BWON-F16C: # %bb.0: ; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm3 +; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2 -; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3 -; BWON-F16C-NEXT: vucomiss %xmm2, %xmm3 +; BWON-F16C-NEXT: vucomiss %xmm3, %xmm2 ; BWON-F16C-NEXT: ja .LBB26_2 ; BWON-F16C-NEXT: # %bb.1: -; BWON-F16C-NEXT: vmovaps %xmm2, %xmm3 +; BWON-F16C-NEXT: vmovaps %xmm3, %xmm2 ; BWON-F16C-NEXT: .LBB26_2: -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm3, %xmm2 ; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] +; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm4 +; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3 -; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; BWON-F16C-NEXT: vcvtph2ps %xmm4, %xmm4 -; BWON-F16C-NEXT: vucomiss %xmm3, %xmm4 +; BWON-F16C-NEXT: vucomiss %xmm4, %xmm3 ; BWON-F16C-NEXT: ja .LBB26_4 ; BWON-F16C-NEXT: # %bb.3: -; BWON-F16C-NEXT: vmovaps %xmm3, %xmm4 +; BWON-F16C-NEXT: vmovaps %xmm4, %xmm3 ; BWON-F16C-NEXT: .LBB26_4: -; BWON-F16C-NEXT: vmovd %xmm2, %eax -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm4, %xmm2 -; BWON-F16C-NEXT: vmovd %xmm2, %ecx -; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2 -; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3 -; BWON-F16C-NEXT: vucomiss %xmm2, %xmm3 +; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; BWON-F16C-NEXT: vcvtph2ps %xmm4, %xmm5 +; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; BWON-F16C-NEXT: vcvtph2ps %xmm4, %xmm4 +; BWON-F16C-NEXT: vucomiss %xmm5, %xmm4 ; BWON-F16C-NEXT: ja .LBB26_6 ; BWON-F16C-NEXT: # %bb.5: -; BWON-F16C-NEXT: vmovaps %xmm2, %xmm3 +; BWON-F16C-NEXT: vmovaps %xmm5, %xmm4 ; BWON-F16C-NEXT: .LBB26_6: -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm3, %xmm2 -; BWON-F16C-NEXT: vmovd %xmm2, %edx -; BWON-F16C-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm3 -; BWON-F16C-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] -; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2 -; BWON-F16C-NEXT: vucomiss %xmm3, %xmm2 +; BWON-F16C-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] +; BWON-F16C-NEXT: vcvtph2ps %xmm5, %xmm5 +; BWON-F16C-NEXT: vshufpd {{.*#+}} xmm6 = xmm0[1,0] +; BWON-F16C-NEXT: vcvtph2ps %xmm6, %xmm6 +; BWON-F16C-NEXT: vucomiss %xmm5, %xmm6 ; BWON-F16C-NEXT: ja .LBB26_8 ; BWON-F16C-NEXT: # %bb.7: -; BWON-F16C-NEXT: vmovaps %xmm3, %xmm2 +; BWON-F16C-NEXT: vmovaps %xmm5, %xmm6 ; BWON-F16C-NEXT: .LBB26_8: -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; BWON-F16C-NEXT: vmovd %xmm2, %esi -; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7] -; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2 -; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7] -; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm6 -; BWON-F16C-NEXT: vucomiss %xmm2, %xmm6 +; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[3,3,3,3,4,5,6,7] +; BWON-F16C-NEXT: vcvtph2ps %xmm5, %xmm7 +; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[3,3,3,3,4,5,6,7] +; BWON-F16C-NEXT: vcvtph2ps %xmm5, %xmm5 +; BWON-F16C-NEXT: vucomiss %xmm7, %xmm5 ; BWON-F16C-NEXT: ja .LBB26_10 ; BWON-F16C-NEXT: # %bb.9: -; BWON-F16C-NEXT: vmovaps %xmm2, %xmm6 +; BWON-F16C-NEXT: vmovaps %xmm7, %xmm5 ; BWON-F16C-NEXT: .LBB26_10: -; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 -; BWON-F16C-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm3 -; BWON-F16C-NEXT: vpinsrw $0, %edx, %xmm0, %xmm4 -; BWON-F16C-NEXT: vpinsrw $0, %esi, %xmm0, %xmm5 +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm6, %xmm6 -; BWON-F16C-NEXT: vmovd %xmm6, %eax -; BWON-F16C-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3] -; BWON-F16C-NEXT: vcvtph2ps %xmm6, %xmm7 -; BWON-F16C-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] -; BWON-F16C-NEXT: vcvtph2ps %xmm6, %xmm6 -; BWON-F16C-NEXT: vucomiss %xmm7, %xmm6 +; BWON-F16C-NEXT: vmovshdup {{.*#+}} xmm7 = xmm1[1,1,3,3] +; BWON-F16C-NEXT: vcvtph2ps %xmm7, %xmm8 +; BWON-F16C-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; BWON-F16C-NEXT: vcvtph2ps %xmm7, %xmm7 +; BWON-F16C-NEXT: vucomiss %xmm8, %xmm7 ; BWON-F16C-NEXT: ja .LBB26_12 ; BWON-F16C-NEXT: # %bb.11: -; BWON-F16C-NEXT: vmovaps %xmm7, %xmm6 +; BWON-F16C-NEXT: vmovaps %xmm8, %xmm7 ; BWON-F16C-NEXT: .LBB26_12: ; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4 -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm6, %xmm5 -; BWON-F16C-NEXT: vmovd %xmm5, %eax -; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm5 +; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm5, %xmm4 +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm7, %xmm5 ; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm7 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm6 ; BWON-F16C-NEXT: vucomiss %xmm7, %xmm6 @@ -1664,8 +1638,6 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 { ; BWON-F16C-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm6, %xmm4 -; BWON-F16C-NEXT: vmovd %xmm4, %eax -; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4 ; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] ; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 ; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] @@ -1676,8 +1648,6 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 { ; BWON-F16C-NEXT: vmovaps %xmm1, %xmm0 ; BWON-F16C-NEXT: .LBB26_16: ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vmovd %xmm0, %eax -; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; BWON-F16C-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; BWON-F16C-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] diff --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll index 1c9c8e40c009d..5ecb67ba7ffa6 100644 --- a/llvm/test/CodeGen/X86/pr31088.ll +++ b/llvm/test/CodeGen/X86/pr31088.ll @@ -45,8 +45,6 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind { ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vmovd %xmm0, %eax -; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; F16C-NEXT: retq ; ; F16C-O0-LABEL: ir_fadd_v1f16: @@ -55,12 +53,6 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind { ; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-O0-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; F16C-O0-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-O0-NEXT: vmovd %xmm0, %eax -; F16C-O0-NEXT: movw %ax, %cx -; F16C-O0-NEXT: # implicit-def: $eax -; F16C-O0-NEXT: movw %cx, %ax -; F16C-O0-NEXT: # implicit-def: $xmm0 -; F16C-O0-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; F16C-O0-NEXT: retq %retval = fadd <1 x half> %arg0, %arg1 ret <1 x half> %retval diff --git a/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll b/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll index 9bc0dbf5dcb58..28b405799dfd0 100644 --- a/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll +++ b/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll @@ -64,8 +64,6 @@ define half @uint8ToHalf(i8 %int8) { ; CHECK-NO_FP16-NEXT: movzbl %dil, %eax ; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 ; CHECK-NO_FP16-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NO_FP16-NEXT: vmovd %xmm0, %eax -; CHECK-NO_FP16-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-NO_FP16-NEXT: retq ; ; CHECK-WITH_FP16-LABEL: uint8ToHalf: @@ -147,8 +145,6 @@ define half @sint8ToHalf(i8 %int8) { ; CHECK-NO_FP16-NEXT: movsbl %dil, %eax ; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 ; CHECK-NO_FP16-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NO_FP16-NEXT: vmovd %xmm0, %eax -; CHECK-NO_FP16-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-NO_FP16-NEXT: retq ; ; CHECK-WITH_FP16-LABEL: sint8ToHalf: @@ -222,8 +218,6 @@ define half @uint16ToHalf(i16 %int16) { ; CHECK-NO_FP16-NEXT: movzwl %di, %eax ; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 ; CHECK-NO_FP16-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NO_FP16-NEXT: vmovd %xmm0, %eax -; CHECK-NO_FP16-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-NO_FP16-NEXT: retq ; ; CHECK-WITH_FP16-LABEL: uint16ToHalf: @@ -289,8 +283,6 @@ define half @sint16ToHalf(i16 %int16) { ; CHECK-NO_FP16-NEXT: movswl %di, %eax ; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 ; CHECK-NO_FP16-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NO_FP16-NEXT: vmovd %xmm0, %eax -; CHECK-NO_FP16-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-NO_FP16-NEXT: retq ; ; CHECK-WITH_FP16-LABEL: sint16ToHalf: diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll index 4e50b56323311..1105909699d4f 100644 --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -39,35 +39,33 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind { ; AVX-LABEL: cvt_4i16_to_4f32: ; AVX: # %bb.0: ; AVX-NEXT: subq $72, %rsp +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl $16, %ecx ; AVX-NEXT: movq %rax, %rdx +; AVX-NEXT: shrq $32, %rdx +; AVX-NEXT: shrq $48, %rax ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; AVX-NEXT: # kill: def $eax killed $eax killed $rax -; AVX-NEXT: shrl $16, %eax -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: shrq $48, %rdx -; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX-NEXT: addq $72, %rsp ; AVX-NEXT: retq @@ -90,35 +88,33 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind { ; AVX-LABEL: cvt_8i16_to_4f32: ; AVX: # %bb.0: ; AVX-NEXT: subq $72, %rsp +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl $16, %ecx ; AVX-NEXT: movq %rax, %rdx +; AVX-NEXT: shrq $32, %rdx +; AVX-NEXT: shrq $48, %rax ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; AVX-NEXT: # kill: def $eax killed $eax killed $rax -; AVX-NEXT: shrl $16, %eax -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: shrq $48, %rdx -; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX-NEXT: addq $72, %rsp ; AVX-NEXT: retq @@ -430,17 +426,15 @@ define <2 x float> @cvt_2i16_to_2f32_constrained(<2 x i16> %a0) nounwind strictf ; AVX-LABEL: cvt_2i16_to_2f32_constrained: ; AVX: # %bb.0: ; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpextrw $1, %xmm0, %eax ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; AVX-NEXT: shrl $16, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] ; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: retq ; @@ -465,35 +459,33 @@ define <4 x float> @cvt_4i16_to_4f32_constrained(<4 x i16> %a0) nounwind strictf ; AVX-LABEL: cvt_4i16_to_4f32_constrained: ; AVX: # %bb.0: ; AVX-NEXT: subq $72, %rsp +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl $16, %ecx ; AVX-NEXT: movq %rax, %rdx +; AVX-NEXT: shrq $32, %rdx +; AVX-NEXT: shrq $48, %rax ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; AVX-NEXT: # kill: def $eax killed $eax killed $rax -; AVX-NEXT: shrl $16, %eax -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: shrq $48, %rdx -; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX-NEXT: addq $72, %rsp ; AVX-NEXT: retq @@ -878,35 +870,34 @@ define <4 x float> @load_cvt_8i16_to_4f32(ptr %a0) nounwind { ; AVX-LABEL: load_cvt_8i16_to_4f32: ; AVX: # %bb.0: ; AVX-NEXT: subq $72, %rsp -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl $16, %ecx ; AVX-NEXT: movq %rax, %rdx +; AVX-NEXT: shrq $32, %rdx +; AVX-NEXT: shrq $48, %rax ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; AVX-NEXT: # kill: def $eax killed $eax killed $rax -; AVX-NEXT: shrl $16, %eax -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: shrq $48, %rdx -; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX-NEXT: addq $72, %rsp ; AVX-NEXT: retq @@ -1301,35 +1292,34 @@ define <4 x float> @load_cvt_8i16_to_4f32_constrained(ptr %a0) nounwind strictfp ; AVX-LABEL: load_cvt_8i16_to_4f32_constrained: ; AVX: # %bb.0: ; AVX-NEXT: subq $72, %rsp -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl $16, %ecx ; AVX-NEXT: movq %rax, %rdx +; AVX-NEXT: shrq $32, %rdx +; AVX-NEXT: shrq $48, %rax ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; AVX-NEXT: # kill: def $eax killed $eax killed $rax -; AVX-NEXT: shrl $16, %eax -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: shrq $48, %rdx -; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX-NEXT: addq $72, %rsp ; AVX-NEXT: retq @@ -1386,9 +1376,7 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind { ; AVX-LABEL: cvt_2i16_to_2f64: ; AVX: # %bb.0: ; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; AVX-NEXT: shrl $16, %eax +; AVX-NEXT: vpextrw $1, %xmm0, %eax ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: callq __extendhfsf2@PLT @@ -1422,19 +1410,18 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind { ; AVX-LABEL: cvt_4i16_to_4f64: ; AVX: # %bb.0: ; AVX-NEXT: subq $72, %rsp +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: shrq $48, %rcx +; AVX-NEXT: movq %rax, %rdx +; AVX-NEXT: shrq $32, %rdx +; AVX-NEXT: shrl $16, %eax ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: shrq $48, %rax -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: shrl $16, %edx ; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1477,9 +1464,7 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind { ; AVX-LABEL: cvt_8i16_to_2f64: ; AVX: # %bb.0: ; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; AVX-NEXT: shrl $16, %eax +; AVX-NEXT: vpextrw $1, %xmm0, %eax ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: callq __extendhfsf2@PLT @@ -1514,19 +1499,18 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind { ; AVX-LABEL: cvt_8i16_to_4f64: ; AVX: # %bb.0: ; AVX-NEXT: subq $72, %rsp +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: shrq $48, %rcx +; AVX-NEXT: movq %rax, %rdx +; AVX-NEXT: shrq $32, %rdx +; AVX-NEXT: shrl $16, %eax ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: shrq $48, %rax -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: shrl $16, %edx ; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1645,9 +1629,7 @@ define <2 x double> @cvt_2i16_to_2f64_constrained(<2 x i16> %a0) nounwind strict ; AVX-LABEL: cvt_2i16_to_2f64_constrained: ; AVX: # %bb.0: ; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; AVX-NEXT: shrl $16, %eax +; AVX-NEXT: vpextrw $1, %xmm0, %eax ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: callq __extendhfsf2@PLT @@ -1684,19 +1666,18 @@ define <4 x double> @cvt_4i16_to_4f64_constrained(<4 x i16> %a0) nounwind strict ; AVX-LABEL: cvt_4i16_to_4f64_constrained: ; AVX: # %bb.0: ; AVX-NEXT: subq $72, %rsp +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: shrq $48, %rcx +; AVX-NEXT: movq %rax, %rdx +; AVX-NEXT: shrq $32, %rdx +; AVX-NEXT: shrl $16, %eax ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: shrq $48, %rax -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: shrl $16, %edx ; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1936,19 +1917,19 @@ define <4 x double> @load_cvt_8i16_to_4f64(ptr %a0) nounwind { ; AVX-LABEL: load_cvt_8i16_to_4f64: ; AVX: # %bb.0: ; AVX-NEXT: subq $72, %rsp -; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: shrq $48, %rcx +; AVX-NEXT: movq %rax, %rdx +; AVX-NEXT: shrq $32, %rdx +; AVX-NEXT: shrl $16, %eax ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: shrq $48, %rax -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: shrl $16, %edx ; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2142,14 +2123,14 @@ define i16 @cvt_f32_to_i16(float %a0) nounwind { ; F16C-LABEL: cvt_f32_to_i16: ; F16C: # %bb.0: ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: vpextrw $0, %xmm0, %eax ; F16C-NEXT: # kill: def $ax killed $ax killed $eax ; F16C-NEXT: retq ; ; AVX512-LABEL: cvt_f32_to_i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpextrw $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq %1 = fptrunc float %a0 to half