diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 66717135c9adf..a0b5f67c2e6c7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3587,6 +3587,19 @@ bool TargetLowering::SimplifyDemandedVectorElts( DemandedRHS.setBit(M - NumElts); } + // If either side isn't demanded, replace it by UNDEF. We handle this + // explicitly here to also simplify in case of multiple uses (on the + // contrary to the SimplifyDemandedVectorElts calls below). + bool FoldLHS = !DemandedLHS && !LHS.isUndef(); + bool FoldRHS = !DemandedRHS && !RHS.isUndef(); + if (FoldLHS || FoldRHS) { + LHS = FoldLHS ? TLO.DAG.getUNDEF(LHS.getValueType()) : LHS; + RHS = FoldRHS ? TLO.DAG.getUNDEF(RHS.getValueType()) : RHS; + SDValue NewOp = + TLO.DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, ShuffleMask); + return TLO.CombineTo(Op, NewOp); + } + // See if we can simplify either shuffle operand. APInt UndefLHS, ZeroLHS; APInt UndefRHS, ZeroRHS; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll index 008e19b620520..5914253b5f58e 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll @@ -1228,51 +1228,49 @@ define void @v_shuffle_v3bf16_v2bf16__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v1 ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v2 ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16 -; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 ; GFX900-NEXT: global_store_dword v0, v2, s[16:17] -; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v1 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16 -; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 ; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] -; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v1 ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v2 ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_alignbit_b32 v2, s0, v2, 16 -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 ; GFX942-NEXT: global_store_dword v0, v2, s[0:1] -; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x bfloat> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll index 99c9480adc410..cd4dbe93e8a11 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll @@ -1928,48 +1928,45 @@ define void @v_shuffle_v3bf16_v3bf16__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 ; GFX900-NEXT: global_store_dword v3, v2, s[16:17] -; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 ; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] -; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 ; GFX942-NEXT: global_store_dword v4, v3, s[0:1] -; GFX942-NEXT: global_store_short v4, v0, s[0:1] offset:4 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x bfloat> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll index e34becc1065ff..99cb8a38f57c3 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll @@ -1228,51 +1228,49 @@ define void @v_shuffle_v3f16_v2f16__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v1 ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v2 ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16 -; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 ; GFX900-NEXT: global_store_dword v0, v2, s[16:17] -; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v1 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16 -; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 ; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] -; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f16_v2f16__3_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v1 ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v2 ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_alignbit_b32 v2, s0, v2, 16 -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 ; GFX942-NEXT: global_store_dword v0, v2, s[0:1] -; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x half> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll index 84d42c882494c..0854ff2ebfc5d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll @@ -1928,48 +1928,45 @@ define void @v_shuffle_v3f16_v3f16__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 ; GFX900-NEXT: global_store_dword v3, v2, s[16:17] -; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 ; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] -; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f16_v3f16__5_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 ; GFX942-NEXT: global_store_dword v4, v3, s[0:1] -; GFX942-NEXT: global_store_short v4, v0, s[0:1] offset:4 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x half> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 62ab5d82bfbb6..910dd1ee6c419 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -2099,21 +2099,19 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psrlq $1, %xmm2 ; SSE41-NEXT: por %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm3 +; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: xorps %xmm2, %xmm2 ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: addps %xmm1, %xmm2 -; SSE41-NEXT: xorps %xmm3, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[2,3] -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],zero,zero +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: addps %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_4i64_to_4f32_undef: