From 7e582f1a4b81bc589b6b0176b6ea474247384722 Mon Sep 17 00:00:00 2001 From: Bjorn Pettersson Date: Tue, 24 Jun 2025 16:16:48 +0200 Subject: [PATCH 1/2] [SelectionDAG] Fold undemanded operand to UNDEF for VECTOR_SHUFFLE Always let SimplifyDemandedVectorElts fold either side of a VECTOR_SHUFFLE to UNDEF if no elements are demanded from that side. For a single use this could be done by SimplifyDemandedVectorElts already, but in case the operand had multiple uses we did not eliminate the use. --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 13 ++++++++++ .../AMDGPU/shufflevector.v3bf16.v2bf16.ll | 16 ++++++------- .../AMDGPU/shufflevector.v3bf16.v3bf16.ll | 15 +++++------- .../AMDGPU/shufflevector.v3f16.v2f16.ll | 16 ++++++------- .../AMDGPU/shufflevector.v3f16.v3f16.ll | 15 +++++------- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 24 +++++++++---------- 6 files changed, 50 insertions(+), 49 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 66717135c9adf..e40a592ecb57c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3587,6 +3587,19 @@ bool TargetLowering::SimplifyDemandedVectorElts( DemandedRHS.setBit(M - NumElts); } + // If either side isn't demanded, replace it by UNDEF. We handle this + // explicitly here to also simplify in case of mulitple uses (on the + // contrary to the SimplifyDemandedVectorElts calls below). + bool FoldLHS = !DemandedLHS && !LHS.isUndef(); + bool FoldRHS = !DemandedRHS && !RHS.isUndef(); + if (FoldLHS || FoldRHS) { + LHS = FoldLHS ? TLO.DAG.getUNDEF(LHS.getValueType()) : LHS; + RHS = FoldRHS ? TLO.DAG.getUNDEF(RHS.getValueType()) : RHS; + SDValue NewOp = + TLO.DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, ShuffleMask); + return TLO.CombineTo(Op, NewOp); + } + // See if we can simplify either shuffle operand. APInt UndefLHS, ZeroLHS; APInt UndefRHS, ZeroRHS; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll index 008e19b620520..5914253b5f58e 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll @@ -1228,51 +1228,49 @@ define void @v_shuffle_v3bf16_v2bf16__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v1 ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v2 ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16 -; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 ; GFX900-NEXT: global_store_dword v0, v2, s[16:17] -; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v1 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16 -; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 ; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] -; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v1 ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v2 ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_alignbit_b32 v2, s0, v2, 16 -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 ; GFX942-NEXT: global_store_dword v0, v2, s[0:1] -; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x bfloat> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll index 99c9480adc410..cd4dbe93e8a11 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll @@ -1928,48 +1928,45 @@ define void @v_shuffle_v3bf16_v3bf16__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 ; GFX900-NEXT: global_store_dword v3, v2, s[16:17] -; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 ; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] -; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 ; GFX942-NEXT: global_store_dword v4, v3, s[0:1] -; GFX942-NEXT: global_store_short v4, v0, s[0:1] offset:4 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x bfloat> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll index e34becc1065ff..99cb8a38f57c3 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll @@ -1228,51 +1228,49 @@ define void @v_shuffle_v3f16_v2f16__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v1 ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v2 ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16 -; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 ; GFX900-NEXT: global_store_dword v0, v2, s[16:17] -; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v1 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16 -; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 ; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] -; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f16_v2f16__3_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v1 ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v2 ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_alignbit_b32 v2, s0, v2, 16 -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 ; GFX942-NEXT: global_store_dword v0, v2, s[0:1] -; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x half> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll index 84d42c882494c..0854ff2ebfc5d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll @@ -1928,48 +1928,45 @@ define void @v_shuffle_v3f16_v3f16__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 ; GFX900-NEXT: global_store_dword v3, v2, s[16:17] -; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 ; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] -; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f16_v3f16__5_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 ; GFX942-NEXT: global_store_dword v4, v3, s[0:1] -; GFX942-NEXT: global_store_short v4, v0, s[0:1] offset:4 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x half> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 62ab5d82bfbb6..910dd1ee6c419 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -2099,21 +2099,19 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psrlq $1, %xmm2 ; SSE41-NEXT: por %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm3 +; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: xorps %xmm2, %xmm2 ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: addps %xmm1, %xmm2 -; SSE41-NEXT: xorps %xmm3, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[2,3] -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],zero,zero +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: addps %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_4i64_to_4f32_undef: From 8903013e46323a855badeb3f22ced5a309e734f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Pettersson?= Date: Wed, 25 Jun 2025 14:44:18 +0200 Subject: [PATCH 2/2] Fix spelling: mulitple -> multiple --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index e40a592ecb57c..a0b5f67c2e6c7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3588,7 +3588,7 @@ bool TargetLowering::SimplifyDemandedVectorElts( } // If either side isn't demanded, replace it by UNDEF. We handle this - // explicitly here to also simplify in case of mulitple uses (on the + // explicitly here to also simplify in case of multiple uses (on the // contrary to the SimplifyDemandedVectorElts calls below). bool FoldLHS = !DemandedLHS && !LHS.isUndef(); bool FoldRHS = !DemandedRHS && !RHS.isUndef();