Skip to content

Commit 6c119cf

Browse files
committed
[X86] combineConcatVectorOps - extend PACKSS/PACKUS handling to 512-bit nodes on BWI targets.
Fixes another TRUNCATE -> PACKSS/PACKUS regression when #63710 finally gets fixed
1 parent 0464a8f commit 6c119cf

File tree

2 files changed

+42
-14
lines changed

2 files changed

+42
-14
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54759,8 +54759,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5475954759
case X86ISD::HSUB:
5476054760
case X86ISD::FHADD:
5476154761
case X86ISD::FHSUB:
54762-
case X86ISD::PACKSS:
54763-
case X86ISD::PACKUS:
5476454762
if (!IsSplat && VT.is256BitVector() &&
5476554763
(VT.isFloatingPoint() || Subtarget.hasInt256())) {
5476654764
MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
@@ -54771,6 +54769,18 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5477154769
ConcatSubOperand(SrcVT, Ops, 1));
5477254770
}
5477354771
break;
54772+
case X86ISD::PACKSS:
54773+
case X86ISD::PACKUS:
54774+
if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
54775+
(VT.is512BitVector() && Subtarget.useBWIRegs()))) {
54776+
MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
54777+
SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
54778+
NumOps * SrcVT.getVectorNumElements());
54779+
return DAG.getNode(Op0.getOpcode(), DL, VT,
54780+
ConcatSubOperand(SrcVT, Ops, 0),
54781+
ConcatSubOperand(SrcVT, Ops, 1));
54782+
}
54783+
break;
5477454784
case X86ISD::PALIGNR:
5477554785
if (!IsSplat &&
5477654786
((VT.is256BitVector() && Subtarget.hasInt256()) ||

llvm/test/CodeGen/X86/vector-pack-512.ll

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -257,12 +257,21 @@ define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) noun
257257
}
258258

259259
define <32 x i16> @concat_packsswd_int_2x256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
260-
; AVX512-LABEL: concat_packsswd_int_2x256:
261-
; AVX512: # %bb.0:
262-
; AVX512-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
263-
; AVX512-NEXT: vpackssdw %ymm3, %ymm2, %ymm1
264-
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
265-
; AVX512-NEXT: retq
260+
; AVX512F-LABEL: concat_packsswd_int_2x256:
261+
; AVX512F: # %bb.0:
262+
; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
263+
; AVX512F-NEXT: vpackssdw %ymm3, %ymm2, %ymm1
264+
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
265+
; AVX512F-NEXT: retq
266+
;
267+
; AVX512BW-LABEL: concat_packsswd_int_2x256:
268+
; AVX512BW: # %bb.0:
269+
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
270+
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
271+
; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
272+
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
273+
; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
274+
; AVX512BW-NEXT: retq
266275
%lo = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
267276
%hi = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a2, <8 x i32> %a3)
268277
%res = shufflevector <16 x i16> %lo, <16 x i16> %hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -271,12 +280,21 @@ define <32 x i16> @concat_packsswd_int_2x256(<8 x i32> %a0, <8 x i32> %a1, <8 x
271280
declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>)
272281

273282
define <32 x i16> @concat_packuswd_int_2x256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
274-
; AVX512-LABEL: concat_packuswd_int_2x256:
275-
; AVX512: # %bb.0:
276-
; AVX512-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
277-
; AVX512-NEXT: vpackusdw %ymm3, %ymm2, %ymm1
278-
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
279-
; AVX512-NEXT: retq
283+
; AVX512F-LABEL: concat_packuswd_int_2x256:
284+
; AVX512F: # %bb.0:
285+
; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
286+
; AVX512F-NEXT: vpackusdw %ymm3, %ymm2, %ymm1
287+
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
288+
; AVX512F-NEXT: retq
289+
;
290+
; AVX512BW-LABEL: concat_packuswd_int_2x256:
291+
; AVX512BW: # %bb.0:
292+
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
293+
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
294+
; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
295+
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
296+
; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
297+
; AVX512BW-NEXT: retq
280298
%lo = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
281299
%hi = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a2, <8 x i32> %a3)
282300
%res = shufflevector <16 x i16> %lo, <16 x i16> %hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>

0 commit comments

Comments
 (0)