[X86] combineBitcastvxi1 - don't prematurely create PACKSS nodes.

RKSimon · RKSimon · commit 65c9153cf055 · 2023-07-21T19:10:18.000+01:00
Similar to Issue #63710 - by truncating the v8i16 result with a PACKSS node before type legalization, we fail to make use of various folds that rely on TRUNCATE nodes. This required tweaks to LowerTruncateVecPackWithSignBits to recognise when the truncation source has been widened and to more closely match combineVectorSignBitsTruncation wrt truncating with PACKSS/PACKUS on AVX512 targets. One of the last stages before we can finally get rid of combineVectorSignBitsTruncation.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -22945,6 +22945,26 @@ static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In,
         (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
     return SDValue();
 
+  // Don't lower with PACK nodes on AVX512 targets if we'd need more than one.
+  if (Subtarget.hasAVX512() &&
+      SrcSVT.getSizeInBits() > (DstSVT.getSizeInBits() * 2))
+    return SDValue();
+
+  // If the upper half of the source is undef, then attempt to split and
+  // only truncate the lower half.
+  if (DstVT.getSizeInBits() >= 128) {
+    SmallVector<SDValue> LowerOps;
+    if (isUpperSubvectorUndef(In, LowerOps, DAG)) {
+      MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
+      MVT SrcHalfVT = SrcVT.getHalfNumVectorElementsVT();
+      SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcHalfVT, LowerOps);
+      if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
+                                                         Subtarget, DAG))
+        return widenSubVector(Res, false, Subtarget, DAG, DL,
+                              DstVT.getSizeInBits());
+    }
+  }
+
   unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
   unsigned NumPackedSignBits = std::min<unsigned>(DstSVT.getSizeInBits(), 16);
   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
@@ -45059,9 +45079,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
   if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
     V = getPMOVMSKB(DL, V, DAG, Subtarget);
   } else {
-    if (SExtVT == MVT::v8i16)
-      V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
-                      DAG.getUNDEF(MVT::v8i16));
+    if (SExtVT == MVT::v8i16) {
+      V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
+      V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
+    }
     V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
   }
 
diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
@@ -1193,19 +1193,17 @@ define i8 @icmp0_v8i1(<8 x i8>) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psllw $15, %xmm0
-; SSE2-NEXT:    psraw $15, %xmm0
 ; SSE2-NEXT:    pmovmskb %xmm0, %eax
-; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    testl $43690, %eax # imm = 0xAAAA
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; SSE41-LABEL: icmp0_v8i1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    psllw $15, %xmm0
-; SSE41-NEXT:    psraw $15, %xmm0
 ; SSE41-NEXT:    pmovmskb %xmm0, %eax
-; SSE41-NEXT:    testl %eax, %eax
+; SSE41-NEXT:    testl $43690, %eax # imm = 0xAAAA
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;