[X86] Adding lowerings for vector ISD::LRINT and ISD::LLRINT #90065

phoebewang · 2024-04-25T14:32:12Z

[V]CVTP[D,S]2DQ supports f64/f32 -> i32 conversions that can be mapped to llvm.lrint.vNi32.vNf64/32 since SSE2. AVX and AVX512 added 256-bit and 512-bit support;
VCVTP[D,S]2QQ supports f64/f32 -> i64 conversions that can be mapped to llvm.l[l]rint.vNi64.vNf64/32 since AVX512DQ. All 128-bit, 256-bit (require AVX512VL) and 512-bit are supported.

llvmbot · 2024-04-25T14:32:46Z

@llvm/pr-subscribers-backend-x86

Author: Phoebe Wang (phoebewang)

Changes

Patch is 52.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/90065.diff

6 Files Affected:

(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+51-1)
(modified) llvm/lib/Target/X86/X86ISelLowering.h (+2-1)
(modified) llvm/lib/Target/X86/X86InstrAVX512.td (+35)
(modified) llvm/lib/Target/X86/X86InstrSSE.td (+14-1)
(modified) llvm/test/CodeGen/X86/vector-llrint.ll (+214-10)
(modified) llvm/test/CodeGen/X86/vector-lrint.ll (+95-408)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bb43cbe15f5225..827537818f059f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1092,6 +1092,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
     setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
 
+    setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
+
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
       setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
       setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
@@ -1431,6 +1433,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FMINIMUM,          VT, Custom);
     }
 
+    setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
+
     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
     // even though v8i16 is a legal type.
     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i16, MVT::v8i32);
@@ -1731,6 +1735,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   }
+  if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
+    setOperationAction(ISD::LRINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::LRINT, MVT::v8f32, Legal);
+    setOperationAction(ISD::LLRINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::LLRINT, MVT::v8f32, Legal);
+    setOperationAction(ISD::LRINT, MVT::v2f64, Legal);
+    setOperationAction(ISD::LRINT, MVT::v4f64, Legal);
+    setOperationAction(ISD::LLRINT, MVT::v2f64, Legal);
+    setOperationAction(ISD::LLRINT, MVT::v4f64, Legal);
+  }
 
   // This block controls legalization for 512-bit operations with 8/16/32/64 bit
   // elements. 512-bits can be disabled based on prefer-vector-width and
@@ -1765,6 +1779,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_FMA, VT, Legal);
       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
     }
+    setOperationAction(ISD::LRINT, MVT::v16f32,
+                       Subtarget.hasDQI() ? Legal : Custom);
+    setOperationAction(ISD::LRINT, MVT::v8f64,
+                       Subtarget.hasDQI() ? Legal : Custom);
+    if (Subtarget.hasDQI())
+      setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
 
     for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
       setOperationPromotedToType(ISD::FP_TO_SINT       , VT, MVT::v16i32);
@@ -2488,6 +2508,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        ISD::FMAXNUM,
                        ISD::SUB,
                        ISD::LOAD,
+                       ISD::LRINT,
+                       ISD::LLRINT,
                        ISD::MLOAD,
                        ISD::STORE,
                        ISD::MSTORE,
@@ -21159,10 +21181,15 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
+                                             const X86Subtarget &Subtarget,
                                              SelectionDAG &DAG) const {
   SDValue Src = Op.getOperand(0);
+  EVT DstVT = Op.getSimpleValueType();
   MVT SrcVT = Src.getSimpleValueType();
 
+  if (SrcVT.isVector())
+    return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
+
   if (SrcVT == MVT::f16)
     return SDValue();
 
@@ -32217,7 +32244,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   case ISD::LRINT:
-  case ISD::LLRINT:             return LowerLRINT_LLRINT(Op, DAG);
+  case ISD::LLRINT:             return LowerLRINT_LLRINT(Op, Subtarget, DAG);
   case ISD::SETCC:
   case ISD::STRICT_FSETCC:
   case ISD::STRICT_FSETCCS:     return LowerSETCC(Op, DAG);
@@ -51556,6 +51583,22 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  SDLoc DL(N);
+
+  if (!Subtarget.hasDQI() || !Subtarget.hasVLX() || VT != MVT::v2i64 ||
+      SrcVT != MVT::v2f32)
+    return SDValue();
+
+  return DAG.getNode(X86ISD::CVTP2SI, DL, VT,
+                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src,
+                                 DAG.getUNDEF(SrcVT)));
+}
+
 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
 /// the codegen.
 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
@@ -51902,6 +51945,11 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
   }
 
+  // Try to combine (trunc (vNi64 (lrint x))) to (vNi32 (lrint x)).
+  if (Src.getOpcode() == ISD::LRINT && VT.getScalarType() == MVT::i32 &&
+      Src.hasOneUse())
+    return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));
+
   return SDValue();
 }
 
@@ -56848,6 +56896,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::UINT_TO_FP:
   case ISD::STRICT_UINT_TO_FP:
     return combineUIntToFP(N, DAG, Subtarget);
+  case ISD::LRINT:
+  case ISD::LLRINT:         return combineLRINT_LLRINT(N, DAG, Subtarget);
   case ISD::FADD:
   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
   case X86ISD::VFCMULC:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index e348ba6e8ac085..eea771d235b2da 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1673,7 +1673,8 @@ namespace llvm {
     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerLRINT_LLRINT(SDValue Op, const X86Subtarget &STI,
+                              SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 43a40f5e691ea3..ec2a5f52a7b6aa 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -8811,7 +8811,18 @@ let Predicates = [HasVLX] in {
   def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)),
                           v4i32x_info.ImmAllZerosV, VK2WM:$mask),
             (VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>;
+
+  def : Pat<(v4i32 (lrint VR128X:$src)), (VCVTPS2DQZ128rr VR128X:$src)>;
+  def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (VCVTPS2DQZ128rm addr:$src)>;
+  def : Pat<(v8i32 (lrint VR256X:$src)), (VCVTPS2DQZ256rr VR256X:$src)>;
+  def : Pat<(v8i32 (lrint (loadv8f32 addr:$src))), (VCVTPS2DQZ256rm addr:$src)>;
+  def : Pat<(v4i32 (lrint VR256X:$src)), (VCVTPD2DQZ256rr VR256X:$src)>;
+  def : Pat<(v4i32 (lrint (loadv4f64 addr:$src))), (VCVTPD2DQZ256rm addr:$src)>;
 }
+def : Pat<(v16i32 (lrint VR512:$src)), (VCVTPS2DQZrr VR512:$src)>;
+def : Pat<(v16i32 (lrint (loadv16f32 addr:$src))), (VCVTPS2DQZrm addr:$src)>;
+def : Pat<(v8i32 (lrint VR512:$src)), (VCVTPD2DQZrr VR512:$src)>;
+def : Pat<(v8i32 (lrint (loadv8f64 addr:$src))), (VCVTPD2DQZrm addr:$src)>;
 
 let Predicates = [HasDQI, HasVLX] in {
   def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
@@ -8857,6 +8868,30 @@ let Predicates = [HasDQI, HasVLX] in {
                                  (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
                                  v2i64x_info.ImmAllZerosV)),
             (VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+  def : Pat<(v4i64 (lrint VR128X:$src)), (VCVTPS2QQZ256rr VR128X:$src)>;
+  def : Pat<(v4i64 (lrint (loadv4f32 addr:$src))), (VCVTPS2QQZ256rm addr:$src)>;
+  def : Pat<(v4i64 (llrint VR128X:$src)), (VCVTPS2QQZ256rr VR128X:$src)>;
+  def : Pat<(v4i64 (llrint (loadv4f32 addr:$src))), (VCVTPS2QQZ256rm addr:$src)>;
+  def : Pat<(v2i64 (lrint VR128X:$src)), (VCVTPD2QQZ128rr VR128X:$src)>;
+  def : Pat<(v2i64 (lrint (loadv2f64 addr:$src))), (VCVTPD2QQZ128rm addr:$src)>;
+  def : Pat<(v4i64 (lrint VR256X:$src)), (VCVTPD2QQZ256rr VR256X:$src)>;
+  def : Pat<(v4i64 (lrint (loadv4f64 addr:$src))), (VCVTPD2QQZ256rm addr:$src)>;
+  def : Pat<(v2i64 (llrint VR128X:$src)), (VCVTPD2QQZ128rr VR128X:$src)>;
+  def : Pat<(v2i64 (llrint (loadv2f64 addr:$src))), (VCVTPD2QQZ128rm addr:$src)>;
+  def : Pat<(v4i64 (llrint VR256X:$src)), (VCVTPD2QQZ256rr VR256X:$src)>;
+  def : Pat<(v4i64 (llrint (loadv4f64 addr:$src))), (VCVTPD2QQZ256rm addr:$src)>;
+}
+
+let Predicates = [HasDQI] in {
+  def : Pat<(v8i64 (lrint VR256X:$src)), (VCVTPS2QQZrr VR256X:$src)>;
+  def : Pat<(v8i64 (lrint (loadv8f32 addr:$src))), (VCVTPS2QQZrm addr:$src)>;
+  def : Pat<(v8i64 (llrint VR256X:$src)), (VCVTPS2QQZrr VR256X:$src)>;
+  def : Pat<(v8i64 (llrint (loadv8f32 addr:$src))), (VCVTPS2QQZrm addr:$src)>;
+  def : Pat<(v8i64 (lrint VR512:$src)), (VCVTPD2QQZrr VR512:$src)>;
+  def : Pat<(v8i64 (lrint (loadv8f64 addr:$src))), (VCVTPD2QQZrm addr:$src)>;
+  def : Pat<(v8i64 (llrint VR512:$src)), (VCVTPD2QQZrr VR512:$src)>;
+  def : Pat<(v8i64 (llrint (loadv8f64 addr:$src))), (VCVTPD2QQZrm addr:$src)>;
 }
 
 let Predicates = [HasVLX] in {
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 063b572761e7d1..62b9b93953ad5a 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -1554,7 +1554,6 @@ def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
                      Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
 
-
 // Convert Packed Double FP to Packed DW Integers
 let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
@@ -1586,6 +1585,20 @@ def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                        VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, WIG;
 }
 
+let Predicates = [HasAVX] in {
+  def : Pat<(v4i32 (lrint VR128:$src)), (VCVTPS2DQrr VR128:$src)>;
+  def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (VCVTPS2DQrm addr:$src)>;
+  def : Pat<(v8i32 (lrint VR256:$src)), (VCVTPS2DQYrr VR256:$src)>;
+  def : Pat<(v8i32 (lrint (loadv8f32 addr:$src))), (VCVTPS2DQYrm addr:$src)>;
+  def : Pat<(v4i32 (lrint VR256:$src)), (VCVTPD2DQYrr VR256:$src)>;
+  def : Pat<(v4i32 (lrint (loadv4f64 addr:$src))), (VCVTPD2DQYrm addr:$src)>;
+}
+
+let Predicates = [HasSSE2] in {
+  def : Pat<(v4i32 (lrint VR128:$src)), (CVTPS2DQrr VR128:$src)>;
+  def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (CVTPS2DQrm addr:$src)>;
+}
+
 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
                 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
diff --git a/llvm/test/CodeGen/X86/vector-llrint.ll b/llvm/test/CodeGen/X86/vector-llrint.ll
index 46904f82fd5d6d..0be58ca86aa626 100644
--- a/llvm/test/CodeGen/X86/vector-llrint.ll
+++ b/llvm/test/CodeGen/X86/vector-llrint.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefix=X64-AVX
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefix=X64-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=CHECK,X64-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=CHECK,X64-AVX-512
 
 define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
 ; X64-SSE-LABEL: llrint_v1i64_v1f32:
@@ -9,10 +9,10 @@ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
 ; X64-SSE-NEXT:    cvtss2si %xmm0, %rax
 ; X64-SSE-NEXT:    retq
 ;
-; X64-AVX-LABEL: llrint_v1i64_v1f32:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX-NEXT:    retq
+; CHECK-LABEL: llrint_v1i64_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtss2si %xmm0, %rax
+; CHECK-NEXT:    retq
   %a = call <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float> %x)
   ret <1 x i64> %a
 }
@@ -39,6 +39,11 @@ define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) {
 ; X64-AVX-NEXT:    vmovq %rax, %xmm0
 ; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X64-AVX-NEXT:    retq
+;
+; X64-AVX-512-LABEL: llrint_v2i64_v2f32:
+; X64-AVX-512:       # %bb.0:
+; X64-AVX-512-NEXT:    vcvtps2qq %xmm0, %xmm0
+; X64-AVX-512-NEXT:    retq
   %a = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> %x)
   ret <2 x i64> %a
 }
@@ -64,6 +69,29 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) {
 ; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
 ; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: llrint_v4i64_v4f32:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X64-AVX-NEXT:    vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm1
+; X64-AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; X64-AVX-NEXT:    vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm2
+; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; X64-AVX-NEXT:    vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm2
+; X64-AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX-NEXT:    vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm0
+; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; X64-AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX-NEXT:    retq
+;
+; X64-AVX-512-LABEL: llrint_v4i64_v4f32:
+; X64-AVX-512:       # %bb.0:
+; X64-AVX-512-NEXT:    vcvtps2qq %xmm0, %ymm0
+; X64-AVX-512-NEXT:    retq
   %a = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> %x)
   ret <4 x i64> %a
 }
@@ -105,6 +133,45 @@ define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) {
 ; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
 ; X64-SSE-NEXT:    movdqa %xmm4, %xmm1
 ; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: llrint_v8i64_v8f32:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X64-AVX-NEXT:    vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm1
+; X64-AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; X64-AVX-NEXT:    vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm2
+; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; X64-AVX-NEXT:    vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm2
+; X64-AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X64-AVX-NEXT:    vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm3
+; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X64-AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; X64-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X64-AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X64-AVX-NEXT:    vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm1
+; X64-AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
+; X64-AVX-NEXT:    vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm3
+; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; X64-AVX-NEXT:    vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm3
+; X64-AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX-NEXT:    vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm0
+; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; X64-AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; X64-AVX-NEXT:    vmovaps %ymm2, %ymm0
+; X64-AVX-NEXT:    retq
+;
+; X64-AVX-512-LABEL: llrint_v8i64_v8f32:
+; X64-AVX-512:       # %bb.0:
+; X64-AVX-512-NEXT:    vcvtps2qq %ymm0, %zmm0
+; X64-AVX-512-NEXT:    retq
   %a = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> %x)
   ret <8 x i64> %a
 }
@@ -183,6 +250,78 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 ; X64-SSE-NEXT:    movdqa %xmm0, 16(%rdi)
 ; X64-SSE-NEXT:    movdqa %xmm4, (%rdi)
 ; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: llrint_v16i64_v16f32:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovaps %ymm0, %ymm2
+; X64-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3]
+; X64-AVX-NEXT:    vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm0
+; X64-AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
+; X64-AVX-NEXT:    vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm3
+; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; X64-AVX-NEXT:    vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm3
+; X64-AVX-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; X64-AVX-NEXT:    vcvtss2si %xmm4, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm4
+; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; X64-AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; X64-AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[3,3,3,3]
+; X64-AVX-NEXT:    vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm3
+; X64-AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm2[1,0]
+; X64-AVX-NEXT:    vcvtss2si %xmm4, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm4
+; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; X64-AVX-NEXT:    vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm4
+; X64-AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-AVX-NEXT:    vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm2
+; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; X64-AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm4
+; X64-AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; X64-AVX-NEXT:    vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm2
+; X64-AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; X64-AVX-NEXT:    vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm3
+; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; X64-AVX-NEXT:    vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm3
+; X64-AVX-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; X64-AVX-NEXT:    vcvtss2si %xmm5, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm5
+; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; X64-AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; X64-AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; X64-AVX-NEXT:    vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm3
+; X64-AVX-NEXT:    vshufpd {{.*#+}} xmm5 = xmm1[1,0]
+; X64-AVX-NEXT:    vcvtss2si %xmm5, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm5
+; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm3[0]
+; X64-AVX-NEXT:    vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm5
+; X64-AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X64-AVX-NEXT:    vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm1
+; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; X64-AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm3
+; X64-AVX-NEXT:    vmovaps %ymm4, %ymm1
+; X64-AVX-NEXT:    retq
+;
+; X64-AVX-512-LABEL: llrint_v16i64_v16f32:
+; X64-AVX-512:       # %bb.0:
+; X64-AVX-512-NEXT:    vcvtps2qq %ymm0, %zmm2
+; X64-AVX-512-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; X64-AVX-512-NEXT:    vcvtps2qq %ymm0, %zmm1
+; X64-AVX-512-NEXT:    vmovaps %zmm2, %zmm0
+; X64-AVX-512-NEXT:    retq
   %a = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> %x)
   ret <16 x i64> %a
 }
@@ -194,10 +333,10 @@ define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) {
 ; X64-SSE-NEXT:    cvtsd2si %xmm0, %rax
 ; X64-SSE-NEXT:    retq
 ;
-; X64-AVX-LABEL: llrint_v1i64_v1f64:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX-NEXT:    retq
+; CHECK-LABEL: llrint_v1i64_v1f64:
+; CHECK:      ...
[truncated]

artagnon

What I gather is that you've added support for lowering vector [l]lrint for AVX. Further, it seems AVX can only pack vectors with i64. Please mention these in the commit subject and body. Otherwise, I'd like a clarification on the MVT you've matched.

artagnon · 2024-04-25T15:23:08Z

llvm/lib/Target/X86/X86ISelLowering.cpp

@@ -1092,6 +1092,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
    setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);

+    setOperationAction(ISD::LRINT, MVT::v4f32, Custom);


Is this correct? The MVT of ISD::LRINT is usually set to its output type, not input type. Shouldn't this be MVT::v4i32?

Yes, it's correct. ISD::LRINT uses input type, see https://github.com/llvm/llvm-project/blob/main/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp#L1002-L1006

artagnon · 2024-04-25T15:24:16Z

llvm/lib/Target/X86/X86ISelLowering.cpp

@@ -1731,6 +1735,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
  }
+  if (Subtarget.hasDQI() && Subtarget.hasVLX()) {


I'm not qualified to review this line, as I don't know what DQI or VLX are.

It's mapped to avx512dq and avx512vl features.

phoebewang · 2024-04-26T06:04:20Z

What I gather is that you've added support for lowering vector [l]lrint for AVX. Further, it seems AVX can only pack vectors with i64. Please mention these in the commit subject and body. Otherwise, I'd like a clarification on the MVT you've matched.

Added the relationship in description, thanks!

llvm/lib/Target/X86/X86ISelLowering.cpp

RKSimon · 2024-04-26T14:17:45Z

llvm/lib/Target/X86/X86ISelLowering.h

@@ -1673,7 +1673,8 @@ namespace llvm {
    SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerLRINT_LLRINT(SDValue Op, const X86Subtarget &STI,


Do you need to provide the X86Subtarget arg? Subtarget is available in the X86TargetLowering class

Good catch! Done.

RKSimon · 2024-04-26T14:20:30Z

llvm/lib/Target/X86/X86InstrAVX512.td

 }
+def : Pat<(v16i32 (lrint VR512:$src)), (VCVTPS2DQZrr VR512:$src)>;


Do these need to be wrapped in a Predicates = [HasAvx512] check?

I think we don't bother to check it since VR512 is not available without AVX512F + EXEV512. We saved a lot of HasEVEX512 check when introduing AVX10.

RKSimon · 2024-04-26T14:21:18Z

llvm/lib/Target/X86/X86InstrSSE.td

+  def : Pat<(v4i32 (lrint (loadv4f64 addr:$src))), (VCVTPD2DQYrm addr:$src)>;
+}
+
+let Predicates = [HasSSE2] in {


Done, thanks!

RKSimon · 2024-04-26T14:21:49Z

llvm/test/CodeGen/X86/vector-llrint.ll

-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefix=X64-AVX
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefix=X64-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=CHECK,X64-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=CHECK,X64-AVX-512


Better to use AVX instead of CHECK (which tends to be used for universal checks)?

Should we bother testing on a non-DQ/non-VLX AVX512 target?

RKSimon · 2024-04-26T14:23:17Z

llvm/test/CodeGen/X86/vector-lrint.ll

 ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=X64-AVX-i64,X64-AVX1-i64
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX-i64,X64-AVX512-i64
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=X64-AVX-i64,X64-AVX512-i64


worth keeping avx512f only test coverage as well as dq+vl?

RKSimon · 2024-04-26T14:25:34Z

llvm/lib/Target/X86/X86ISelLowering.cpp

+
+  return DAG.getNode(X86ISD::CVTP2SI, DL, VT,
+                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src,
+                                 DAG.getUNDEF(SrcVT)));


Should this be handled in in ReplaceNodeResults ?

No, the result type is legal.

github-actions · 2024-04-27T14:26:42Z

✅ With the latest revision this PR passed the C/C++ code formatter.

X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which have the same costs as the CVTTP2SI (fptosi) instructions Followup to llvm#90065

RKSimon

LGTM

X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which have the same costs as the CVTTP2SI (fptosi) instructions Followup to llvm#90065

X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which have the same costs as the CVTTP2SI (fptosi) instructions Followup to #90065

fhahn · 2024-10-01T14:57:30Z

It looks like this change is causing different results in some cases when using @llvm.lrint.v4i64.v4f32.

For

define i32 @foo(ptr %dst, <4 x float> %in) {
entry:
  %0 = call <4 x i64> @llvm.lrint.v4i64.v4f32(<4 x float> %in)
  %1 = trunc <4 x i64> %0 to <4 x i32>
  store <4 x i32> %1, ptr %dst, align 4
  ret i32 0
}

we now generate the assembly below, which does the conversion to signed double word.

_foo:                                   ## @foo
        cvtps2dq        xmm0, xmm0
        movupd  xmmword ptr [rdi], xmm0
        xor     eax, eax
        ret

Before this change, we generated the assembly below, which does the conversion to signed quad word, then truncating the result

_foo:                                   ## @foo
        movaps  xmm1, xmm0
        shufps  xmm1, xmm0, 85                  ## xmm1 = xmm1[1,1],xmm0[1,1]
        cvtss2si        rax, xmm1
        cvtss2si        rcx, xmm0
        movaps  xmm1, xmm0
        unpckhpd        xmm1, xmm0                      ## xmm1 = xmm1[1],xmm0[1]
        cvtss2si        rdx, xmm1
        shufps  xmm0, xmm0, 255                 ## xmm0 = xmm0[3,3,3,3]
        cvtss2si        rsi, xmm0
        movd    xmm0, esi
        movd    xmm1, edx
        punpckldq       xmm1, xmm0              ## xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
        movd    xmm0, ecx
        movd    xmm2, eax
        punpckldq       xmm0, xmm2              ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
        punpcklqdq      xmm0, xmm1              ## xmm0 = xmm0[0],xmm1[0]
        movdqu  xmmword ptr [rdi], xmm0
        xor     eax, eax
        ret

I might be missing something, but I think for inputs like 2^33 new codegen will overflow, while the second won't.

See https://llvm.godbolt.org/z/E7W89q59M for a comparison main vs 18.x

topperc · 2024-10-01T17:20:14Z

It looks like this change is causing different results in some cases when using @llvm.lrint.v4i64.v4f32.

For

define i32 @foo(ptr %dst, <4 x float> %in) {
entry:
  %0 = call <4 x i64> @llvm.lrint.v4i64.v4f32(<4 x float> %in)
  %1 = trunc <4 x i64> %0 to <4 x i32>
  store <4 x i32> %1, ptr %dst, align 4
  ret i32 0
}

we now generate the assembly below, which does the conversion to signed double word.

_foo:                                   ## @foo
        cvtps2dq        xmm0, xmm0
        movupd  xmmword ptr [rdi], xmm0
        xor     eax, eax
        ret

Before this change, we generated the assembly below, which does the conversion to signed quad word, then truncating the result

_foo:                                   ## @foo
        movaps  xmm1, xmm0
        shufps  xmm1, xmm0, 85                  ## xmm1 = xmm1[1,1],xmm0[1,1]
        cvtss2si        rax, xmm1
        cvtss2si        rcx, xmm0
        movaps  xmm1, xmm0
        unpckhpd        xmm1, xmm0                      ## xmm1 = xmm1[1],xmm0[1]
        cvtss2si        rdx, xmm1
        shufps  xmm0, xmm0, 255                 ## xmm0 = xmm0[3,3,3,3]
        cvtss2si        rsi, xmm0
        movd    xmm0, esi
        movd    xmm1, edx
        punpckldq       xmm1, xmm0              ## xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
        movd    xmm0, ecx
        movd    xmm2, eax
        punpckldq       xmm0, xmm2              ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
        punpcklqdq      xmm0, xmm1              ## xmm0 = xmm0[0],xmm1[0]
        movdqu  xmmword ptr [rdi], xmm0
        xor     eax, eax
        ret

I might be missing something, but I think for inputs like 2^33 new codegen will overflow, while the second won't.

See https://llvm.godbolt.org/z/E7W89q59M for a comparison main vs 18.x

I think its this combine which isn't valid.

  // Try to combine (trunc (vNi64 (lrint x))) to (vNi32 (lrint x)).
  if (Src.getOpcode() == ISD::LRINT && VT.getScalarType() == MVT::i32 &&
      Src.hasOneUse())
    return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));

phoebewang · 2024-10-02T12:16:04Z

Isn't it a poison vaule if the inputs larger than 2^32 according to LLVM LangRef?

fhahn · 2024-10-02T12:20:08Z

Isn't it a poison vaule if the inputs larger than 2^32 according to LLVM LangRef?

I think only if the trunc would have a nuw flag, which isn't present in the reproducer?

phoebewang · 2024-10-02T13:01:37Z

Isn't it a poison vaule if the inputs larger than 2^32 according to LLVM LangRef?

I think only if the trunc would have a nuw flag, which isn't present in the reproducer?

Yes, you are correct!

fhahn · 2024-10-03T11:43:31Z

@phoebewang are you working on a fix? Would be good to resolve this miscompile either with a fix or a revert

phoebewang · 2024-10-03T12:53:09Z

@phoebewang are you working on a fix? Would be good to resolve this miscompile either with a fix or a revert

Sorry, I'm on vacation and don't have a chunk of time to do so. I will be able to work on it in a few days.

phoebewang · 2024-10-06T03:30:55Z

Isn't it a poison vaule if the inputs larger than 2^32 according to LLVM LangRef?

I think only if the trunc would have a nuw flag, which isn't present in the reproducer?

@fhahn Reviewed the code again, I think it's arguable to consider it's a mis optimization. The reason is not about trunc with nuw or not, but llvm.lrint itself. I think we should consider the difference between llvm.lrint.i64.f32 and llvm.llrint.i64.f32. The front end won't generate llvm.lrint.i32.f32 for lrint, but user might expect foo is more optimal than bar when they writing

int foo(float x) {
  return lrintf(x);
}

int bar(float x) {
  return llrintf(x);
}

I admit we don't describe llvm.lrint.i64.f32 in this way, but we can improve it and suggest user to use llrintf/llvm.llrint.i64.f32 instead when they care about inputs larger than 2^32. WDYT?

topperc · 2024-10-06T03:33:14Z

Isn't it a poison vaule if the inputs larger than 2^32 according to LLVM LangRef?

I think only if the trunc would have a nuw flag, which isn't present in the reproducer?

@fhahn Reviewed the code again, I think it's arguable to consider it's a mis optimization. The reason is not about trunc with nuw or not, but llvm.lrint itself. I think we should consider the difference between llvm.lrint.i64.f32 and llvm.llrint.i64.f32. The front end won't generate llvm.lrint.i32.f32 for lrint, but user might expect foo is more optimal than bar when they writing
int foo(float x) {
  return lrintf(x);
}

int bar(float x) {
  return llrintf(x);
}
I admit we don't describe llvm.lrint.i64.f32 in this way, but we can improve it and suggest user to use llrintf/llvm.llrint.i64.f32 instead when they care about inputs larger than 2^32. WDYT?

Doesn't the frontend generate llvm.lrint.i32.f32 on 32-bit targets and on 64-bit Windows? sizeof(long) is 4 bytes in those cases.

phoebewang · 2024-10-06T03:44:25Z

Isn't it a poison vaule if the inputs larger than 2^32 according to LLVM LangRef?

I think only if the trunc would have a nuw flag, which isn't present in the reproducer?

@fhahn Reviewed the code again, I think it's arguable to consider it's a mis optimization. The reason is not about trunc with nuw or not, but llvm.lrint itself. I think we should consider the difference between llvm.lrint.i64.f32 and llvm.llrint.i64.f32. The front end won't generate llvm.lrint.i32.f32 for lrint, but user might expect foo is more optimal than bar when they writing
int foo(float x) {
  return lrintf(x);
}

int bar(float x) {
  return llrintf(x);
}
I admit we don't describe llvm.lrint.i64.f32 in this way, but we can improve it and suggest user to use llrintf/llvm.llrint.i64.f32 instead when they care about inputs larger than 2^32. WDYT?
Doesn't the frontend generate llvm.lrint.i32.f32 on 32-bit targets and on 64-bit Windows? sizeof(long) is 4 bytes in those cases.

But they don't have difference on 64-bit Linux, I think it's good to expand its semantic given it's already inconsistent across different target and OS.

topperc · 2024-10-06T03:51:52Z

Isn't it a poison vaule if the inputs larger than 2^32 according to LLVM LangRef?

I think only if the trunc would have a nuw flag, which isn't present in the reproducer?

@fhahn Reviewed the code again, I think it's arguable to consider it's a mis optimization. The reason is not about trunc with nuw or not, but llvm.lrint itself. I think we should consider the difference between llvm.lrint.i64.f32 and llvm.llrint.i64.f32. The front end won't generate llvm.lrint.i32.f32 for lrint, but user might expect foo is more optimal than bar when they writing
int foo(float x) {
  return lrintf(x);
}

int bar(float x) {
  return llrintf(x);
}
I admit we don't describe llvm.lrint.i64.f32 in this way, but we can improve it and suggest user to use llrintf/llvm.llrint.i64.f32 instead when they care about inputs larger than 2^32. WDYT?
Doesn't the frontend generate llvm.lrint.i32.f32 on 32-bit targets and on 64-bit Windows? sizeof(long) is 4 bytes in those cases.
But they don't have difference on 64-bit Linux, I think it's good to expand its semantic given it's already inconsistent across different target and OS.

Aren't the semantics defined by C not by LLVM IR. How can you change it?

phoebewang · 2024-10-06T04:05:37Z

Isn't it a poison vaule if the inputs larger than 2^32 according to LLVM LangRef?

I think only if the trunc would have a nuw flag, which isn't present in the reproducer?

@fhahn Reviewed the code again, I think it's arguable to consider it's a mis optimization. The reason is not about trunc with nuw or not, but llvm.lrint itself. I think we should consider the difference between llvm.lrint.i64.f32 and llvm.llrint.i64.f32. The front end won't generate llvm.lrint.i32.f32 for lrint, but user might expect foo is more optimal than bar when they writing
int foo(float x) {
  return lrintf(x);
}

int bar(float x) {
  return llrintf(x);
}
I admit we don't describe llvm.lrint.i64.f32 in this way, but we can improve it and suggest user to use llrintf/llvm.llrint.i64.f32 instead when they care about inputs larger than 2^32. WDYT?
Doesn't the frontend generate llvm.lrint.i32.f32 on 32-bit targets and on 64-bit Windows? sizeof(long) is 4 bytes in those cases.
But they don't have difference on 64-bit Linux, I think it's good to expand its semantic given it's already inconsistent across different target and OS.
Aren't the semantics defined by C not by LLVM IR. How can you change it?

The LangRef says This function returns the same values as the libm lrint functions would, but without setting errno. In my opinion, it is not a clear definition given:

It doesn't cover the Windows behavior because it doesn't use libm;
It doesn't mention the implicit inconsistent from libm;
It doesn't cover the vector type behavior because they don't map to libm directly;

So I think we can improve its semantic, it's not necessarily identical with C (e.g., we already excluded errno).

topperc · 2024-10-06T05:32:37Z

Isn't it a poison vaule if the inputs larger than 2^32 according to LLVM LangRef?

I think only if the trunc would have a nuw flag, which isn't present in the reproducer?

@fhahn Reviewed the code again, I think it's arguable to consider it's a mis optimization. The reason is not about trunc with nuw or not, but llvm.lrint itself. I think we should consider the difference between llvm.lrint.i64.f32 and llvm.llrint.i64.f32. The front end won't generate llvm.lrint.i32.f32 for lrint, but user might expect foo is more optimal than bar when they writing
int foo(float x) {
  return lrintf(x);
}

int bar(float x) {
  return llrintf(x);
}
I admit we don't describe llvm.lrint.i64.f32 in this way, but we can improve it and suggest user to use llrintf/llvm.llrint.i64.f32 instead when they care about inputs larger than 2^32. WDYT?
Doesn't the frontend generate llvm.lrint.i32.f32 on 32-bit targets and on 64-bit Windows? sizeof(long) is 4 bytes in those cases.
But they don't have difference on 64-bit Linux, I think it's good to expand its semantic given it's already inconsistent across different target and OS.
Aren't the semantics defined by C not by LLVM IR. How can you change it?
The LangRef says This function returns the same values as the libm lrint functions would, but without setting errno. In my opinion, it is not a clear definition given:

It doesn't cover the Windows behavior because it doesn't use libm;

I think that's poor wording. I think the intent is to cover any implementation of the C math library.

It doesn't mention the implicit inconsistent from libm;

It doesn't cover the vector type behavior because they don't map to libm directly;

So I think we can improve its semantic, it's not necessarily identical with C (e.g., we already excluded errno).

Is your proposal to tell C programmers they shouldn't use lrint if they care about values larger than 2^32 by changing the LLVM IR semantics? Most C programmers don't know and/or care that LLVM IR exists. Wouldn't it be a deviation from the C standard?

phoebewang · 2024-10-06T07:16:41Z

Isn't it a poison vaule if the inputs larger than 2^32 according to LLVM LangRef?

I think only if the trunc would have a nuw flag, which isn't present in the reproducer?

@fhahn Reviewed the code again, I think it's arguable to consider it's a mis optimization. The reason is not about trunc with nuw or not, but llvm.lrint itself. I think we should consider the difference between llvm.lrint.i64.f32 and llvm.llrint.i64.f32. The front end won't generate llvm.lrint.i32.f32 for lrint, but user might expect foo is more optimal than bar when they writing
int foo(float x) {
  return lrintf(x);
}

int bar(float x) {
  return llrintf(x);
}
I admit we don't describe llvm.lrint.i64.f32 in this way, but we can improve it and suggest user to use llrintf/llvm.llrint.i64.f32 instead when they care about inputs larger than 2^32. WDYT?
Doesn't the frontend generate llvm.lrint.i32.f32 on 32-bit targets and on 64-bit Windows? sizeof(long) is 4 bytes in those cases.
But they don't have difference on 64-bit Linux, I think it's good to expand its semantic given it's already inconsistent across different target and OS.
Aren't the semantics defined by C not by LLVM IR. How can you change it?
The LangRef says This function returns the same values as the libm lrint functions would, but without setting errno. In my opinion, it is not a clear definition given:

It doesn't cover the Windows behavior because it doesn't use libm;
I think that's poor wording. I think the intent is to cover any implementation of the C math library.

It doesn't mention the implicit inconsistent from libm;

It doesn't cover the vector type behavior because they don't map to libm directly;

So I think we can improve its semantic, it's not necessarily identical with C (e.g., we already excluded errno).

Is your proposal to tell C programmers they shouldn't use lrint if they care about values larger than 2^32 by changing the LLVM IR semantics? Most C programmers don't know and/or care that LLVM IR exists. Wouldn't it be a deviation from the C standard?

The C code won't generate llvm.lrint by default https://godbolt.org/z/WMroczG5r. Yes, they may still observe behavior change when using -ffast-math, so we need to tell them through release note or somewhere, but that's change regarding fast math rather than C standard.

topperc · 2024-10-06T08:30:04Z

Isn't it a poison vaule if the inputs larger than 2^32 according to LLVM LangRef?

I think only if the trunc would have a nuw flag, which isn't present in the reproducer?

@fhahn Reviewed the code again, I think it's arguable to consider it's a mis optimization. The reason is not about trunc with nuw or not, but llvm.lrint itself. I think we should consider the difference between llvm.lrint.i64.f32 and llvm.llrint.i64.f32. The front end won't generate llvm.lrint.i32.f32 for lrint, but user might expect foo is more optimal than bar when they writing
int foo(float x) {
  return lrintf(x);
}

int bar(float x) {
  return llrintf(x);
}
I admit we don't describe llvm.lrint.i64.f32 in this way, but we can improve it and suggest user to use llrintf/llvm.llrint.i64.f32 instead when they care about inputs larger than 2^32. WDYT?
Doesn't the frontend generate llvm.lrint.i32.f32 on 32-bit targets and on 64-bit Windows? sizeof(long) is 4 bytes in those cases.
But they don't have difference on 64-bit Linux, I think it's good to expand its semantic given it's already inconsistent across different target and OS.
Aren't the semantics defined by C not by LLVM IR. How can you change it?
The LangRef says This function returns the same values as the libm lrint functions would, but without setting errno. In my opinion, it is not a clear definition given:

It doesn't cover the Windows behavior because it doesn't use libm;
I think that's poor wording. I think the intent is to cover any implementation of the C math library.

It doesn't mention the implicit inconsistent from libm;

It doesn't cover the vector type behavior because they don't map to libm directly;

So I think we can improve its semantic, it's not necessarily identical with C (e.g., we already excluded errno).

Is your proposal to tell C programmers they shouldn't use lrint if they care about values larger than 2^32 by changing the LLVM IR semantics? Most C programmers don't know and/or care that LLVM IR exists. Wouldn't it be a deviation from the C standard?
The C code won't generate llvm.lrint by default https://godbolt.org/z/WMroczG5r. Yes, they may still observe behavior change when using -ffast-math, so we need to tell them through release note or somewhere, but that's change regarding fast math rather than C standard.

Is -fno-math-errno sufficient?

phoebewang · 2024-10-06T09:40:25Z

Isn't it a poison vaule if the inputs larger than 2^32 according to LLVM LangRef?

I think only if the trunc would have a nuw flag, which isn't present in the reproducer?

@fhahn Reviewed the code again, I think it's arguable to consider it's a mis optimization. The reason is not about trunc with nuw or not, but llvm.lrint itself. I think we should consider the difference between llvm.lrint.i64.f32 and llvm.llrint.i64.f32. The front end won't generate llvm.lrint.i32.f32 for lrint, but user might expect foo is more optimal than bar when they writing
int foo(float x) {
  return lrintf(x);
}

int bar(float x) {
  return llrintf(x);
}
I admit we don't describe llvm.lrint.i64.f32 in this way, but we can improve it and suggest user to use llrintf/llvm.llrint.i64.f32 instead when they care about inputs larger than 2^32. WDYT?
Doesn't the frontend generate llvm.lrint.i32.f32 on 32-bit targets and on 64-bit Windows? sizeof(long) is 4 bytes in those cases.
But they don't have difference on 64-bit Linux, I think it's good to expand its semantic given it's already inconsistent across different target and OS.
Aren't the semantics defined by C not by LLVM IR. How can you change it?
The LangRef says This function returns the same values as the libm lrint functions would, but without setting errno. In my opinion, it is not a clear definition given:

It doesn't cover the Windows behavior because it doesn't use libm;
I think that's poor wording. I think the intent is to cover any implementation of the C math library.

It doesn't mention the implicit inconsistent from libm;

It doesn't cover the vector type behavior because they don't map to libm directly;

So I think we can improve its semantic, it's not necessarily identical with C (e.g., we already excluded errno).

Is your proposal to tell C programmers they shouldn't use lrint if they care about values larger than 2^32 by changing the LLVM IR semantics? Most C programmers don't know and/or care that LLVM IR exists. Wouldn't it be a deviation from the C standard?
The C code won't generate llvm.lrint by default https://godbolt.org/z/WMroczG5r. Yes, they may still observe behavior change when using -ffast-math, so we need to tell them through release note or somewhere, but that's change regarding fast math rather than C standard.
Is -fno-math-errno sufficient?

Good point! If we refer to GCC's 32-bit case, we can see there is differece between -ffast-math and -fno-math-errno https://godbolt.org/z/b4oE67bGb. But Clang doesn't distinguish them so far. So it would be an independent problem to what we are discussing.

topperc · 2024-10-06T16:55:30Z

Isn't it a poison vaule if the inputs larger than 2^32 according to LLVM LangRef?

I think only if the trunc would have a nuw flag, which isn't present in the reproducer?

@fhahn Reviewed the code again, I think it's arguable to consider it's a mis optimization. The reason is not about trunc with nuw or not, but llvm.lrint itself. I think we should consider the difference between llvm.lrint.i64.f32 and llvm.llrint.i64.f32. The front end won't generate llvm.lrint.i32.f32 for lrint, but user might expect foo is more optimal than bar when they writing
int foo(float x) {
  return lrintf(x);
}

int bar(float x) {
  return llrintf(x);
}
I admit we don't describe llvm.lrint.i64.f32 in this way, but we can improve it and suggest user to use llrintf/llvm.llrint.i64.f32 instead when they care about inputs larger than 2^32. WDYT?
Doesn't the frontend generate llvm.lrint.i32.f32 on 32-bit targets and on 64-bit Windows? sizeof(long) is 4 bytes in those cases.
But they don't have difference on 64-bit Linux, I think it's good to expand its semantic given it's already inconsistent across different target and OS.
Aren't the semantics defined by C not by LLVM IR. How can you change it?
The LangRef says This function returns the same values as the libm lrint functions would, but without setting errno. In my opinion, it is not a clear definition given:

It doesn't cover the Windows behavior because it doesn't use libm;
I think that's poor wording. I think the intent is to cover any implementation of the C math library.

It doesn't mention the implicit inconsistent from libm;

It doesn't cover the vector type behavior because they don't map to libm directly;

So I think we can improve its semantic, it's not necessarily identical with C (e.g., we already excluded errno).

Is your proposal to tell C programmers they shouldn't use lrint if they care about values larger than 2^32 by changing the LLVM IR semantics? Most C programmers don't know and/or care that LLVM IR exists. Wouldn't it be a deviation from the C standard?
The C code won't generate llvm.lrint by default https://godbolt.org/z/WMroczG5r. Yes, they may still observe behavior change when using -ffast-math, so we need to tell them through release note or somewhere, but that's change regarding fast math rather than C standard.
Is -fno-math-errno sufficient?
Good point! If we refer to GCC's 32-bit case, we can see there is differece between -ffast-math and -fno-math-errno https://godbolt.org/z/b4oE67bGb. But Clang doesn't distinguish them so far. So it would be an independent problem to what we are discussing.

gcc requires -O1 -fno-math-errno -m32 -mfpmath=sse -fno-trapping-math

phoebewang · 2024-10-07T01:31:27Z

gcc requires -O1 -fno-math-errno -m32 -mfpmath=sse -fno-trapping-math

-fno-trapping-math is not required. I think it's on by default.

topperc · 2024-10-07T02:13:38Z

gcc requires -O1 -fno-math-errno -m32 -mfpmath=sse -fno-trapping-math

-fno-trapping-math is not required. I think it's on by default.

-fno-trapping-math is not default on gcc. I was trying to find the subset of -ffast-math that gcc requires. it appears to be fno-math-errno and fno-trapping-math.

phoebewang · 2024-10-07T02:48:00Z

gcc requires -O1 -fno-math-errno -m32 -mfpmath=sse -fno-trapping-math

-fno-trapping-math is not required. I think it's on by default.

-fno-trapping-math is not default on gcc. I was trying to find the subset of -ffast-math that gcc requires. it appears to be fno-math-errno and fno-trapping-math.

Yes, -fno-trapping-math is not default on gcc. But we don't need it here https://godbolt.org/z/TsKTP4TxK

topperc · 2024-10-07T05:20:32Z

gcc requires -O1 -fno-math-errno -m32 -mfpmath=sse -fno-trapping-math

-fno-trapping-math is not required. I think it's on by default.

-fno-trapping-math is not default on gcc. I was trying to find the subset of -ffast-math that gcc requires. it appears to be fno-math-errno and fno-trapping-math.

Yes, -fno-trapping-math is not default on gcc. But we don't need it here https://godbolt.org/z/TsKTP4TxK

Oops you're right. I don't know what I did earlier. I guess it was the -mfpmath=sse that fixed it.

[X86] Adding lowerings for vector ISD::LRINT and ISD::LLRINT

e16d56e

phoebewang requested review from artagnon, RKSimon and topperc April 25, 2024 14:32

llvmbot added the backend:X86 label Apr 25, 2024

artagnon reviewed Apr 25, 2024

View reviewed changes

RKSimon reviewed Apr 26, 2024

View reviewed changes

Address review comments

9b02ca3

clang-format

7cd28e4

phoebewang closed this May 2, 2024

phoebewang deleted the lrint branch May 2, 2024 09:15

phoebewang restored the lrint branch May 2, 2024 09:19

phoebewang reopened this May 2, 2024

RKSimon mentioned this pull request May 2, 2024

[CostModel][X86] Treat lrint/llrint as fptosi calls #90883

Merged

RKSimon approved these changes May 2, 2024

View reviewed changes

phoebewang merged commit fd3e7e3 into llvm:main May 3, 2024
7 of 8 checks passed

phoebewang deleted the lrint branch May 3, 2024 01:31

RKSimon added a commit that referenced this pull request May 3, 2024

[CostModel][X86] Treat lrint/llrint as fptosi calls (#90883)

8a0073a

X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which have the same costs as the CVTTP2SI (fptosi) instructions Followup to #90065

phoebewang mentioned this pull request Jul 16, 2024

Missed Optimization - Replacement of rint/lrint with X87/SSE specific instructions #55202

Open

topperc mentioned this pull request Feb 1, 2025

[clang] [x86-64] lrint()/lrintf() using instruction writing to 32-bit register if assigned to 32-bit int even though long is 64-bit #125324

Closed

		}
		def : Pat<(v16i32 (lrint VR512:$src)), (VCVTPS2DQZrr VR512:$src)>;

[X86] Adding lowerings for vector ISD::LRINT and ISD::LLRINT #90065

[X86] Adding lowerings for vector ISD::LRINT and ISD::LLRINT #90065

Uh oh!

Conversation

phoebewang commented Apr 25, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Apr 25, 2024

Uh oh!

artagnon left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

phoebewang commented Apr 26, 2024

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Apr 27, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

fhahn commented Oct 1, 2024

Uh oh!

topperc commented Oct 1, 2024

Uh oh!

phoebewang commented Oct 2, 2024

Uh oh!

fhahn commented Oct 2, 2024

Uh oh!

phoebewang commented Oct 2, 2024

Uh oh!

fhahn commented Oct 3, 2024

Uh oh!

phoebewang commented Oct 3, 2024

Uh oh!

phoebewang commented Oct 6, 2024

Uh oh!

topperc commented Oct 6, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

phoebewang commented Oct 6, 2024

Uh oh!

topperc commented Oct 6, 2024

Uh oh!

phoebewang commented Oct 6, 2024

Uh oh!

topperc commented Oct 6, 2024

phoebewang commented Apr 25, 2024 •

edited

Loading

github-actions bot commented Apr 27, 2024 •

edited

Loading

topperc commented Oct 6, 2024 •

edited

Loading