[X86] Fold some (truncate (srl (add X, C1), C2)) patterns to (add (truncate (srl X, C2)), C1') (#126448)

joaotgouveia · web-flow · commit 0a913b5e3a23 · 2025-02-21T17:17:09.000+08:00
Addresses the poor codegen identified in #123239 and a few extra cases. This transformation is correct for `eq` (https://alive2.llvm.org/ce/z/qZhwtT), `ne` (https://alive2.llvm.org/ce/z/6gsmNz), `ult` (https://alive2.llvm.org/ce/z/xip_td) and `ugt` (https://alive2.llvm.org/ce/z/39XQkX). Fixes #123239
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53697,6 +53697,41 @@ static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG,
                                  DAG.getUNDEF(SrcVT)));
 }
 
+// Attempt to fold some (truncate (srl (add X, C1), C2)) patterns to
+// (add (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we are able
+// to avoid generating code with MOVABS and large constants in certain cases.
+static SDValue combinei64TruncSrlAdd(SDValue N, EVT VT, SelectionDAG &DAG,
+                                     const SDLoc &DL) {
+  using namespace llvm::SDPatternMatch;
+
+  SDValue AddLhs;
+  APInt AddConst, SrlConst;
+  if (VT != MVT::i32 ||
+      !sd_match(N, m_AllOf(m_SpecificVT(MVT::i64),
+                           m_Srl(m_OneUse(m_Add(m_Value(AddLhs),
+                                                m_ConstInt(AddConst))),
+                                 m_ConstInt(SrlConst)))))
+    return SDValue();
+
+  if (SrlConst.ule(31) || AddConst.lshr(SrlConst).shl(SrlConst) != AddConst)
+    return SDValue();
+
+  SDValue AddLHSSrl =
+      DAG.getNode(ISD::SRL, DL, MVT::i64, AddLhs, N.getOperand(1));
+  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, AddLHSSrl);
+
+  APInt NewAddConstVal =
+      (~((~AddConst).lshr(SrlConst))).trunc(VT.getSizeInBits());
+  SDValue NewAddConst = DAG.getConstant(NewAddConstVal, DL, VT);
+  SDValue NewAddNode = DAG.getNode(ISD::ADD, DL, VT, Trunc, NewAddConst);
+
+  APInt CleanupSizeConstVal = (SrlConst - 32).zextOrTrunc(VT.getSizeInBits());
+  EVT CleanUpVT =
+      EVT::getIntegerVT(*DAG.getContext(), CleanupSizeConstVal.getZExtValue());
+  SDValue CleanUp = DAG.getAnyExtOrTrunc(NewAddNode, DL, CleanUpVT);
+  return DAG.getAnyExtOrTrunc(CleanUp, DL, VT);
+}
+
 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
 /// the codegen.
 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
@@ -53742,6 +53777,9 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
   if (!Src.hasOneUse())
     return SDValue();
 
+  if (SDValue R = combinei64TruncSrlAdd(Src, VT, DAG, DL))
+    return R;
+
   // Only support vector truncation for now.
   // TODO: i64 scalar math would benefit as well.
   if (!VT.isVector())
diff --git a/llvm/test/CodeGen/X86/combine-i64-trunc-srl-add.ll b/llvm/test/CodeGen/X86/combine-i64-trunc-srl-add.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefixes=X64
+
+; Test for https://github.com/llvm/llvm-project/issues/123239
+
+define i1 @test_ult_trunc_add(i64 %x) {
+; X64-LABEL: test_ult_trunc_add:
+; X64:       # %bb.0:
+; X64-NEXT:    shrq $48, %rdi
+; X64-NEXT:    addl $-65522, %edi # imm = 0xFFFF000E
+; X64-NEXT:    cmpl $3, %edi
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
+  %add = add i64 %x, 3940649673949184
+  %shr = lshr i64 %add, 48
+  %conv = trunc i64 %shr to i32
+  %res = icmp ult i32 %conv, 3
+  ret i1 %res
+}
+
+define i1 @test_ult_add(i64 %x) {
+; X64-LABEL: test_ult_add:
+; X64:       # %bb.0:
+; X64-NEXT:    shrq $48, %rdi
+; X64-NEXT:    addl $-65522, %edi # imm = 0xFFFF000E
+; X64-NEXT:    cmpl $3, %edi
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
+  %add = add i64 3940649673949184, %x
+  %cmp = icmp ult i64 %add, 844424930131968
+  ret i1 %cmp
+}
+
+define i1 @test_ugt_trunc_add(i64 %x) {
+; X64-LABEL: test_ugt_trunc_add:
+; X64:       # %bb.0:
+; X64-NEXT:    shrq $48, %rdi
+; X64-NEXT:    addl $-65522, %edi # imm = 0xFFFF000E
+; X64-NEXT:    cmpl $4, %edi
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
+  %add = add i64 %x, 3940649673949184
+  %shr = lshr i64 %add, 48
+  %conv = trunc i64 %shr to i32
+  %res = icmp ugt i32 %conv, 3
+  ret i1 %res
+}
+
+define i1 @test_ugt_add(i64 %x) {
+; X64-LABEL: test_ugt_add:
+; X64:       # %bb.0:
+; X64-NEXT:    movabsq $3940649673949184, %rax # imm = 0xE000000000000
+; X64-NEXT:    addq %rdi, %rax
+; X64-NEXT:    movabsq $844424930131968, %rcx # imm = 0x3000000000000
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    seta %al
+; X64-NEXT:    retq
+  %add = add i64 3940649673949184, %x
+  %cmp = icmp ugt i64 %add, 844424930131968
+  ret i1 %cmp
+}
+
+define i1 @test_eq_trunc_add(i64 %x) {
+; X64-LABEL: test_eq_trunc_add:
+; X64:       # %bb.0:
+; X64-NEXT:    shrq $48, %rdi
+; X64-NEXT:    cmpl $65525, %edi # imm = 0xFFF5
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %add = add i64 %x, 3940649673949184
+  %shr = lshr i64 %add, 48
+  %conv = trunc i64 %shr to i32
+  %res = icmp eq i32 %conv, 3
+  ret i1 %res
+}
+
+define i1 @test_eq_add(i64 %x) {
+; X64-LABEL: test_eq_add:
+; X64:       # %bb.0:
+; X64-NEXT:    movabsq $-3096224743817216, %rax # imm = 0xFFF5000000000000
+; X64-NEXT:    cmpq %rax, %rdi
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %add = add i64 3940649673949184, %x
+  %cmp = icmp eq i64 %add, 844424930131968
+  ret i1 %cmp
+}
+
+define i1 @test_ne_trunc_add(i64 %x) {
+; X64-LABEL: test_ne_trunc_add:
+; X64:       # %bb.0:
+; X64-NEXT:    shrq $48, %rdi
+; X64-NEXT:    cmpl $65525, %edi # imm = 0xFFF5
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %add = add i64 %x, 3940649673949184
+  %shr = lshr i64 %add, 48
+  %conv = trunc i64 %shr to i32
+  %res = icmp ne i32 %conv, 3
+  ret i1 %res
+}
+
+define i1 @test_ne_add(i64 %x) {
+; X64-LABEL: test_ne_add:
+; X64:       # %bb.0:
+; X64-NEXT:    movabsq $-3096224743817216, %rax # imm = 0xFFF5000000000000
+; X64-NEXT:    cmpq %rax, %rdi
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %add = add i64 3940649673949184, %x
+  %cmp = icmp ne i64 %add, 844424930131968
+  ret i1 %cmp
+}
+
+define i32 @test_trunc_add(i64 %x) {
+; X64-LABEL: test_trunc_add:
+; X64:       # %bb.0:
+; X64-NEXT:    shrq $48, %rdi
+; X64-NEXT:    leal -65522(%rdi), %eax
+; X64-NEXT:    retq
+  %add = add i64 %x, 3940649673949184
+  %shr = lshr i64 %add, 48
+  %conv = trunc i64 %shr to i32
+  ret i32 %conv
+}