Skip to content

Conversation

nikic
Copy link
Contributor

@nikic nikic commented Sep 5, 2025

foldCmpLoadFromIndexedGlobal() currently checks that the global type, the GEP type and the load type match in certain ways. Replace this with generic logic based on offsets.

This is a reboot of #67093. This PR is less ambitious by requiring that the constant offset is smaller than the stride, which avoids the additional complexity of that PR.

@nikic nikic requested review from davemgreen and dtcxzyw September 5, 2025 12:21
@llvmbot llvmbot added llvm:instcombine Covers the InstCombine, InstSimplify and AggressiveInstCombine passes llvm:transforms labels Sep 5, 2025
@llvmbot
Copy link
Member

llvmbot commented Sep 5, 2025

@llvm/pr-subscribers-llvm-transforms

Author: Nikita Popov (nikic)

Changes

foldCmpLoadFromIndexedGlobal() currently checks that the global type, the GEP type and the load type match in certain ways. Replace this with generic logic based on offsets.

This is a reboot of #67093. This PR is less ambitious by requiring that the constant offset is smaller than the stride, which avoids the additional complexity of that PR.


Full diff: https://github.com/llvm/llvm-project/pull/157089.diff

3 Files Affected:

  • (modified) llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp (+32-71)
  • (modified) llvm/test/Transforms/InstCombine/load-cmp.ll (+88)
  • (modified) llvm/test/Transforms/InstCombine/opaque-ptr.ll (+1-4)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 90feddf6dcfe1..01b0da3469c18 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -112,73 +112,42 @@ static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) {
 Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
     LoadInst *LI, GetElementPtrInst *GEP, GlobalVariable *GV, CmpInst &ICI,
     ConstantInt *AndCst) {
-  if (LI->isVolatile() || LI->getType() != GEP->getResultElementType() ||
-      !GV->getValueType()->isArrayTy() || !GV->isConstant() ||
-      !GV->hasDefinitiveInitializer())
-    return nullptr;
-
-  Type *GEPSrcEltTy = GEP->getSourceElementType();
-  if (GEPSrcEltTy->isArrayTy())
-    GEPSrcEltTy = GEPSrcEltTy->getArrayElementType();
-  if (GV->getValueType()->getArrayElementType() != GEPSrcEltTy)
+  if (LI->isVolatile() || !GV->isConstant() || !GV->hasDefinitiveInitializer())
     return nullptr;
 
   Constant *Init = GV->getInitializer();
-  if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
+  TypeSize GlobalSize = DL.getTypeAllocSize(Init->getType());
+  Type *EltTy = LI->getType();
+  TypeSize EltSize = DL.getTypeStoreSize(EltTy);
+  if (EltSize.isScalable())
     return nullptr;
 
-  uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
-  // Don't blow up on huge arrays.
-  if (ArrayElementCount > MaxArraySizeForCombine)
+  unsigned IndexBW = DL.getIndexTypeSizeInBits(GEP->getType());
+  SmallMapVector<Value *, APInt, 4> VarOffsets;
+  APInt ConstOffset(IndexBW, 0);
+  if (!GEP->collectOffset(DL, IndexBW, VarOffsets, ConstOffset) ||
+      VarOffsets.size() != 1 || IndexBW > 64)
     return nullptr;
 
-  // There are many forms of this optimization we can handle, for now, just do
-  // the simple index into a single-dimensional array or elements of equal size.
-  //
-  // Require: GEP [n x i8] GV, 0, Idx {{, constant indices}}
-  //      Or: GEP i8 GV, Idx
-
-  unsigned GEPIdxOp = 1;
-  if (GEP->getSourceElementType()->isArrayTy()) {
-    GEPIdxOp = 2;
-    if (!match(GEP->getOperand(1), m_ZeroInt()))
-      return nullptr;
-  }
-  if (GEP->getNumOperands() < GEPIdxOp + 1 ||
-      isa<Constant>(GEP->getOperand(GEPIdxOp)))
+  Value *Idx = VarOffsets.front().first;
+  const APInt &Stride = VarOffsets.front().second;
+  // If the index type is non-canonical, wait for it to be canonicalized.
+  if (Idx->getType()->getScalarSizeInBits() != IndexBW)
     return nullptr;
 
-  // Check that indices after the variable are constants and in-range for the
-  // type they index.  Collect the indices.  This is typically for arrays of
-  // structs.
-  SmallVector<unsigned, 4> LaterIndices;
-
-  Type *EltTy = Init->getType()->getArrayElementType();
-  for (unsigned i = GEPIdxOp + 1, e = GEP->getNumOperands(); i != e; ++i) {
-    ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i));
-    if (!Idx)
-      return nullptr; // Variable index.
-
-    uint64_t IdxVal = Idx->getZExtValue();
-    if ((unsigned)IdxVal != IdxVal)
-      return nullptr; // Too large array index.
-
-    if (StructType *STy = dyn_cast<StructType>(EltTy))
-      EltTy = STy->getElementType(IdxVal);
-    else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) {
-      if (IdxVal >= ATy->getNumElements())
-        return nullptr;
-      EltTy = ATy->getElementType();
-    } else {
-      return nullptr; // Unknown type.
-    }
+  // Allow an additional context offset, but only within the stride.
+  if (!ConstOffset.ult(Stride))
+    return nullptr;
 
-    LaterIndices.push_back(IdxVal);
-  }
+  // Don't handle overlapping loads for now.
+  if (!Stride.uge(EltSize.getFixedValue()))
+    return nullptr;
 
-  Value *Idx = GEP->getOperand(GEPIdxOp);
-  // If the index type is non-canonical, wait for it to be canonicalized.
-  if (Idx->getType() != DL.getIndexType(GEP->getType()))
+  // Don't blow up on huge arrays.
+  uint64_t ArrayElementCount =
+      divideCeil((GlobalSize.getFixedValue() - ConstOffset.getZExtValue()),
+                 Stride.getZExtValue());
+  if (ArrayElementCount > MaxArraySizeForCombine)
     return nullptr;
 
   enum { Overdefined = -3, Undefined = -2 };
@@ -211,18 +180,12 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
 
   // Scan the array and see if one of our patterns matches.
   Constant *CompareRHS = cast<Constant>(ICI.getOperand(1));
-  for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) {
-    Constant *Elt = Init->getAggregateElement(i);
+  APInt Offset = ConstOffset;
+  for (unsigned i = 0, e = ArrayElementCount; i != e; ++i, Offset += Stride) {
+    Constant *Elt = ConstantFoldLoadFromConst(Init, EltTy, Offset, DL);
     if (!Elt)
       return nullptr;
 
-    // If this is indexing an array of structures, get the structure element.
-    if (!LaterIndices.empty()) {
-      Elt = ConstantFoldExtractValueInstruction(Elt, LaterIndices);
-      if (!Elt)
-        return nullptr;
-    }
-
     // If the element is masked, handle it.
     if (AndCst) {
       Elt = ConstantFoldBinaryOpOperands(Instruction::And, Elt, AndCst, DL);
@@ -309,19 +272,17 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
   // Now that we've scanned the entire array, emit our new comparison(s).  We
   // order the state machines in complexity of the generated code.
 
-  // If inbounds keyword is not present, Idx * ElementSize can overflow.
-  // Let's assume that ElementSize is 2 and the wanted value is at offset 0.
+  // If inbounds keyword is not present, Idx * Stride can overflow.
+  // Let's assume that Stride is 2 and the wanted value is at offset 0.
   // Then, there are two possible values for Idx to match offset 0:
   // 0x00..00, 0x80..00.
   // Emitting 'icmp eq Idx, 0' isn't correct in this case because the
   // comparison is false if Idx was 0x80..00.
   // We need to erase the highest countTrailingZeros(ElementSize) bits of Idx.
-  unsigned ElementSize =
-      DL.getTypeAllocSize(Init->getType()->getArrayElementType());
   auto MaskIdx = [&](Value *Idx) {
-    if (!GEP->isInBounds() && llvm::countr_zero(ElementSize) != 0) {
+    if (!GEP->isInBounds() && Stride.countr_zero() != 0) {
       Value *Mask = Constant::getAllOnesValue(Idx->getType());
-      Mask = Builder.CreateLShr(Mask, llvm::countr_zero(ElementSize));
+      Mask = Builder.CreateLShr(Mask, Stride.countr_zero());
       Idx = Builder.CreateAnd(Idx, Mask);
     }
     return Idx;
diff --git a/llvm/test/Transforms/InstCombine/load-cmp.ll b/llvm/test/Transforms/InstCombine/load-cmp.ll
index 2a2c74dc09e2f..3a66214752951 100644
--- a/llvm/test/Transforms/InstCombine/load-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/load-cmp.ll
@@ -371,3 +371,91 @@ define i1 @pr93017(i64 %idx) {
   %cmp = icmp ne ptr %v, null
   ret i1 %cmp
 }
+
+@g_i32_lo = internal constant [4 x i32] [i32 1, i32 2, i32 3, i32 4]
+
+; Mask is 0b10101010
+define i1 @load_vs_array_type_mismatch1(i32 %idx) {
+; CHECK-LABEL: @load_vs_array_type_mismatch1(
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i32 1, [[TMP1:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 170
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep = getelementptr inbounds i16, ptr @g_i32_lo, i32 %idx
+  %load = load i16, ptr %gep
+  %cmp = icmp eq i16 %load, 0
+  ret i1 %cmp
+}
+
+@g_i32_hi = internal constant [4 x i32] [i32 u0x00010000, i32 u0x00020000, i32 u0x00030000, i32 u0x00040000]
+
+; Mask is 0b01010101
+define i1 @load_vs_array_type_mismatch2(i32 %idx) {
+; CHECK-LABEL: @load_vs_array_type_mismatch2(
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i32 1, [[TMP1:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 85
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep = getelementptr inbounds i16, ptr @g_i32_hi, i32 %idx
+  %load = load i16, ptr %gep
+  %cmp = icmp eq i16 %load, 0
+  ret i1 %cmp
+}
+
+@g_i16_1 = internal constant [8 x i16] [i16 0, i16 1, i16 1, i16 0, i16 0, i16 1, i16 1, i16 0]
+
+; idx == 1 || idx == 3
+define i1 @load_vs_array_type_mismatch_offset1(i32 %idx) {
+; CHECK-LABEL: @load_vs_array_type_mismatch_offset1(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IDX:%.*]], -3
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep = getelementptr inbounds {i16, i16}, ptr @g_i16_1, i32 %idx, i32 1
+  %load = load i16, ptr %gep
+  %cmp = icmp eq i16 %load, 0
+  ret i1 %cmp
+}
+
+@g_i16_2 = internal constant [8 x i16] [i16 1, i16 0, i16 0, i16 1, i16 1, i16 0, i16 0, i16 1]
+
+; idx == 0 || idx == 2
+define i1 @load_vs_array_type_mismatch_offset2(i32 %idx) {
+; CHECK-LABEL: @load_vs_array_type_mismatch_offset2(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IDX:%.*]], -3
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep = getelementptr inbounds {i16, i16}, ptr @g_i16_2, i32 %idx, i32 1
+  %load = load i16, ptr %gep
+  %cmp = icmp eq i16 %load, 0
+  ret i1 %cmp
+}
+
+define i1 @offset_larger_than_stride(i32 %idx) {
+; CHECK-LABEL: @offset_larger_than_stride(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [2 x i16], ptr @g_i16_1, i32 1, i32 [[TMP1:%.*]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i16, ptr [[GEP]], align 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[LOAD]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep = getelementptr [2 x i16], ptr @g_i16_1, i64 1, i32 %idx
+  %load = load i16, ptr %gep
+  %cmp = icmp eq i16 %load, 0
+  ret i1 %cmp
+}
+
+define i1 @load_size_larger_stride(i32 %idx) {
+; CHECK-LABEL: @load_size_larger_stride(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr @g_i16_1, i32 [[IDX:%.*]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i16, ptr [[GEP]], align 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[LOAD]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep = getelementptr i8, ptr @g_i16_1, i32 %idx
+  %load = load i16, ptr %gep
+  %cmp = icmp eq i16 %load, 0
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstCombine/opaque-ptr.ll b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
index 047698102a695..b54170ed3a2e8 100644
--- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll
+++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
@@ -543,10 +543,7 @@ define i1 @cmp_load_gep_global_different_load_type(i64 %idx) {
 
 define i1 @cmp_load_gep_global_different_gep_type(i64 %idx) {
 ; CHECK-LABEL: @cmp_load_gep_global_different_gep_type(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr @ary, i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i16, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[LOAD]], 3
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %gep = getelementptr [4 x i16], ptr @ary, i64 0, i64 %idx
   %load = load i16, ptr %gep

@nikic
Copy link
Contributor Author

nikic commented Sep 5, 2025

@zyw-bot mfuzz

Copy link
Member

@XChy XChy left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The implementation LGTM.
The original logic for the state machines remains complicated. If possible, can you import some testcases from #67093?

@nikic
Copy link
Contributor Author

nikic commented Sep 8, 2025

The implementation LGTM. The original logic for the state machines remains complicated. If possible, can you import some testcases from #67093?

Done. Some of the test cases are redundant after previous changes to this function (support for missing initial zero index was added separately), but I've imported the ones that looked valuable.

Copy link
Member

@XChy XChy left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. LGTM.

@nikic nikic merged commit 305cf0e into llvm:main Sep 8, 2025
9 checks passed
@nikic nikic deleted the load-indexed-global branch September 8, 2025 10:54
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
llvm:instcombine Covers the InstCombine, InstSimplify and AggressiveInstCombine passes llvm:transforms
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants