Skip to content

Question: What is the correct interpretation of LaneBitmask? #109797

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 40 additions & 2 deletions llvm/lib/CodeGen/TargetRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,26 +521,62 @@ bool TargetRegisterInfo::getCoveringSubRegIndexes(
SmallVector<unsigned, 8> PossibleIndexes;
unsigned BestIdx = 0;
unsigned BestCover = 0;

unsigned BestSize = 0;
for (unsigned Idx = 1, E = getNumSubRegIndices(); Idx < E; ++Idx) {
// Is this index even compatible with the given class?
if (getSubClassWithSubReg(RC, Idx) != RC)
continue;
LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx);

// The code below (now disabled) is not correct when the lane mask does not
// cover the full sub-register. For example, when 16-bit H0 has only one
// subreg B0 of 8-bits, and the high lanes are not defined in TableGen (e.g.
// by defining B0 and some artificial B0_HI register), then this function
// may return either `bsub` or `hsub`, whereas in this case we'd want it to
// return the *largest* (in bithwidth) sub-register index. It is better to
// keep looking for the biggest one, but this is rather expensive.
//
// Before thinking of how to solve this, I first want to ask the
// question: What is the meaning of 'LaneBitmask'?
//
// According to LaneBitmask.h:
//
// "The individual bits in a lane mask can't be assigned
// any specific meaning. They can be used to check if two
// sub-register indices overlap."
//
// However, in the code here the assumption is made that if
// 'SubRegMask == LaneMask', then the subregister must overlap *fully*.
// This is not the case for the way AArch64 registers are defined
// at the moment.
//
// Which interpretation is correct?
//
// A) If the meaning is 'if the bits are equal, the sub-registers overlap but
// not necessarily fully', then we should fix the code in this function (in
// a better way than just disabling it).
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Like @arsenm said this is the correct interpretation.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for confirming! Now that this is cleared up, I'll close this PR.

//
// B) If the meaning is 'if the bits are equal, the sub-registers overlap
// fully', then we can define the high bits with an artificial register.

#if 0
// Early exit if we found a perfect match.
if (SubRegMask == LaneMask) {
BestIdx = Idx;
break;
}
#endif

// The index must not cover any lanes outside \p LaneMask.
if ((SubRegMask & ~LaneMask).any())
continue;

unsigned PopCount = SubRegMask.getNumLanes();
unsigned Size = getSubRegIdxSize(Idx);
PossibleIndexes.push_back(Idx);
if (PopCount > BestCover) {
if ((!BestCover || PopCount >= BestCover) && Size >= BestSize) {
BestCover = PopCount;
BestSize = Size;
BestIdx = Idx;
}
}
Expand All @@ -559,6 +595,8 @@ bool TargetRegisterInfo::getCoveringSubRegIndexes(
int BestCover = std::numeric_limits<int>::min();
for (unsigned Idx : PossibleIndexes) {
LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx);
// FIXME: similar issue as above.

// Early exit if we found a perfect match.
if (SubRegMask == LanesLeft) {
BestIdx = Idx;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AArch64/AArch64Subtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
const Triple &getTargetTriple() const { return TargetTriple; }
bool enableMachineScheduler() const override { return true; }
bool enablePostRAScheduler() const override { return usePostRAScheduler(); }
bool enableSubRegLiveness() const override { return true; }

bool enableMachinePipeliner() const override;
bool useDFAforSMS() const override { return false; }
Expand Down
75 changes: 75 additions & 0 deletions llvm/test/CodeGen/AArch64/regalloc-subregidx-reproducer.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s | FileCheck %s

target triple = "aarch64"

define void @reproducer(ptr %ptr, ptr %ptr2, <8 x i32> %vec.arg) {
; CHECK-LABEL: reproducer:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sub sp, sp, #112
; CHECK-NEXT: stp x29, x30, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 112
; CHECK-NEXT: .cfi_offset w19, -8
; CHECK-NEXT: .cfi_offset w20, -16
; CHECK-NEXT: .cfi_offset w30, -24
; CHECK-NEXT: .cfi_offset w29, -32
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: stp q0, q1, [sp, #16] // 32-byte Folded Spill
; CHECK-NEXT: mov x19, x1
; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x8]
; CHECK-NEXT: add x8, sp, #48
; CHECK-NEXT: mov x20, x0
; CHECK-NEXT: st1 { v0.2d, v1.2d }, [x8] // 32-byte Folded Spill
; CHECK-NEXT: bl bar
; CHECK-NEXT: ldp q1, q0, [sp, #16] // 32-byte Folded Reload
; CHECK-NEXT: mov w8, #1 // =0x1
; CHECK-NEXT: uzp2 v0.4s, v1.4s, v0.4s
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: add x8, sp, #48
; CHECK-NEXT: ld1 { v5.2d, v6.2d }, [x8] // 32-byte Folded Reload
; CHECK-NEXT: ushll2 v2.2d, v6.4s, #1
; CHECK-NEXT: ushll v3.2d, v6.2s, #1
; CHECK-NEXT: ushll v4.2d, v5.2s, #0
; CHECK-NEXT: mov v6.16b, v5.16b
; CHECK-NEXT: ushll2 v5.2d, v0.4s, #1
; CHECK-NEXT: ushll v0.2d, v0.2s, #1
; CHECK-NEXT: orr v3.16b, v3.16b, v4.16b
; CHECK-NEXT: ushll2 v6.2d, v6.4s, #0
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: orr v2.16b, v2.16b, v6.16b
; CHECK-NEXT: stp q0, q3, [sp, #32] // 32-byte Folded Spill
; CHECK-NEXT: orr v0.16b, v5.16b, v1.16b
; CHECK-NEXT: stp q0, q2, [sp] // 32-byte Folded Spill
; CHECK-NEXT: bl bar
; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: stp q0, q1, [x20]
; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: stp q0, q1, [x19]
; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
entry:
%wide.vec = load <8 x i32>, ptr null, align 4
call void @bar()
%strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%strided.vec2 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%strided.vec3 = shufflevector <8 x i32> %vec.arg, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%1 = zext <4 x i32> %strided.vec2 to <4 x i64>
%2 = zext <4 x i32> %strided.vec3 to <4 x i64>
%3 = shl <4 x i64> %1, <i64 1, i64 1, i64 1, i64 1>
%4 = shl <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
%5 = zext <4 x i32> %strided.vec to <4 x i64>
%6 = or <4 x i64> %3, %5
%7 = or <4 x i64> %4, <i64 1, i64 1, i64 1, i64 1>
call void @bar()
store <4 x i64> %6, ptr %ptr, align 8
store <4 x i64> %7, ptr %ptr2, align 8
ret void
}

declare void @bar()

Loading