From 23581e381e2e357cff8684173edc3c27f980b1f9 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 24 Sep 2024 13:26:25 +0100 Subject: [PATCH 1/2] Add description and test (with sub-reg liveness enabled) --- llvm/lib/CodeGen/TargetRegisterInfo.cpp | 42 ++++++++++- llvm/lib/Target/AArch64/AArch64Subtarget.h | 1 + .../AArch64/regalloc-subregidx-reproducer.ll | 75 +++++++++++++++++++ 3 files changed, 116 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/regalloc-subregidx-reproducer.ll diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index ac9a3d6f0d1a6..910a7b6bc4a0e 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -521,26 +521,62 @@ bool TargetRegisterInfo::getCoveringSubRegIndexes( SmallVector PossibleIndexes; unsigned BestIdx = 0; unsigned BestCover = 0; - + unsigned BestSize = 0; for (unsigned Idx = 1, E = getNumSubRegIndices(); Idx < E; ++Idx) { // Is this index even compatible with the given class? if (getSubClassWithSubReg(RC, Idx) != RC) continue; LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx); + + // The code below (now disabled) is not correct when the lane mask does not + // cover the full sub-register. For example, when 16-bit H0 has only one + // subreg B0 of 8-bits, and the high lanes are not defined in TableGen (e.g. + // by defining B0 and some artificial B0_HI register), then this function + // may return either `bsub` or `hsub`, whereas in this case we'd want it to + // return the *largest* (in bithwidth) sub-register index. It is better to + // keep looking for the biggest one, but this is rather expensive. + // + // Before thinking of how to solve this, I first want to ask the + // question: What is the meaning of 'LaneBitmask'? + // + // According to LaneBitmask.h: + // + // "The individual bits in a lane mask can't be assigned + // any specific meaning. They can be used to check if two + // sub-register indices overlap." + // + // However, in the code here the assumption is made that if + // 'SubRegMask == LaneMask', then the subregister must overlap *fully*. + // This is not the case for the way AArch64 registers are defined + // at the moment. + // + // Which interpretation is correct? + // + // A) If the meaning is 'if the bits are equal, the sub-registers overlap but + // not necessarily fully', then we should fix the code in this function (in + // a better way than just disabling it). + // + // B) If the meaning is 'if the bits are equal, the sub-registers overlap + // fully', then we can define the high bits with an artificial register. + +#if 1 // Early exit if we found a perfect match. if (SubRegMask == LaneMask) { BestIdx = Idx; break; } +#endif // The index must not cover any lanes outside \p LaneMask. if ((SubRegMask & ~LaneMask).any()) continue; unsigned PopCount = SubRegMask.getNumLanes(); + unsigned Size = getSubRegIdxSize(Idx); PossibleIndexes.push_back(Idx); - if (PopCount > BestCover) { + if ((!BestCover || PopCount >= BestCover) && Size >= BestSize) { BestCover = PopCount; + BestSize = Size; BestIdx = Idx; } } @@ -559,6 +595,8 @@ bool TargetRegisterInfo::getCoveringSubRegIndexes( int BestCover = std::numeric_limits::min(); for (unsigned Idx : PossibleIndexes) { LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx); + // FIXME: similar issue as above. + // Early exit if we found a perfect match. if (SubRegMask == LanesLeft) { BestIdx = Idx; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index accfb49c6fbe3..338afb0cf1962 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -152,6 +152,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { const Triple &getTargetTriple() const { return TargetTriple; } bool enableMachineScheduler() const override { return true; } bool enablePostRAScheduler() const override { return usePostRAScheduler(); } + bool enableSubRegLiveness() const override { return true; } bool enableMachinePipeliner() const override; bool useDFAforSMS() const override { return false; } diff --git a/llvm/test/CodeGen/AArch64/regalloc-subregidx-reproducer.ll b/llvm/test/CodeGen/AArch64/regalloc-subregidx-reproducer.ll new file mode 100644 index 0000000000000..7370b7fc0c42c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/regalloc-subregidx-reproducer.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64" + +define void @reproducer(ptr %ptr, ptr %ptr2, <8 x i32> %vec.arg) { +; CHECK-LABEL: reproducer: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: stp x29, x30, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 112 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: stp q0, q1, [sp, #16] // 32-byte Folded Spill +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x8] +; CHECK-NEXT: add x8, sp, #48 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: st1 { v0.2d, v1.2d }, [x8] // 32-byte Folded Spill +; CHECK-NEXT: bl bar +; CHECK-NEXT: ldp q1, q0, [sp, #16] // 32-byte Folded Reload +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: uzp2 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: add x8, sp, #48 +; CHECK-NEXT: ld1 { v5.2d, v6.2d }, [x8] // 32-byte Folded Reload +; CHECK-NEXT: ushll2 v2.2d, v6.4s, #1 +; CHECK-NEXT: ushll v3.2d, v6.2s, #1 +; CHECK-NEXT: fmov s6, s5 // <---- this is wrong, it should copy the full 128-bit vector, rather than 32-bit subreg +; CHECK-NEXT: ushll v4.2d, v5.2s, #0 +; CHECK-NEXT: ushll2 v5.2d, v0.4s, #1 +; CHECK-NEXT: ushll v0.2d, v0.2s, #1 +; CHECK-NEXT: ushll2 v6.2d, v6.4s, #0 +; CHECK-NEXT: orr v3.16b, v3.16b, v4.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: orr v2.16b, v2.16b, v6.16b +; CHECK-NEXT: stp q0, q3, [sp, #32] // 32-byte Folded Spill +; CHECK-NEXT: orr v0.16b, v5.16b, v1.16b +; CHECK-NEXT: stp q0, q2, [sp] // 32-byte Folded Spill +; CHECK-NEXT: bl bar +; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: stp q0, q1, [x20] +; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: stp q0, q1, [x19] +; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #112 +; CHECK-NEXT: ret +entry: + %wide.vec = load <8 x i32>, ptr null, align 4 + call void @bar() + %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> + %strided.vec2 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> + %strided.vec3 = shufflevector <8 x i32> %vec.arg, <8 x i32> poison, <4 x i32> + %1 = zext <4 x i32> %strided.vec2 to <4 x i64> + %2 = zext <4 x i32> %strided.vec3 to <4 x i64> + %3 = shl <4 x i64> %1, + %4 = shl <4 x i64> %2, + %5 = zext <4 x i32> %strided.vec to <4 x i64> + %6 = or <4 x i64> %3, %5 + %7 = or <4 x i64> %4, + call void @bar() + store <4 x i64> %6, ptr %ptr, align 8 + store <4 x i64> %7, ptr %ptr2, align 8 + ret void +} + +declare void @bar() + From 0f1ed85616670fe321918f199a9e9baedc50d578 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 24 Sep 2024 13:27:16 +0100 Subject: [PATCH 2/2] Comment out code that assumes 'A == B' means 'A fully overlaps with B'. --- llvm/lib/CodeGen/TargetRegisterInfo.cpp | 2 +- llvm/test/CodeGen/AArch64/regalloc-subregidx-reproducer.ll | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index 910a7b6bc4a0e..e62cdf7765c4f 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -559,7 +559,7 @@ bool TargetRegisterInfo::getCoveringSubRegIndexes( // B) If the meaning is 'if the bits are equal, the sub-registers overlap // fully', then we can define the high bits with an artificial register. -#if 1 +#if 0 // Early exit if we found a perfect match. if (SubRegMask == LaneMask) { BestIdx = Idx; diff --git a/llvm/test/CodeGen/AArch64/regalloc-subregidx-reproducer.ll b/llvm/test/CodeGen/AArch64/regalloc-subregidx-reproducer.ll index 7370b7fc0c42c..c7e96a6877956 100644 --- a/llvm/test/CodeGen/AArch64/regalloc-subregidx-reproducer.ll +++ b/llvm/test/CodeGen/AArch64/regalloc-subregidx-reproducer.ll @@ -30,12 +30,12 @@ define void @reproducer(ptr %ptr, ptr %ptr2, <8 x i32> %vec.arg) { ; CHECK-NEXT: ld1 { v5.2d, v6.2d }, [x8] // 32-byte Folded Reload ; CHECK-NEXT: ushll2 v2.2d, v6.4s, #1 ; CHECK-NEXT: ushll v3.2d, v6.2s, #1 -; CHECK-NEXT: fmov s6, s5 // <---- this is wrong, it should copy the full 128-bit vector, rather than 32-bit subreg ; CHECK-NEXT: ushll v4.2d, v5.2s, #0 +; CHECK-NEXT: mov v6.16b, v5.16b ; CHECK-NEXT: ushll2 v5.2d, v0.4s, #1 ; CHECK-NEXT: ushll v0.2d, v0.2s, #1 -; CHECK-NEXT: ushll2 v6.2d, v6.4s, #0 ; CHECK-NEXT: orr v3.16b, v3.16b, v4.16b +; CHECK-NEXT: ushll2 v6.2d, v6.4s, #0 ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: orr v2.16b, v2.16b, v6.16b ; CHECK-NEXT: stp q0, q3, [sp, #32] // 32-byte Folded Spill