-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[lld][ELF][LoongArch] Support relaxing R_LARCH_CALL36 #127312
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-lld @llvm/pr-subscribers-lld-elf Author: WÁNG Xuěruì (xen0n) ChangesRelax eligible PCADDU18I + JIRL sequences to B or BL depending on JIRL's output (link) register. Correctness is maintained on a best-effort basis by ensuring the underlying instruction pair is PCADDU18I and JIRL, and that the register operands involved are appropriate. This is beneficial performance-wise for code compiled with the medium code model, and enables future changing of the default code model from "small" to "medium" without runtime performance impact. Full diff: https://github.com/llvm/llvm-project/pull/127312.diff 2 Files Affected:
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index dbf024eadf100..4cc37219e97b5 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -55,9 +55,12 @@ enum Op {
ANDI = 0x03400000,
PCADDI = 0x18000000,
PCADDU12I = 0x1c000000,
+ PCADDU18I = 0x1e000000,
LD_W = 0x28800000,
LD_D = 0x28c00000,
JIRL = 0x4c000000,
+ B = 0x50000000,
+ BL = 0x54000000,
};
enum Reg {
@@ -830,6 +833,45 @@ static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i,
remove = 4;
}
+static bool isInsnPairCall36(uint64_t pair) {
+ const uint32_t insn1 = extractBits(pair, 31, 0);
+ const uint32_t insn2 = extractBits(pair, 63, 32);
+ if ((insn1 & 0xfe000000) != PCADDU18I)
+ return false;
+ if ((insn2 & 0xfc000000) != JIRL)
+ return false;
+
+ const uint32_t rd1 = extractBits(insn1, 4, 0);
+ const uint32_t rd2 = extractBits(insn2, 4, 0);
+ const uint32_t rj2 = extractBits(insn2, 9, 5);
+ if (rd1 != rj2)
+ return false;
+ if (rd2 != R_ZERO && rd2 != R_RA)
+ return false;
+
+ return true;
+}
+
+// Relax R_LARCH_CALL36 pcaddu18i+jirl to b or bl.
+static void relaxCall(Ctx &ctx, const InputSection &sec, size_t i, uint64_t loc,
+ Relocation &r, uint32_t &remove) {
+ const Symbol &sym = *r.sym;
+ const uint64_t insnPair = read64le(sec.content().data() + r.offset);
+ if (!isInsnPairCall36(insnPair))
+ return;
+
+ const bool isTail = extractBits(insnPair, 32 + 4, 32 + 0) == R_ZERO;
+ const uint64_t dest =
+ (r.expr == R_PLT_PC ? sym.getPltVA(ctx) : sym.getVA(ctx)) + r.addend;
+ const int64_t displace = dest - loc;
+
+ if (isInt<28>(displace) && !(displace & 0x3)) {
+ sec.relaxAux->relocTypes[i] = R_LARCH_B26;
+ sec.relaxAux->writes.push_back(isTail ? B : BL);
+ remove = 4;
+ }
+}
+
static bool relax(Ctx &ctx, InputSection &sec) {
const uint64_t secAddr = sec.getVA();
const MutableArrayRef<Relocation> relocs = sec.relocs();
@@ -874,6 +916,10 @@ static bool relax(Ctx &ctx, InputSection &sec) {
if (isPairRelaxable(relocs, i))
relaxPCHi20Lo12(ctx, sec, i, loc, r, relocs[i + 2], remove);
break;
+ case R_LARCH_CALL36:
+ if (relaxable(relocs, i))
+ relaxCall(ctx, sec, i, loc, r, remove);
+ break;
}
// For all anchors whose offsets are <= r.offset, they are preceded by
diff --git a/lld/test/ELF/loongarch-relax-call36.s b/lld/test/ELF/loongarch-relax-call36.s
new file mode 100644
index 0000000000000..1ba9dbd688975
--- /dev/null
+++ b/lld/test/ELF/loongarch-relax-call36.s
@@ -0,0 +1,131 @@
+# REQUIRES: loongarch
+## Relax R_LARCH_CALL36.
+## Currently only loongarch64 is covered, because the call36 pseudo-instruction
+## is valid for LA64 only, due to LA32 not having pcaddu18i.
+
+# TODO:
+#
+# * trivial cases
+# * +/- limit: -4, 0, +4
+# * align: 0, 1, 2, 3
+# * invalid pcaddu18i + jirl pairs
+# - rd1 != rj2
+# - rd2 not in (0, 1)
+
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=loongarch64 -mattr=+relax a.s -o a.o
+
+# RUN: ld.lld -T lds a.o -o a
+# RUN: llvm-objdump -d --no-show-raw-insn a | FileCheck %s
+
+## Unsure whether this needs a diagnostic. GNU ld allows this.
+# RUN: ld.lld -T lds -pie a.o -o a.pie
+# RUN: llvm-objdump -d --no-show-raw-insn a.pie | FileCheck %s
+
+# RUN: ld.lld -T lds -pie -z notext -z ifunc-noplt a.o -o a.ifunc-noplt
+# RUN: llvm-objdump -d --no-show-raw-insn a.ifunc-noplt | FileCheck %s --check-prefix=CHECK2
+
+# CHECK-LABEL: <_start>:
+# CHECK-NEXT: bl -4 <near_before>
+# CHECK-NEXT: b -8 <near_before>
+# CHECK-NEXT: bl 64 <near_after>
+# CHECK-NEXT: b 60 <near_after>
+# CHECK-NEXT: pcaddu18i $ra, -512
+# CHECK-NEXT: jirl $ra, $ra, -4
+# CHECK-NEXT: bl -134217728 <far_b>
+# CHECK-NEXT: bl 134217724 <far_y>
+# CHECK-NEXT: pcaddu18i $ra, 512
+# CHECK-NEXT: jirl $ra, $ra, 0
+# CHECK-NEXT: pcaddu18i $t0, 0
+# CHECK-NEXT: jirl $t0, $t0, -44
+# CHECK-NEXT: pcaddu18i $t0, 0
+# CHECK-NEXT: jirl $zero, $t1, 24
+# CHECK-NEXT: pcalau12i $t0, 0
+# CHECK-NEXT: jirl $zero, $t0, -60
+# CHECK-NEXT: pcaddu18i $t0, 0
+# CHECK-NEXT: addu16i.d $t0, $t0, 2
+# CHECK-EMPTY:
+
+# CHECK-LABEL: <.mid>:
+# CHECK-NEXT: b 2048
+# CHECK-NEXT: b 2044
+# CHECK-EMPTY:
+
+# CHECK2-LABEL: <.mid>:
+# CHECK2-NEXT: pcaddu18i $t0, 0
+# CHECK2-NEXT: jr $t0
+# CHECK2-NEXT: pcaddu18i $t0, 0
+# CHECK2-NEXT: jr $t0
+# CHECK2-EMPTY:
+
+#--- a.s
+.global _start, ifunc
+near_before:
+ ret
+
+_start:
+ call36 near_before
+ tail36 $t0, near_before
+
+ call36 near_after
+ tail36 $t0, near_after
+
+ call36 far_a ## just out of relaxable range: 0x08000010 - 0x10000014 = -(1 << 27) - 4
+ call36 far_b ## just in relaxable range: 0x0800001c - 0x1000001c = -(1 << 27)
+
+ call36 far_y ## just in relaxable range: 0x1800001c - 0x10000020 = (1 << 27) - 4
+ call36 far_z ## just out of relaxable range: 0x18000024 - 0x10000024 = 1 << 27
+
+ ## broken R_LARCH_CALL36 usages should not be relaxed even if relaxable
+ ## otherwise
+ ## correctness is not guaranteed for malformed input like these
+
+ ## jirl link register (rd) not $zero or $ra (hence not expressible by B or BL)
+ ## the apparent correctness here is only coincidence and should not be relied
+ ## upon
+ .reloc ., R_LARCH_CALL36, near_before
+ .reloc ., R_LARCH_RELAX, 0
+ pcaddu18i $t0, 0
+ jirl $t0, $t0, 0
+
+ ## jirl base != pcaddu18i output
+ .reloc ., R_LARCH_CALL36, near_after
+ .reloc ., R_LARCH_RELAX, 0
+ pcaddu18i $t0, 0
+ jirl $zero, $t1, 0
+
+ ## 1st insn not pcaddu18i
+ .reloc ., R_LARCH_CALL36, near_before
+ .reloc ., R_LARCH_RELAX, 0
+ pcalau12i $t0, 0
+ jirl $zero, $t0, 0
+
+ ## 2nd insn not jirl
+ .reloc ., R_LARCH_CALL36, near_after
+ .reloc ., R_LARCH_RELAX, 0
+ pcaddu18i $t0, 0
+ addu16i.d $t0, $t0, 0
+
+near_after:
+ ret
+
+.section .mid,"ax",@progbits
+.balign 16
+ tail36 $t0, ifunc@plt
+ tail36 $t0, ifunc@plt
+
+.type ifunc, @gnu_indirect_function
+ifunc:
+ ret
+
+#--- lds
+SECTIONS {
+ .text 0x10000000 : { *(.text) }
+ .mid 0x10000800 : { *(.mid) }
+ .iplt 0x10001000 : { *(.iplt) }
+}
+
+far_a = 0x08000010;
+far_b = 0x0800001c;
+far_y = 0x1800001c;
+far_z = 0x18000024;
|
cc @SixWeining @wangleiat @xry111 -- sorry for being a 鸽子 and delaying this optimization sooooo long; Real Life hits hard. I have finished the implementation back in April 2024 as you can see in the commit details, but only got time to finish the test case just now. |
Relax eligible PCADDU18I + JIRL sequences to B or BL depending on JIRL's output (link) register. Correctness is maintained on a best-effort basis by ensuring the underlying instruction pair is PCADDU18I and JIRL, and that the register operands involved are appropriate. This is beneficial performance-wise for code compiled with the medium code model, and enables future changing of the default code model from "small" to "medium" without runtime performance impact.
@@ -971,6 +1017,7 @@ void LoongArch::finalizeRelax(int passes) const { | |||
switch (newType) { | |||
case R_LARCH_RELAX: | |||
break; | |||
case R_LARCH_B26: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#123576 does not change r.expr
. Which one is appropriate?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, it should remain R_PLT_PC
. I'll see if I should just close this instead of #123576 because I didn't know that PR even existed until today morning.
static bool isInsnPairCall36(uint64_t pair) { | ||
const uint32_t insn1 = extractBits(pair, 31, 0); | ||
const uint32_t insn2 = extractBits(pair, 63, 32); | ||
if ((insn1 & 0xfe000000) != PCADDU18I) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is an instruction that #123576 doesn't have. We typically avoid instruction checks unless absolutely necessary.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ld.bfd
checks for jirl
but not pcaddu18i
, nor does ld.bfd
ensure jirl.rj == pcaddu18i.rd
. Granted, these are irrelevant for well-formed inputs, but I think such checks should get added to ld.bfd
(or indeed, the psABI) as well, for the guarantee that "relaxation does not affect code semantics" to uphold.
Sorry again for the long pause of my LLVM work; I didn't know #123576 was posted long before this, until today morning when @MaskRay pointed me to it. Given that PR has more diverse test cases, and the fact that this PR has at least one issue to be fixed, I'm going to close this PR and review that one instead. |
Relax eligible PCADDU18I + JIRL sequences to B or BL depending on JIRL's output (link) register. Correctness is maintained on a best-effort basis by ensuring the underlying instruction pair is PCADDU18I and JIRL, and that the register operands involved are appropriate.
This is beneficial performance-wise for code compiled with the medium code model, and enables future changing of the default code model from "small" to "medium" without runtime performance impact.