-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Description
With a simple for
loop around array access of an int
array, various inefficiencies are seen in x64 and arm64 generated code, when using different loop/array index types.
Related: Induction Variable widening #7312
Consider:
public static int arr_index_int(int[] a)
{
int sum = 0;
for (int i = 0; i < a.Length; i++)
{
sum += a[i];
}
return sum;
}
For x64, we have a sign extend in the loop because the index type is 32-bit int
but the register is 64-bit. We should be able to eliminate this sign extension because the max array size is unsigned 31 bits (?), so the sign bits will never be used. We eliminate the bounds check presumably because of a comparison against the array Length, and Length will never have the sign bits set.
For arm64, we have the sign extension, but we also have inefficient array index calculation because it doesn't have base + scale*index + offset
addressing mode. We currently are careful to only add a fully computed index offset to the base ref
type, to avoid creating intermediate byref
types: there have been bugs before with certain (negative) index expressions that led the JIT to create illegal byrefs (pointing out of the object). The index expression here is:
sxtw x4, x2
lsl x4, x4, #2
add x4, x4, #16
ldr w4, [x0, x4]
If we could hoist the array base address computation out of the loop (x0 + #16
), eliminate the sign extend (see above), and use the LSL
addressing mode, we could have:
ldr w4, [x0, x4, LSL #2]
We don't even need to eliminate the sign extend, as we can use the base + offset (Extended Register) form:
ldr w4, [x0, w4, SXTW #2]
Hoisting the x0 + #16
out of the loop isn't required here: simply changing the way the index expression is generated plus enhancing support for addressing modes would be a first step; hoisting the loop-invariant x0 + #16
would be an additional win.
x64 assembly
G_M5005_IG01:
;; bbWeight=1 PerfScore 0.00
G_M5005_IG02:
33C0 xor eax, eax
33D2 xor edx, edx
448B4108 mov r8d, dword ptr [rcx+8]
4585C0 test r8d, r8d
7E0F jle SHORT G_M5005_IG04
;; bbWeight=1 PerfScore 3.75
G_M5005_IG03:
4C63CA movsxd r9, edx
4203448910 add eax, dword ptr [rcx+4*r9+16]
FFC2 inc edx
443BC2 cmp r8d, edx
7FF1 jg SHORT G_M5005_IG03
;; bbWeight=4 PerfScore 15.00
G_M5005_IG04:
C3 ret
;; bbWeight=1 PerfScore 1.00
arm64 assembly
G_M5005_IG01:
A9BF7BFD stp fp, lr, [sp,#-16]!
910003FD mov fp, sp
;; bbWeight=1 PerfScore 1.50
G_M5005_IG02:
52800001 mov w1, #0
52800002 mov w2, #0
B9400803 ldr w3, [x0,#8]
7100007F cmp w3, #0
5400012D ble G_M5005_IG04
;; bbWeight=1 PerfScore 5.50
G_M5005_IG03:
93407C44 sxtw x4, x2
D37EF484 lsl x4, x4, #2
91004084 add x4, x4, #16
B8646804 ldr w4, [x0, x4]
0B010081 add w1, w4, w1
11000442 add w2, w2, #1
6B02007F cmp w3, w2
54FFFF2C bgt G_M5005_IG03
;; bbWeight=4 PerfScore 30.00
G_M5005_IG04:
2A0103E0 mov w0, w1
;; bbWeight=1 PerfScore 0.50
G_M5005_IG05:
A8C17BFD ldp fp, lr, [sp],#16
D65F03C0 ret lr
;; bbWeight=1 PerfScore 2.00
public static int arr_index_long(int[] a)
{
int sum = 0;
for (long i = 0; i < a.Length; i++)
{
sum += a[i];
}
return sum;
}
If the loop / array index variable is changed from int
to long
, there are some unexpected effects: (1) the JIT sign-extends the array size. The array size can never be negative, so this seems unnecessary. (2) the JIT doesn't eliminate the bounds check in the loop body.
The sign extend is eliminated, however, as expected.
x64 assembly
G_M57268_IG01:
4883EC28 sub rsp, 40
;; bbWeight=1 PerfScore 0.25
G_M57268_IG02:
33C0 xor eax, eax
33D2 xor rdx, rdx
448B4108 mov r8d, dword ptr [rcx+8]
4D63C0 movsxd r8, r8d
4D85C0 test r8, r8
7E11 jle SHORT G_M57268_IG04
;; bbWeight=1 PerfScore 4.00
G_M57268_IG03:
493BD0 cmp rdx, r8
7311 jae SHORT G_M57268_IG05
03449110 add eax, dword ptr [rcx+4*rdx+16]
48FFC2 inc rdx
4C3BC2 cmp r8, rdx
7FEF jg SHORT G_M57268_IG03
;; bbWeight=4 PerfScore 19.00
G_M57268_IG04:
4883C428 add rsp, 40
C3 ret
;; bbWeight=1 PerfScore 1.25
G_M57268_IG05:
E8A10D415E call CORINFO_HELP_RNGCHKFAIL
CC int3
;; bbWeight=0 PerfScore 0.00
arm64 assembly
G_M57268_IG01:
A9BF7BFD stp fp, lr, [sp,#-16]!
910003FD mov fp, sp
;; bbWeight=1 PerfScore 1.50
G_M57268_IG02:
52800001 mov w1, #0
D2800002 mov x2, #0
B9400803 ldr w3, [x0,#8]
93407C63 sxtw x3, x3
F100007F cmp x3, #0
5400014D ble G_M57268_IG04
;; bbWeight=1 PerfScore 6.00
G_M57268_IG03:
EB03005F cmp x2, x3
54000162 bhs G_M57268_IG06
D37EF444 lsl x4, x2, #2
91004084 add x4, x4, #16
B8646804 ldr w4, [x0, x4]
0B010081 add w1, w4, w1
91000442 add x2, x2, #1
EB02007F cmp x3, x2
54FFFF0C bgt G_M57268_IG03
;; bbWeight=4 PerfScore 34.00
G_M57268_IG04:
2A0103E0 mov w0, w1
;; bbWeight=1 PerfScore 0.50
G_M57268_IG05:
A8C17BFD ldp fp, lr, [sp],#16
D65F03C0 ret lr
;; bbWeight=1 PerfScore 2.00
G_M57268_IG06:
94000000 bl CORINFO_HELP_RNGCHKFAIL
D43E0000 bkpt
;; bbWeight=0 PerfScore 0.00
public static int arr_index_uint(int[] a)
{
int sum = 0;
for (uint i = 0; i < a.Length; i++)
{
sum += a[i];
}
return sum;
}
Changing the index variable to uint
retains the bounds check and array length sign extend, as for long
. Oddly, it also adds a sign extend of the index variable before the addressing mode.
x64 assembly
G_M57944_IG01:
4883EC28 sub rsp, 40
;; bbWeight=1 PerfScore 0.25
G_M57944_IG02:
33C0 xor eax, eax
33D2 xor edx, edx
448B4108 mov r8d, dword ptr [rcx+8]
4D63C8 movsxd r9, r8d
4D85C9 test r9, r9
7E17 jle SHORT G_M57944_IG04
;; bbWeight=1 PerfScore 4.00
G_M57944_IG03:
413BD0 cmp edx, r8d
7317 jae SHORT G_M57944_IG05
4C63D2 movsxd r10, edx
4203449110 add eax, dword ptr [rcx+4*r10+16]
FFC2 inc edx
448BD2 mov r10d, edx
4D3BCA cmp r9, r10
7FE9 jg SHORT G_M57944_IG03
;; bbWeight=4 PerfScore 21.00
G_M57944_IG04:
4883C428 add rsp, 40
C3 ret
;; bbWeight=1 PerfScore 1.25
G_M57944_IG05:
E84B0D405E call CORINFO_HELP_RNGCHKFAIL
CC int3
;; bbWeight=0 PerfScore 0.00
arm64 assembly
G_M57944_IG01:
A9BF7BFD stp fp, lr, [sp,#-16]!
910003FD mov fp, sp
;; bbWeight=1 PerfScore 1.50
G_M57944_IG02:
52800001 mov w1, #0
52800002 mov w2, #0
B9400803 ldr w3, [x0,#8]
93407C64 sxtw x4, x3
F100009F cmp x4, #0
5400018D ble G_M57944_IG04
;; bbWeight=1 PerfScore 6.00
G_M57944_IG03:
6B03005F cmp w2, w3
540001A2 bhs G_M57944_IG06
93407C45 sxtw x5, x2
D37EF4A5 lsl x5, x5, #2
910040A5 add x5, x5, #16
B8656805 ldr w5, [x0, x5]
0B0100A1 add w1, w5, w1
11000442 add w2, w2, #1
2A0203E5 mov w5, w2
EB05009F cmp x4, x5
54FFFECC bgt G_M57944_IG03
;; bbWeight=4 PerfScore 38.00
G_M57944_IG04:
2A0103E0 mov w0, w1
;; bbWeight=1 PerfScore 0.50
G_M57944_IG05:
A8C17BFD ldp fp, lr, [sp],#16
D65F03C0 ret lr
;; bbWeight=1 PerfScore 2.00
G_M57944_IG06:
94000000 bl CORINFO_HELP_RNGCHKFAIL
D43E0000 bkpt
;; bbWeight=0 PerfScore 0.00
public static int arr_index_ulong(int[] a)
{
int sum = 0;
for (ulong i = 0; i < (ulong)a.Length; i++)
{
sum += a[i];
}
return sum;
}
Changing the index variable to ulong
adds an overflow check to the loop as well as a range check. Why is the overflow check necessary?
x64 assembly
G_M30337_IG01:
4883EC28 sub rsp, 40
;; bbWeight=1 PerfScore 0.25
G_M30337_IG02:
33C0 xor eax, eax
33D2 xor rdx, rdx
448B4108 mov r8d, dword ptr [rcx+8]
4D63C0 movsxd r8, r8d
4D85C0 test r8, r8
741A je SHORT G_M30337_IG04
;; bbWeight=1 PerfScore 4.00
G_M30337_IG03:
4885D2 test rdx, rdx
7C1A jl SHORT G_M30337_IG05
4C8BCA mov r9, rdx
4D3BC8 cmp r9, r8
7318 jae SHORT G_M30337_IG06
4203448910 add eax, dword ptr [rcx+4*r9+16]
48FFC2 inc rdx
4C3BC2 cmp r8, rdx
77E6 ja SHORT G_M30337_IG03
;; bbWeight=4 PerfScore 25.00
G_M30337_IG04:
4883C428 add rsp, 40
C3 ret
;; bbWeight=1 PerfScore 1.25
G_M30337_IG05:
E8B82F425E call CORINFO_HELP_OVERFLOW
CC int3
;; bbWeight=0 PerfScore 0.00
G_M30337_IG06:
E8F20C425E call CORINFO_HELP_RNGCHKFAIL
CC int3
;; bbWeight=0 PerfScore 0.00
arm64 assembly
G_M30337_IG01:
A9BF7BFD stp fp, lr, [sp,#-16]!
910003FD mov fp, sp
;; bbWeight=1 PerfScore 1.50
G_M30337_IG02:
52800001 mov w1, #0
D2800002 mov x2, #0
B9400803 ldr w3, [x0,#8]
93407C63 sxtw x3, x3
B40001A3 cbz x3, G_M30337_IG04
;; bbWeight=1 PerfScore 5.50
G_M30337_IG03:
F100005F cmp x2, #0
540001CB blt G_M30337_IG06
AA0203E4 mov x4, x2
EB03009F cmp x4, x3
54000182 bhs G_M30337_IG07
D37EF484 lsl x4, x4, #2
91004084 add x4, x4, #16
B8646804 ldr w4, [x0, x4]
0B010081 add w1, w4, w1
91000442 add x2, x2, #1
EB02007F cmp x3, x2
54FFFEA8 bhi G_M30337_IG03
;; bbWeight=4 PerfScore 42.00
G_M30337_IG04:
2A0103E0 mov w0, w1
;; bbWeight=1 PerfScore 0.50
G_M30337_IG05:
A8C17BFD ldp fp, lr, [sp],#16
D65F03C0 ret lr
;; bbWeight=1 PerfScore 2.00
G_M30337_IG06:
94000000 bl CORINFO_HELP_OVERFLOW
;; bbWeight=0 PerfScore 0.00
G_M30337_IG07:
94000000 bl CORINFO_HELP_RNGCHKFAIL
D43E0000 bkpt
;; bbWeight=0 PerfScore 0.00
category:cq
theme:loop-opt
skill-level:intermediate
cost:medium