Skip to content

rustc -C opt-level=3 generates bad assembly code for Vec by default #44452

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
gnzlbg opened this issue Sep 9, 2017 · 29 comments
Open

rustc -C opt-level=3 generates bad assembly code for Vec by default #44452

gnzlbg opened this issue Sep 9, 2017 · 29 comments
Labels
A-codegen Area: Code generation C-enhancement Category: An issue proposing an enhancement or a PR with one. C-optimization Category: An issue highlighting optimization opportunities or PRs implementing such I-slow Issue: Problems and improvements with respect to performance of generated code. T-compiler Relevant to the compiler team, which will review and decide on the PR/issue.

Comments

@gnzlbg
Copy link
Contributor

gnzlbg commented Sep 9, 2017

Compiling the following C++ snippet with clang++ -O3 and g++ -O3 (see here):

#include <vector>

unsigned foo() {
  std::vector<unsigned> a;
  a.push_back(314);
  return a[0];
}

generates this assembly on x86_64:

foo(): # @foo()
  push rax
  mov edi, 4
  call operator new(unsigned long)
  mov rdi, rax
  call operator delete(void*)
  mov eax, 314
  pop rcx
  ret

(note: clang generates perfect assembly even with multiple push backs, the only thing that seems to trip it is a reallocation)

This snippet compiled with rustc --C opt-level=3 (see here):

pub fn foo() -> u32 {
  let mut v: Vec<u32> = Vec::new();
  v.push(0);
  v[0]
}

generates the following assembly:

<alloc::raw_vec::RawVec<T, A>>::double:
        push    rbp
        mov     rbp, rsp
        push    r14
        push    rbx
        sub     rsp, 64
        mov     r14, rdi
        mov     rbx, qword ptr [r14 + 8]
        test    rbx, rbx
        je      .LBB0_6
        lea     rsi, [4*rbx]
        lea     rcx, [8*rbx]
        mov     rdi, qword ptr [r14]
        lea     r9, [rbp - 40]
        mov     edx, 4
        mov     r8d, 4
        call    __rust_realloc@PLT
        test    rax, rax
        je      .LBB0_4
        add     rbx, rbx
        jmp     .LBB0_3
.LBB0_6:
        lea     rdx, [rbp - 40]
        mov     edi, 16
        mov     esi, 4
        call    __rust_alloc@PLT
        test    rax, rax
        je      .LBB0_8
        mov     ebx, 4
.LBB0_3:
        mov     qword ptr [r14], rax
        mov     qword ptr [r14 + 8], rbx
        add     rsp, 64
        pop     rbx
        pop     r14
        pop     rbp
        ret
.LBB0_4:
        mov     rax, qword ptr [rbp - 40]
        movups  xmm0, xmmword ptr [rbp - 32]
        movaps  xmmword ptr [rbp - 64], xmm0
        mov     qword ptr [rbp - 40], rax
        movaps  xmm0, xmmword ptr [rbp - 64]
        jmp     .LBB0_5
.LBB0_8:
        movups  xmm0, xmmword ptr [rbp - 32]
        movaps  xmmword ptr [rbp - 64], xmm0
        movaps  xmm0, xmmword ptr [rbp - 64]
        movaps  xmmword ptr [rbp - 80], xmm0
        movaps  xmm0, xmmword ptr [rbp - 80]
.LBB0_5:
        movups  xmmword ptr [rbp - 32], xmm0
        lea     rdi, [rbp - 40]
        call    <alloc::heap::Heap as alloc::allocator::Alloc>::oom
core::ptr::drop_in_place:
        push    rbp
        mov     rbp, rsp
        mov     rsi, qword ptr [rdi + 8]
        test    rsi, rsi
        je      .LBB1_1
        mov     rdi, qword ptr [rdi]
        shl     rsi, 2
        mov     edx, 4
        pop     rbp
        jmp     __rust_dealloc@PLT
.LBB1_1:
        pop     rbp
        ret
<alloc::heap::Heap as alloc::allocator::Alloc>::oom:
        push    rbp
        mov     rbp, rsp
        sub     rsp, 32
        mov     rax, qword ptr [rdi + 16]
        mov     qword ptr [rbp - 16], rax
        movups  xmm0, xmmword ptr [rdi]
        movaps  xmmword ptr [rbp - 32], xmm0
        lea     rdi, [rbp - 32]
        call    __rust_oom@PLT

example::foo:
        push    rbp
        mov     rbp, rsp
        push    rbx
        sub     rsp, 24
        mov     qword ptr [rbp - 32], 4
        xorps   xmm0, xmm0
        movups  xmmword ptr [rbp - 24], xmm0
        lea     rdi, [rbp - 32]
        call    <alloc::raw_vec::RawVec<T, A>>::double
        mov     rdi, qword ptr [rbp - 32]
        mov     rax, qword ptr [rbp - 16]
        mov     dword ptr [rdi + 4*rax], 0
        inc     rax
        mov     qword ptr [rbp - 16], rax
        je      .LBB3_2
        mov     ebx, dword ptr [rdi]
        mov     rsi, qword ptr [rbp - 24]
        test    rsi, rsi
        je      .LBB3_6
        shl     rsi, 2
        mov     edx, 4
        call    __rust_dealloc@PLT
.LBB3_6:
        mov     eax, ebx
        add     rsp, 24
        pop     rbx
        pop     rbp
        ret
.LBB3_2:
        lea     rdi, [rip + panic_bounds_check_loc.2]
        xor     esi, esi
        xor     edx, edx
        call    core::panicking::panic_bounds_check@PLT
        mov     rbx, rax
        lea     rdi, [rbp - 32]
        call    core::ptr::drop_in_place
        mov     rdi, rbx
        call    _Unwind_Resume@PLT
GCC_except_table3:
        .byte   255
        .byte   155
        .asciz  "\234"
        .byte   3
        .byte   26
        .long   .Ltmp29-.Lfunc_begin3
        .long   .Ltmp32-.Ltmp29
        .long   .Ltmp33-.Lfunc_begin3
        .byte   0
        .long   .Ltmp32-.Lfunc_begin3
        .long   .Lfunc_end3-.Ltmp32
        .long   0
        .byte   0
str.1:
        .ascii  "/checkout/src/liballoc/vec.rs"
panic_bounds_check_loc.2:
        .quad   str.1
        .quad   29
        .long   1555
        .long   10
DW.ref.rust_eh_personality:
        .quad   rust_eh_personality

I've tried adding -lto and -C panic=abort to rustc without much luck. I've also tried replacing [0] with unsafe { *v.get_unchecked(0) } without any luck. The only thing that makes it generate good assembly is using Vec::with_capacity(N) (see here):

pub fn foo() -> u32 {
  let mut v: Vec<u32> = Vec::with_capacity(3);
  v.push(7);
  v.push(4);
  v[1]
}

generates

example::foo:
        push    rbp
        mov     rbp, rsp
        mov     eax, 4
        pop     rbp
        ret
@leonardo-m
Copy link

This is the asm I am seeing:

foo:
	pushq	%rsi
	subq	$64, %rsp
	movq	$4, 40(%rsp)
	vxorps	%xmm0, %xmm0, %xmm0
	vmovups	%xmm0, 48(%rsp)
	leaq	40(%rsp), %rcx
	callq	_ZN49_$LT$alloc..raw_vec..RawVec$LT$T$C$$u20$A$GT$$GT$6double17h82e5d6919d342c2fE
	movq	40(%rsp), %rcx
	movq	56(%rsp), %rax
	movl	$0, (%rcx,%rax,4)
	addq	$1, %rax
	movq	%rax, 56(%rsp)
	je	.LBB3_2
	movl	(%rcx), %esi
	movq	48(%rsp), %rdx
	testq	%rdx, %rdx
	je	.LBB3_8
	shlq	$2, %rdx
	movl	$4, %r8d
	callq	__rust_dealloc
.LBB3_8:
	movl	%esi, %eax
	addq	$64, %rsp
	popq	%rsi
	retq
.LBB3_2:
	leaq	panic_bounds_check_loc.2(%rip), %rcx
	xorl	%edx, %edx
	xorl	%r8d, %r8d
	callq	_ZN4core9panicking18panic_bounds_check17hbf9952bd7dce1212E
	ud2
.LBB3_4:
	movq	%rax, %rcx
	callq	rust_eh_unwind_resume
	ud2
.LBB3_9:
	movq	%rax, %rsi
	leaq	40(%rsp), %rcx
	callq	_ZN4core3ptr13drop_in_place17h731ac0064b2a28c1E
	movq	%rsi, %rcx
	callq	rust_eh_unwind_resume
	ud2

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Sep 9, 2017

This is the asm I am seeing on the playground (link):

.text
	.intel_syntax noprefix
	.file	"playground.cgu-0.rs"
	.section	".text.cold._ZN49_$LT$alloc..raw_vec..RawVec$LT$T$C$$u20$A$GT$$GT$6double17h1882b1047d8074ffE","ax",@progbits
	.p2align	4, 0x90
	.type	_ZN49_$LT$alloc..raw_vec..RawVec$LT$T$C$$u20$A$GT$$GT$6double17h1882b1047d8074ffE,@function
_ZN49_$LT$alloc..raw_vec..RawVec$LT$T$C$$u20$A$GT$$GT$6double17h1882b1047d8074ffE:
	.cfi_startproc
	push	r14
.Lcfi0:
	.cfi_def_cfa_offset 16
	push	rbx
.Lcfi1:
	.cfi_def_cfa_offset 24
	sub	rsp, 72
.Lcfi2:
	.cfi_def_cfa_offset 96
.Lcfi3:
	.cfi_offset rbx, -24
.Lcfi4:
	.cfi_offset r14, -16
	mov	r14, rdi
	mov	rbx, qword ptr [r14 + 8]
	test	rbx, rbx
	je	.LBB0_6
	lea	rsi, [4*rbx]
	lea	rcx, [8*rbx]
	mov	rdi, qword ptr [r14]
	lea	r9, [rsp + 8]
	mov	edx, 4
	mov	r8d, 4
	call	__rust_realloc@PLT
	test	rax, rax
	je	.LBB0_4
	add	rbx, rbx
	jmp	.LBB0_3
.LBB0_6:
	lea	rdx, [rsp + 8]
	mov	edi, 16
	mov	esi, 4
	call	__rust_alloc@PLT
	test	rax, rax
	je	.LBB0_8
	mov	ebx, 4
.LBB0_3:
	mov	qword ptr [r14], rax
	mov	qword ptr [r14 + 8], rbx
	add	rsp, 72
	pop	rbx
	pop	r14
	ret
.LBB0_4:
	mov	rax, qword ptr [rsp + 8]
	movups	xmm0, xmmword ptr [rsp + 16]
	movaps	xmmword ptr [rsp + 32], xmm0
	mov	qword ptr [rsp + 8], rax
	movaps	xmm0, xmmword ptr [rsp + 32]
	jmp	.LBB0_5
.LBB0_8:
	movups	xmm0, xmmword ptr [rsp + 16]
	movaps	xmmword ptr [rsp + 32], xmm0
	movaps	xmm0, xmmword ptr [rsp + 32]
	movaps	xmmword ptr [rsp + 48], xmm0
	movaps	xmm0, xmmword ptr [rsp + 48]
.LBB0_5:
	movups	xmmword ptr [rsp + 16], xmm0
	lea	rdi, [rsp + 8]
	call	_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h8b5f820f2dd0e667E
.Lfunc_end0:
	.size	_ZN49_$LT$alloc..raw_vec..RawVec$LT$T$C$$u20$A$GT$$GT$6double17h1882b1047d8074ffE, .Lfunc_end0-_ZN49_$LT$alloc..raw_vec..RawVec$LT$T$C$$u20$A$GT$$GT$6double17h1882b1047d8074ffE
	.cfi_endproc

	.section	.text._ZN4core3ptr13drop_in_place17h62d05f1818f0f9d8E,"ax",@progbits
	.p2align	4, 0x90
	.type	_ZN4core3ptr13drop_in_place17h62d05f1818f0f9d8E,@function
_ZN4core3ptr13drop_in_place17h62d05f1818f0f9d8E:
	.cfi_startproc
	mov	rsi, qword ptr [rdi + 8]
	test	rsi, rsi
	je	.LBB1_1
	mov	rdi, qword ptr [rdi]
	shl	rsi, 2
	mov	edx, 4
	jmp	__rust_dealloc@PLT
.LBB1_1:
	ret
.Lfunc_end1:
	.size	_ZN4core3ptr13drop_in_place17h62d05f1818f0f9d8E, .Lfunc_end1-_ZN4core3ptr13drop_in_place17h62d05f1818f0f9d8E
	.cfi_endproc

	.section	".text.cold._ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h8b5f820f2dd0e667E","ax",@progbits
	.p2align	4, 0x90
	.type	_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h8b5f820f2dd0e667E,@function
_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h8b5f820f2dd0e667E:
	.cfi_startproc
	sub	rsp, 24
.Lcfi5:
	.cfi_def_cfa_offset 32
	mov	rax, qword ptr [rdi + 16]
	mov	qword ptr [rsp + 16], rax
	movups	xmm0, xmmword ptr [rdi]
	movaps	xmmword ptr [rsp], xmm0
	mov	rdi, rsp
	call	__rust_oom@PLT
.Lfunc_end2:
	.size	_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h8b5f820f2dd0e667E, .Lfunc_end2-_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h8b5f820f2dd0e667E
	.cfi_endproc

	.section	.text._ZN10playground4main17hf21e17cd5a16d5d9E,"ax",@progbits
	.p2align	4, 0x90
	.type	_ZN10playground4main17hf21e17cd5a16d5d9E,@function
_ZN10playground4main17hf21e17cd5a16d5d9E:
.Lfunc_begin0:
	.cfi_startproc
	.cfi_personality 155, DW.ref.rust_eh_personality
	.cfi_lsda 27, .Lexception0
	push	rbx
.Lcfi6:
	.cfi_def_cfa_offset 16
	sub	rsp, 32
.Lcfi7:
	.cfi_def_cfa_offset 48
.Lcfi8:
	.cfi_offset rbx, -16
	mov	qword ptr [rsp + 8], 4
	xorps	xmm0, xmm0
	movups	xmmword ptr [rsp + 16], xmm0
.Ltmp0:
	lea	rdi, [rsp + 8]
	call	_ZN49_$LT$alloc..raw_vec..RawVec$LT$T$C$$u20$A$GT$$GT$6double17h1882b1047d8074ffE
.Ltmp1:
	mov	rdi, qword ptr [rsp + 8]
	mov	rax, qword ptr [rsp + 24]
	mov	dword ptr [rdi + 4*rax], 0
	inc	rax
	mov	qword ptr [rsp + 24], rax
	je	.LBB3_2
	mov	rsi, qword ptr [rsp + 16]
	test	rsi, rsi
	je	.LBB3_6
	shl	rsi, 2
	mov	edx, 4
	call	__rust_dealloc@PLT
.LBB3_6:
	add	rsp, 32
	pop	rbx
	ret
.LBB3_2:
.Ltmp2:
	lea	rdi, [rip + panic_bounds_check_loc.2]
	xor	esi, esi
	xor	edx, edx
	call	_ZN4core9panicking18panic_bounds_check17h78beadfd8229dc37E@PLT
.Ltmp3:
.LBB3_7:
.Ltmp4:
	mov	rbx, rax
	lea	rdi, [rsp + 8]
	call	_ZN4core3ptr13drop_in_place17h62d05f1818f0f9d8E
	mov	rdi, rbx
	call	_Unwind_Resume@PLT
.Lfunc_end3:
	.size	_ZN10playground4main17hf21e17cd5a16d5d9E, .Lfunc_end3-_ZN10playground4main17hf21e17cd5a16d5d9E
	.cfi_endproc
	.section	.gcc_except_table,"a",@progbits
	.p2align	2
GCC_except_table3:
.Lexception0:
	.byte	255
	.byte	155
	.asciz	"\234"
	.byte	3
	.byte	26
	.long	.Ltmp0-.Lfunc_begin0
	.long	.Ltmp3-.Ltmp0
	.long	.Ltmp4-.Lfunc_begin0
	.byte	0
	.long	.Ltmp3-.Lfunc_begin0
	.long	.Lfunc_end3-.Ltmp3
	.long	0
	.byte	0
	.p2align	2

	.section	.text.main,"ax",@progbits
	.globl	main
	.p2align	4, 0x90
	.type	main,@function
main:
	.cfi_startproc
	mov	rax, rsi
	mov	rcx, rdi
	lea	rdi, [rip + _ZN10playground4main17hf21e17cd5a16d5d9E]
	mov	rsi, rcx
	mov	rdx, rax
	jmp	_ZN3std2rt10lang_start17h573cecb903a42a26E@PLT
.Lfunc_end4:
	.size	main, .Lfunc_end4-main
	.cfi_endproc

	.type	str.1,@object
	.section	.rodata.str.1,"a",@progbits
	.p2align	4
str.1:
	.ascii	"/checkout/src/liballoc/vec.rs"
	.size	str.1, 29

	.type	panic_bounds_check_loc.2,@object
	.section	.data.rel.ro.panic_bounds_check_loc.2,"aw",@progbits
	.p2align	3
panic_bounds_check_loc.2:
	.quad	str.1
	.quad	29
	.long	1555
	.long	10
	.size	panic_bounds_check_loc.2, 24

	.hidden	DW.ref.rust_eh_personality
	.weak	DW.ref.rust_eh_personality
	.section	.data.DW.ref.rust_eh_personality,"aGw",@progbits,DW.ref.rust_eh_personality,comdat
	.p2align	3
	.type	DW.ref.rust_eh_personality,@object
	.size	DW.ref.rust_eh_personality, 8
DW.ref.rust_eh_personality:
	.quad	rust_eh_personality

	.section	".note.GNU-stack","",@progbits

Not all of it is due to the vector code (it seems the play ground doesn't support creating an object file for code without main).

@oyvindln
Copy link
Contributor

oyvindln commented Sep 9, 2017

One difference between using with_capacity and Vec::new is that RawVec::double which is called by Vec::push is annotated with #[cold] and #[inline(never)] which will prevent the compiler from optimizing it out (at least without lto). Normally preventing it from being inlined is probably ideal, but this is a bit of an edge case. With with_capacity, the code around the allocation is inlined, and the compiler is able to figure out that it's redundant and also avoids the call to double entirely since the cap was just set larger than the length.

It looks like in the C++ version, manages optimize out the zeroing of length/capacity. The double call probably prevents this in the rust version. There is also an bounds check, not sure why this isn't optimized out. Maybe the compiler can't prove that length field (which is part of Vec, not RawVec) doesn't change in RawVec::double

My (possibly incorrect) attempt at labelling what's happening:

...

        ; Function prologue (save stack pointer and reserve space)
        push    rbp
        mov     rbp, rsp
        push    rbx
        sub     rsp, 24
        ; This sets the internal ptr to the (later) heap allocated data to 4 (The pointer is set to the alignment of the type initially if the vector is empty.)
        mov     qword ptr [rbp - 32], 4
        ; And this sets the both the capacity and length to 0 using vector instructions.
        xorps   xmm0, xmm0
        movups  xmmword ptr [rbp - 24], xmm0

        ; Load pointer to the RawVec to pass to the double function (I think)
        lea     rdi, [rbp - 32]
        ; Call to capacity doubling function.
        call    <alloc::raw_vec::RawVec<T, A>>::double
        ; Load ptr to heap memory into %rdi
        mov     rdi, qword ptr [rbp - 32]
        ; Load length into %rax
        mov     rax, qword ptr [rbp - 16]
        ; Set first element to 0 (I suppose it could be possible to avoid doing the 4*rax bit here since the length should always be 0 at this point.)
        mov     dword ptr [rdi + 4*rax], 0
        ; Increment length
        inc     rax
        ; Store the length back to the length field in the vector instance
        mov     qword ptr [rbp - 16], rax
        ; Jump to bounds check panic if incrementing length wraps to zero(I think)
        je      .LBB3_2
        ; Load value of the first element into %ebx
        mov     ebx, dword ptr [rdi]
        ; Load the capacity into %rsi
        mov     rsi, qword ptr [rbp - 24]
        ; Check if capacity == 0
        test    rsi, rsi
        ; Jump to end and skip deallocation if the capacity of the vector was 0.
        je      .LBB3_6
        ; Set up arguments to rust_dealloc (I think)
        shl     rsi, 2
        mov     edx, 4
        ; Deallocate vector memory
        call    __rust_dealloc@PLT
.LBB3_6:
        ; Load value from %eax into return register
        mov     eax, ebx
        ; Function epilogue
        add     rsp, 24
        pop     rbx
        pop     rbp
        ret
...

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Sep 9, 2017

When using ::with_capacity the Rust version:

example::foo:
        push    rbp
        mov     rbp, rsp
        mov     eax, 4
        pop     rbp
        ret

is way better than the C++ version

foo(): # @foo()
  push rax
  mov edi, 4
  ;; -------------------------------------
  ;; what's the point of this memory allocation?
  call operator new(unsigned long)
  mov rdi, rax
  call operator delete(void*)
  ;; -------------------------------------
  mov eax, 314
  pop rcx
  ret

Ideally the Rust versions with and without ::with_capacity would beat the C++ version in both situations.

@oyvindln
Copy link
Contributor

oyvindln commented Sep 9, 2017

By altering Vec::push to:

        let len = self.len;
        if len == self.buf.cap() {
            self.buf.double();
        }
        unsafe {
            let end = self.as_mut_ptr().offset(len as isize);
            ptr::write(end, value);
            self.len = len + 1;
        }

I got rid of the bounds check.

I then tried to remove #[inline(never)] from RawVec::double(), and lo and behold, the example with 0 resolved to this:

	xorl	%eax, %eax
	retq

Using other (non-0) values in push seem to work fine too.

I don't know whether that could have some negative in other cases though.

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Sep 9, 2017

Is RawVec::double() still [cold] in your case?

@oyvindln
Copy link
Contributor

oyvindln commented Sep 9, 2017

Yeah, I didn't remove that.

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Sep 9, 2017

So... two questions:

  • why is double marked #[inline(never)]? I understand the desire of making it cold is to avoid the compiler from inlining it and trashing the instruction cache for the "rare" situation in which the Vec grows. However, #[inline(never)] is a hammer that prevents some optimizations for these other examples. Isn't cold enough?

  • does double actually double the size of a Vec or does it use a memory-reuse-friendly growth factor like the one stated here and here or probably more relevant, at facebook (they are using jemalloc internally as well)? (in most vector implementations I know double is called grow instead to denote that the growth factor is an implementation detail)

@oyvindln
Copy link
Contributor

oyvindln commented Sep 9, 2017

Double is simply doubling.
There is an issue about the growth strategy of vectors: #29931

@matthewjasper
Copy link
Contributor

why is double marked #[inline(never)]?

See #23670, reducing the size of Vec::push was the motivation given.

@oyvindln
Copy link
Contributor

oyvindln commented Sep 9, 2017

Seems there is a bit of a tradeoff here.

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Sep 9, 2017

Here's one silly benchmark:

I don't think the benchmark provided is silly. It shows that something wrong is going on with #[cold]. Why is a cold function being inlined when it is not profitable to do so?

It would be nice to know what happens in that benchmark w/o #[inline(never)] in that case today, and check the assembly.


EDIT: maybe cold is not enough and we need to annotate the call to double as unlikely ? (using LLVM's branch prediction hints?)

@oyvindln
Copy link
Contributor

oyvindln commented Sep 9, 2017

Extend doesn't use push anymore, so that benchmark isn't all that useful for current rust.

EDIT: If I use push manually, the non-inlined version is significantly faster:

#[bench]
fn x(b: &mut Bencher) {
    let mut v = Vec::with_capacity(100);
    b.iter(|| {
        for n in 0..100 {
            v.push(n);
        }
        v.truncate(0);
    }
    );
}
with inline(never)
test x ... bench:         202 ns/iter (+/- 5)

without:
test x ... bench:         334 ns/iter (+/- 29)

@oyvindln
Copy link
Contributor

oyvindln commented Sep 9, 2017

Actually, completely optimizing away the allocation would technically not be correct, since that would strictly speaking be changing the behaviour of the function, as the allocation could theoretically fail and cause a panic/exception. So in other words, the C++ version having a memory allocation call is correct.

It might still be possible to improve the Rust implementation optimize down to something similar to the C++ one though.

@oyvindln
Copy link
Contributor

The first change in my previous comment makes this benchmark significantly faster (I got about 120-130 ns, compared to around 200 for the current version.) Though, it increases the code size a small bit, so for functions where nothing is known about the length of the vector it might cause a slight performance increase.

I'm tinkering around to see if I can reproduce it with just using the assume intrinsic, I managed to get rid of the bounds check in the original example with that at least without any code size increase..

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Sep 10, 2017

I'm tinkering around to see if I can reproduce it with just using the assume intrinsic,

There is an llvm.expect intrinsic to tell LLVM that we expect the branch to not be taken in most cases.

@oyvindln
Copy link
Contributor

oyvindln commented Sep 10, 2017

I'm tinkering around to see if I can reproduce it with just using the assume intrinsic,

I managed to avoid the code size increase,
EDIT: Apparently there was still one extra instruction.
so at least I now have an improvement that avoids the bounds check and improves the benchmark (it seems llvm manages to merge the loop counter and length value after my changes.) I don't know if it's possible to reduce it further without letting double be inlined, but I'm not sure how to manage that without increasing the code size and slowing down things. (Using unlikely(), just prevented it from being inlined for the code in the OP, but not for a function that only did a push, so that wasn't very useful.)

I'm preparing a PR with my changes, so people can test if they are of any use.
EDIT:
There seems to be a few tradeoffs between different ways of altering the push version. Not sure what cases it's best to optimize for. Putting push inside a tight loop on it's own may not be the most real-world relevant example since one would normally use extend which avoids push entirely and is way faster as it can avoid checking the capacity inside the loop.

@TimNN TimNN added A-codegen Area: Code generation C-enhancement Category: An issue proposing an enhancement or a PR with one. I-slow Issue: Problems and improvements with respect to performance of generated code. T-compiler Relevant to the compiler team, which will review and decide on the PR/issue. labels Sep 17, 2017
@H2CO3
Copy link

H2CO3 commented Sep 17, 2017

@oyvindln how is getting rid of a potential panic "changing the behavior"? I mean, if the optimized vector doesn't fail with OOM when the unoptimized one would, is that really bad? Surely it doesn't affect algorithmic correctness… Memory allocation, OOM conditions, etc. are mostly implementation details (and consequently very chaotic on modern OSes) anyway.

If someone is trying to test OOM by allocating and destroying single-element vectors, s/he is better off calling malloc() directly, no? And I fail to see any other case where removing an error condition via optimization could be called "changing observable behavior". Saying this would be akin to not allowing register allocation "because local variables are supposed to be on the stack". It does not quite seem reasonable to me, frankly.

@oyvindln
Copy link
Contributor

oyvindln commented Sep 17, 2017

@H2CO3 Ah sorry, I worded myself badly. I don't see a problem with optimizing out the allocations in practice (and normally that's what you would want), I was just pondering why the allocations weren't removed in the C++ version.

I tried various combinations of using the assume intrinsic calls on the length and on the capacity, ordering of the increment and the pointer write, and using unlikely on the in Vec<T>::push() but it's a bit hard to say which combination would be the ideal one without a benchmark that is more relevant to real-world code. (Also using assume a lot is a bit iffy as it can cause problems if used incorrectly.)

This was one compromise I came up with:

    pub fn push(&mut self, value: T) {
        unsafe {
            // This will panic or abort if we would allocate > isize::MAX bytes
            // or if the length increment would overflow for zero-sized types.
            if unlikely(self.len == self.buf.cap()) {
                let len = self.len;
                self.buf.double();
                // Let llvm know that the length didn't change
                // so if we push to a new vector of length 0,
                // we don't have to calculate the offset from
                // the start of the array.
                // This might be worth revisiting once #31681 is solved.
                // See also #44452.
                assume(self.len() == len);
            }

            let end = self.as_mut_ptr().offset(self.len as isize);
            // Copy len to a local before writing `value` to the vector. This seems to enable
            // llvm to merge the length and loop counter when writing to a fresh vector.
            let len = self.len;
            ptr::write(end, value);
            // Let llvm know that the length doesn't change in the write call.
            assume(self.len() == len);
            self.len += 1;
            // Let llvm know that len didn't overflow.
            // We can safely assume that there was no overflow,
            // since if len == usize::MAX, growing the vector would fail.
            // We already made the assumption that the invariant len <= capacity
            // holds when checked if we should grow the vector.
            //
            // Letting llvm know that there was no overflow can help llvm
            // avoid bounds checks after a push in some situations.
            assume(self.len() != 0);
        }
    }

It does result in one extra instruction for a lone raw push call though.
It's a bit harder to experiment with doing changes to RawVec::double as it uses some rust internals so it's difficult to test as a local function.

@H2CO3
Copy link

H2CO3 commented Sep 17, 2017

@oyvindln Oh, that does make sense. Sorry, I thought you were writing in favor of not optimizing away the allocation.

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Sep 17, 2017

@oyvindln

unlikely(self.len == self.buf.cap())

this is what I had in mind, but I expected the code to look like this:

if unlikely(full()) {
  grow();
}

and grow should be marked #[cold], so that the cold applies to everything in the branch.

@Techcable
Copy link

Techcable commented Oct 1, 2017

I know that cold calls are never inlined by llvm, but they just have a lower threshold in the MIR inliner. Is the MIR inliner the reason why it's conventional in the rust stdlib to mark functions as both cold and inline(never) when cold already prevents inlining in LLVM?

Would dropping cold functions for consideration in MIR inlining still prevent the double function from being inlined while still allowing LLVM to fully optimize the call? Even if it doesn't fix the issue we should probably prevent cold functions from being inlined for consistency with LLVM's behavior.

@hanna-kruppe
Copy link
Contributor

@Techcable MIR inlining is not currently enabled by default, and LLVM does inline coldcc functions (source). Also note that #[cold] translates to the cold attribute, not the coldcc calling convention, though that doesn't prevent inlining either.

@Techcable
Copy link

Techcable commented Oct 2, 2017

Thanks for the clarification @rkruppe, you'd think the LLVM docs would be right! Do you think using the coldcc calling convention instead help prevent the issue? At the very least it'd help performance by helping reduce the call overhead.

@lenaschoenburg
Copy link
Contributor

On nightly the original code:

pub fn foo() -> u32 {
  let mut v: Vec<u32> = Vec::new();
  v.push(0);
  v[0]
}

now generates this:

example::foo:
  xor eax, eax
  ret

https://godbolt.org/g/bo4WaM

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Jun 25, 2018

Nice! A second push still makes it trip: https://godbolt.org/g/TZ1Mbo =/

clang appears to handle a couple of push_backs before tripping.

@Ixrec
Copy link
Contributor

Ixrec commented Jul 2, 2018

No idea if it'll help, but this sounds like the optimizer limitation that this C++ lightning talk is about: https://www.youtube.com/watch?v=s4wnuiCwTGU Dunno why Rust would fare worse at this.

@the8472
Copy link
Member

the8472 commented Jun 6, 2020

The only thing that makes it generate good assembly is using Vec::with_capacity(N) (see here):

pub fn foo() -> u32 {
  let mut v: Vec<u32> = Vec::with_capacity(3);
  v.push(7);
  v.push(4);
  v[1]
}

generates

example::foo:
        push    rbp
        mov     rbp, rsp
        mov     eax, 4
        pop     rbp
        ret

It doesn't anymore, it has regressed from 1.42 to 1.43

Edit: Currently O2 results in the expected assembly, O3 in a large mess.

@saethlin
Copy link
Member

saethlin commented Feb 9, 2021

It looks like going from 1.49.0 to 1.15.0 we now will generate the expected optimal code for any number of pushes when sufficient capacity is provided using Vec::with_capacity and -Cpanic=abort is passed. But the allocations only fold away for a few pushes without -Cpanic=abort.

@workingjubilee workingjubilee added the C-optimization Category: An issue highlighting optimization opportunities or PRs implementing such label Oct 8, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
A-codegen Area: Code generation C-enhancement Category: An issue proposing an enhancement or a PR with one. C-optimization Category: An issue highlighting optimization opportunities or PRs implementing such I-slow Issue: Problems and improvements with respect to performance of generated code. T-compiler Relevant to the compiler team, which will review and decide on the PR/issue.
Projects
None yet
Development

No branches or pull requests