Closed
Description
There are two missed optimization opportunities:
- the copy of elements can be a
memmove
/memcpy
but compiler is unable to recognize the loop-idiom - the new copy is dead so it can be removed.
#include<vector>
#include<string>
using namespace std;
using T = int;
T vat1(std::vector<T> v1) {
auto v = v1;
return 10;
}
$ clang++ -std=c++17 -O3 -fno-exceptions
vat1(std::__1::vector<int, std::__1::allocator<int> >): # @vat1(std::__1::vector<int, std::__1::allocator<int> >)
push r15
push r14
push rbx
sub rsp, 32
xorps xmm0, xmm0
movaps xmmword ptr [rsp], xmm0
mov qword ptr [rsp + 16], 0
mov r15, qword ptr [rdi]
mov r14, qword ptr [rdi + 8]
mov rbx, r14
sub rbx, r15
je .LBB0_10
js .LBB0_11
mov rdi, rbx
call operator new(unsigned long)@PLT
add rbx, -4
cmp rbx, 28
jb .LBB0_3
mov rcx, rax
sub rcx, r15
cmp rcx, 32
jb .LBB0_3
shr rbx, 2
inc rbx
mov rsi, rbx
and rsi, -8
lea rcx, [r15 + 4*rsi]
lea rdx, [rax + 4*rsi]
xor edi, edi
.LBB0_6: # =>This Inner Loop Header: Depth=1
movups xmm0, xmmword ptr [r15 + 4*rdi]
movups xmm1, xmmword ptr [r15 + 4*rdi + 16]
movups xmmword ptr [rax + 4*rdi], xmm0
movups xmmword ptr [rax + 4*rdi + 16], xmm1
add rdi, 8
cmp rsi, rdi
jne .LBB0_6
cmp rbx, rsi
jne .LBB0_8
jmp .LBB0_9
.LBB0_3:
mov rcx, r15
mov rdx, rax
.LBB0_8: # =>This Inner Loop Header: Depth=1
mov esi, dword ptr [rcx]
mov dword ptr [rdx], esi
add rcx, 4
add rdx, 4
cmp rcx, r14
jne .LBB0_8
.LBB0_9:
mov rdi, rax
call operator delete(void*)@PLT
.LBB0_10:
mov eax, 10
add rsp, 32
pop rbx
pop r14
pop r15
ret
.LBB0_11:
mov rdi, rsp
call std::__1::vector<int, std::__1::allocator<int> >::__throw_length_error[abi:v170000]() const