opnsense-src/lib/libc/amd64/string/memmove.S
Alexander Motin f22068d91b amd64: Stop using REP MOVSB for backward memmove()s.
Enhanced REP MOVSB feature of CPUs starting from Ivy Bridge makes
REP MOVSB the fastest way to copy memory in most of cases. However
Intel Optimization Reference Manual says: "setting the DF to force
REP MOVSB to copy bytes from high towards low addresses will expe-
rience significant performance degradation". Measurements on Intel
Cascade Lake and Alder Lake, same as on AMD Zen3 show that it can
drop throughput to as low as 2.5-3.5GB/s, comparing to ~10-30GB/s
of REP MOVSQ or hand-rolled loop, used for non-ERMS CPUs.

This patch keeps ERMS use for forward ordered memory copies, but
removes it for backward overlapped moves where it does not work.

This is just a cosmetic sync with kernel, since libc does not use
ERMS at this time.

Reviewed by:    mjg
MFC after:	2 weeks
2022-06-16 14:51:50 -04:00

308 lines
5.6 KiB
ArmAsm

/*-
* Copyright (c) 2018 The FreeBSD Foundation
*
* This software was developed by Mateusz Guzik <mjg@FreeBSD.org>
* under sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <machine/asm.h>
__FBSDID("$FreeBSD$");
/*
* Note: this routine was written with kernel use in mind (read: no simd),
* it is only present in userspace as a temporary measure until something
* better gets imported.
*/
#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
/*
* memmove(dst, src, cnt)
* rdi, rsi, rdx
*/
/*
* Register state at entry is supposed to be as follows:
* rdi - destination
* rsi - source
* rdx - count
*
* The macro possibly clobbers the above and: rcx, r8, r9, 10
* It does not clobber rax nor r11.
*/
.macro MEMMOVE erms overlap begin end
\begin
/*
* For sizes 0..32 all data is read before it is written, so there
* is no correctness issue with direction of copying.
*/
cmpq $32,%rcx
jbe 101632f
.if \overlap == 1
movq %rdi,%r8
subq %rsi,%r8
cmpq %rcx,%r8 /* overlapping && src < dst? */
jb 2f
.endif
cmpq $256,%rcx
ja 1256f
ALIGN_TEXT
103200:
movq (%rsi),%rdx
movq %rdx,(%rdi)
movq 8(%rsi),%rdx
movq %rdx,8(%rdi)
movq 16(%rsi),%rdx
movq %rdx,16(%rdi)
movq 24(%rsi),%rdx
movq %rdx,24(%rdi)
leaq 32(%rsi),%rsi
leaq 32(%rdi),%rdi
subq $32,%rcx
cmpq $32,%rcx
jae 103200b
cmpb $0,%cl
jne 101632f
\end
ret
ALIGN_TEXT
101632:
cmpb $16,%cl
jl 100816f
movq (%rsi),%rdx
movq 8(%rsi),%r8
movq -16(%rsi,%rcx),%r9
movq -8(%rsi,%rcx),%r10
movq %rdx,(%rdi)
movq %r8,8(%rdi)
movq %r9,-16(%rdi,%rcx)
movq %r10,-8(%rdi,%rcx)
\end
ret
ALIGN_TEXT
100816:
cmpb $8,%cl
jl 100408f
movq (%rsi),%rdx
movq -8(%rsi,%rcx),%r8
movq %rdx,(%rdi)
movq %r8,-8(%rdi,%rcx,)
\end
ret
ALIGN_TEXT
100408:
cmpb $4,%cl
jl 100204f
movl (%rsi),%edx
movl -4(%rsi,%rcx),%r8d
movl %edx,(%rdi)
movl %r8d,-4(%rdi,%rcx)
\end
ret
ALIGN_TEXT
100204:
cmpb $2,%cl
jl 100001f
movzwl (%rsi),%edx
movzwl -2(%rsi,%rcx),%r8d
movw %dx,(%rdi)
movw %r8w,-2(%rdi,%rcx)
\end
ret
ALIGN_TEXT
100001:
cmpb $1,%cl
jl 100000f
movb (%rsi),%dl
movb %dl,(%rdi)
100000:
\end
ret
ALIGN_TEXT
1256:
testb $15,%dil
jnz 100f
.if \erms == 1
rep
movsb
.else
shrq $3,%rcx /* copy by 64-bit words */
rep
movsq
movq %rdx,%rcx
andl $7,%ecx /* any bytes left? */
jne 100408b
.endif
\end
ret
100:
movq (%rsi),%r8
movq 8(%rsi),%r9
movq %rdi,%r10
movq %rdi,%rcx
andq $15,%rcx
leaq -16(%rdx,%rcx),%rdx
neg %rcx
leaq 16(%rdi,%rcx),%rdi
leaq 16(%rsi,%rcx),%rsi
movq %rdx,%rcx
.if \erms == 1
rep
movsb
movq %r8,(%r10)
movq %r9,8(%r10)
.else
shrq $3,%rcx /* copy by 64-bit words */
rep
movsq
movq %r8,(%r10)
movq %r9,8(%r10)
movq %rdx,%rcx
andl $7,%ecx /* any bytes left? */
jne 100408b
.endif
\end
ret
.if \overlap == 1
/*
* Copy backwards.
*/
ALIGN_TEXT
2:
cmpq $256,%rcx
ja 2256f
leaq -8(%rdi,%rcx),%rdi
leaq -8(%rsi,%rcx),%rsi
cmpq $32,%rcx
jb 2016f
ALIGN_TEXT
2032:
movq (%rsi),%rdx
movq %rdx,(%rdi)
movq -8(%rsi),%rdx
movq %rdx,-8(%rdi)
movq -16(%rsi),%rdx
movq %rdx,-16(%rdi)
movq -24(%rsi),%rdx
movq %rdx,-24(%rdi)
leaq -32(%rsi),%rsi
leaq -32(%rdi),%rdi
subq $32,%rcx
cmpq $32,%rcx
jae 2032b
cmpb $0,%cl
jne 2016f
\end
ret
ALIGN_TEXT
2016:
cmpb $16,%cl
jl 2008f
movq (%rsi),%rdx
movq %rdx,(%rdi)
movq -8(%rsi),%rdx
movq %rdx,-8(%rdi)
subb $16,%cl
jz 2000f
leaq -16(%rsi),%rsi
leaq -16(%rdi),%rdi
2008:
cmpb $8,%cl
jl 2004f
movq (%rsi),%rdx
movq %rdx,(%rdi)
subb $8,%cl
jz 2000f
leaq -8(%rsi),%rsi
leaq -8(%rdi),%rdi
2004:
cmpb $4,%cl
jl 2002f
movl 4(%rsi),%edx
movl %edx,4(%rdi)
subb $4,%cl
jz 2000f
leaq -4(%rsi),%rsi
leaq -4(%rdi),%rdi
2002:
cmpb $2,%cl
jl 2001f
movw 6(%rsi),%dx
movw %dx,6(%rdi)
subb $2,%cl
jz 2000f
leaq -2(%rsi),%rsi
leaq -2(%rdi),%rdi
2001:
cmpb $1,%cl
jl 2000f
movb 7(%rsi),%dl
movb %dl,7(%rdi)
2000:
\end
ret
ALIGN_TEXT
2256:
std
leaq -8(%rdi,%rcx),%rdi
leaq -8(%rsi,%rcx),%rsi
shrq $3,%rcx
rep
movsq
cld
movq %rdx,%rcx
andb $7,%cl
jne 2004b
\end
ret
.endif
.endm
.macro MEMMOVE_BEGIN
movq %rdi,%rax
movq %rdx,%rcx
.endm
.macro MEMMOVE_END
.endm
#ifndef MEMCPY
ENTRY(memmove)
MEMMOVE erms=0 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END
END(memmove)
#else
ENTRY(memcpy)
MEMMOVE erms=0 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END
END(memcpy)
#endif