From ddf6571230dd508f458982c911ba332da0fcbab4 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 1 Dec 2018 14:20:32 +0000 Subject: [PATCH] amd64: align target memmove buffer to 16 bytes before using rep movs See the review for sample test results. Reviewed by: kib (kernel part) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D18401 --- lib/libc/amd64/string/memmove.S | 30 ++++++++++++++++++++++++++++++ sys/amd64/amd64/support.S | 30 ++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/lib/libc/amd64/string/memmove.S b/lib/libc/amd64/string/memmove.S index 5a466f7a7d6..accc8644061 100644 --- a/lib/libc/amd64/string/memmove.S +++ b/lib/libc/amd64/string/memmove.S @@ -139,6 +139,8 @@ __FBSDID("$FreeBSD$"); ALIGN_TEXT 1256: + testb $15,%dil + jnz 100f .if \erms == 1 rep movsb @@ -152,6 +154,34 @@ __FBSDID("$FreeBSD$"); .endif \end ret +100: + movq (%rsi),%r8 + movq 8(%rsi),%r9 + movq %rdi,%r10 + movq %rdi,%rcx + andq $15,%rcx + leaq -16(%rdx,%rcx),%rdx + neg %rcx + leaq 16(%rdi,%rcx),%rdi + leaq 16(%rsi,%rcx),%rsi + movq %rdx,%rcx +.if \erms == 1 + rep + movsb + movq %r8,(%r10) + movq %r9,8(%r10) +.else + shrq $3,%rcx /* copy by 64-bit words */ + rep + movsq + movq %r8,(%r10) + movq %r9,8(%r10) + movq %rdx,%rcx + andl $7,%ecx /* any bytes left? */ + jne 100408b +.endif + \end + ret .if \overlap == 1 /* diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index c5b167d1783..2f2897613d1 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -302,6 +302,8 @@ END(memcmp) ALIGN_TEXT 1256: + testb $15,%dil + jnz 100f .if \erms == 1 rep movsb @@ -315,6 +317,34 @@ END(memcmp) .endif \end ret +100: + movq (%rsi),%r8 + movq 8(%rsi),%r9 + movq %rdi,%r10 + movq %rdi,%rcx + andq $15,%rcx + leaq -16(%rdx,%rcx),%rdx + neg %rcx + leaq 16(%rdi,%rcx),%rdi + leaq 16(%rsi,%rcx),%rsi + movq %rdx,%rcx +.if \erms == 1 + rep + movsb + movq %r8,(%r10) + movq %r9,8(%r10) +.else + shrq $3,%rcx /* copy by 64-bit words */ + rep + movsq + movq %r8,(%r10) + movq %r9,8(%r10) + movq %rdx,%rcx + andl $7,%ecx /* any bytes left? */ + jne 100408b +.endif + \end + ret .if \overlap == 1 /*