mirror of
https://github.com/opnsense/src.git
synced 2026-06-04 22:32:43 -04:00
lib/libc/amd64/string: add memccpy scalar, baseline implementation
Based on the strlcpy code from D42863, this patch adds a SIMD-enhanced implementation of memccpy for amd64. A scalar implementation calling into memchr and memcpy to do the job is provided, too. Please note that this code does not behave exactly the same as the C implementation of memccpy for overlapping inputs. However, overlapping inputs are not allowed for this function by ISO/IEC 9899:1999 and neither has the C implementation any code to deal with the possibility. It just proceeds byte-by-byte, which may or may not do the expected thing for some overlaps. We do not document whether overlapping inputs are supported in memccpy(3). Tested by: developers@, exp-run Approved by: mjg MFC after: 1 month MFC to: stable/14 PR: 275785 Differential Revision: https://reviews.freebsd.org/D42902
This commit is contained in:
parent
e4b7b0bcbc
commit
fc0e38a7a6
2 changed files with 260 additions and 0 deletions
|
|
@ -3,6 +3,7 @@ MDSRCS+= \
|
|||
bcmp.S \
|
||||
memchr.S \
|
||||
memcmp.S \
|
||||
memccpy.S \
|
||||
memcpy.S \
|
||||
memmove.S \
|
||||
memset.S \
|
||||
|
|
|
|||
259
lib/libc/amd64/string/memccpy.S
Normal file
259
lib/libc/amd64/string/memccpy.S
Normal file
|
|
@ -0,0 +1,259 @@
|
|||
/*
|
||||
* Copyright (c) 2023 The FreeBSD Foundation
|
||||
*
|
||||
* This software was developed by Robert Clausecker <fuz@FreeBSD.org>
|
||||
* under sponsorship from the FreeBSD Foundation.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE
|
||||
*/
|
||||
|
||||
#include <machine/asm.h>
|
||||
|
||||
#include "amd64_archlevel.h"
|
||||
|
||||
#define ALIGN_TEXT .p2align 4, 0x90
|
||||
|
||||
.weak memccpy
|
||||
.set memccpy, __memccpy
|
||||
ARCHFUNCS(__memccpy)
|
||||
ARCHFUNC(__memccpy, scalar)
|
||||
ARCHFUNC(__memccpy, baseline)
|
||||
ENDARCHFUNCS(__memccpy)
|
||||
|
||||
ARCHENTRY(__memccpy, scalar)
|
||||
push %rbp # establish stack frame
|
||||
mov %rsp, %rbp
|
||||
push %rax # dummy push for alignment
|
||||
push %rbx
|
||||
push %rdi
|
||||
push %rsi
|
||||
|
||||
mov %rsi, %rdi
|
||||
mov %edx, %esi
|
||||
mov %rcx, %rdx
|
||||
mov %rcx, %rbx
|
||||
call CNAME(__memchr) # ptr = memchr(src, c, len)
|
||||
|
||||
pop %rsi
|
||||
pop %rdi
|
||||
lea 1(%rax), %rdx
|
||||
sub %rsi, %rdx # size = ptr - src + 1
|
||||
mov %rbx, %rcx
|
||||
lea (%rdi, %rdx, 1), %rbx # res = dest + size
|
||||
test %rax, %rax # if (ptr == NULL)
|
||||
cmovz %rcx, %rdx # size = len
|
||||
cmovz %rax, %rbx # res = NULL
|
||||
call CNAME(memcpy)
|
||||
|
||||
mov %rbx, %rax # return (res)
|
||||
pop %rbx
|
||||
leave
|
||||
ret
|
||||
ARCHEND(__memccpy, scalar)
|
||||
|
||||
ARCHENTRY(__memccpy, baseline)
|
||||
sub $1, %rcx # RCX refers to last character in buffer
|
||||
jb .L0 # go to special code path if len was 0
|
||||
|
||||
movd %edx, %xmm4
|
||||
mov %rcx, %rdx
|
||||
punpcklbw %xmm4, %xmm4 # c -> cc
|
||||
mov %esi, %ecx
|
||||
punpcklwd %xmm4, %xmm4 # cc -> cccc
|
||||
mov %rsi, %r9 # stash a copy of the source pointer for later
|
||||
pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc
|
||||
and $~0xf, %rsi
|
||||
movdqa %xmm4, %xmm1
|
||||
pcmpeqb (%rsi), %xmm1 # NUL found in head?
|
||||
mov $-1, %r8d
|
||||
and $0xf, %ecx
|
||||
shl %cl, %r8d # mask of bytes in the string
|
||||
pmovmskb %xmm1, %eax
|
||||
and %r8d, %eax
|
||||
jnz .Lhead_nul
|
||||
|
||||
movdqa 16(%rsi), %xmm3 # load second string chunk
|
||||
movdqu (%r9), %xmm2 # load unaligned string head
|
||||
mov $32, %r8d
|
||||
sub %ecx, %r8d # head length + length of second chunk
|
||||
movdqa %xmm4, %xmm1
|
||||
pcmpeqb %xmm3, %xmm1 # NUL found in second chunk?
|
||||
|
||||
sub %r8, %rdx # enough space left for the second chunk?
|
||||
jb .Lhead_buf_end
|
||||
|
||||
/* process second chunk */
|
||||
pmovmskb %xmm1, %eax
|
||||
test %eax, %eax
|
||||
jnz .Lsecond_nul
|
||||
|
||||
/* string didn't end in second chunk and neither did buffer -- not a runt! */
|
||||
movdqa 32(%rsi), %xmm0 # load next string chunk
|
||||
movdqa %xmm4, %xmm1
|
||||
movdqu %xmm2, (%rdi) # deposit head into buffer
|
||||
sub %rcx, %rdi # adjust RDI to correspond to RSI
|
||||
movdqu %xmm3, 16(%rdi) # deposit second chunk
|
||||
sub %rsi, %rdi # express RDI as distance from RSI
|
||||
add $32, %rsi # advance RSI past first two chunks
|
||||
sub $16, %rdx # enough left for another round?
|
||||
jb 1f
|
||||
|
||||
/* main loop unrolled twice */
|
||||
ALIGN_TEXT
|
||||
0: pcmpeqb %xmm0, %xmm1 # NUL byte encountered?
|
||||
pmovmskb %xmm1, %eax
|
||||
test %eax, %eax
|
||||
jnz 3f
|
||||
|
||||
movdqu %xmm0, (%rsi, %rdi)
|
||||
movdqa 16(%rsi), %xmm0 # load next string chunk
|
||||
movdqa %xmm4, %xmm1
|
||||
cmp $16, %rdx # more than a full chunk left?
|
||||
jb 2f
|
||||
|
||||
add $32, %rsi # advance pointers to next chunk
|
||||
pcmpeqb %xmm0, %xmm1 # NUL byte encountered?
|
||||
pmovmskb %xmm1, %eax
|
||||
test %eax, %eax
|
||||
jnz 4f
|
||||
|
||||
movdqu %xmm0, -16(%rsi, %rdi)
|
||||
movdqa (%rsi), %xmm0 # load next string chunk
|
||||
movdqa %xmm4, %xmm1
|
||||
sub $32, %rdx
|
||||
jae 0b
|
||||
|
||||
1: sub $16, %rsi # undo second advancement
|
||||
add $16, %edx
|
||||
|
||||
/* 1--16 bytes left in the buffer but string has not ended yet */
|
||||
2: pcmpeqb %xmm1, %xmm0 # NUL byte encountered?
|
||||
pmovmskb %xmm0, %r8d
|
||||
mov %r8d, %ecx
|
||||
bts %edx, %r8d # treat end of buffer as end of string
|
||||
or $0x10000, %eax # ensure TZCNT finds a set bit
|
||||
tzcnt %r8d, %r8d # find tail length
|
||||
add %rsi, %rdi # restore RDI
|
||||
movdqu 1(%rsi, %r8, 1), %xmm0 # load string tail
|
||||
movdqu %xmm0, 1(%rdi, %r8, 1) # store string tail
|
||||
lea 17(%rdi, %r8, 1), %rsi # return value if terminator encountered
|
||||
xor %eax, %eax # return value if no terminator encountered
|
||||
bt %r8d, %ecx # terminator encountered inside buffer?
|
||||
cmovc %rsi, %rax # if yes, return pointer, else NULL
|
||||
ret
|
||||
|
||||
4: sub $16, %rsi # undo second advancement
|
||||
add $16, %rdx # restore number of remaining bytes
|
||||
|
||||
/* string has ended but buffer has not */
|
||||
3: tzcnt %eax, %eax # find length of string tail
|
||||
movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL)
|
||||
add %rsi, %rdi # restore destination pointer
|
||||
movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL)
|
||||
lea 1(%rdi, %rax, 1), %rax # compute return value
|
||||
ret
|
||||
|
||||
.Lhead_buf_end:
|
||||
pmovmskb %xmm1, %r8d
|
||||
add $32, %edx # restore edx to (len-1) + ecx
|
||||
shl $16, %r8d # place 2nd chunk NUL mask into bits 16--31
|
||||
mov %r8d, %r10d
|
||||
bts %rdx, %r8 # treat end of buffer as if terminator present
|
||||
xor %eax, %eax # return value if terminator not found
|
||||
tzcnt %r8, %rdx # find string/buffer len from alignment boundary
|
||||
lea 1(%rdi, %rdx, 1), %r8 # return value if terminator found + rcx
|
||||
sub %rcx, %r8 # subtract rcx
|
||||
bt %rdx, %r10 # was the terminator present?
|
||||
cmovc %r8, %rax # if yes, return pointer, else NULL
|
||||
sub %ecx, %edx # find actual string/buffer len
|
||||
jmp .L0132
|
||||
|
||||
.Lsecond_nul:
|
||||
add %r8, %rdx # restore buffer length
|
||||
tzcnt %eax, %r8d # where is the NUL byte?
|
||||
lea -16(%rcx), %eax
|
||||
sub %eax, %r8d # string length
|
||||
lea 1(%rdi, %r8, 1), %rax # return value if NUL before end of buffer
|
||||
xor %ecx, %ecx # return value if not
|
||||
cmp %r8, %rdx # is the string shorter than the buffer?
|
||||
cmova %r8, %rdx # copy only min(buflen, srclen) bytes
|
||||
cmovb %rcx, %rax # return NUL if buffer ended before string
|
||||
.L0132: cmp $16, %rdx # at least 17 bytes to copy (not incl NUL)?
|
||||
jb .L0116
|
||||
|
||||
/* copy 17--32 bytes */
|
||||
movdqu (%r9), %xmm0 # load first 16 bytes
|
||||
movdqu -15(%r9, %rdx, 1), %xmm1 # load last 16 bytes
|
||||
movdqu %xmm0, (%rdi)
|
||||
movdqu %xmm1, -15(%rdi, %rdx, 1)
|
||||
ret
|
||||
|
||||
.Lhead_nul:
|
||||
tzcnt %eax, %r8d # where is the NUL byte?
|
||||
sub %ecx, %r8d # ... from the beginning of the string?
|
||||
lea 1(%rdi, %r8, 1), %rax # return value if NUL before end of buffer
|
||||
xor %ecx, %ecx # return value if not
|
||||
cmp %r8, %rdx # is the string shorter than the buffer?
|
||||
cmova %r8, %rdx # copy only min(buflen, srclen) bytes
|
||||
cmovb %rcx, %rax # return NUL if buffer ended before string
|
||||
|
||||
/* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */
|
||||
.L0116: cmp $8, %rdx # at least 9 bytes to copy?
|
||||
jae .L0916
|
||||
|
||||
cmp $4, %rdx # at least 5 bytes to copy?
|
||||
jae .L0508
|
||||
|
||||
cmp $2, %rdx # at least 3 bytes to copy?
|
||||
jae .L0304
|
||||
|
||||
/* copy one or two bytes */
|
||||
movzbl (%r9), %ecx # load first byte from src
|
||||
movzbl (%r9, %rdx, 1), %esi # load last byte from src
|
||||
mov %cl, (%rdi) # deposit into destination
|
||||
mov %sil, (%rdi, %rdx, 1)
|
||||
ret
|
||||
|
||||
.L0304: movzwl (%r9), %ecx
|
||||
movzwl -1(%r9, %rdx, 1), %esi
|
||||
mov %cx, (%rdi)
|
||||
mov %si, -1(%rdi, %rdx, 1)
|
||||
ret
|
||||
|
||||
.L0508: mov (%r9), %ecx
|
||||
mov -3(%r9, %rdx, 1), %esi
|
||||
mov %ecx, (%rdi)
|
||||
mov %esi, -3(%rdi, %rdx, 1)
|
||||
ret
|
||||
|
||||
.L0916: mov (%r9), %rcx
|
||||
mov -7(%r9, %rdx, 1), %rsi
|
||||
mov %rcx, (%rdi)
|
||||
mov %rsi, -7(%rdi, %rdx, 1)
|
||||
ret
|
||||
|
||||
/* length zero destination: return null pointer */
|
||||
.L0: xor %eax, %eax
|
||||
ret
|
||||
ARCHEND(__memccpy, baseline)
|
||||
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
Loading…
Reference in a new issue