mirror of
https://github.com/opnsense/src.git
synced 2026-06-04 14:26:03 -04:00
lib/libc/amd64/string: add memchr(3) scalar, baseline implementation
This is conceptually similar to strchr(3), but there are slight changes to account for the buffer having an explicit buffer length. Sponsored by: The FreeBSD Foundation Approved by: mjg MFC after: 1 week MFC to: stable/14 Differential Revision: https://reviews.freebsd.org/D41598
This commit is contained in:
parent
a559ef1ac8
commit
de12a689fa
2 changed files with 205 additions and 0 deletions
|
|
@ -2,6 +2,7 @@
|
|||
MDSRCS+= \
|
||||
amd64_archlevel.c \
|
||||
bcmp.S \
|
||||
memchr.S \
|
||||
memcmp.S \
|
||||
memcpy.S \
|
||||
memmove.S \
|
||||
|
|
|
|||
204
lib/libc/amd64/string/memchr.S
Normal file
204
lib/libc/amd64/string/memchr.S
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
/*-
|
||||
* Copyright (c) 2023 The FreeBSD Foundation
|
||||
*
|
||||
* This software was developed by Robert Clausecker <fuz@FreeBSD.org>
|
||||
* under sponsorship from the FreeBSD Foundation.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE
|
||||
*/
|
||||
|
||||
#include <machine/asm.h>
|
||||
|
||||
#include "amd64_archlevel.h"
|
||||
|
||||
#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
|
||||
|
||||
.weak memchr
|
||||
.set memchr, __memchr
|
||||
ARCHFUNCS(__memchr)
|
||||
ARCHFUNC(__memchr, scalar)
|
||||
ARCHFUNC(__memchr, baseline)
|
||||
ENDARCHFUNCS(__memchr)
|
||||
|
||||
ARCHENTRY(__memchr, scalar)
|
||||
test %rdx, %rdx # empty input?
|
||||
je .Lnomatch
|
||||
|
||||
lea (, %rdi, 8), %ecx
|
||||
add %rdi, %rdx # pointer to end of buffer
|
||||
and $~7, %rdi # align to 8 bytes
|
||||
mov (%rdi), %rax # load first word
|
||||
movzbl %sil, %esi # clear stray high bits
|
||||
movabs $0x0101010101010101, %r8
|
||||
imul %r8, %rsi # replicate char 8 times
|
||||
|
||||
/* compute head and tail masks */
|
||||
mov %r8, %r10
|
||||
movabs $0x8080808080808080, %r9
|
||||
shl %cl, %r10 # 0x01 where string head is
|
||||
lea (, %rdx, 8), %ecx
|
||||
xor %r8, %r10 # 0x01 where it is not
|
||||
neg %r8 # negate 01..01 so we can use lea
|
||||
mov %r9, %r11
|
||||
xor %rsi, %rax # str ^ c (0x00 where str[i] == c)
|
||||
neg %ecx
|
||||
or %r10, %rax # except before the string
|
||||
shr %cl, %r11 # 0x80 where string tail is
|
||||
|
||||
add $8, %rdi # advance to next 8 bytes
|
||||
cmp %rdx, %rdi # end of buffer reached during head?
|
||||
jae .Ltail # and go to tail-processing code
|
||||
|
||||
/* main loop, unrolled twice */
|
||||
ALIGN_TEXT
|
||||
0: lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01
|
||||
not %rax # ~(str ^ c)
|
||||
and %r9, %rax # ((str^c) - 0x01..01) & ~(str^c)
|
||||
and %rcx, %rax # not including junk bytes
|
||||
jnz .Lmatch
|
||||
|
||||
mov (%rdi), %rax
|
||||
add $8, %rdi
|
||||
xor %rsi, %rax # str ^ c
|
||||
cmp %rdx, %rdi
|
||||
jae .Ltail
|
||||
|
||||
lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01
|
||||
not %rax # ~(str ^ c)
|
||||
and %r9, %rax # ((str^c) - 0x01..01) & ~(str^c)
|
||||
and %rcx, %rax # not including junk bytes
|
||||
jnz .Lmatch
|
||||
|
||||
mov (%rdi), %rax
|
||||
add $8, %rdi
|
||||
xor %rsi, %rax # str ^ c
|
||||
cmp %rdx, %rdi
|
||||
jb 0b
|
||||
|
||||
.Ltail: lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01
|
||||
not %rax # ~(str ^ c)
|
||||
and %r11, %rax # ((str^c) - 0x01..01) & ~(str^c)
|
||||
and %rcx, %rax # not including junk bytes or bytes past buffer
|
||||
jz .Lnomatch
|
||||
|
||||
.Lmatch:
|
||||
tzcnt %rax, %rax # first match
|
||||
shr $3, %eax # scale from bit to byte index
|
||||
lea -8(%rdi, %rax), %rax # pointer to found c
|
||||
ret
|
||||
|
||||
/* no match found */
|
||||
.Lnomatch:
|
||||
xor %eax, %eax # return null pointer
|
||||
ret
|
||||
ARCHEND(__memchr, scalar)
|
||||
|
||||
ARCHENTRY(__memchr, baseline)
|
||||
test %rdx, %rdx # empty input?
|
||||
je .Lnomatchb
|
||||
|
||||
movd %esi, %xmm2
|
||||
mov %edi, %ecx
|
||||
add %rdi, %rdx # pointer to end of buffer
|
||||
and $~0x1f, %rdi # align to 32 bytes
|
||||
movdqa (%rdi), %xmm0 # load first 32 bytes
|
||||
movdqa 16(%rdi), %xmm1
|
||||
|
||||
punpcklbw %xmm2, %xmm2 # c -> cc
|
||||
|
||||
mov $-1, %r9d
|
||||
shl %cl, %r9d # mask with zeroes before the string
|
||||
|
||||
punpcklwd %xmm2, %xmm2 # cc -> cccc
|
||||
|
||||
mov $-1, %r8d
|
||||
xor %ecx, %ecx
|
||||
sub %edx, %ecx # edx = -ecx
|
||||
shr %cl, %r8d # bytes in tail that are part of the buffer
|
||||
|
||||
pshufd $0, %xmm2, %xmm2 # cccc -> cccccccccccccccc
|
||||
|
||||
add $32, %rdi # advance to next 32 bytes
|
||||
mov $-1, %eax
|
||||
cmp %rdx, %rdi # end of buffer reached during head?
|
||||
cmovae %r8d, %eax # if yes, do combined head/tail processing
|
||||
and %r9d, %eax # mask of bytes in head part of string
|
||||
|
||||
/* process head */
|
||||
pcmpeqb %xmm2, %xmm1
|
||||
pcmpeqb %xmm2, %xmm0
|
||||
pmovmskb %xmm1, %esi
|
||||
pmovmskb %xmm0, %ecx
|
||||
shl $16, %esi
|
||||
or %esi, %ecx # locations of matches
|
||||
and %ecx, %eax # any match inside buffer?
|
||||
jnz .Lprecisematchb
|
||||
|
||||
cmp %rdx, %rdi # did the buffer end here?
|
||||
jae .Lnomatchb # if yes we are done
|
||||
|
||||
/* main loop */
|
||||
ALIGN_TEXT
|
||||
0: movdqa (%rdi), %xmm0 # load next string chunk
|
||||
movdqa 16(%rdi), %xmm1
|
||||
add $32, %rdi
|
||||
cmp %rdx, %rdi # ready for main loop?
|
||||
jae .Ltailb
|
||||
|
||||
pcmpeqb %xmm2, %xmm0
|
||||
pcmpeqb %xmm2, %xmm1
|
||||
por %xmm1, %xmm0 # match in either half?
|
||||
pmovmskb %xmm0, %eax
|
||||
test %eax, %eax
|
||||
jz 0b
|
||||
|
||||
.Lmatchb:
|
||||
pcmpeqb -32(%rdi), %xmm2 # redo comparison of first 16 bytes
|
||||
pmovmskb %xmm1, %ecx
|
||||
pmovmskb %xmm2, %eax
|
||||
shl $16, %ecx
|
||||
or %ecx, %eax # location of matches
|
||||
|
||||
.Lprecisematchb:
|
||||
tzcnt %eax, %eax # find location of match
|
||||
lea -32(%rdi, %rax, 1), %rax # point to matching byte
|
||||
ret
|
||||
|
||||
.Ltailb:
|
||||
pcmpeqb %xmm2, %xmm1
|
||||
pcmpeqb %xmm2, %xmm0
|
||||
pmovmskb %xmm1, %edx
|
||||
pmovmskb %xmm0, %eax
|
||||
shl $16, %edx
|
||||
or %edx, %eax # location of matches
|
||||
and %r8d, %eax # mask out matches beyond buffer
|
||||
bsf %eax, %edx # location of match
|
||||
lea -32(%rdi, %rdx, 1), %rdx # pointer to match (if any)
|
||||
cmovnz %rdx, %rax # point to match if present,
|
||||
ret # else null pointer
|
||||
|
||||
.Lnomatchb:
|
||||
xor %eax, %eax # return null pointer
|
||||
ret
|
||||
ARCHEND(__memchr, baseline)
|
||||
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
Loading…
Reference in a new issue