opnsense-src/lib/libc/amd64/string/strcmp.S

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

360 lines
9.9 KiB
ArmAsm
Raw Normal View History

/*-
* Copyright (c) 2023, The FreeBSD Foundation
*
* SPDX-License-Expression: BSD-2-Clause
*
* Portions of this software were developed by Robert Clausecker
* <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
*
* Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcmp.S
* written by J.T. Conklin <jtc@acorntoolworks.com> that was originally
* dedicated to the public domain.
*/
#include <machine/asm.h>
#include <machine/param.h>
#if 0
RCSID("$NetBSD: strcmp.S,v 1.3 2004/07/19 20:04:41 drochner Exp $")
#endif
#include "amd64_archlevel.h"
#define ALIGN_TEXT .p2align 4, 0x90
ARCHFUNCS(strcmp)
ARCHFUNC(strcmp, scalar)
ARCHFUNC(strcmp, baseline)
ENDARCHFUNCS(strcmp)
ARCHENTRY(strcmp, scalar)
/*
* Align s1 to word boundary.
* Consider unrolling loop?
*/
.Ls1align:
testb $7,%dil
je .Ls1aligned
movb (%rdi),%al
incq %rdi
movb (%rsi),%dl
incq %rsi
testb %al,%al
je .Ldone
cmpb %al,%dl
je .Ls1align
jmp .Ldone
/*
* Check whether s2 is aligned to a word boundary. If it is, we
* can compare by words. Otherwise we have to compare by bytes.
*/
.Ls1aligned:
testb $7,%sil
jne .Lbyte_loop
movabsq $0x0101010101010101,%r8
subq $8,%rdi
movabsq $0x8080808080808080,%r9
subq $8,%rsi
ALIGN_TEXT
.Lword_loop:
movq 8(%rdi),%rax
addq $8,%rdi
movq 8(%rsi),%rdx
addq $8,%rsi
cmpq %rax,%rdx
jne .Lbyte_loop
subq %r8,%rdx
notq %rax
andq %rax,%rdx
testq %r9,%rdx
je .Lword_loop
ALIGN_TEXT
.Lbyte_loop:
movb (%rdi),%al
incq %rdi
movb (%rsi),%dl
incq %rsi
testb %al,%al
je .Ldone
cmpb %al,%dl
je .Lbyte_loop
.Ldone:
movzbq %al,%rax
movzbq %dl,%rdx
subq %rdx,%rax
ret
ARCHEND(strcmp, scalar)
ARCHENTRY(strcmp, baseline)
/* check if either string crosses a page in the head */
lea 15(%rdi), %r8d # end of head
lea 15(%rsi), %r9d
mov %edi, %eax
mov %esi, %edx
xor %edi, %r8d # bits that changed between first and last byte
xor %esi, %r9d
and $~0xf, %rdi # align heads to 16 bytes
and $~0xf, %rsi
or %r8d, %r9d # in either RSI or RDI
and $0xf, %eax # offset from alignment
and $0xf, %edx
pxor %xmm1, %xmm1
test $PAGE_SIZE, %r9d # did the page change?
jz 0f # if not, take fast path
/* heads may cross page boundary, avoid unmapped loads */
movdqa (%rdi), %xmm0 # load aligned heads
movdqa (%rsi), %xmm2
mov $-1, %r8d
mov $-1, %r9d
mov %eax, %ecx
shl %cl, %r8d # string head in XMM0
mov %edx, %ecx
shl %cl, %r9d # string head in XMM2
movdqa %xmm0, -40(%rsp) # stash copies of the heads on the stack
movdqa %xmm2, -24(%rsp)
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm1, %xmm2
pmovmskb %xmm0, %r10d
pmovmskb %xmm2, %r11d
test %r8d, %r10d # NUL byte present in first string?
lea -40(%rsp), %r8
cmovz %rdi, %r8
test %r9d, %r11d # NUL byte present in second string?
lea -24(%rsp), %r9
cmovz %rsi, %r9
movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads
movdqu (%r9, %rdx, 1), %xmm4
jmp 1f
0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads
movdqu (%rsi, %rdx, 1), %xmm4
1: pxor %xmm2, %xmm2
pcmpeqb %xmm0, %xmm2 # NUL byte present?
pcmpeqb %xmm0, %xmm4 # which bytes match?
pandn %xmm4, %xmm2 # match and not NUL byte?
pmovmskb %xmm2, %r9d
xor $0xffff, %r9d # mismatch or NUL byte?
jnz .Lhead_mismatch
/* load head and second chunk */
movdqa 16(%rdi), %xmm2 # load second chunks
movdqa 16(%rsi), %xmm3
sub %rdx, %rax # is a&0xf >= b&0xf?
jb .Lswapped # if not, proceed with swapped operands
neg %rax
movdqu 16(%rsi, %rax, 1), %xmm0
sub %rdi, %rsi # express RSI as distance from RDI
lea (%rsi, %rax, 1), %rdx # point RDX to offset in second string
neg %rax
pcmpeqb %xmm3, %xmm1 # ... corresponding to RDI
pcmpeqb %xmm2, %xmm0
pmovmskb %xmm1, %r8d
pmovmskb %xmm0, %r9d
add $16, %rdi
test %r8d, %r8d
jnz .Lnul_found
xor $0xffff, %r9d
jnz .Lmismatch
add $16, %rdi # advance aligned pointers
/*
* During the main loop, the layout of the two strings is something like:
*
* v ------1------ v ------2------ v
* RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
* RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
*
* where v indicates the alignment boundaries and corresponding chunks
* of the strings have the same letters. Chunk A has been checked in
* the previous iteration. This iteration, we first check that string
* RSI doesn't end within region 2, then we compare chunk B between the
* two strings. As RSI is known not to hold a NUL byte in regsions 1
* and 2 at this point, this also ensures that RDI has not ended yet.
*/
ALIGN_TEXT
0: movdqu (%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
pxor %xmm1, %xmm1
pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI?
pcmpeqb (%rdi), %xmm0 # where do the chunks match?
pmovmskb %xmm1, %r8d
pmovmskb %xmm0, %r9d
test %r8d, %r8d
jnz .Lnul_found
xor $0xffff, %r9d # any mismatches?
jnz .Lmismatch
/* main loop unrolled twice */
movdqu 16(%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
pxor %xmm1, %xmm1
pcmpeqb 16(%rdi, %rsi, 1), %xmm1 # end of string in RSI?
pcmpeqb 16(%rdi), %xmm0 # where do the chunks match?
pmovmskb %xmm1, %r8d
pmovmskb %xmm0, %r9d
add $32, %rdi
test %r8d, %r8d
jnz .Lnul_found2
xor $0xffff, %r9d # any mismatches?
jz 0b
sub $16, %rdi # roll back second increment
/* a mismatch has been found between RDX and RSI */
.Lmismatch:
tzcnt %r9d, %r9d # where is the mismatch?
add %rdi, %rdx # turn RDX from offset to pointer
movzbl (%rdx, %r9, 1), %ecx
movzbl (%rdi, %r9, 1), %eax
sub %ecx, %eax # difference of the mismatching chars
ret
/* mismatch in true heads */
.Lhead_mismatch:
tzcnt %r9d, %r9d # where is the mismatch?
add %rax, %rdi # return to true heads
add %rdx, %rsi
movzbl (%rdi, %r9, 1), %eax # mismatching characters
movzbl (%rsi, %r9, 1), %ecx
sub %ecx, %eax
ret
.Lnul_found2:
sub $16, %rdi # roll back second increment
/* a NUL has been found in RSI */
.Lnul_found:
mov %eax, %ecx
mov %r8d, %r10d
shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX
xor $0xffff, %r9d # mask of mismatches
or %r8d, %r9d # NUL bytes also count as mismatches
jnz .Lmismatch
/*
* (RDI) == (RSI) and NUL is past the string.
* Compare (RSI) with the corresponding part
* of the other string until the NUL byte.
*/
movdqu (%rdi, %rax, 1), %xmm0
pcmpeqb (%rdi, %rsi, 1), %xmm0
add %rdi, %rsi # restore RSI pointer
add %rax, %rdi # point RDI to chunk corresponding to (RSI)
pmovmskb %xmm0, %ecx # mask of matches
not %ecx # mask of mismatches
or %r10d, %ecx # mask of mismatches or NUL bytes
tzcnt %ecx, %ecx # location of first mismatch
movzbl (%rdi, %rcx, 1), %eax
movzbl (%rsi, %rcx, 1), %ecx
sub %ecx, %eax
ret
/*
* If (a&0xf) < (b&0xf), we do the same thing but with swapped
* operands. I found that this performs slightly better than
* using conditional moves to do the swap branchless.
*/
.Lswapped:
movdqu 16(%rdi, %rax, 1), %xmm0
sub %rsi, %rdi # express RDI as distance from RSI
lea (%rdi, %rax, 1), %rdx # point RDX to offset in RDI corresponding to RSI
neg %rax # make difference positive
pcmpeqb %xmm2, %xmm1
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm1, %r8d
pmovmskb %xmm0, %r9d
add $16, %rsi # advance aligned pointers
test %r8d, %r8d
jnz .Lnul_founds
xor $0xffff, %r9d
jnz .Lmismatchs
add $16, %rsi
/*
* During the main loop, the layout of the two strings is something like:
*
* v ------1------ v ------2------ v
* RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
* RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
*
* where v indicates the alignment boundaries and corresponding chunks
* of the strings have the same letters. Chunk A has been checked in
* the previous iteration. This iteration, we first check that string
* RSI doesn't end within region 2, then we compare chunk B between the
* two strings. As RSI is known not to hold a NUL byte in regsions 1
* and 2 at this point, this also ensures that RDI has not ended yet.
*/
ALIGN_TEXT
0: movdqu (%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
pxor %xmm1, %xmm1
pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RSI?
pcmpeqb (%rsi), %xmm0 # where do the chunks match?
pmovmskb %xmm1, %r8d
pmovmskb %xmm0, %r9d
test %r8d, %r8d
jnz .Lnul_founds
xor $0xffff, %r9d # any mismatches?
jnz .Lmismatchs
/* main loop unrolled twice */
movdqu 16(%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
pxor %xmm1, %xmm1
pcmpeqb 16(%rsi, %rdi, 1), %xmm1 # end of string in RSI?
pcmpeqb 16(%rsi), %xmm0 # where do the chunks match?
pmovmskb %xmm1, %r8d
pmovmskb %xmm0, %r9d
add $32, %rsi
test %r8d, %r8d
jnz .Lnul_found2s
xor $0xffff, %r9d # any mismatches?
jz 0b
sub $16, %rsi # roll back second increment
/* a mismatch has been found between RDX and RDI */
.Lmismatchs:
tzcnt %r9d, %r9d # where is the mismatch?
add %rsi, %rdx # turn RDX from offset to pointer
movzbl (%rdx, %r9, 1), %eax
movzbl (%rsi, %r9, 1), %ecx
sub %ecx, %eax # difference of the mismatching chars
ret
.Lnul_found2s:
sub $16, %rsi # roll back second increment
/* a NUL has been found in RSI */
.Lnul_founds:
mov %eax, %ecx
mov %r8d, %r10d
shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX
xor $0xffff, %r9d # mask of mismatches
or %r8d, %r9d # NUL bytes also count as mismatches
jnz .Lmismatchs
/*
* (RDI) == (RSI) and NUL is past the string.
* Compare (RSI) with the corresponding part
* of the other string until the NUL byte.
*/
movdqu (%rsi, %rax, 1), %xmm0
pcmpeqb (%rsi, %rdi, 1), %xmm0
add %rsi, %rdi # restore RDI pointer
add %rax, %rsi # point RSI to chunk corresponding to (RDI)
pmovmskb %xmm0, %ecx # mask of matches
not %ecx # mask of mismatches
or %r10d, %ecx # mask of mismatches or NUL bytes
tzcnt %ecx, %ecx # location of first mismatch
movzbl (%rdi, %rcx, 1), %eax
movzbl (%rsi, %rcx, 1), %ecx
sub %ecx, %eax
ret
ARCHEND(strcmp, baseline)
.section .note.GNU-stack,"",%progbits