lib/libc/amd64/string/memcmp.S: add baseline implementation

This changeset adds a baseline implementation of memcmp and bcmp
for amd64. The same code is used for both functions with conditional
code were the behaviour differs (we need more precise output for the
memcmp case).

FreeBSD documents that memcmp returns the difference between the
mismatching characters. Slightly faster code would be possible could
we relax this requirement to the ISO/IEC 9899:1999 requirement of
merely returning a negative/positive integer or zero.

Performance is better than bionic and glibc, except for long strings
were the two are 13% faster. This could be because they use SSE4
ptest which we cannot use in a baseline kernel.

Sponsored by:	The FreeBSD Foundation
Approved by:	mjg
Differential Revision:	https://reviews.freebsd.org/D41442
This commit is contained in:
Robert Clausecker 2023-07-12 15:35:13 +02:00
parent 245fee96c0
commit 8803f01e93

View file

@ -1,9 +1,12 @@
/*-
* Copyright (c) 2018 The FreeBSD Foundation
* Copyright (c) 2018, 2023 The FreeBSD Foundation
*
* This software was developed by Mateusz Guzik <mjg@FreeBSD.org>
* under sponsorship from the FreeBSD Foundation.
*
* Portions of this software were developed by Robert Clausecker
* <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@ -27,6 +30,10 @@
*/
#include <machine/asm.h>
#include <machine/param.h>
#include "amd64_archlevel.h"
/*
* Note: this routine was written with kernel use in mind (read: no simd),
* it is only present in userspace as a temporary measure until something
@ -36,10 +43,15 @@
#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
#ifdef BCMP
ENTRY(bcmp)
#else
ENTRY(memcmp)
#define memcmp bcmp
#endif
ARCHFUNCS(memcmp)
ARCHFUNC(memcmp, scalar)
ARCHFUNC(memcmp, baseline)
ENDARCHFUNCS(memcmp)
ARCHENTRY(memcmp, scalar)
xorl %eax,%eax
10:
cmpq $16,%rdx
@ -157,7 +169,6 @@ ENTRY(memcmp)
1:
leal 1(%eax),%eax
ret
END(bcmp)
#else
/*
* We need to compute the difference between strings.
@ -230,7 +241,165 @@ END(bcmp)
2:
subl %r8d,%eax
ret
END(memcmp)
#endif
ARCHEND(memcmp, scalar)
ARCHENTRY(memcmp, baseline)
cmp $32, %rdx # enough to permit use of the long kernel?
ja .Llong
test %rdx, %rdx # zero bytes buffer?
je .L0
/*
* Compare strings of 1--32 bytes. We want to do this by
* loading into two xmm registers and then comparing. To avoid
* crossing into unmapped pages, we either load 32 bytes from
* the start of the buffer or 32 bytes before its end, depending
* on whether there is a page boundary between the overread area
* or not.
*/
/* check for page boundaries overreads */
lea 31(%rdi), %eax # end of overread
lea 31(%rsi), %r8d
lea -1(%rdi, %rdx, 1), %ecx # last character in buffer
lea -1(%rsi, %rdx, 1), %r9d
xor %ecx, %eax
xor %r9d, %r8d
test $PAGE_SIZE, %eax # are they on different pages?
jz 0f
/* fix up rdi */
movdqu -32(%rdi, %rdx, 1), %xmm0
movdqu -16(%rdi, %rdx, 1), %xmm1
lea -8(%rsp), %rdi # end of replacement buffer
sub %rdx, %rdi # start of replacement buffer
movdqa %xmm0, -40(%rsp) # copy to replacement buffer
movdqa %xmm1, -24(%rsp)
0: test $PAGE_SIZE, %r8d
jz 0f
/* fix up rsi */
movdqu -32(%rsi, %rdx, 1), %xmm0
movdqu -16(%rsi, %rdx, 1), %xmm1
lea -40(%rsp), %rsi # end of replacement buffer
sub %rdx, %rsi # start of replacement buffer
movdqa %xmm0, -72(%rsp) # copy to replacement buffer
movdqa %xmm1, -56(%rsp)
/* load data and compare properly */
0: movdqu 16(%rdi), %xmm1
movdqu 16(%rsi), %xmm3
movdqu (%rdi), %xmm0
movdqu (%rsi), %xmm2
mov %edx, %ecx
mov $-1, %edx
shl %cl, %rdx # ones where the buffer is not
pcmpeqb %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
pmovmskb %xmm1, %ecx
pmovmskb %xmm0, %eax
shl $16, %ecx
or %ecx, %eax # ones where the buffers match
or %edx, %eax # including where the buffer is not
not %eax # ones where there is a mismatch
#ifndef BCMP
bsf %eax, %edx # location of the first mismatch
cmovz %eax, %edx # including if there is no mismatch
movzbl (%rdi, %rdx, 1), %eax # mismatching bytes
movzbl (%rsi, %rdx, 1), %edx
sub %edx, %eax
#endif
ret
/* empty input */
.L0: xor %eax, %eax
ret
/* compare 33+ bytes */
ALIGN_TEXT
.Llong: movdqu (%rdi), %xmm0 # load head
movdqu (%rsi), %xmm2
mov %rdi, %rcx
sub %rdi, %rsi # express rsi as distance from rdi
and $~0xf, %rdi # align rdi to 16 bytes
movdqu 16(%rsi, %rdi, 1), %xmm1
pcmpeqb 16(%rdi), %xmm1 # compare second half of this iteration
add %rcx, %rdx # pointer to last byte in buffer
pcmpeqb %xmm2, %xmm0
pmovmskb %xmm0, %eax
xor $0xffff, %eax # any mismatch?
jne .Lmismatch_head
add $64, %rdi # advance to next iteration
jmp 1f # and get going with the loop
/* process buffer 32 bytes at a time */
ALIGN_TEXT
0: movdqu -32(%rsi, %rdi, 1), %xmm0
movdqu -16(%rsi, %rdi, 1), %xmm1
pcmpeqb -32(%rdi), %xmm0
pcmpeqb -16(%rdi), %xmm1
add $32, %rdi # advance to next iteration
1: pand %xmm0, %xmm1 # 0xff where both halves matched
pmovmskb %xmm1, %eax
cmp $0xffff, %eax # all bytes matched?
jne .Lmismatch
cmp %rdx, %rdi # end of buffer reached?
jb 0b
/* less than 32 bytes left to compare */
movdqu -16(%rdx), %xmm1 # load 32 byte tail through end pointer
movdqu -16(%rdx, %rsi, 1), %xmm3
movdqu -32(%rdx), %xmm0
movdqu -32(%rdx, %rsi, 1), %xmm2
pcmpeqb %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
pmovmskb %xmm1, %ecx
pmovmskb %xmm0, %eax
shl $16, %ecx
or %ecx, %eax # ones where the buffers match
not %eax # ones where there is a mismatch
#ifndef BCMP
bsf %eax, %ecx # location of the first mismatch
cmovz %eax, %ecx # including if there is no mismatch
add %rcx, %rdx # pointer to potential mismatch
movzbl -32(%rdx), %eax # mismatching bytes
movzbl -32(%rdx, %rsi, 1), %edx
sub %edx, %eax
#endif
ret
#ifdef BCMP
.Lmismatch:
mov $1, %eax
.Lmismatch_head:
ret
#else /* memcmp */
.Lmismatch_head:
tzcnt %eax, %eax # location of mismatch
add %rax, %rcx # pointer to mismatch
movzbl (%rcx), %eax # mismatching bytes
movzbl (%rcx, %rsi, 1), %ecx
sub %ecx, %eax
ret
.Lmismatch:
movdqu -48(%rsi, %rdi, 1), %xmm1
pcmpeqb -48(%rdi), %xmm1 # reconstruct xmm1 before PAND
pmovmskb %xmm0, %eax # mismatches in first 16 bytes
pmovmskb %xmm1, %edx # mismatches in second 16 bytes
shl $16, %edx
or %edx, %eax # mismatches in both
not %eax # matches in both
tzcnt %eax, %eax # location of mismatch
add %rax, %rdi # pointer to mismatch
movzbl -64(%rdi), %eax # mismatching bytes
movzbl -64(%rdi, %rsi, 1), %ecx
sub %ecx, %eax
ret
#endif
ARCHEND(memcmp, baseline)
.section .note.GNU-stack,"",%progbits