lib/libc/aarch64/string: add ASIMD-enhanced timingsafe_bcmp implementation

A straightforward port of the amd64 implementation.

Approved by:	security (cperciva)
Reviewed by:	getz, cperciva
Event:		EuroBSDcon 2024
Differential Revision:	https://reviews.freebsd.org/D46757
This commit is contained in:
Robert Clausecker 2024-12-09 10:49:49 +01:00
parent ce6af7a49e
commit f2c98669fc
2 changed files with 114 additions and 0 deletions

View file

@ -31,6 +31,7 @@ MDSRCS+= \
strncat.c \
strlcat.c \
strlen.S \
timingsafe_bcmp.S \
bcopy.c \
bzero.c

View file

@ -0,0 +1,113 @@
/*
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2024 Robert Clausecker
*/
#include <machine/asm.h>
ENTRY(timingsafe_bcmp)
cmp x2, #32 // at least 33 bytes to process?
bhi .Lgt32
cmp x2, #16 // at least 17 bytes to process?
bhi .L1732
cmp x2, #8 // at least 9 bytes to process?
bhi .L0916
cmp x2, #4 // at least 5 bytes to process?
bhi .L0508
cmp x2, #2 // at least 3 bytes to process?
bhi .L0304
cbnz x2, .L0102 // buffer empty?
mov w0, #0 // empty buffer always matches
ret
.L0102: ldrb w3, [x0] // load first bytes
ldrb w4, [x1]
sub x2, x2, #1
ldrb w5, [x0, x2] // load last bytes
ldrb w6, [x1, x2]
eor w3, w3, w4
eor w5, w5, w6
orr w0, w3, w5
ret
.L0304: ldrh w3, [x0] // load first halfwords
ldrh w4, [x1]
sub x2, x2, #2
ldrh w5, [x0, x2] // load last halfwords
ldrh w6, [x1, x2]
eor w3, w3, w4
eor w5, w5, w6
orr w0, w3, w5
ret
.L0508: ldr w3, [x0] // load first words
ldr w4, [x1]
sub x2, x2, #4
ldr w5, [x0, x2] // load last words
ldr w6, [x1, x2]
eor w3, w3, w4
eor w5, w5, w6
orr w0, w3, w5
ret
.L0916: ldr x3, [x0]
ldr x4, [x1]
sub x2, x2, #8
ldr x5, [x0, x2]
ldr x6, [x1, x2]
eor x3, x3, x4
eor x5, x5, x6
orr x0, x3, x5
orr x0, x0, x0, lsr #32 // ensure low 32 bits are nonzero iff mismatch
ret
.L1732: ldr q0, [x0]
ldr q1, [x1]
sub x2, x2, #16
ldr q2, [x0, x2]
ldr q3, [x1, x2]
eor v0.16b, v0.16b, v1.16b
eor v2.16b, v2.16b, v3.16b
orr v0.16b, v0.16b, v2.16b
umaxv s0, v0.4s // get a nonzero word if any
mov w0, v0.s[0]
ret
/* more than 32 bytes: process buffer in a loop */
.Lgt32: ldp q0, q1, [x0], #32
ldp q2, q3, [x1], #32
eor v0.16b, v0.16b, v2.16b
eor v1.16b, v1.16b, v3.16b
orr v4.16b, v0.16b, v1.16b
subs x2, x2, #64 // enough left for another iteration?
bls .Ltail
0: ldp q0, q1, [x0], #32
ldp q2, q3, [x1], #32
eor v0.16b, v0.16b, v2.16b
eor v1.16b, v1.16b, v3.16b
orr v0.16b, v0.16b, v1.16b
orr v4.16b, v4.16b, v0.16b
subs x2, x2, #32
bhi 0b
/* process last 32 bytes */
.Ltail: add x0, x0, x2 // point to the last 32 bytes in the buffer
add x1, x1, x2
ldp q0, q1, [x0]
ldp q2, q3, [x1]
eor v0.16b, v0.16b, v2.16b
eor v1.16b, v1.16b, v3.16b
orr v0.16b, v0.16b, v1.16b
orr v4.16b, v4.16b, v0.16b
umaxv s0, v4.4s // get a nonzero word if any
mov w0, v0.s[0]
ret
END(timingsafe_bcmp)