opnsense-src/lib/libc/amd64/string/stpcpy.S

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

238 lines
5.5 KiB
ArmAsm
Raw Permalink Normal View History

/*-
* Copyright (c) 2023, The FreeBSD Foundation
*
* SPDX-License-Expression: BSD-2-Clause
*
* Portions of this software were developed by Robert Clausecker
* <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
*
* Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S
* written by J.T. Conklin <jtc@acorntoolworks.com> and
* adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy
* that was originally dedicated to the public domain
*/
#include <machine/asm.h>
#include "amd64_archlevel.h"
#define ALIGN_TEXT .p2align 4, 0x90
.weak stpcpy
.set stpcpy, __stpcpy
ARCHFUNCS(__stpcpy)
ARCHFUNC(__stpcpy, scalar)
ARCHFUNC(__stpcpy, baseline)
ENDARCHFUNCS(__stpcpy)
/*
* This stpcpy implementation copies a byte at a time until the
* source pointer is aligned to a word boundary, it then copies by
* words until it finds a word containing a zero byte, and finally
* copies by bytes until the end of the string is reached.
*
* While this may result in unaligned stores if the source and
* destination pointers are unaligned with respect to each other,
* it is still faster than either byte copies or the overhead of
* an implementation suitable for machines with strict alignment
* requirements.
*/
ARCHENTRY(__stpcpy, scalar)
movabsq $0x0101010101010101,%r8
movabsq $0x8080808080808080,%r9
/*
* Align source to a word boundary.
* Consider unrolling loop?
*/
.Lalign:
testb $7,%sil
je .Lword_aligned
movb (%rsi),%dl
incq %rsi
movb %dl,(%rdi)
incq %rdi
testb %dl,%dl
jne .Lalign
movq %rdi,%rax
dec %rax
ret
ALIGN_TEXT
.Lloop:
movq %rdx,(%rdi)
addq $8,%rdi
.Lword_aligned:
movq (%rsi),%rdx
movq %rdx,%rcx
addq $8,%rsi
subq %r8,%rcx
testq %r9,%rcx
je .Lloop
/*
* In rare cases, the above loop may exit prematurely. We must
* return to the loop if none of the bytes in the word equal 0.
*/
movb %dl,(%rdi)
testb %dl,%dl /* 1st byte == 0? */
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl /* 2nd byte == 0? */
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl /* 3rd byte == 0? */
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl /* 4th byte == 0? */
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl /* 5th byte == 0? */
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl /* 6th byte == 0? */
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl /* 7th byte == 0? */
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
incq %rdi
testb %dl,%dl /* 8th byte == 0? */
jne .Lword_aligned
decq %rdi
.Ldone:
movq %rdi,%rax
ret
ARCHEND(__stpcpy, scalar)
ARCHENTRY(__stpcpy, baseline)
mov %esi, %ecx
mov %rdi, %rdx
sub %rsi, %rdi # express destination as distance to surce
and $~0xf, %rsi # align source to 16 byte
movdqa (%rsi), %xmm0 # head of string with junk before
pxor %xmm1, %xmm1
and $0xf, %ecx # misalignment in bytes
pcmpeqb %xmm1, %xmm0 # NUL byte present?
pmovmskb %xmm0, %eax
shr %cl, %eax # clear out matches in junk bytes
bsf %eax, %eax # find match if any
jnz .Lrunt
/* first normal iteration: write head back if it succeeds */
movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration
movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string
pcmpeqb %xmm0, %xmm1 # NUL byte present?
pmovmskb %xmm1, %eax
test %eax, %eax # find match if any
jnz .Lshorty
movdqu %xmm2, (%rdx) # store beginning of string
/* main loop, unrolled twice */
ALIGN_TEXT
0: movdqa 32(%rsi), %xmm2 # load current iteraion
movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion
pxor %xmm1, %xmm1
add $32, %rsi
pcmpeqb %xmm2, %xmm1 # NUL byte present?
pmovmskb %xmm1, %eax
test %eax, %eax
jnz 1f
movdqa 16(%rsi), %xmm0 # load current iteraion
movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion
pxor %xmm1, %xmm1
pcmpeqb %xmm0, %xmm1 # NUL byte present?
pmovmskb %xmm1, %eax
test %eax, %eax
jz 0b
/* end of string after main loop has iterated */
add $16, %rsi # advance rsi to second unrolled half
1: tzcnt %eax, %eax # find location of match
# (behaves as bsf on pre-x86-64-v3 CPUs)
add %rsi, %rax # point to NUL byte
movdqu -15(%rax), %xmm0 # last 16 bytes of string
movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination
add %rdi, %rax # point to destination's NUL byte
ret
/* NUL encountered in second iteration */
.Lshorty:
tzcnt %eax, %eax
add $16, %eax # account for length of first iteration
sub %ecx, %eax # but not the parts before the string
/* NUL encountered in first iteration */
.Lrunt: lea 1(%rax), %edi # string length including NUL byte
add %rcx, %rsi # point to beginning of string
add %rdx, %rax # point to NUL byte
/* transfer 16--32 bytes */
.L1632: cmp $16, %edi
jb .L0815
movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes
movdqu %xmm2, (%rdx) # store first 16 bytes
movdqu %xmm0, -15(%rax) # store last 16 bytes
ret
/* transfer 8--15 bytes */
.L0815: cmp $8, %edi
jb .L0407
mov (%rsi), %rcx # load first 8 bytes
mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes
mov %rcx, (%rdx) # store to dst
mov %rdi, -7(%rax) # dito
ret
/* transfer 4--7 bytes */
.L0407: cmp $4, %edi
jb .L0203
mov (%rsi), %ecx
mov -4(%rsi, %rdi, 1), %edi
mov %ecx, (%rdx)
mov %edi, -3(%rax)
ret
/* transfer 2--3 bytes */
.L0203: cmp $2, %edi
jb .L0101
movzwl (%rsi), %ecx
mov %cx, (%rdx) # store first two bytes
/* transfer 0 bytes (last byte is always NUL) */
.L0101: movb $0, (%rax) # store terminating NUL byte
ret
ARCHEND(__stpcpy, baseline)
.section .note.GNU-stack,"",%progbits