opnsense-src/lib/libc/amd64/string/stpcpy.S

/*-
 * Copyright (c) 2023, The FreeBSD Foundation
 *
 * SPDX-License-Expression: BSD-2-Clause
 *
 * Portions of this software were developed by Robert Clausecker
 * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
 *
 * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S
 * written by J.T. Conklin <jtc@acorntoolworks.com> and
 * adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy
 * that was originally dedicated to the public domain
 */

#include <machine/asm.h>

#include "amd64_archlevel.h"

#define ALIGN_TEXT	.p2align 4, 0x90

	.weak stpcpy
	.set stpcpy, __stpcpy
ARCHFUNCS(__stpcpy)
	ARCHFUNC(__stpcpy, scalar)
	ARCHFUNC(__stpcpy, baseline)
ENDARCHFUNCS(__stpcpy)

/*
 * This stpcpy implementation copies a byte at a time until the
 * source pointer is aligned to a word boundary, it then copies by
 * words until it finds a word containing a zero byte, and finally
 * copies by bytes until the end of the string is reached.
 *
 * While this may result in unaligned stores if the source and
 * destination pointers are unaligned with respect to each other,
 * it is still faster than either byte copies or the overhead of
 * an implementation suitable for machines with strict alignment
 * requirements.
 */

ARCHENTRY(__stpcpy, scalar)
	movabsq $0x0101010101010101,%r8
	movabsq $0x8080808080808080,%r9

	/*
	 * Align source to a word boundary.
	 * Consider unrolling loop?
	 */
.Lalign:
	testb	$7,%sil
	je	.Lword_aligned
	movb	(%rsi),%dl
	incq	%rsi
	movb	%dl,(%rdi)
	incq	%rdi
	testb	%dl,%dl
	jne	.Lalign
	movq	%rdi,%rax
	dec	%rax
	ret

	ALIGN_TEXT
.Lloop:
	movq	%rdx,(%rdi)
	addq	$8,%rdi
.Lword_aligned:
	movq	(%rsi),%rdx
	movq	%rdx,%rcx
	addq	$8,%rsi
	subq	%r8,%rcx
	testq	%r9,%rcx
	je	.Lloop

	/*
	 * In rare cases, the above loop may exit prematurely. We must
	 * return to the loop if none of the bytes in the word equal 0.
	 */

	movb	%dl,(%rdi)
	testb	%dl,%dl		/* 1st byte == 0? */
	je	.Ldone
	incq	%rdi

	shrq	$8,%rdx
	movb	%dl,(%rdi)
	testb	%dl,%dl		/* 2nd byte == 0? */
	je	.Ldone
	incq	%rdi

	shrq	$8,%rdx
	movb	%dl,(%rdi)
	testb	%dl,%dl		/* 3rd byte == 0? */
	je	.Ldone
	incq	%rdi

	shrq	$8,%rdx
	movb	%dl,(%rdi)
	testb	%dl,%dl		/* 4th byte == 0? */
	je	.Ldone
	incq	%rdi

	shrq	$8,%rdx
	movb	%dl,(%rdi)
	testb	%dl,%dl		/* 5th byte == 0? */
	je	.Ldone
	incq	%rdi

	shrq	$8,%rdx
	movb	%dl,(%rdi)
	testb	%dl,%dl		/* 6th byte == 0? */
	je	.Ldone
	incq	%rdi

	shrq	$8,%rdx
	movb	%dl,(%rdi)
	testb	%dl,%dl		/* 7th byte == 0? */
	je	.Ldone
	incq	%rdi

	shrq	$8,%rdx
	movb	%dl,(%rdi)
	incq	%rdi
	testb	%dl,%dl		/* 8th byte == 0? */
	jne	.Lword_aligned
	decq	%rdi

.Ldone:
	movq	%rdi,%rax
	ret
ARCHEND(__stpcpy, scalar)

ARCHENTRY(__stpcpy, baseline)
	mov	%esi, %ecx
	mov	%rdi, %rdx
	sub	%rsi, %rdi		# express destination as distance to surce
	and	$~0xf, %rsi		# align source to 16 byte
	movdqa	(%rsi), %xmm0		# head of string with junk before
	pxor	%xmm1, %xmm1
	and	$0xf, %ecx		# misalignment in bytes
	pcmpeqb	%xmm1, %xmm0		# NUL byte present?
	pmovmskb %xmm0, %eax
	shr	%cl, %eax		# clear out matches in junk bytes
	bsf	%eax, %eax		# find match if any
	jnz	.Lrunt

	/* first normal iteration: write head back if it succeeds */
	movdqa	16(%rsi), %xmm0		# 16 bytes of current iteration
	movdqu	(%rsi, %rcx, 1), %xmm2	# first 16 bytes of the string
	pcmpeqb	%xmm0, %xmm1		# NUL byte present?
	pmovmskb %xmm1, %eax
	test	%eax, %eax		# find match if any
	jnz	.Lshorty

	movdqu	%xmm2, (%rdx)		# store beginning of string

	/* main loop, unrolled twice */
	ALIGN_TEXT
0:	movdqa	32(%rsi), %xmm2		# load current iteraion
	movdqu	%xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion
	pxor	%xmm1, %xmm1
	add	$32, %rsi
	pcmpeqb	%xmm2, %xmm1		# NUL byte present?
	pmovmskb %xmm1, %eax
	test	%eax, %eax
	jnz	1f

	movdqa	16(%rsi), %xmm0		# load current iteraion
	movdqu	%xmm2, (%rsi, %rdi, 1)	# write back previous iteraion
	pxor	%xmm1, %xmm1
	pcmpeqb	%xmm0, %xmm1		# NUL byte present?
	pmovmskb %xmm1, %eax
	test	%eax, %eax
	jz	0b

	/* end of string after main loop has iterated */
	add	$16, %rsi		# advance rsi to second unrolled half
1:	tzcnt	%eax, %eax		# find location of match
					# (behaves as bsf on pre-x86-64-v3 CPUs)
	add	%rsi, %rax		# point to NUL byte
	movdqu	-15(%rax), %xmm0	# last 16 bytes of string
	movdqu	%xmm0, -15(%rax, %rdi, 1) # copied to destination
	add	%rdi, %rax		# point to destination's NUL byte
	ret

	/* NUL encountered in second iteration */
.Lshorty:
	tzcnt	%eax, %eax
	add	$16, %eax		# account for length of first iteration
	sub	%ecx, %eax		# but not the parts before the string

	/* NUL encountered in first iteration */
.Lrunt:	lea	1(%rax), %edi		# string length including NUL byte
	add	%rcx, %rsi		# point to beginning of string
	add	%rdx, %rax		# point to NUL byte

	/* transfer 16--32 bytes */
.L1632:	cmp	$16, %edi
	jb	.L0815

	movdqu	-16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes
	movdqu	%xmm2, (%rdx)		# store first 16 bytes
	movdqu	%xmm0, -15(%rax)	# store last 16 bytes
	ret

	/* transfer 8--15 bytes */
.L0815:	cmp	$8, %edi
	jb	.L0407

	mov	(%rsi), %rcx		# load first 8 bytes
	mov	-8(%rsi, %rdi, 1), %rdi	# load last 8 bytes
	mov	%rcx, (%rdx)		# store to dst
	mov	%rdi, -7(%rax)		# dito
	ret

	/* transfer 4--7 bytes */
.L0407:	cmp	$4, %edi
	jb	.L0203

	mov	(%rsi), %ecx
	mov	-4(%rsi, %rdi, 1), %edi
	mov	%ecx, (%rdx)
	mov	%edi, -3(%rax)
	ret

	/* transfer 2--3 bytes */
.L0203:	cmp	$2, %edi
	jb	.L0101

	movzwl	(%rsi), %ecx
	mov	%cx, (%rdx)		# store first two bytes

	/* transfer 0 bytes (last byte is always NUL) */
.L0101:	movb	$0, (%rax)		# store terminating NUL byte
	ret
ARCHEND(__stpcpy, baseline)

	.section .note.GNU-stack,"",%progbits
lib/libc/amd64/string/stpcpy.S: add baseline implementation This commit adds a baseline implementation of stpcpy(3) for amd64. It performs quite well in comparison to the previous scalar implementation as well as agains bionic and glibc (though glibc is faster for very long strings). Fiddle with the Makefile to also have strcpy(3) call into the optimised stpcpy(3) code, fixing an oversight from D9841. Sponsored by: The FreeBSD Foundation Reviewed by: imp ngie emaste Approved by: mjg kib Fixes: D9841 Differential Revision: https://reviews.freebsd.org/D41349 2023-07-05 17:23:33 -04:00			`/*-`
			`* Copyright (c) 2023, The FreeBSD Foundation`
			`*`
			`* SPDX-License-Expression: BSD-2-Clause`
			`*`
			`* Portions of this software were developed by Robert Clausecker`
			`* <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.`
			`*`
			`* Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S`
			`* written by J.T. Conklin <jtc@acorntoolworks.com> and`
			`* adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy`
			`* that was originally dedicated to the public domain`
Add a machine-specific, optimized implementation of strcpy. PR: 73111 Submitted by: Ville-Pertti Keinonen <will@iki.fi> (taken from NetBSD) MFC after: 3 weeks 2005-04-10 01:11:06 -04:00			`*/`

			`#include <machine/asm.h>`
lib/libc/amd64/string/stpcpy.S: add baseline implementation This commit adds a baseline implementation of stpcpy(3) for amd64. It performs quite well in comparison to the previous scalar implementation as well as agains bionic and glibc (though glibc is faster for very long strings). Fiddle with the Makefile to also have strcpy(3) call into the optimised stpcpy(3) code, fixing an oversight from D9841. Sponsored by: The FreeBSD Foundation Reviewed by: imp ngie emaste Approved by: mjg kib Fixes: D9841 Differential Revision: https://reviews.freebsd.org/D41349 2023-07-05 17:23:33 -04:00
			`#include "amd64_archlevel.h"`

			`#define ALIGN_TEXT .p2align 4, 0x90`

			`.weak stpcpy`
			`.set stpcpy, __stpcpy`
			`ARCHFUNCS(__stpcpy)`
			`ARCHFUNC(__stpcpy, scalar)`
			`ARCHFUNC(__stpcpy, baseline)`
			`ENDARCHFUNCS(__stpcpy)`

Add a machine-specific, optimized implementation of strcpy. PR: 73111 Submitted by: Ville-Pertti Keinonen <will@iki.fi> (taken from NetBSD) MFC after: 3 weeks 2005-04-10 01:11:06 -04:00			`/*`
Make both stpcpy and strcpy be assembly language implementations on amd64. Submitted by: Guillaume Morin (guillaume at morinfr.org) Reviewed by: kib, jhb Approved by: re (bz) MFC after: 1 month 2011-07-21 12:32:13 -04:00			`* This stpcpy implementation copies a byte at a time until the`
Add a machine-specific, optimized implementation of strcpy. PR: 73111 Submitted by: Ville-Pertti Keinonen <will@iki.fi> (taken from NetBSD) MFC after: 3 weeks 2005-04-10 01:11:06 -04:00			`* source pointer is aligned to a word boundary, it then copies by`
			`* words until it finds a word containing a zero byte, and finally`
			`* copies by bytes until the end of the string is reached.`
			`*`
			`* While this may result in unaligned stores if the source and`
			`* destination pointers are unaligned with respect to each other,`
			`* it is still faster than either byte copies or the overhead of`
			`* an implementation suitable for machines with strict alignment`
			`* requirements.`
			`*/`

lib/libc/amd64/string/stpcpy.S: add baseline implementation This commit adds a baseline implementation of stpcpy(3) for amd64. It performs quite well in comparison to the previous scalar implementation as well as agains bionic and glibc (though glibc is faster for very long strings). Fiddle with the Makefile to also have strcpy(3) call into the optimised stpcpy(3) code, fixing an oversight from D9841. Sponsored by: The FreeBSD Foundation Reviewed by: imp ngie emaste Approved by: mjg kib Fixes: D9841 Differential Revision: https://reviews.freebsd.org/D41349 2023-07-05 17:23:33 -04:00			`ARCHENTRY(__stpcpy, scalar)`
Make both stpcpy and strcpy be assembly language implementations on amd64. Submitted by: Guillaume Morin (guillaume at morinfr.org) Reviewed by: kib, jhb Approved by: re (bz) MFC after: 1 month 2011-07-21 12:32:13 -04:00			`movabsq $0x0101010101010101,%r8`
			`movabsq $0x8080808080808080,%r9`
Add a machine-specific, optimized implementation of strcpy. PR: 73111 Submitted by: Ville-Pertti Keinonen <will@iki.fi> (taken from NetBSD) MFC after: 3 weeks 2005-04-10 01:11:06 -04:00
			`/*`
			`* Align source to a word boundary.`
			`* Consider unrolling loop?`
			`*/`
			`.Lalign:`
			`testb $7,%sil`
			`je .Lword_aligned`
			`movb (%rsi),%dl`
			`incq %rsi`
			`movb %dl,(%rdi)`
			`incq %rdi`
			`testb %dl,%dl`
			`jne .Lalign`
Make both stpcpy and strcpy be assembly language implementations on amd64. Submitted by: Guillaume Morin (guillaume at morinfr.org) Reviewed by: kib, jhb Approved by: re (bz) MFC after: 1 month 2011-07-21 12:32:13 -04:00			`movq %rdi,%rax`
			`dec %rax`
Add a machine-specific, optimized implementation of strcpy. PR: 73111 Submitted by: Ville-Pertti Keinonen <will@iki.fi> (taken from NetBSD) MFC after: 3 weeks 2005-04-10 01:11:06 -04:00			`ret`

lib/libc/amd64/string/stpcpy.S: add baseline implementation This commit adds a baseline implementation of stpcpy(3) for amd64. It performs quite well in comparison to the previous scalar implementation as well as agains bionic and glibc (though glibc is faster for very long strings). Fiddle with the Makefile to also have strcpy(3) call into the optimised stpcpy(3) code, fixing an oversight from D9841. Sponsored by: The FreeBSD Foundation Reviewed by: imp ngie emaste Approved by: mjg kib Fixes: D9841 Differential Revision: https://reviews.freebsd.org/D41349 2023-07-05 17:23:33 -04:00			`ALIGN_TEXT`
Add a machine-specific, optimized implementation of strcpy. PR: 73111 Submitted by: Ville-Pertti Keinonen <will@iki.fi> (taken from NetBSD) MFC after: 3 weeks 2005-04-10 01:11:06 -04:00			`.Lloop:`
			`movq %rdx,(%rdi)`
			`addq $8,%rdi`
			`.Lword_aligned:`
			`movq (%rsi),%rdx`
			`movq %rdx,%rcx`
			`addq $8,%rsi`
			`subq %r8,%rcx`
			`testq %r9,%rcx`
			`je .Lloop`

			`/*`
			`* In rare cases, the above loop may exit prematurely. We must`
			`* return to the loop if none of the bytes in the word equal 0.`
			`*/`

			`movb %dl,(%rdi)`
			`testb %dl,%dl /* 1st byte == 0? */`
			`je .Ldone`
Make both stpcpy and strcpy be assembly language implementations on amd64. Submitted by: Guillaume Morin (guillaume at morinfr.org) Reviewed by: kib, jhb Approved by: re (bz) MFC after: 1 month 2011-07-21 12:32:13 -04:00			`incq %rdi`
Add a machine-specific, optimized implementation of strcpy. PR: 73111 Submitted by: Ville-Pertti Keinonen <will@iki.fi> (taken from NetBSD) MFC after: 3 weeks 2005-04-10 01:11:06 -04:00
			`shrq $8,%rdx`
			`movb %dl,(%rdi)`
			`testb %dl,%dl /* 2nd byte == 0? */`
			`je .Ldone`
Make both stpcpy and strcpy be assembly language implementations on amd64. Submitted by: Guillaume Morin (guillaume at morinfr.org) Reviewed by: kib, jhb Approved by: re (bz) MFC after: 1 month 2011-07-21 12:32:13 -04:00			`incq %rdi`
Add a machine-specific, optimized implementation of strcpy. PR: 73111 Submitted by: Ville-Pertti Keinonen <will@iki.fi> (taken from NetBSD) MFC after: 3 weeks 2005-04-10 01:11:06 -04:00
			`shrq $8,%rdx`
			`movb %dl,(%rdi)`
			`testb %dl,%dl /* 3rd byte == 0? */`
			`je .Ldone`
Make both stpcpy and strcpy be assembly language implementations on amd64. Submitted by: Guillaume Morin (guillaume at morinfr.org) Reviewed by: kib, jhb Approved by: re (bz) MFC after: 1 month 2011-07-21 12:32:13 -04:00			`incq %rdi`
Add a machine-specific, optimized implementation of strcpy. PR: 73111 Submitted by: Ville-Pertti Keinonen <will@iki.fi> (taken from NetBSD) MFC after: 3 weeks 2005-04-10 01:11:06 -04:00
			`shrq $8,%rdx`
			`movb %dl,(%rdi)`
			`testb %dl,%dl /* 4th byte == 0? */`
			`je .Ldone`
Make both stpcpy and strcpy be assembly language implementations on amd64. Submitted by: Guillaume Morin (guillaume at morinfr.org) Reviewed by: kib, jhb Approved by: re (bz) MFC after: 1 month 2011-07-21 12:32:13 -04:00			`incq %rdi`
Add a machine-specific, optimized implementation of strcpy. PR: 73111 Submitted by: Ville-Pertti Keinonen <will@iki.fi> (taken from NetBSD) MFC after: 3 weeks 2005-04-10 01:11:06 -04:00
			`shrq $8,%rdx`
			`movb %dl,(%rdi)`
			`testb %dl,%dl /* 5th byte == 0? */`
			`je .Ldone`
Make both stpcpy and strcpy be assembly language implementations on amd64. Submitted by: Guillaume Morin (guillaume at morinfr.org) Reviewed by: kib, jhb Approved by: re (bz) MFC after: 1 month 2011-07-21 12:32:13 -04:00			`incq %rdi`
Add a machine-specific, optimized implementation of strcpy. PR: 73111 Submitted by: Ville-Pertti Keinonen <will@iki.fi> (taken from NetBSD) MFC after: 3 weeks 2005-04-10 01:11:06 -04:00
			`shrq $8,%rdx`
			`movb %dl,(%rdi)`
			`testb %dl,%dl /* 6th byte == 0? */`
			`je .Ldone`
Make both stpcpy and strcpy be assembly language implementations on amd64. Submitted by: Guillaume Morin (guillaume at morinfr.org) Reviewed by: kib, jhb Approved by: re (bz) MFC after: 1 month 2011-07-21 12:32:13 -04:00			`incq %rdi`
Add a machine-specific, optimized implementation of strcpy. PR: 73111 Submitted by: Ville-Pertti Keinonen <will@iki.fi> (taken from NetBSD) MFC after: 3 weeks 2005-04-10 01:11:06 -04:00
			`shrq $8,%rdx`
			`movb %dl,(%rdi)`
			`testb %dl,%dl /* 7th byte == 0? */`
			`je .Ldone`
Make both stpcpy and strcpy be assembly language implementations on amd64. Submitted by: Guillaume Morin (guillaume at morinfr.org) Reviewed by: kib, jhb Approved by: re (bz) MFC after: 1 month 2011-07-21 12:32:13 -04:00			`incq %rdi`
Add a machine-specific, optimized implementation of strcpy. PR: 73111 Submitted by: Ville-Pertti Keinonen <will@iki.fi> (taken from NetBSD) MFC after: 3 weeks 2005-04-10 01:11:06 -04:00
			`shrq $8,%rdx`
			`movb %dl,(%rdi)`
			`incq %rdi`
			`testb %dl,%dl /* 8th byte == 0? */`
			`jne .Lword_aligned`
Make both stpcpy and strcpy be assembly language implementations on amd64. Submitted by: Guillaume Morin (guillaume at morinfr.org) Reviewed by: kib, jhb Approved by: re (bz) MFC after: 1 month 2011-07-21 12:32:13 -04:00			`decq %rdi`
Add a machine-specific, optimized implementation of strcpy. PR: 73111 Submitted by: Ville-Pertti Keinonen <will@iki.fi> (taken from NetBSD) MFC after: 3 weeks 2005-04-10 01:11:06 -04:00
			`.Ldone:`
Make both stpcpy and strcpy be assembly language implementations on amd64. Submitted by: Guillaume Morin (guillaume at morinfr.org) Reviewed by: kib, jhb Approved by: re (bz) MFC after: 1 month 2011-07-21 12:32:13 -04:00			`movq %rdi,%rax`
Add a machine-specific, optimized implementation of strcpy. PR: 73111 Submitted by: Ville-Pertti Keinonen <will@iki.fi> (taken from NetBSD) MFC after: 3 weeks 2005-04-10 01:11:06 -04:00			`ret`
lib/libc/amd64/string/stpcpy.S: add baseline implementation This commit adds a baseline implementation of stpcpy(3) for amd64. It performs quite well in comparison to the previous scalar implementation as well as agains bionic and glibc (though glibc is faster for very long strings). Fiddle with the Makefile to also have strcpy(3) call into the optimised stpcpy(3) code, fixing an oversight from D9841. Sponsored by: The FreeBSD Foundation Reviewed by: imp ngie emaste Approved by: mjg kib Fixes: D9841 Differential Revision: https://reviews.freebsd.org/D41349 2023-07-05 17:23:33 -04:00			`ARCHEND(__stpcpy, scalar)`

			`ARCHENTRY(__stpcpy, baseline)`
			`mov %esi, %ecx`
			`mov %rdi, %rdx`
			`sub %rsi, %rdi # express destination as distance to surce`
			`and $~0xf, %rsi # align source to 16 byte`
			`movdqa (%rsi), %xmm0 # head of string with junk before`
			`pxor %xmm1, %xmm1`
			`and $0xf, %ecx # misalignment in bytes`
			`pcmpeqb %xmm1, %xmm0 # NUL byte present?`
			`pmovmskb %xmm0, %eax`
			`shr %cl, %eax # clear out matches in junk bytes`
			`bsf %eax, %eax # find match if any`
			`jnz .Lrunt`

			`/* first normal iteration: write head back if it succeeds */`
			`movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration`
			`movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string`
			`pcmpeqb %xmm0, %xmm1 # NUL byte present?`
			`pmovmskb %xmm1, %eax`
			`test %eax, %eax # find match if any`
			`jnz .Lshorty`

			`movdqu %xmm2, (%rdx) # store beginning of string`

			`/* main loop, unrolled twice */`
			`ALIGN_TEXT`
			`0: movdqa 32(%rsi), %xmm2 # load current iteraion`
			`movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion`
			`pxor %xmm1, %xmm1`
			`add $32, %rsi`
			`pcmpeqb %xmm2, %xmm1 # NUL byte present?`
			`pmovmskb %xmm1, %eax`
			`test %eax, %eax`
			`jnz 1f`

			`movdqa 16(%rsi), %xmm0 # load current iteraion`
			`movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion`
			`pxor %xmm1, %xmm1`
			`pcmpeqb %xmm0, %xmm1 # NUL byte present?`
			`pmovmskb %xmm1, %eax`
			`test %eax, %eax`
			`jz 0b`

			`/* end of string after main loop has iterated */`
			`add $16, %rsi # advance rsi to second unrolled half`
			`1: tzcnt %eax, %eax # find location of match`
			`# (behaves as bsf on pre-x86-64-v3 CPUs)`
			`add %rsi, %rax # point to NUL byte`
			`movdqu -15(%rax), %xmm0 # last 16 bytes of string`
			`movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination`
			`add %rdi, %rax # point to destination's NUL byte`
			`ret`

			`/* NUL encountered in second iteration */`
			`.Lshorty:`
			`tzcnt %eax, %eax`
			`add $16, %eax # account for length of first iteration`
			`sub %ecx, %eax # but not the parts before the string`

			`/* NUL encountered in first iteration */`
			`.Lrunt: lea 1(%rax), %edi # string length including NUL byte`
			`add %rcx, %rsi # point to beginning of string`
			`add %rdx, %rax # point to NUL byte`

			`/* transfer 16--32 bytes */`
			`.L1632: cmp $16, %edi`
			`jb .L0815`

			`movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes`
			`movdqu %xmm2, (%rdx) # store first 16 bytes`
			`movdqu %xmm0, -15(%rax) # store last 16 bytes`
			`ret`

			`/* transfer 8--15 bytes */`
			`.L0815: cmp $8, %edi`
			`jb .L0407`

			`mov (%rsi), %rcx # load first 8 bytes`
			`mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes`
			`mov %rcx, (%rdx) # store to dst`
			`mov %rdi, -7(%rax) # dito`
			`ret`

			`/* transfer 4--7 bytes */`
			`.L0407: cmp $4, %edi`
			`jb .L0203`

			`mov (%rsi), %ecx`
			`mov -4(%rsi, %rdi, 1), %edi`
			`mov %ecx, (%rdx)`
			`mov %edi, -3(%rax)`
			`ret`

			`/* transfer 2--3 bytes */`
			`.L0203: cmp $2, %edi`
			`jb .L0101`

			`movzwl (%rsi), %ecx`
			`mov %cx, (%rdx) # store first two bytes`

			`/* transfer 0 bytes (last byte is always NUL) */`
			`.L0101: movb $0, (%rax) # store terminating NUL byte`
			`ret`
			`ARCHEND(__stpcpy, baseline)`

Add section .note.GNU-stack for assembly files used by 386 and amd64. 2011-01-07 11:08:40 -05:00			`.section .note.GNU-stack,"",%progbits`