From af366d353b84bdc4e730f0fc563853abc338271c Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 8 Feb 2021 18:01:48 +0100 Subject: [PATCH] amd64: implement strlen in assembly The C variant in libkern performs excessive branching to find the non-zero byte instead of using the bsfq instruction. The same code patched to use it is still slower than the routine implemented here as the compiler keeps neglecting to perform certain optimizations (like using leaq). On top of that the routine can is a starting point for copyinstr which operates on words instead of bytes. Tested with glibc test suite. Sample results (calls/s): Haswell: $(perl -e "print 'A' x 3"): stock: 211198039 patched:338626619 asm: 465609618 $(perl -e "print 'A' x 100"): stock: 83151997 patched: 98285919 asm: 120719888 AMD EPYC 7R32: $(perl -e "print 'A' x 3"): stock: 282523617 asm: 491498172 $(perl -e "print 'A' x 100"): stock: 114857172 asm: 112082057 --- sys/amd64/amd64/support.S | 66 +++++++++++++++++++++++++++++++++++++++ sys/conf/files | 1 - sys/conf/files.arm | 1 + sys/conf/files.arm64 | 1 + sys/conf/files.i386 | 1 + sys/conf/files.mips | 1 + sys/conf/files.powerpc | 1 + sys/conf/files.riscv | 1 + 8 files changed, 72 insertions(+), 1 deletion(-) diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index b623fba277d..994c5f15e24 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -697,6 +697,72 @@ ENTRY(fillw) ret END(fillw) +/* + * strlen(string) + * %rdi + * + * Uses the ((x - 0x01....01) & ~x & 0x80....80) trick. + * + * 0x80....80 is replaced with 0 - 0x80....80 so that it can be added + * with leaq. + * + * For a description see either: + * - "Hacker's Delight" by Henry S. Warren, Jr. + * - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms" + * by Agner Fog + * + * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386. + */ +ENTRY(strlen) + PUSH_FRAME_POINTER + movabsq $0xfefefefefefefeff,%r8 + movabsq $0x8080808080808080,%r9 + + movq %rdi,%r10 + movq %rdi,%rcx + testb $7,%dil + jz 2f + + /* + * Handle misaligned reads: align to 8 and fill + * the spurious bytes. + */ + andq $~7,%rdi + movq (%rdi),%r11 + shlq $3,%rcx + movq $-1,%rdx + shlq %cl,%rdx + notq %rdx + orq %rdx,%r11 + + leaq (%r11,%r8),%rcx + notq %r11 + andq %r11,%rcx + andq %r9,%rcx + jnz 3f + + /* + * Main loop. + */ + ALIGN_TEXT +1: + leaq 8(%rdi),%rdi +2: + movq (%rdi),%r11 + leaq (%r11,%r8),%rcx + notq %r11 + andq %rcx,%r11 + andq %r9,%rcx + jz 1b +3: + bsfq %rcx,%rcx + shrq $3,%rcx + leaq (%rcx,%rdi),%rax + subq %r10,%rax + POP_FRAME_POINTER + ret +END(strlen) + /*****************************************************************************/ /* copyout and fubyte family */ /*****************************************************************************/ diff --git a/sys/conf/files b/sys/conf/files index edca1003e90..1abfadb1e8d 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -4085,7 +4085,6 @@ libkern/strdup.c standard libkern/strndup.c standard libkern/strlcat.c standard libkern/strlcpy.c standard -libkern/strlen.c standard libkern/strncat.c standard libkern/strncmp.c standard libkern/strncpy.c standard diff --git a/sys/conf/files.arm b/sys/conf/files.arm index eb3a23b5fc2..69986585bdf 100644 --- a/sys/conf/files.arm +++ b/sys/conf/files.arm @@ -127,6 +127,7 @@ libkern/lshrdi3.c standard libkern/memcmp.c standard libkern/moddi3.c standard libkern/qdivrem.c standard +libkern/strlen.c standard libkern/ucmpdi2.c standard libkern/udivdi3.c standard libkern/umoddi3.c standard diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index f7003b1048c..42ec3b2787b 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -432,6 +432,7 @@ libkern/memcmp.c standard \ compile-with "${NORMAL_C:N-fsanitize*}" libkern/memset.c standard \ compile-with "${NORMAL_C:N-fsanitize*}" +libkern/strlen.c standard libkern/arm64/crc32c_armv8.S standard cddl/dev/dtrace/aarch64/dtrace_asm.S optional dtrace compile-with "${DTRACE_S}" cddl/dev/dtrace/aarch64/dtrace_subr.c optional dtrace compile-with "${DTRACE_C}" diff --git a/sys/conf/files.i386 b/sys/conf/files.i386 index 6560ab217d9..1e2ab5f8c52 100644 --- a/sys/conf/files.i386 +++ b/sys/conf/files.i386 @@ -218,6 +218,7 @@ libkern/memcmp.c standard libkern/memset.c standard libkern/moddi3.c standard libkern/qdivrem.c standard +libkern/strlen.c standard libkern/ucmpdi2.c standard libkern/udivdi3.c standard libkern/umoddi3.c standard diff --git a/sys/conf/files.mips b/sys/conf/files.mips index c18f0a5c69b..7ee5b0019bd 100644 --- a/sys/conf/files.mips +++ b/sys/conf/files.mips @@ -66,6 +66,7 @@ libkern/ucmpdi2.c optional mips | mipshf | mipsel | mipselhf libkern/ashldi3.c standard libkern/ashrdi3.c standard libkern/memcmp.c standard +libkern/strlen.c standard # cfe support dev/cfe/cfe_api.c optional cfe diff --git a/sys/conf/files.powerpc b/sys/conf/files.powerpc index 3022fd6f6e3..347abee153d 100644 --- a/sys/conf/files.powerpc +++ b/sys/conf/files.powerpc @@ -129,6 +129,7 @@ libkern/memcmp.c standard libkern/memset.c standard libkern/moddi3.c optional powerpc | powerpcspe libkern/qdivrem.c optional powerpc | powerpcspe +libkern/strlen.c standard libkern/ucmpdi2.c optional powerpc | powerpcspe libkern/udivdi3.c optional powerpc | powerpcspe libkern/umoddi3.c optional powerpc | powerpcspe diff --git a/sys/conf/files.riscv b/sys/conf/files.riscv index 3969528db07..7ecea016b9a 100644 --- a/sys/conf/files.riscv +++ b/sys/conf/files.riscv @@ -29,6 +29,7 @@ libkern/flsl.c standard libkern/flsll.c standard libkern/memcmp.c standard libkern/memset.c standard +libkern/strlen.c standard riscv/riscv/autoconf.c standard riscv/riscv/bus_machdep.c standard riscv/riscv/bus_space_asm.S standard