amd64: revamp memcmp

Borrow the trick from memset and memmove and use the scale/index/base addressing to avoid branches. If a mismatch is found, the routine has to calculate the difference. Make sure there is always up to 8 bytes to inspect. This replaces the previous loop which would operate over up to 16 bytes with an unrolled list of 8 tests. Speed varies a lot, but this is a net win over the previous routine with probably a lot more to gain. Validated with glibc test suite.
2026-05-28 04:12:45 -04:00 · 2020-01-28 17:48:17 +00:00 · 2020-01-28 17:48:17 +00:00 · f0ddecd745
commit f0ddecd745
parent 9945b2dfef
1 changed files with 173 additions and 74 deletions
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@ -111,92 +111,191 @@ END(sse2_pagezero)
 */
 ENTRY(memcmp)
 	PUSH_FRAME_POINTER
+
+	xorl	%eax,%eax
+10:
 	cmpq	$16,%rdx
-	jae	5f
+	ja	101632f
+
+100816:
+	cmpb	$8,%dl
+	jl	100408f
+	movq	(%rdi),%r8
+	movq	(%rsi),%r9
+	cmpq	%r8,%r9
+	jne	1f
+	movq	-8(%rdi,%rdx),%r8
+	movq	-8(%rsi,%rdx),%r9
+	cmpq	%r8,%r9
+	jne	10081608f
+	POP_FRAME_POINTER
+	ret
+100408:
+	cmpb	$4,%dl
+	jl	100204f
+	movl	(%rsi),%r8d
+	movl	(%rdi),%r9d
+	cmpl	%r8d,%r9d
+	jne	1f
+	movl	-4(%rsi,%rdx),%r8d
+	movl	-4(%rdi,%rdx),%r9d
+	cmpl	%r8d,%r9d
+	jne	1f
+	POP_FRAME_POINTER
+	ret
+100204:
+	cmpb	$2,%dl
+	jl	100001f
+	movzwl	(%rsi),%r8d
+	movzwl	(%rdi),%r9d
+	cmpl	%r8d,%r9d
+	jne	1f
+	movzwl	-2(%rsi,%rdx),%r8d
+	movzwl	-2(%rdi,%rdx),%r9d
+	cmpl	%r8d,%r9d
+	jne	1f
+	POP_FRAME_POINTER
+	ret
+100001:
+	cmpb	$1,%dl
+	jl	100000f
+	movzbl	(%rdi),%r8d
+	movzbl	(%rsi),%r9d
+	cmpb	%r8b,%r9b
+	jne	1f
+100000:
+	POP_FRAME_POINTER
+	ret
+ALIGN_TEXT
+101632:
+	cmpq	$32,%rdx
+	ja	103200f
+	movq	(%rdi),%r8
+	movq	(%rsi),%r9
+	cmpq	%r8,%r9
+	jne	1f
+	movq	8(%rdi),%r8
+	movq	8(%rsi),%r9
+	cmpq	%r8,%r9
+	jne	 10163208f
+	movq	-16(%rdi,%rdx),%r8
+	movq	-16(%rsi,%rdx),%r9
+	cmpq	%r8,%r9
+	jne	10163216f
+	movq	-8(%rdi,%rdx),%r8
+	movq	-8(%rsi,%rdx),%r9
+	cmpq	%r8,%r9
+	jne	10163224f
+	POP_FRAME_POINTER
+	ret
+ALIGN_TEXT
+103200:
+	movq	(%rdi),%r8
+	movq	8(%rdi),%r9
+	subq	(%rsi),%r8
+	subq	8(%rsi),%r9
+	or	%r8,%r9
+	jnz	10320000f
+
+	movq    16(%rdi),%r8
+	movq    24(%rdi),%r9
+	subq    16(%rsi),%r8
+	subq    24(%rsi),%r9
+	or      %r8,%r9
+	jnz     10320016f
+
+	leaq	32(%rdi),%rdi
+	leaq	32(%rsi),%rsi
+	subq	$32,%rdx
+	cmpq	$32,%rdx
+	jae	103200b
+	cmpb	$0,%dl
+	jne	10b
+	POP_FRAME_POINTER
+	ret
+
+10320016:
+	leaq	16(%rdi),%rdi
+	leaq	16(%rsi),%rsi
+10320000:
+/*
+ * Mismatch was found within a 16 bytes range. The part of the routine
+ * which calculates it only operates on sizes up to 8 bytes. Find the
+ * right part.
+ */
+	movq	(%rdi),%r8
+	movq	(%rsi),%r9
+	cmpq	%r8,%r9
+	jne	1f
+	leaq	8(%rdi),%rdi
+	leaq	8(%rsi),%rsi
+	jmp	1f
+10163224:
+	leaq	-8(%rdi,%rdx),%rdi
+	leaq	-8(%rsi,%rdx),%rsi
+	jmp	1f
+10163216:
+	leaq	-16(%rdi,%rdx),%rdi
+	leaq	-16(%rsi,%rdx),%rsi
+	jmp	1f
+10163208:
+10081608:
+	leaq	8(%rdi),%rdi
+	leaq	8(%rsi),%rsi
+	jmp	1f
+
+/*
+ * Mismatch was found. We have no more than 8 bytes to inspect.
+ */
+ALIGN_TEXT
 1:
-	testq	%rdx,%rdx
-	je	3f
-	xorl	%ecx,%ecx
-2:
-	movzbl	(%rdi,%rcx,1),%eax
-	movzbl	(%rsi,%rcx,1),%r8d
+	movzbl	(%rdi),%eax
+	movzbl	(%rsi),%r8d
 	cmpb	%r8b,%al
-	jne	4f
-	addq    $1,%rcx
-	cmpq    %rcx,%rdx
-	jz	3f
-	movzbl	(%rdi,%rcx,1),%eax
-	movzbl	(%rsi,%rcx,1),%r8d
+	jne	2f
+
+	movzbl	1(%rdi),%eax
+	movzbl	1(%rsi),%r8d
 	cmpb	%r8b,%al
-	jne	4f
-	addq	$1,%rcx
-	cmpq	%rcx,%rdx
-	jz	3f
-	movzbl	(%rdi,%rcx,1),%eax
-	movzbl	(%rsi,%rcx,1),%r8d
+	jne	2f
+
+	movzbl	2(%rdi),%eax
+	movzbl	2(%rsi),%r8d
 	cmpb	%r8b,%al
-	jne	4f
-	addq	$1,%rcx
-	cmpq	%rcx,%rdx
-	jz	3f
-	movzbl	(%rdi,%rcx,1),%eax
-	movzbl	(%rsi,%rcx,1),%r8d
+	jne	2f
+
+	movzbl	3(%rdi),%eax
+	movzbl	3(%rsi),%r8d
 	cmpb	%r8b,%al
-	jne	4f
-	addq	$1,%rcx
-	cmpq	%rcx,%rdx
-	jne	2b
-3:
+	jne	2f
+
+	movzbl	4(%rdi),%eax
+	movzbl	4(%rsi),%r8d
+	cmpb	%r8b,%al
+	jne	2f
+
+	movzbl	5(%rdi),%eax
+	movzbl	5(%rsi),%r8d
+	cmpb	%r8b,%al
+	jne	2f
+
+	movzbl	6(%rdi),%eax
+	movzbl	6(%rsi),%r8d
+	cmpb	%r8b,%al
+	jne	2f
+
+	movzbl	7(%rdi),%eax
+	movzbl	7(%rsi),%r8d
+	cmpb	%r8b,%al
+	jne	2f
+
 	xorl	%eax,%eax
 	POP_FRAME_POINTER
 	ret
-4:
+2:
 	subl	%r8d,%eax
 	POP_FRAME_POINTER
 	ret
-5:
-	cmpq	$32,%rdx
-	jae	7f
-6:
-	/*
-	 * 8 bytes
-	 */
-	movq    (%rdi),%r8
-	movq    (%rsi),%r9
-	cmpq    %r8,%r9
-	jne	1b
-	leaq	8(%rdi),%rdi
-	leaq	8(%rsi),%rsi
-	subq	$8,%rdx
-	cmpq	$8,%rdx
-	jae	6b
-	jl	1b
-	jmp	3b
-7:
-	/*
-	 * 32 bytes
-	 */
-	movq    (%rsi),%r8
-	movq    8(%rsi),%r9
-	subq    (%rdi),%r8
-	subq    8(%rdi),%r9
-	or	%r8,%r9
-	jnz	1b
-
-	movq    16(%rsi),%r8
-	movq    24(%rsi),%r9
-	subq    16(%rdi),%r8
-	subq    24(%rdi),%r9
-	or	%r8,%r9
-	jnz	1b
-
-	leaq    32(%rdi),%rdi
-	leaq    32(%rsi),%rsi
-	subq    $32,%rdx
-	cmpq    $32,%rdx
-	jae	7b
-	jnz	1b
-	jmp	3b
 END(memcmp)

 /*