mirror of
https://github.com/postgres/postgres.git
synced 2026-04-10 11:37:33 -04:00
Use AVX2 for calculating page checksums where available
We already rely on autovectorization for computing page checksums, but on x86 we can get a further several-fold performance increase by annotating pg_checksum_block() with a function target attribute for the AVX2 instruction set extension. Not only does that use 256-bit registers, it can also use vector multiplication rather than the vector shifts and adds used in SSE2. Similar to other hardware-specific paths, we set a function pointer on first use. We don't bother to avoid this on platforms without AVX2 since the overhead of indirect calls doesn't matter for multi-kilobyte inputs. However, we do arrange so that only core has the function pointer mechanism. External programs will continue to build a normal static function and don't need to be aware of this. This matters most when using io_uring since in that case the checksum computation is not done in parallel by IO workers. Co-authored-by: Matthew Sterrett <matthewsterrett2@gmail.com> Co-authored-by: Andrew Kim <andrew.kim@intel.com> Reviewed-by: Oleg Tselebrovskiy <o.tselebrovskiy@postgrespro.ru> Tested-by: Ants Aasma <ants.aasma@cybertec.at> Tested-by: Stepan Neretin <slpmcf@gmail.com> (earlier version) Discussion: https://postgr.es/m/CA+vA85_5GTu+HHniSbvvP+8k3=xZO=WE84NPwiKyxztqvpfZ3Q@mail.gmail.com Discussion: https://postgr.es/m/20250911054220.3784-1-root%40ip-172-31-36-228.ec2.internal
This commit is contained in:
parent
c06443063f
commit
5e13b0f240
10 changed files with 219 additions and 35 deletions
|
|
@ -687,6 +687,31 @@ fi
|
|||
undefine([Ac_cachevar])dnl
|
||||
])# PGAC_SSE42_CRC32_INTRINSICS
|
||||
|
||||
# PGAC_AVX2_SUPPORT
|
||||
# ---------------------------
|
||||
# Check if the compiler supports AVX2 as a target
|
||||
#
|
||||
# If AVX2 target attribute is supported, sets pgac_avx2_support.
|
||||
#
|
||||
# There is deliberately not a guard for __has_attribute here
|
||||
AC_DEFUN([PGAC_AVX2_SUPPORT],
|
||||
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx2_support])])dnl
|
||||
AC_CACHE_CHECK([for AVX2 target attribute support], [Ac_cachevar],
|
||||
[AC_COMPILE_IFELSE([AC_LANG_PROGRAM([
|
||||
__attribute__((target("avx2")))
|
||||
static int avx2_test(void)
|
||||
{
|
||||
return 0;
|
||||
}],
|
||||
[return avx2_test();])],
|
||||
[Ac_cachevar=yes],
|
||||
[Ac_cachevar=no])])
|
||||
if test x"$Ac_cachevar" = x"yes"; then
|
||||
pgac_avx2_support=yes
|
||||
fi
|
||||
undefine([Ac_cachevar])dnl
|
||||
])# PGAC_AVX2_SUPPORT
|
||||
|
||||
# PGAC_AVX512_PCLMUL_INTRINSICS
|
||||
# ---------------------------
|
||||
# Check if the compiler supports AVX-512 carryless multiplication
|
||||
|
|
|
|||
44
configure
vendored
44
configure
vendored
|
|
@ -17820,6 +17820,50 @@ $as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h
|
|||
|
||||
fi
|
||||
|
||||
# Check for AVX2 target support
|
||||
#
|
||||
if test x"$host_cpu" = x"x86_64"; then
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 target attribute support" >&5
|
||||
$as_echo_n "checking for AVX2 target attribute support... " >&6; }
|
||||
if ${pgac_cv_avx2_support+:} false; then :
|
||||
$as_echo_n "(cached) " >&6
|
||||
else
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
__attribute__((target("avx2")))
|
||||
static int avx2_test(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
int
|
||||
main ()
|
||||
{
|
||||
return avx2_test();
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_compile "$LINENO"; then :
|
||||
pgac_cv_avx2_support=yes
|
||||
else
|
||||
pgac_cv_avx2_support=no
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
||||
fi
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx2_support" >&5
|
||||
$as_echo "$pgac_cv_avx2_support" >&6; }
|
||||
if test x"$pgac_cv_avx2_support" = x"yes"; then
|
||||
pgac_avx2_support=yes
|
||||
fi
|
||||
|
||||
if test x"$pgac_avx2_support" = x"yes"; then
|
||||
|
||||
$as_echo "#define USE_AVX2_WITH_RUNTIME_CHECK 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check for XSAVE intrinsics
|
||||
#
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv" >&5
|
||||
|
|
|
|||
|
|
@ -2135,6 +2135,15 @@ if test x"$pgac_cv__cpuidex" = x"yes"; then
|
|||
AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.])
|
||||
fi
|
||||
|
||||
# Check for AVX2 target support
|
||||
#
|
||||
if test x"$host_cpu" = x"x86_64"; then
|
||||
PGAC_AVX2_SUPPORT()
|
||||
if test x"$pgac_avx2_support" = x"yes"; then
|
||||
AC_DEFINE(USE_AVX2_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX2 instructions with a runtime check.])
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check for XSAVE intrinsics
|
||||
#
|
||||
PGAC_XSAVE_INTRINSICS()
|
||||
|
|
|
|||
27
meson.build
27
meson.build
|
|
@ -2494,6 +2494,33 @@ int main(void)
|
|||
endif
|
||||
|
||||
|
||||
###############################################################
|
||||
# Check if the compiler supports AVX2 as a target
|
||||
# There is deliberately not a guard for __has_attribute here
|
||||
###############################################################
|
||||
|
||||
if host_cpu == 'x86_64'
|
||||
|
||||
prog = '''
|
||||
__attribute__((target("avx2")))
|
||||
static int avx2_test(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
return avx2_test();
|
||||
}
|
||||
'''
|
||||
|
||||
if cc.links(prog, name: 'AVX2 support', args: test_c_args)
|
||||
cdata.set('USE_AVX2_WITH_RUNTIME_CHECK', 1)
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
|
||||
###############################################################
|
||||
# Check for the availability of AVX-512 popcount intrinsics.
|
||||
###############################################################
|
||||
|
|
|
|||
|
|
@ -13,10 +13,52 @@
|
|||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "port/pg_cpu.h"
|
||||
#include "storage/checksum.h"
|
||||
/*
|
||||
* The actual code is in storage/checksum_impl.h. This is done so that
|
||||
* external programs can incorporate the checksum code by #include'ing
|
||||
* that file from the exported Postgres headers. (Compare our CRC code.)
|
||||
* that file from the exported Postgres headers. (Compare our legacy
|
||||
* CRC code in pg_crc.h.)
|
||||
* The PG_CHECKSUM_INTERNAL symbol allows core to use hardware-specific
|
||||
* coding without affecting external programs.
|
||||
*/
|
||||
#define PG_CHECKSUM_INTERNAL
|
||||
#include "storage/checksum_impl.h" /* IWYU pragma: keep */
|
||||
|
||||
|
||||
static uint32
|
||||
pg_checksum_block_fallback(const PGChecksummablePage *page)
|
||||
{
|
||||
#include "storage/checksum_block.inc.c"
|
||||
}
|
||||
|
||||
/*
|
||||
* AVX2-optimized block checksum algorithm.
|
||||
*/
|
||||
#ifdef USE_AVX2_WITH_RUNTIME_CHECK
|
||||
pg_attribute_target("avx2")
|
||||
static uint32
|
||||
pg_checksum_block_avx2(const PGChecksummablePage *page)
|
||||
{
|
||||
#include "storage/checksum_block.inc.c"
|
||||
}
|
||||
#endif /* USE_AVX2_WITH_RUNTIME_CHECK */
|
||||
|
||||
/*
|
||||
* Choose the best available checksum implementation.
|
||||
*/
|
||||
static uint32
|
||||
pg_checksum_choose(const PGChecksummablePage *page)
|
||||
{
|
||||
pg_checksum_block = pg_checksum_block_fallback;
|
||||
|
||||
#ifdef USE_AVX2_WITH_RUNTIME_CHECK
|
||||
if (x86_feature_available(PG_AVX2))
|
||||
pg_checksum_block = pg_checksum_block_avx2;
|
||||
#endif
|
||||
|
||||
return pg_checksum_block(page);
|
||||
}
|
||||
|
||||
static uint32 (*pg_checksum_block) (const PGChecksummablePage *page) = pg_checksum_choose;
|
||||
|
|
|
|||
|
|
@ -674,6 +674,9 @@
|
|||
/* Define to 1 to build with assertion checks. (--enable-cassert) */
|
||||
#undef USE_ASSERT_CHECKING
|
||||
|
||||
/* Define to 1 to use AVX2 instructions with a runtime check. */
|
||||
#undef USE_AVX2_WITH_RUNTIME_CHECK
|
||||
|
||||
/* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */
|
||||
#undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
|
||||
|
||||
|
|
|
|||
|
|
@ -24,6 +24,9 @@ typedef enum X86FeatureId
|
|||
PG_SSE4_2,
|
||||
PG_POPCNT,
|
||||
|
||||
/* 256-bit YMM registers */
|
||||
PG_AVX2,
|
||||
|
||||
/* 512-bit ZMM registers */
|
||||
PG_AVX512_BW,
|
||||
PG_AVX512_VL,
|
||||
|
|
|
|||
42
src/include/storage/checksum_block.inc.c
Normal file
42
src/include/storage/checksum_block.inc.c
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* checksum_block.inc.c
|
||||
* Core algorithm for page checksums, semi-private to checksum_impl.h
|
||||
* and checksum.c.
|
||||
*
|
||||
* Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* src/include/storage/checksum_block.inc.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* there is deliberately not an #ifndef CHECKSUM_BLOCK_INC_C here */
|
||||
|
||||
uint32 sums[N_SUMS];
|
||||
uint32 result = 0;
|
||||
uint32 i,
|
||||
j;
|
||||
|
||||
/* ensure that the size is compatible with the algorithm */
|
||||
Assert(sizeof(PGChecksummablePage) == BLCKSZ);
|
||||
|
||||
/* initialize partial checksums to their corresponding offsets */
|
||||
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
|
||||
|
||||
/* main checksum calculation */
|
||||
for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
|
||||
for (j = 0; j < N_SUMS; j++)
|
||||
CHECKSUM_COMP(sums[j], page->data[i][j]);
|
||||
|
||||
/* finally add in two rounds of zeroes for additional mixing */
|
||||
for (i = 0; i < 2; i++)
|
||||
for (j = 0; j < N_SUMS; j++)
|
||||
CHECKSUM_COMP(sums[j], 0);
|
||||
|
||||
/* xor fold partial checksums together */
|
||||
for (i = 0; i < N_SUMS; i++)
|
||||
result ^= sums[i];
|
||||
|
||||
return result;
|
||||
|
|
@ -72,12 +72,13 @@
|
|||
* random segments of page with 0x00, 0xFF and random data all show optimal
|
||||
* 2e-16 false positive rate within margin of error.
|
||||
*
|
||||
* Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer
|
||||
* multiplication instruction. As of 2013 the corresponding instruction is
|
||||
* available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32).
|
||||
* Vectorization requires a compiler to do the vectorization for us. For recent
|
||||
* GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough
|
||||
* to achieve vectorization.
|
||||
* Vectorization of the algorithm works best with a 32bit x 32bit -> 32bit
|
||||
* vector integer multiplication instruction, Examples include x86 AVX2
|
||||
* extensions (vpmulld) and ARM NEON (vmul.i32). Without that, vectorization
|
||||
* is still possible if the compiler can turn multiplication by FNV_PRIME
|
||||
* into a sequence of vectorized shifts and adds. For simplicity we rely
|
||||
* on the compiler to do the vectorization for us. For GCC and clang the
|
||||
* flags -funroll-loops -ftree-vectorize are enough to achieve vectorization.
|
||||
*
|
||||
* The optimal amount of parallelism to use depends on CPU specific instruction
|
||||
* latency, SIMD instruction width, throughput and the amount of registers
|
||||
|
|
@ -89,8 +90,9 @@
|
|||
*
|
||||
* The parallelism number 32 was chosen based on the fact that it is the
|
||||
* largest state that fits into architecturally visible x86 SSE registers while
|
||||
* leaving some free registers for intermediate values. For future processors
|
||||
* with 256bit vector registers this will leave some performance on the table.
|
||||
* leaving some free registers for intermediate values. For processors
|
||||
* with 256-bit vector registers this leaves some performance on the table.
|
||||
*
|
||||
* When vectorization is not available it might be beneficial to restructure
|
||||
* the computation to calculate a subset of the columns at a time and perform
|
||||
* multiple passes to avoid register spilling. This optimization opportunity
|
||||
|
|
@ -142,37 +144,20 @@ do { \
|
|||
* Block checksum algorithm. The page must be adequately aligned
|
||||
* (at least on 4-byte boundary).
|
||||
*/
|
||||
#ifdef PG_CHECKSUM_INTERNAL
|
||||
/* definitions in src/backend/storage/page/checksum.c */
|
||||
static uint32 (*pg_checksum_block) (const PGChecksummablePage *page);
|
||||
|
||||
#else
|
||||
/* static definition for external programs */
|
||||
static uint32
|
||||
pg_checksum_block(const PGChecksummablePage *page)
|
||||
{
|
||||
uint32 sums[N_SUMS];
|
||||
uint32 result = 0;
|
||||
uint32 i,
|
||||
j;
|
||||
|
||||
/* ensure that the size is compatible with the algorithm */
|
||||
Assert(sizeof(PGChecksummablePage) == BLCKSZ);
|
||||
|
||||
/* initialize partial checksums to their corresponding offsets */
|
||||
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
|
||||
|
||||
/* main checksum calculation */
|
||||
for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
|
||||
for (j = 0; j < N_SUMS; j++)
|
||||
CHECKSUM_COMP(sums[j], page->data[i][j]);
|
||||
|
||||
/* finally add in two rounds of zeroes for additional mixing */
|
||||
for (i = 0; i < 2; i++)
|
||||
for (j = 0; j < N_SUMS; j++)
|
||||
CHECKSUM_COMP(sums[j], 0);
|
||||
|
||||
/* xor fold partial checksums together */
|
||||
for (i = 0; i < N_SUMS; i++)
|
||||
result ^= sums[i];
|
||||
|
||||
return result;
|
||||
#include "storage/checksum_block.inc.c"
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Compute the checksum for a Postgres page.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -119,6 +119,10 @@ set_x86_features(void)
|
|||
xcr0_val = _xgetbv(0);
|
||||
#endif
|
||||
|
||||
/* Are YMM registers enabled? */
|
||||
if (mask_available(xcr0_val, XMM | YMM))
|
||||
X86Features[PG_AVX2] = reg[EBX] >> 5 & 1;
|
||||
|
||||
/* Are ZMM registers enabled? */
|
||||
if (mask_available(xcr0_val, XMM | YMM |
|
||||
OPMASK | ZMM0_15 | ZMM16_31))
|
||||
|
|
|
|||
Loading…
Reference in a new issue