Notable upstream pull request merges:
 #17058 -multiple Port AVX2 implementation of aes-gcm from BoringSSL
 #17602 077269bfe Fix Assert in dbuf_undirty, which triggers during
                  usage zap shrink
 #17613 d15143207 ZIL: Make allocations more flexible
 #17618 5061f959d Retire zfs_autoimport_disable kmod option
 #17166 d3c1d27af zdb: better handling for corrupt block pointers
 #17616 1ccae433e Allow vmem_alloc backed multilists
 #17619 e0e60d319 Better pack struct zio_prop
 #17620 152e34822 Silence zstd large allocation warning
 #17622 -multiple ZIL: restore some things lost in "ZIL-crash" review
 #17625 -multiple zvol: cleanup & fixup zvol destruction sequence and
                  locking
 #17631 885d929cf Fix missed assertion update in physical rewrite patch
 #17642 a9410ccbd Make zpool_find_config() report errors
 #17647 30a915efe zfs-send.8: mention combination of -c/-e flags and
                  zstd_compress feature
 #17649 2c877e845 FreeBSD: Set st_rdev to NODEV, not 0, when not a device
                  (already backported)

Obtained from:	OpenZFS
OpenZFS commit:	a9410ccbd9
This commit is contained in:
Martin Matuska 2025-08-20 10:57:58 +02:00
commit 53a2e2635a
55 changed files with 4527 additions and 784 deletions

View file

@ -214,7 +214,8 @@ ZFS_CFLAGS+= -I$S/contrib/openzfs/module/icp/include \
.if ${MACHINE_ARCH} == "amd64"
ZFS_CFLAGS+= -D__x86_64 -DHAVE_SSE2 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 \
-DHAVE_AVX -DHAVE_AVX2 -DHAVE_AVX512F -DHAVE_AVX512VL -DHAVE_AVX512BW
-DHAVE_AVX -DHAVE_AVX2 -DHAVE_AVX512F -DHAVE_AVX512VL -DHAVE_AVX512BW \
-DHAVE_VAES -DHAVE_VPCLMULQDQ
.endif
.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \

View file

@ -109,7 +109,7 @@ case "$OS" in
KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz"
;;
freebsd15-0c)
FreeBSD="15.0-CURRENT"
FreeBSD="15.0-PRERELEASE"
OSNAME="FreeBSD $FreeBSD"
OSv="freebsd14.0"
URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz"

View file

@ -5,12 +5,13 @@
#
# Usage:
#
# qemu-4-build-vm.sh OS [--enable-debug][--dkms][--poweroff]
# [--release][--repo][--tarball]
# qemu-4-build-vm.sh OS [--enable-debug][--dkms][--patch-level NUM]
# [--poweroff][--release][--repo][--tarball]
#
# OS: OS name like 'fedora41'
# --enable-debug: Build RPMs with '--enable-debug' (for testing)
# --dkms: Build DKMS RPMs as well
# --patch-level NUM: Use a custom patch level number for packages.
# --poweroff: Power-off the VM after building
# --release Build zfs-release*.rpm as well
# --repo After building everything, copy RPMs into /tmp/repo
@ -21,6 +22,7 @@
ENABLE_DEBUG=""
DKMS=""
PATCH_LEVEL=""
POWEROFF=""
RELEASE=""
REPO=""
@ -35,6 +37,11 @@ while [[ $# -gt 0 ]]; do
DKMS=1
shift
;;
--patch-level)
PATCH_LEVEL=$2
shift
shift
;;
--poweroff)
POWEROFF=1
shift
@ -215,6 +222,10 @@ function rpm_build_and_install() {
run ./autogen.sh
echo "##[endgroup]"
if [ -n "$PATCH_LEVEL" ] ; then
sed -i -E 's/(Release:\s+)1/\1'$PATCH_LEVEL'/g' META
fi
echo "##[group]Configure"
run ./configure --enable-debuginfo $extra
echo "##[endgroup]"
@ -328,7 +339,13 @@ fi
# almalinux9.5
# fedora42
source /etc/os-release
sudo hostname "$ID$VERSION_ID"
if which hostnamectl &> /dev/null ; then
# Fedora 42+ use hostnamectl
sudo hostnamectl set-hostname "$ID$VERSION_ID"
sudo hostnamectl set-hostname --pretty "$ID$VERSION_ID"
else
sudo hostname "$ID$VERSION_ID"
fi
# save some sysinfo
uname -a > /var/tmp/uname.txt

View file

@ -32,6 +32,11 @@ on:
options:
- "Build RPMs"
- "Test repo"
patch_level:
type: string
required: false
default: ""
description: "(optional) patch level number"
repo_url:
type: string
required: false
@ -78,7 +83,13 @@ jobs:
mkdir -p /tmp/repo
ssh zfs@vm0 '$HOME/zfs/.github/workflows/scripts/qemu-test-repo-vm.sh' ${{ github.event.inputs.repo_url }}
else
.github/workflows/scripts/qemu-4-build.sh --repo --release --dkms --tarball ${{ matrix.os }}
EXTRA=""
if [ -n "${{ github.event.inputs.patch_level }}" ] ; then
EXTRA="--patch-level ${{ github.event.inputs.patch_level }}"
fi
.github/workflows/scripts/qemu-4-build.sh $EXTRA \
--repo --release --dkms --tarball ${{ matrix.os }}
fi
- name: Prepare artifacts

View file

@ -23,6 +23,7 @@
# These maps are making names consistent where they have varied but the email
# address has never changed. In most cases, the full name is in the
# Signed-off-by of a commit with a matching author.
Achill Gilgenast <achill@achill.org>
Ahelenia Ziemiańska <nabijaczleweli@gmail.com>
Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Alex John <alex@stty.io>
@ -37,6 +38,7 @@ Crag Wang <crag0715@gmail.com>
Damian Szuberski <szuberskidamian@gmail.com>
Daniel Kolesa <daniel@octaforge.org>
Debabrata Banerjee <dbavatar@gmail.com>
Diwakar Kristappagari <diwakar-k@hpe.com>
Finix Yan <yanchongwen@hotmail.com>
Gaurav Kumar <gauravk.18@gmail.com>
Gionatan Danti <g.danti@assyoma.it>
@ -145,6 +147,7 @@ Gaurav Kumar <gauravk.18@gmail.com> <gaurkuma@users.noreply.github.com>
George Gaydarov <git@gg7.io> <gg7@users.noreply.github.com>
Georgy Yakovlev <gyakovlev@gentoo.org> <168902+gyakovlev@users.noreply.github.com>
Gerardwx <gerardw@alum.mit.edu> <Gerardwx@users.noreply.github.com>
Germano Massullo <germano.massullo@gmail.com> <Germano0@users.noreply.github.com>
Gian-Carlo DeFazio <defazio1@llnl.gov> <defaziogiancarlo@users.noreply.github.com>
Giuseppe Di Natale <dinatale2@llnl.gov> <dinatale2@users.noreply.github.com>
Hajo Möller <dasjoe@gmail.com> <dasjoe@users.noreply.github.com>
@ -164,6 +167,7 @@ John Ramsden <johnramsden@riseup.net> <johnramsden@users.noreply.github.com>
Jonathon Fernyhough <jonathon@m2x.dev> <559369+jonathonf@users.noreply.github.com>
Jose Luis Duran <jlduran@gmail.com> <jlduran@users.noreply.github.com>
Justin Hibbits <chmeeedalf@gmail.com> <chmeeedalf@users.noreply.github.com>
Kaitlin Hoang <kthoang@amazon.com> <khoang98@users.noreply.github.com>
Kevin Greene <kevin.greene@delphix.com> <104801862+kxgreene@users.noreply.github.com>
Kevin Jin <lostking2008@hotmail.com> <33590050+jxdking@users.noreply.github.com>
Kevin P. Fleming <kevin@km6g.us> <kpfleming@users.noreply.github.com>

View file

@ -10,6 +10,7 @@ PAST MAINTAINERS:
CONTRIBUTORS:
Aaron Fineman <abyxcos@gmail.com>
Achill Gilgenast <achill@achill.org>
Adam D. Moss <c@yotes.com>
Adam Leventhal <ahl@delphix.com>
Adam Stevko <adam.stevko@gmail.com>
@ -59,6 +60,7 @@ CONTRIBUTORS:
Andreas Buschmann <andreas.buschmann@tech.net.de>
Andreas Dilger <adilger@intel.com>
Andreas Vögele <andreas@andreasvoegele.com>
Andres <a-d-j-i@users.noreply.github.com>
Andrew Barnes <barnes333@gmail.com>
Andrew Hamilton <ahamilto@tjhsst.edu>
Andrew Innes <andrew.c12@gmail.com>
@ -72,6 +74,7 @@ CONTRIBUTORS:
Andrey Prokopenko <job@terem.fr>
Andrey Vesnovaty <andrey.vesnovaty@gmail.com>
Andriy Gapon <avg@freebsd.org>
Andriy Tkachuk <andriy.tkachuk@seagate.com>
Andy Bakun <github@thwartedefforts.org>
Andy Fiddaman <omnios@citrus-it.co.uk>
Aniruddha Shankar <k@191a.net>
@ -120,6 +123,7 @@ CONTRIBUTORS:
Caleb James DeLisle <calebdelisle@lavabit.com>
Cameron Harr <harr1@llnl.gov>
Cao Xuewen <cao.xuewen@zte.com.cn>
Carl George <carlwgeorge@gmail.com>
Carlo Landmeter <clandmeter@gmail.com>
Carlos Alberto Lopez Perez <clopez@igalia.com>
Cedric Maunoury <cedric.maunoury@gmail.com>
@ -200,6 +204,7 @@ CONTRIBUTORS:
Dimitri John Ledkov <xnox@ubuntu.com>
Dimitry Andric <dimitry@andric.com>
Dirkjan Bussink <d.bussink@gmail.com>
Diwakar Kristappagari <diwakar-k@hpe.com>
Dmitry Khasanov <pik4ez@gmail.com>
Dominic Pearson <dsp@technoanimal.net>
Dominik Hassler <hadfl@omniosce.org>
@ -250,6 +255,7 @@ CONTRIBUTORS:
George Wilson <gwilson@delphix.com>
Georgy Yakovlev <ya@sysdump.net>
Gerardwx <gerardw@alum.mit.edu>
Germano Massullo <germano.massullo@gmail.com>
Gian-Carlo DeFazio <defazio1@llnl.gov>
Gionatan Danti <g.danti@assyoma.it>
Giuseppe Di Natale <guss80@gmail.com>
@ -287,6 +293,7 @@ CONTRIBUTORS:
Igor K <igor@dilos.org>
Igor Kozhukhov <ikozhukhov@gmail.com>
Igor Lvovsky <ilvovsky@gmail.com>
Igor Ostapenko <pm@igoro.pro>
ilbsmart <wgqimut@gmail.com>
Ilkka Sovanto <github@ilkka.kapsi.fi>
illiliti <illiliti@protonmail.com>
@ -326,6 +333,7 @@ CONTRIBUTORS:
Jinshan Xiong <jinshan.xiong@intel.com>
Jitendra Patidar <jitendra.patidar@nutanix.com>
JK Dingwall <james@dingwall.me.uk>
Joel Low <joel@joelsplace.sg>
Joe Stein <joe.stein@delphix.com>
John-Mark Gurney <jmg@funkthat.com>
John Albietz <inthecloud247@gmail.com>
@ -374,6 +382,7 @@ CONTRIBUTORS:
Kevin Jin <lostking2008@hotmail.com>
Kevin P. Fleming <kevin@km6g.us>
Kevin Tanguy <kevin.tanguy@ovh.net>
khoang98 <khoang98@users.noreply.github.com>
KireinaHoro <i@jsteward.moe>
Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Kleber Tarcísio <klebertarcisio@yahoo.com.br>
@ -447,6 +456,7 @@ CONTRIBUTORS:
Max Zettlmeißl <max@zettlmeissl.de>
Md Islam <mdnahian@outlook.com>
megari <megari@iki.fi>
Meriel Luna Mittelbach <lunarlambda@gmail.com>
Michael D Labriola <michael.d.labriola@gmail.com>
Michael Franzl <michael@franzl.name>
Michael Gebetsroither <michael@mgeb.org>
@ -494,6 +504,7 @@ CONTRIBUTORS:
Orivej Desh <orivej@gmx.fr>
Pablo Correa Gómez <ablocorrea@hotmail.com>
Palash Gandhi <pbg4930@rit.edu>
Patrick Fasano <patrick@patrickfasano.com>
Patrick Mooney <pmooney@pfmooney.com>
Patrik Greco <sikevux@sikevux.se>
Paul B. Henson <henson@acm.org>
@ -535,6 +546,7 @@ CONTRIBUTORS:
Remy Blank <remy.blank@pobox.com>
renelson <bnelson@nelsonbe.com>
Reno Reckling <e-github@wthack.de>
René Wirnata <rene.wirnata@pandascience.net>
Ricardo M. Correia <ricardo.correia@oracle.com>
Riccardo Schirone <rschirone91@gmail.com>
Richard Allen <belperite@gmail.com>
@ -640,6 +652,7 @@ CONTRIBUTORS:
tleydxdy <shironeko.github@tesaguri.club>
Tobin Harding <me@tobin.cc>
Todd Seidelmann <seidelma@users.noreply.github.com>
Todd Zullinger <tmz@pobox.com>
Tom Caputi <tcaputi@datto.com>
Tom Matthews <tom@axiom-partners.com>
Tomohiro Kusumi <kusumi.tomohiro@gmail.com>

View file

@ -6,5 +6,5 @@ Release: 1
Release-Tags: relext
License: CDDL
Author: OpenZFS
Linux-Maximum: 6.15
Linux-Maximum: 6.16
Linux-Minimum: 4.18

View file

@ -127,6 +127,7 @@ static zfs_range_tree_t *mos_refd_objs;
static spa_t *spa;
static objset_t *os;
static boolean_t kernel_init_done;
static boolean_t corruption_found = B_FALSE;
static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
boolean_t);
@ -250,6 +251,7 @@ sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
&e->svbr_blk, B_TRUE);
(void) printf("\tERROR: %d unmatched FREE(s): %s\n",
e->svbr_refcnt, blkbuf);
corruption_found = B_TRUE;
}
zfs_btree_destroy(&sv->sv_pair);
@ -405,6 +407,7 @@ verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,
(u_longlong_t)DVA_GET_ASIZE(&found->svb_dva),
(u_longlong_t)found->svb_allocated_txg,
(u_longlong_t)txg);
corruption_found = B_TRUE;
}
}
}
@ -426,6 +429,7 @@ metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)
(u_longlong_t)txg, (u_longlong_t)offset,
(u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
(u_longlong_t)mv->mv_msid);
corruption_found = B_TRUE;
} else {
zfs_range_tree_add(mv->mv_allocated,
offset, size);
@ -439,6 +443,7 @@ metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)
(u_longlong_t)txg, (u_longlong_t)offset,
(u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
(u_longlong_t)mv->mv_msid);
corruption_found = B_TRUE;
} else {
zfs_range_tree_remove(mv->mv_allocated,
offset, size);
@ -526,6 +531,7 @@ mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
(u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
(u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
(u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
corruption_found = B_TRUE;
continue;
}
@ -542,6 +548,7 @@ mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
(u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
(u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
(u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
corruption_found = B_TRUE;
continue;
}
@ -655,6 +662,7 @@ livelist_metaslab_validate(spa_t *spa)
}
(void) printf("ERROR: Found livelist blocks marked as allocated "
"for indirect vdevs:\n");
corruption_found = B_TRUE;
zfs_btree_index_t *where = NULL;
sublivelist_verify_block_t *svb;
@ -827,7 +835,7 @@ usage(void)
(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
"to make only that option verbose\n");
(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
zdb_exit(1);
zdb_exit(2);
}
static void
@ -2583,19 +2591,17 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
}
}
static void
static u_longlong_t
print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb,
const dnode_phys_t *dnp)
{
char blkbuf[BP_SPRINTF_LEN];
u_longlong_t offset;
int l;
if (!BP_IS_EMBEDDED(bp)) {
ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
}
offset = (u_longlong_t)blkid2offset(dnp, bp, zb);
(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
(void) printf("%16llx ", offset);
ASSERT(zb->zb_level >= 0);
@ -2610,19 +2616,38 @@ print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb,
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE);
if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD)
snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp);
(void) printf("%s\n", blkbuf);
(void) printf("%s", blkbuf);
if (!BP_IS_EMBEDDED(bp)) {
if (BP_GET_TYPE(bp) != dnp->dn_type) {
(void) printf(" (ERROR: Block pointer type "
"(%llu) does not match dnode type (%hhu))",
BP_GET_TYPE(bp), dnp->dn_type);
corruption_found = B_TRUE;
}
if (BP_GET_LEVEL(bp) != zb->zb_level) {
(void) printf(" (ERROR: Block pointer level "
"(%llu) does not match bookmark level (%ld))",
BP_GET_LEVEL(bp), zb->zb_level);
corruption_found = B_TRUE;
}
}
(void) printf("\n");
return (offset);
}
static int
visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
blkptr_t *bp, const zbookmark_phys_t *zb)
{
u_longlong_t offset;
int err = 0;
if (BP_GET_BIRTH(bp) == 0)
return (0);
print_indirect(spa, bp, zb, dnp);
offset = print_indirect(spa, bp, zb, dnp);
if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
arc_flags_t flags = ARC_FLAG_WAIT;
@ -2652,8 +2677,15 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
break;
fill += BP_GET_FILL(cbp);
}
if (!err)
ASSERT3U(fill, ==, BP_GET_FILL(bp));
if (!err) {
if (fill != BP_GET_FILL(bp)) {
(void) printf("%16llx: Block pointer "
"fill (%llu) does not match calculated "
"value (%lu)\n", offset, BP_GET_FILL(bp),
fill);
corruption_found = B_TRUE;
}
}
arc_buf_destroy(buf, &buf);
}
@ -2909,6 +2941,7 @@ dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
(void) printf("ERROR %u while trying to open "
"subobj id %llu\n",
error, (u_longlong_t)subobj);
corruption_found = B_TRUE;
continue;
}
dump_full_bpobj(&subbpo, "subobj", indent + 1);
@ -3088,6 +3121,7 @@ bpobj_count_refd(bpobj_t *bpo)
(void) printf("ERROR %u while trying to open "
"subobj id %llu\n",
error, (u_longlong_t)subobj);
corruption_found = B_TRUE;
continue;
}
bpobj_count_refd(&subbpo);
@ -9634,7 +9668,7 @@ main(int argc, char **argv)
} else if (objset_str && !zdb_numeric(objset_str + 1) &&
dump_opt['N']) {
printf("Supply a numeric objset ID with -N\n");
error = 1;
error = 2;
goto fini;
}
} else {
@ -9936,5 +9970,8 @@ fini:
if (kernel_init_done)
kernel_fini();
if (corruption_found && error == 0)
error = 3;
return (error);
}

View file

@ -84,6 +84,8 @@ AC_DEFUN([ZFS_AC_KERNEL_MKDIR], [
AC_DEFINE(HAVE_IOPS_MKDIR_DENTRY, 1,
[iops->mkdir() returns struct dentry*])
],[
AC_MSG_RESULT(no)
dnl #
dnl # 6.3 API change
dnl # mkdir() takes struct mnt_idmap * as the first arg

View file

@ -24,6 +24,8 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD], [
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_VAES
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_VPCLMULQDQ
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVE
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVEOPT
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVES
@ -446,6 +448,48 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE], [
])
])
dnl #
dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_VAES
dnl #
AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_VAES], [
AC_MSG_CHECKING([whether host toolchain supports VAES])
AC_LINK_IFELSE([AC_LANG_SOURCE([
[
int main()
{
__asm__ __volatile__("vaesenc %ymm0, %ymm1, %ymm0");
return (0);
}
]])], [
AC_MSG_RESULT([yes])
AC_DEFINE([HAVE_VAES], 1, [Define if host toolchain supports VAES])
], [
AC_MSG_RESULT([no])
])
])
dnl #
dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_VPCLMULQDQ
dnl #
AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_VPCLMULQDQ], [
AC_MSG_CHECKING([whether host toolchain supports VPCLMULQDQ])
AC_LINK_IFELSE([AC_LANG_SOURCE([
[
int main()
{
__asm__ __volatile__("vpclmulqdq %0, %%ymm4, %%ymm3, %%ymm5" :: "i"(0));
return (0);
}
]])], [
AC_MSG_RESULT([yes])
AC_DEFINE([HAVE_VPCLMULQDQ], 1, [Define if host toolchain supports VPCLMULQDQ])
], [
AC_MSG_RESULT([no])
])
])
dnl #
dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVE
dnl #

View file

@ -100,8 +100,8 @@ Depends: ${misc:Depends}, ${shlibs:Depends}
# The libcurl4 is loaded through dlopen("libcurl.so.4").
# https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=988521
Recommends: libcurl4
Breaks: libzfs2, libzfs4, libzfs4linux, libzfs6linux
Replaces: libzfs2, libzfs4, libzfs4linux, libzfs6linux
Breaks: libzfs2, libzfs4, libzfs4linux, libzfs6linux, openzfs-libzfs4
Replaces: libzfs2, libzfs4, libzfs4linux, libzfs6linux, openzfs-libzfs4
Conflicts: libzfs6linux
Description: OpenZFS filesystem library for Linux - general support
OpenZFS is a storage platform that encompasses the functionality of
@ -128,8 +128,8 @@ Package: openzfs-libzpool6
Section: contrib/libs
Architecture: linux-any
Depends: ${misc:Depends}, ${shlibs:Depends}
Breaks: libzpool2, libzpool5, libzpool5linux, libzpool6linux
Replaces: libzpool2, libzpool5, libzpool5linux, libzpool6linux
Breaks: libzpool2, libzpool5, libzpool6linux
Replaces: libzpool2, libzpool5, libzpool6linux
Conflicts: libzpool6linux
Description: OpenZFS pool library for Linux
OpenZFS is a storage platform that encompasses the functionality of

View file

@ -0,0 +1,253 @@
BoringSSL is a fork of OpenSSL. As such, large parts of it fall under OpenSSL
licensing. Files that are completely new have a Google copyright and an ISC
license. This license is reproduced at the bottom of this file.
Contributors to BoringSSL are required to follow the CLA rules for Chromium:
https://cla.developers.google.com/clas
Files in third_party/ have their own licenses, as described therein. The MIT
license, for third_party/fiat, which, unlike other third_party directories, is
compiled into non-test libraries, is included below.
The OpenSSL toolkit stays under a dual license, i.e. both the conditions of the
OpenSSL License and the original SSLeay license apply to the toolkit. See below
for the actual license texts. Actually both licenses are BSD-style Open Source
licenses. In case of any license issues related to OpenSSL please contact
openssl-core@openssl.org.
The following are Google-internal bug numbers where explicit permission from
some authors is recorded for use of their work. (This is purely for our own
record keeping.)
27287199
27287880
27287883
263291445
OpenSSL License
---------------
/* ====================================================================
* Copyright (c) 1998-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
Original SSLeay License
-----------------------
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
ISC license used for completely new code in BoringSSL:
/* Copyright 2015 The BoringSSL Authors
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
The code in third_party/fiat carries the MIT license:
Copyright (c) 2015-2016 the fiat-crypto authors (see
https://github.com/mit-plv/fiat-crypto/blob/master/AUTHORS).
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Licenses for support code
-------------------------
Parts of the TLS test suite are under the Go license. This code is not included
in BoringSSL (i.e. libcrypto and libssl) when compiled, however, so
distributing code linked against BoringSSL does not trigger this license:
Copyright (c) 2009 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
BoringSSL uses the Chromium test infrastructure to run a continuous build,
trybots etc. The scripts which manage this, and the script for generating build
metadata, are under the Chromium license. Distributing code linked against
BoringSSL does not trigger this license.
Copyright 2015 The Chromium Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View file

@ -0,0 +1,11 @@
This directory contains the original BoringSSL [1] GCM x86-64 assembly
files [2].
The assembler files where then further modified to fit the ICP conventions.
The main purpose to include these files (and the original ones) here, is to
serve as a reference if upstream changes need to be applied to the files
included and modified in the ICP.
[1] https://github.com/google/boringssl
[2] https://github.com/google/boringssl/blob/d5440dd2c2c500ac2d3bba4afec47a054b4d99ae/gen/bcm/aes-gcm-avx2-x86_64-linux.S

File diff suppressed because it is too large Load diff

View file

@ -979,7 +979,8 @@ mountroot()
touch /run/zfs_unlock_complete
if [ -e /run/zfs_unlock_complete_notify ]; then
read -r < /run/zfs_unlock_complete_notify
# shellcheck disable=SC2034
read -r zfs_unlock_complete_notify < /run/zfs_unlock_complete_notify
fi
# ------------

View file

@ -597,6 +597,32 @@ zfs_movbe_available(void)
#endif
}
/*
* Check if VAES instruction set is available
*/
static inline boolean_t
zfs_vaes_available(void)
{
#if defined(X86_FEATURE_VAES)
return (!!boot_cpu_has(X86_FEATURE_VAES));
#else
return (B_FALSE);
#endif
}
/*
* Check if VPCLMULQDQ instruction set is available
*/
static inline boolean_t
zfs_vpclmulqdq_available(void)
{
#if defined(X86_FEATURE_VPCLMULQDQ)
return (!!boot_cpu_has(X86_FEATURE_VPCLMULQDQ));
#else
return (B_FALSE);
#endif
}
/*
* Check if SHA_NI instruction set is available
*/

View file

@ -139,18 +139,18 @@
#define ZCW_TP_STRUCT_ENTRY \
__field(lwb_t *, zcw_lwb) \
__field(boolean_t, zcw_done) \
__field(int, zcw_zio_error) \
__field(int, zcw_error) \
#define ZCW_TP_FAST_ASSIGN \
__entry->zcw_lwb = zcw->zcw_lwb; \
__entry->zcw_done = zcw->zcw_done; \
__entry->zcw_zio_error = zcw->zcw_zio_error;
__entry->zcw_error = zcw->zcw_error;
#define ZCW_TP_PRINTK_FMT \
"zcw { lwb %p done %u error %u }"
#define ZCW_TP_PRINTK_ARGS \
__entry->zcw_lwb, __entry->zcw_done, __entry->zcw_zio_error
__entry->zcw_lwb, __entry->zcw_done, __entry->zcw_error
/*
* Generic support for two argument tracepoints of the form:

View file

@ -880,7 +880,6 @@ extern kcondvar_t spa_namespace_cv;
#define SPA_CONFIG_UPDATE_VDEVS 1
extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t);
extern void spa_config_load(void);
extern int spa_all_configs(uint64_t *generation, nvlist_t **pools);
extern void spa_config_set(spa_t *spa, nvlist_t *config);
extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
@ -1244,7 +1243,6 @@ extern void vdev_mirror_stat_fini(void);
/* Initialization and termination */
extern void spa_init(spa_mode_t mode);
extern void spa_fini(void);
extern void spa_boot_init(void *);
/* properties */
extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);

View file

@ -41,8 +41,8 @@ extern "C" {
*
* An lwb will start out in the "new" state, and transition to the "opened"
* state via a call to zil_lwb_write_open() on first itx assignment. When
* transitioning from "new" to "opened" the zilog's "zl_issuer_lock" must be
* held.
* transitioning from "new" to "opened" the zilog's "zl_issuer_lock" and
* LWB's "lwb_lock" must be held.
*
* After the lwb is "opened", it can be assigned number of itxs and transition
* into the "closed" state via zil_lwb_write_close() when full or on timeout.
@ -100,16 +100,22 @@ typedef enum {
* holding the "zl_issuer_lock". After the lwb is issued, the zilog's
* "zl_lock" is used to protect the lwb against concurrent access.
*/
typedef enum {
LWB_FLAG_SLIM = (1<<0), /* log block has slim format */
LWB_FLAG_SLOG = (1<<1), /* lwb_blk is on SLOG device */
LWB_FLAG_CRASHED = (1<<2), /* lwb is on the crash list */
} lwb_flag_t;
typedef struct lwb {
zilog_t *lwb_zilog; /* back pointer to log struct */
blkptr_t lwb_blk; /* on disk address of this log blk */
boolean_t lwb_slim; /* log block has slim format */
boolean_t lwb_slog; /* lwb_blk is on SLOG device */
lwb_flag_t lwb_flags; /* extra info about this lwb */
int lwb_error; /* log block allocation error */
int lwb_nmax; /* max bytes in the buffer */
int lwb_nused; /* # used bytes in buffer */
int lwb_nfilled; /* # filled bytes in buffer */
int lwb_sz; /* size of block and buffer */
int lwb_min_sz; /* min size for range allocation */
lwb_state_t lwb_state; /* the state of this lwb */
char *lwb_buf; /* log write buffer */
zio_t *lwb_child_zio; /* parent zio for children */
@ -124,7 +130,7 @@ typedef struct lwb {
list_t lwb_itxs; /* list of itx's */
list_t lwb_waiters; /* list of zil_commit_waiter's */
avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */
kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */
kmutex_t lwb_lock; /* protects lwb_vdev_tree and size */
} lwb_t;
/*
@ -149,7 +155,7 @@ typedef struct zil_commit_waiter {
list_node_t zcw_node; /* linkage in lwb_t:lwb_waiter list */
lwb_t *zcw_lwb; /* back pointer to lwb when linked */
boolean_t zcw_done; /* B_TRUE when "done", else B_FALSE */
int zcw_zio_error; /* contains the zio io_error value */
int zcw_error; /* result to return from zil_commit() */
} zil_commit_waiter_t;
/*

View file

@ -360,26 +360,26 @@ struct zbookmark_err_phys {
(zb)->zb_blkid == ZB_ROOT_BLKID)
typedef struct zio_prop {
enum zio_checksum zp_checksum;
enum zio_compress zp_compress;
enum zio_checksum zp_checksum:8;
enum zio_compress zp_compress:8;
uint8_t zp_complevel;
uint8_t zp_level;
uint8_t zp_copies;
uint8_t zp_gang_copies;
dmu_object_type_t zp_type;
boolean_t zp_dedup;
boolean_t zp_dedup_verify;
boolean_t zp_nopwrite;
boolean_t zp_brtwrite;
boolean_t zp_encrypt;
boolean_t zp_byteorder;
boolean_t zp_direct_write;
boolean_t zp_rewrite;
dmu_object_type_t zp_type:8;
dmu_object_type_t zp_storage_type:8;
boolean_t zp_dedup:1;
boolean_t zp_dedup_verify:1;
boolean_t zp_nopwrite:1;
boolean_t zp_brtwrite:1;
boolean_t zp_encrypt:1;
boolean_t zp_byteorder:1;
boolean_t zp_direct_write:1;
boolean_t zp_rewrite:1;
uint32_t zp_zpl_smallblk;
uint8_t zp_salt[ZIO_DATA_SALT_LEN];
uint8_t zp_iv[ZIO_DATA_IV_LEN];
uint8_t zp_mac[ZIO_DATA_MAC_LEN];
uint32_t zp_zpl_smallblk;
dmu_object_type_t zp_storage_type;
} zio_prop_t;
typedef struct zio_cksum_report zio_cksum_report_t;
@ -622,7 +622,8 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
const blkptr_t *bp, zio_flag_t flags);
extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg,
blkptr_t *new_bp, uint64_t size, boolean_t *slog);
blkptr_t *new_bp, uint64_t min_size, uint64_t max_size, boolean_t *slog,
boolean_t allow_larger);
extern void zio_flush(zio_t *zio, vdev_t *vd);
extern void zio_shrink(zio_t *zio, uint64_t size);

View file

@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
* Copyright (c) 2024, Klara, Inc.
* Copyright (c) 2024, 2025, Klara, Inc.
*/
#ifndef _SYS_ZVOL_IMPL_H
@ -56,6 +56,7 @@ typedef struct zvol_state {
atomic_t zv_suspend_ref; /* refcount for suspend */
krwlock_t zv_suspend_lock; /* suspend lock */
kcondvar_t zv_removing_cv; /* ready to remove minor */
list_node_t zv_remove_node; /* node on removal list */
struct zvol_state_os *zv_zso; /* private platform state */
boolean_t zv_threading; /* volthreading property */
} zvol_state_t;
@ -135,7 +136,7 @@ int zvol_os_rename_minor(zvol_state_t *zv, const char *newname);
int zvol_os_create_minor(const char *name);
int zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize);
boolean_t zvol_os_is_zvol(const char *path);
void zvol_os_clear_private(zvol_state_t *zv);
void zvol_os_remove_minor(zvol_state_t *zv);
void zvol_os_set_disk_ro(zvol_state_t *zv, int flags);
void zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity);

View file

@ -69,6 +69,7 @@ nodist_libicp_la_SOURCES += \
module/icp/asm-x86_64/aes/aes_aesni.S \
module/icp/asm-x86_64/modes/gcm_pclmulqdq.S \
module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S \
module/icp/asm-x86_64/modes/aesni-gcm-avx2-vaes.S \
module/icp/asm-x86_64/modes/ghash-x86_64.S \
module/icp/asm-x86_64/sha2/sha256-x86_64.S \
module/icp/asm-x86_64/sha2/sha512-x86_64.S \

View file

@ -102,7 +102,9 @@ typedef enum cpuid_inst_sets {
AES,
PCLMULQDQ,
MOVBE,
SHA_NI
SHA_NI,
VAES,
VPCLMULQDQ
} cpuid_inst_sets_t;
/*
@ -127,6 +129,8 @@ typedef struct cpuid_feature_desc {
#define _AES_BIT (1U << 25)
#define _PCLMULQDQ_BIT (1U << 1)
#define _MOVBE_BIT (1U << 22)
#define _VAES_BIT (1U << 9)
#define _VPCLMULQDQ_BIT (1U << 10)
#define _SHA_NI_BIT (1U << 29)
/*
@ -157,6 +161,8 @@ static const cpuid_feature_desc_t cpuid_features[] = {
[PCLMULQDQ] = {1U, 0U, _PCLMULQDQ_BIT, ECX },
[MOVBE] = {1U, 0U, _MOVBE_BIT, ECX },
[SHA_NI] = {7U, 0U, _SHA_NI_BIT, EBX },
[VAES] = {7U, 0U, _VAES_BIT, ECX },
[VPCLMULQDQ] = {7U, 0U, _VPCLMULQDQ_BIT, ECX },
};
/*
@ -231,6 +237,8 @@ CPUID_FEATURE_CHECK(aes, AES);
CPUID_FEATURE_CHECK(pclmulqdq, PCLMULQDQ);
CPUID_FEATURE_CHECK(movbe, MOVBE);
CPUID_FEATURE_CHECK(shani, SHA_NI);
CPUID_FEATURE_CHECK(vaes, VAES);
CPUID_FEATURE_CHECK(vpclmulqdq, VPCLMULQDQ);
/*
* Detect register set support
@ -381,6 +389,24 @@ zfs_shani_available(void)
return (__cpuid_has_shani());
}
/*
* Check if VAES instruction is available
*/
static inline boolean_t
zfs_vaes_available(void)
{
return (__cpuid_has_vaes());
}
/*
* Check if VPCLMULQDQ instruction is available
*/
static inline boolean_t
zfs_vpclmulqdq_available(void)
{
return (__cpuid_has_vpclmulqdq());
}
/*
* AVX-512 family of instruction sets:
*

View file

@ -38,6 +38,7 @@
#include <sys/processor.h>
#include <sys/rrwlock.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/stat.h>
#include <sys/systeminfo.h>
#include <sys/time.h>
@ -811,6 +812,79 @@ umem_out_of_memory(void)
return (0);
}
static void
spa_config_load(void)
{
void *buf = NULL;
nvlist_t *nvlist, *child;
nvpair_t *nvpair;
char *pathname;
zfs_file_t *fp;
zfs_file_attr_t zfa;
uint64_t fsize;
int err;
/*
* Open the configuration file.
*/
pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
(void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);
err = zfs_file_open(pathname, O_RDONLY, 0, &fp);
if (err)
err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp);
kmem_free(pathname, MAXPATHLEN);
if (err)
return;
if (zfs_file_getattr(fp, &zfa))
goto out;
fsize = zfa.zfa_size;
buf = kmem_alloc(fsize, KM_SLEEP);
/*
* Read the nvlist from the file.
*/
if (zfs_file_read(fp, buf, fsize, NULL) < 0)
goto out;
/*
* Unpack the nvlist.
*/
if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
goto out;
/*
* Iterate over all elements in the nvlist, creating a new spa_t for
* each one with the specified configuration.
*/
mutex_enter(&spa_namespace_lock);
nvpair = NULL;
while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
continue;
child = fnvpair_value_nvlist(nvpair);
if (spa_lookup(nvpair_name(nvpair)) != NULL)
continue;
(void) spa_add(nvpair_name(nvpair), child, NULL);
}
mutex_exit(&spa_namespace_lock);
nvlist_free(nvlist);
out:
if (buf != NULL)
kmem_free(buf, fsize);
zfs_file_close(fp);
}
void
kernel_init(int mode)
{
@ -835,6 +909,7 @@ kernel_init(int mode)
zstd_init();
spa_init((spa_mode_t)mode);
spa_config_load();
fletcher_4_init();

View file

@ -1903,30 +1903,43 @@ zpool_find_config(libpc_handle_t *hdl, const char *target, nvlist_t **configp,
*sepp = '\0';
pools = zpool_search_import(hdl, args);
if (pools != NULL) {
nvpair_t *elem = NULL;
while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
VERIFY0(nvpair_value_nvlist(elem, &config));
if (pool_match(config, targetdup)) {
count++;
if (match != NULL) {
/* multiple matches found */
continue;
} else {
match = fnvlist_dup(config);
}
}
}
fnvlist_free(pools);
if (pools == NULL) {
zutil_error_aux(hdl, dgettext(TEXT_DOMAIN, "no pools found"));
(void) zutil_error_fmt(hdl, LPC_UNKNOWN, dgettext(TEXT_DOMAIN,
"failed to find config for pool '%s'"), targetdup);
free(targetdup);
return (ENOENT);
}
nvpair_t *elem = NULL;
while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
VERIFY0(nvpair_value_nvlist(elem, &config));
if (pool_match(config, targetdup)) {
count++;
if (match != NULL) {
/* multiple matches found */
continue;
} else {
match = fnvlist_dup(config);
}
}
}
fnvlist_free(pools);
if (count == 0) {
zutil_error_aux(hdl, dgettext(TEXT_DOMAIN,
"no matching pools"));
(void) zutil_error_fmt(hdl, LPC_UNKNOWN, dgettext(TEXT_DOMAIN,
"failed to find config for pool '%s'"), targetdup);
free(targetdup);
return (ENOENT);
}
if (count > 1) {
zutil_error_aux(hdl, dgettext(TEXT_DOMAIN,
"more than one matching pool"));
(void) zutil_error_fmt(hdl, LPC_UNKNOWN, dgettext(TEXT_DOMAIN,
"failed to find config for pool '%s'"), targetdup);
free(targetdup);
fnvlist_free(match);
return (EINVAL);

View file

@ -941,10 +941,6 @@ The target number of bytes the ARC should leave as free memory on the system.
If zero, equivalent to the bigger of
.Sy 512 KiB No and Sy all_system_memory/64 .
.
.It Sy zfs_autoimport_disable Ns = Ns Sy 1 Ns | Ns 0 Pq int
Disable pool import at module load by ignoring the cache file
.Pq Sy spa_config_path .
.
.It Sy zfs_checksum_events_per_second Ns = Ns Sy 20 Ns /s Pq uint
Rate limit checksum events to this many per second.
Note that this should not be set below the ZED thresholds

View file

@ -15,7 +15,7 @@
.\" Copyright (c) 2017 Lawrence Livermore National Security, LLC.
.\" Copyright (c) 2017 Intel Corporation.
.\"
.Dd October 27, 2024
.Dd April 23, 2025
.Dt ZDB 8
.Os
.
@ -531,6 +531,18 @@ option, with more occurrences enabling more verbosity.
If no options are specified, all information about the named pool will be
displayed at default verbosity.
.
.Sh EXIT STATUS
The
.Nm
utility exits
.Sy 0
on success,
.Sy 1
if a fatal error occurs,
.Sy 2
if invalid command line options were specified, or
.Sy 3
if on-disk corruption was detected, but was not fatal.
.Sh EXAMPLES
.Ss Example 1 : No Display the configuration of imported pool Ar rpool
.Bd -literal

View file

@ -173,8 +173,10 @@ The receiving system must have the
feature enabled.
If the
.Sy lz4_compress
feature is active on the sending system, then the receiving system must have
that feature enabled as well.
or
.Sy zstd_compress
features are active on the sending system, then the receiving system must have
the corresponding features enabled as well.
Datasets that are sent with this flag may not be
received as an encrypted dataset, since encrypted datasets cannot use the
.Sy embedded_data
@ -201,8 +203,10 @@ property for details
.Pc .
If the
.Sy lz4_compress
feature is active on the sending system, then the receiving system must have
that feature enabled as well.
or
.Sy zstd_compress
features are active on the sending system, then the receiving system must have
the corresponding features enabled as well.
If the
.Sy large_blocks
feature is enabled on the sending system but the
@ -357,8 +361,10 @@ property for details
.Pc .
If the
.Sy lz4_compress
feature is active on the sending system, then the receiving system must have
that feature enabled as well.
or
.Sy zstd_compress
features are active on the sending system, then the receiving system must have
the corresponding features enabled as well.
If the
.Sy large_blocks
feature is enabled on the sending system but the
@ -400,8 +406,10 @@ The receiving system must have the
feature enabled.
If the
.Sy lz4_compress
feature is active on the sending system, then the receiving system must have
that feature enabled as well.
or
.Sy zstd_compress
features are active on the sending system, then the receiving system must have
the corresponding features enabled as well.
Datasets that are sent with this flag may not be received as an encrypted
dataset,
since encrypted datasets cannot use the

View file

@ -135,6 +135,7 @@ ICP_OBJS_X86_64 := \
asm-x86_64/sha2/sha256-x86_64.o \
asm-x86_64/sha2/sha512-x86_64.o \
asm-x86_64/modes/aesni-gcm-x86_64.o \
asm-x86_64/modes/aesni-gcm-avx2-vaes.o \
asm-x86_64/modes/gcm_pclmulqdq.o \
asm-x86_64/modes/ghash-x86_64.o

View file

@ -46,6 +46,9 @@
#define IMPL_CYCLE (UINT32_MAX-1)
#ifdef CAN_USE_GCM_ASM
#define IMPL_AVX (UINT32_MAX-2)
#if CAN_USE_GCM_ASM >= 2
#define IMPL_AVX2 (UINT32_MAX-3)
#endif
#endif
#define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
static uint32_t icp_gcm_impl = IMPL_FASTEST;
@ -56,17 +59,16 @@ static uint32_t user_sel_impl = IMPL_FASTEST;
boolean_t gcm_avx_can_use_movbe = B_FALSE;
/*
* Whether to use the optimized openssl gcm and ghash implementations.
* Set to true if module parameter icp_gcm_impl == "avx".
*/
static boolean_t gcm_use_avx = B_FALSE;
#define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx)
static gcm_impl gcm_impl_used = GCM_IMPL_GENERIC;
#define GCM_IMPL_USED (*(volatile gcm_impl *)&gcm_impl_used)
extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
static inline boolean_t gcm_avx_will_work(void);
static inline void gcm_set_avx(boolean_t);
static inline boolean_t gcm_toggle_avx(void);
static inline size_t gcm_simd_get_htab_size(boolean_t);
static inline boolean_t gcm_avx2_will_work(void);
static inline void gcm_use_impl(gcm_impl impl);
static inline gcm_impl gcm_toggle_impl(void);
static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
crypto_data_t *, size_t);
@ -89,7 +91,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
void (*xor_block)(uint8_t *, uint8_t *))
{
#ifdef CAN_USE_GCM_ASM
if (ctx->gcm_use_avx == B_TRUE)
if (ctx->impl != GCM_IMPL_GENERIC)
return (gcm_mode_encrypt_contiguous_blocks_avx(
ctx, data, length, out, block_size));
#endif
@ -208,7 +210,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
{
(void) copy_block;
#ifdef CAN_USE_GCM_ASM
if (ctx->gcm_use_avx == B_TRUE)
if (ctx->impl != GCM_IMPL_GENERIC)
return (gcm_encrypt_final_avx(ctx, out, block_size));
#endif
@ -374,7 +376,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
void (*xor_block)(uint8_t *, uint8_t *))
{
#ifdef CAN_USE_GCM_ASM
if (ctx->gcm_use_avx == B_TRUE)
if (ctx->impl != GCM_IMPL_GENERIC)
return (gcm_decrypt_final_avx(ctx, out, block_size));
#endif
@ -631,23 +633,23 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
gcm_ctx->impl = GCM_IMPL_USED;
} else {
/*
* Handle the "cycle" implementation by creating avx and
* non-avx contexts alternately.
* Handle the "cycle" implementation by creating different
* contexts, one per implementation.
*/
gcm_ctx->gcm_use_avx = gcm_toggle_avx();
gcm_ctx->impl = gcm_toggle_impl();
/* The avx impl. doesn't handle byte swapped key schedules. */
if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
gcm_ctx->gcm_use_avx = B_FALSE;
/* The AVX impl. doesn't handle byte swapped key schedules. */
if (needs_bswap == B_TRUE) {
gcm_ctx->impl = GCM_IMPL_GENERIC;
}
/*
* If this is a GCM context, use the MOVBE and the BSWAP
* If this is an AVX context, use the MOVBE and the BSWAP
* variants alternately.
*/
if (gcm_ctx->gcm_use_avx == B_TRUE &&
if (gcm_ctx->impl == GCM_IMPL_AVX &&
zfs_movbe_available() == B_TRUE) {
(void) atomic_toggle_boolean_nv(
(volatile boolean_t *)&gcm_avx_can_use_movbe);
@ -658,12 +660,13 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
* still they could be created by the aes generic implementation.
* Make sure not to use them since we'll corrupt data if we do.
*/
if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
gcm_ctx->gcm_use_avx = B_FALSE;
if (gcm_ctx->impl != GCM_IMPL_GENERIC && needs_bswap == B_TRUE) {
gcm_ctx->impl = GCM_IMPL_GENERIC;
cmn_err_once(CE_WARN,
"ICP: Can't use the aes generic or cycle implementations "
"in combination with the gcm avx implementation!");
"in combination with the gcm avx or avx2-vaes "
"implementation!");
cmn_err_once(CE_WARN,
"ICP: Falling back to a compatible implementation, "
"aes-gcm performance will likely be degraded.");
@ -672,36 +675,20 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
"restore performance.");
}
/* Allocate Htab memory as needed. */
if (gcm_ctx->gcm_use_avx == B_TRUE) {
size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
if (htab_len == 0) {
return (CRYPTO_MECHANISM_PARAM_INVALID);
}
gcm_ctx->gcm_htab_len = htab_len;
gcm_ctx->gcm_Htable =
kmem_alloc(htab_len, KM_SLEEP);
if (gcm_ctx->gcm_Htable == NULL) {
return (CRYPTO_HOST_MEMORY);
}
/*
* AVX implementations use Htable with sizes depending on
* implementation.
*/
if (gcm_ctx->impl != GCM_IMPL_GENERIC) {
rv = gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
block_size);
}
/* Avx and non avx context initialization differs from here on. */
if (gcm_ctx->gcm_use_avx == B_FALSE) {
else
#endif /* ifdef CAN_USE_GCM_ASM */
if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
rv = CRYPTO_MECHANISM_PARAM_INVALID;
}
#ifdef CAN_USE_GCM_ASM
} else {
if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
block_size) != CRYPTO_SUCCESS) {
rv = CRYPTO_MECHANISM_PARAM_INVALID;
}
if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
rv = CRYPTO_MECHANISM_PARAM_INVALID;
}
#endif /* ifdef CAN_USE_GCM_ASM */
return (rv);
}
@ -767,6 +754,9 @@ gcm_impl_get_ops(void)
break;
#ifdef CAN_USE_GCM_ASM
case IMPL_AVX:
#if CAN_USE_GCM_ASM >= 2
case IMPL_AVX2:
#endif
/*
* Make sure that we return a valid implementation while
* switching to the avx implementation since there still
@ -828,6 +818,13 @@ gcm_impl_init(void)
* Use the avx implementation if it's available and the implementation
* hasn't changed from its default value of fastest on module load.
*/
#if CAN_USE_GCM_ASM >= 2
if (gcm_avx2_will_work()) {
if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
gcm_use_impl(GCM_IMPL_AVX2);
}
} else
#endif
if (gcm_avx_will_work()) {
#ifdef HAVE_MOVBE
if (zfs_movbe_available() == B_TRUE) {
@ -835,7 +832,7 @@ gcm_impl_init(void)
}
#endif
if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
gcm_set_avx(B_TRUE);
gcm_use_impl(GCM_IMPL_AVX);
}
}
#endif
@ -852,6 +849,7 @@ static const struct {
{ "fastest", IMPL_FASTEST },
#ifdef CAN_USE_GCM_ASM
{ "avx", IMPL_AVX },
{ "avx2-vaes", IMPL_AVX2 },
#endif
};
@ -887,7 +885,13 @@ gcm_impl_set(const char *val)
/* Check mandatory options */
for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
#ifdef CAN_USE_GCM_ASM
#if CAN_USE_GCM_ASM >= 2
/* Ignore avx implementation if it won't work. */
if (gcm_impl_opts[i].sel == IMPL_AVX2 &&
!gcm_avx2_will_work()) {
continue;
}
#endif
if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
continue;
}
@ -915,11 +919,17 @@ gcm_impl_set(const char *val)
* Use the avx implementation if available and the requested one is
* avx or fastest.
*/
#if CAN_USE_GCM_ASM >= 2
if (gcm_avx2_will_work() == B_TRUE &&
(impl == IMPL_AVX2 || impl == IMPL_FASTEST)) {
gcm_use_impl(GCM_IMPL_AVX2);
} else
#endif
if (gcm_avx_will_work() == B_TRUE &&
(impl == IMPL_AVX || impl == IMPL_FASTEST)) {
gcm_set_avx(B_TRUE);
gcm_use_impl(GCM_IMPL_AVX);
} else {
gcm_set_avx(B_FALSE);
gcm_use_impl(GCM_IMPL_GENERIC);
}
#endif
@ -952,6 +962,12 @@ icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
#ifdef CAN_USE_GCM_ASM
/* Ignore avx implementation if it won't work. */
#if CAN_USE_GCM_ASM >= 2
if (gcm_impl_opts[i].sel == IMPL_AVX2 &&
!gcm_avx2_will_work()) {
continue;
}
#endif
if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
continue;
}
@ -993,9 +1009,6 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
/* Clear the FPU registers since they hold sensitive internal state. */
#define clear_fpu_regs() clear_fpu_regs_avx()
#define GHASH_AVX(ctx, in, len) \
gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
in, len)
#define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
@ -1010,20 +1023,77 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
static uint32_t gcm_avx_chunk_size =
((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
/*
* GCM definitions: uint128_t is copied from include/crypto/modes.h
* Avoiding u128 because it is already defined in kernel sources.
*/
typedef struct {
uint64_t hi, lo;
} uint128_t;
extern void ASMABI clear_fpu_regs_avx(void);
extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
const uint32_t pt[4], uint32_t ct[4]);
extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
#if CAN_USE_GCM_ASM >= 2
extern void ASMABI gcm_init_vpclmulqdq_avx2(uint128_t Htable[16],
const uint64_t H[2]);
#endif
extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
const uint8_t *in, size_t len);
#if CAN_USE_GCM_ASM >= 2
extern void ASMABI gcm_ghash_vpclmulqdq_avx2(uint64_t ghash[2],
const uint64_t *Htable, const uint8_t *in, size_t len);
#endif
static inline void GHASH_AVX(gcm_ctx_t *ctx, const uint8_t *in, size_t len)
{
switch (ctx->impl) {
#if CAN_USE_GCM_ASM >= 2
case GCM_IMPL_AVX2:
gcm_ghash_vpclmulqdq_avx2(ctx->gcm_ghash,
(const uint64_t *)ctx->gcm_Htable, in, len);
break;
#endif
case GCM_IMPL_AVX:
gcm_ghash_avx(ctx->gcm_ghash,
(const uint64_t *)ctx->gcm_Htable, in, len);
break;
default:
VERIFY(B_FALSE);
}
}
typedef size_t ASMABI aesni_gcm_encrypt_impl(const uint8_t *, uint8_t *,
size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);
extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
const void *, uint64_t *, uint64_t *);
#if CAN_USE_GCM_ASM >= 2
extern void ASMABI aes_gcm_enc_update_vaes_avx2(const uint8_t *in,
uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],
const uint128_t Htable[16], uint8_t Xi[16]);
#endif
typedef size_t ASMABI aesni_gcm_decrypt_impl(const uint8_t *, uint8_t *,
size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);
extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
const void *, uint64_t *, uint64_t *);
#if CAN_USE_GCM_ASM >= 2
extern void ASMABI aes_gcm_dec_update_vaes_avx2(const uint8_t *in,
uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],
const uint128_t Htable[16], uint8_t Xi[16]);
#endif
static inline boolean_t
gcm_avx2_will_work(void)
{
return (kfpu_allowed() &&
zfs_avx2_available() && zfs_vaes_available() &&
zfs_vpclmulqdq_available());
}
static inline boolean_t
gcm_avx_will_work(void)
@ -1035,33 +1105,67 @@ gcm_avx_will_work(void)
}
static inline void
gcm_set_avx(boolean_t val)
gcm_use_impl(gcm_impl impl)
{
if (gcm_avx_will_work() == B_TRUE) {
atomic_swap_32(&gcm_use_avx, val);
switch (impl) {
#if CAN_USE_GCM_ASM >= 2
case GCM_IMPL_AVX2:
if (gcm_avx2_will_work() == B_TRUE) {
atomic_swap_32(&gcm_impl_used, impl);
return;
}
zfs_fallthrough;
#endif
case GCM_IMPL_AVX:
if (gcm_avx_will_work() == B_TRUE) {
atomic_swap_32(&gcm_impl_used, impl);
return;
}
zfs_fallthrough;
default:
atomic_swap_32(&gcm_impl_used, GCM_IMPL_GENERIC);
}
}
static inline boolean_t
gcm_toggle_avx(void)
gcm_impl_will_work(gcm_impl impl)
{
if (gcm_avx_will_work() == B_TRUE) {
return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
} else {
return (B_FALSE);
switch (impl) {
#if CAN_USE_GCM_ASM >= 2
case GCM_IMPL_AVX2:
return (gcm_avx2_will_work());
#endif
case GCM_IMPL_AVX:
return (gcm_avx_will_work());
default:
return (B_TRUE);
}
}
static inline size_t
gcm_simd_get_htab_size(boolean_t simd_mode)
static inline gcm_impl
gcm_toggle_impl(void)
{
switch (simd_mode) {
case B_TRUE:
return (2 * 6 * 2 * sizeof (uint64_t));
gcm_impl current_impl, new_impl;
do { /* handle races */
current_impl = atomic_load_32(&gcm_impl_used);
new_impl = current_impl;
while (B_TRUE) { /* handle incompatble implementations */
new_impl = (new_impl + 1) % GCM_IMPL_MAX;
if (gcm_impl_will_work(new_impl)) {
break;
}
}
default:
return (0);
}
} while (atomic_cas_32(&gcm_impl_used, current_impl, new_impl) !=
current_impl);
return (new_impl);
}
@ -1077,6 +1181,50 @@ gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
}
static size_t aesni_gcm_encrypt_avx(const uint8_t *in, uint8_t *out,
size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
uint64_t *Xip)
{
(void) Htable;
return (aesni_gcm_encrypt(in, out, len, key, iv, Xip));
}
#if CAN_USE_GCM_ASM >= 2
// kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four
// bits of a |size_t|.
// This is from boringssl/crypto/fipsmodule/aes/gcm.cc.inc
static const size_t kSizeTWithoutLower4Bits = (size_t)-16;
/* The following CRYPTO methods are from boringssl/crypto/internal.h */
static inline uint32_t CRYPTO_bswap4(uint32_t x) {
return (__builtin_bswap32(x));
}
static inline uint32_t CRYPTO_load_u32_be(const void *in) {
uint32_t v;
memcpy(&v, in, sizeof (v));
return (CRYPTO_bswap4(v));
}
static inline void CRYPTO_store_u32_be(void *out, uint32_t v) {
v = CRYPTO_bswap4(v);
memcpy(out, &v, sizeof (v));
}
static size_t aesni_gcm_encrypt_avx2(const uint8_t *in, uint8_t *out,
size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
uint64_t *Xip)
{
uint8_t *ivec = (uint8_t *)iv;
len &= kSizeTWithoutLower4Bits;
aes_gcm_enc_update_vaes_avx2(in, out, len, key, ivec,
(const uint128_t *)Htable, (uint8_t *)Xip);
CRYPTO_store_u32_be(&ivec[12],
CRYPTO_load_u32_be(&ivec[12]) + len / 16);
return (len);
}
#endif /* if CAN_USE_GCM_ASM >= 2 */
/*
* Encrypt multiple blocks of data in GCM mode.
* This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
@ -1091,8 +1239,15 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
size_t done = 0;
uint8_t *datap = (uint8_t *)data;
size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
aesni_gcm_encrypt_impl *encrypt_blocks =
#if CAN_USE_GCM_ASM >= 2
ctx->impl == GCM_IMPL_AVX2 ?
aesni_gcm_encrypt_avx2 :
#endif
aesni_gcm_encrypt_avx;
const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
uint64_t *ghash = ctx->gcm_ghash;
uint64_t *htable = ctx->gcm_Htable;
uint64_t *cb = ctx->gcm_cb;
uint8_t *ct_buf = NULL;
uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
@ -1156,8 +1311,8 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
/* Do the bulk encryption in chunk_size blocks. */
for (; bleft >= chunk_size; bleft -= chunk_size) {
kfpu_begin();
done = aesni_gcm_encrypt(
datap, ct_buf, chunk_size, key, cb, ghash);
done = encrypt_blocks(
datap, ct_buf, chunk_size, key, cb, htable, ghash);
clear_fpu_regs();
kfpu_end();
@ -1180,7 +1335,8 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
/* Bulk encrypt the remaining data. */
kfpu_begin();
if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
done = encrypt_blocks(datap, ct_buf, bleft, key, cb, htable,
ghash);
if (done == 0) {
rv = CRYPTO_FAILED;
goto out;
@ -1293,6 +1449,29 @@ gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
return (CRYPTO_SUCCESS);
}
static size_t aesni_gcm_decrypt_avx(const uint8_t *in, uint8_t *out,
size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
uint64_t *Xip)
{
(void) Htable;
return (aesni_gcm_decrypt(in, out, len, key, iv, Xip));
}
#if CAN_USE_GCM_ASM >= 2
static size_t aesni_gcm_decrypt_avx2(const uint8_t *in, uint8_t *out,
size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
uint64_t *Xip)
{
uint8_t *ivec = (uint8_t *)iv;
len &= kSizeTWithoutLower4Bits;
aes_gcm_dec_update_vaes_avx2(in, out, len, key, ivec,
(const uint128_t *)Htable, (uint8_t *)Xip);
CRYPTO_store_u32_be(&ivec[12],
CRYPTO_load_u32_be(&ivec[12]) + len / 16);
return (len);
}
#endif /* if CAN_USE_GCM_ASM >= 2 */
/*
* Finalize decryption: We just have accumulated crypto text, so now we
* decrypt it here inplace.
@ -1306,10 +1485,17 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
B_FALSE);
size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
aesni_gcm_decrypt_impl *decrypt_blocks =
#if CAN_USE_GCM_ASM >= 2
ctx->impl == GCM_IMPL_AVX2 ?
aesni_gcm_decrypt_avx2 :
#endif
aesni_gcm_decrypt_avx;
size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
uint8_t *datap = ctx->gcm_pt_buf;
const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
uint32_t *cb = (uint32_t *)ctx->gcm_cb;
uint64_t *htable = ctx->gcm_Htable;
uint64_t *ghash = ctx->gcm_ghash;
uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
int rv = CRYPTO_SUCCESS;
@ -1322,8 +1508,8 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
*/
for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
kfpu_begin();
done = aesni_gcm_decrypt(datap, datap, chunk_size,
(const void *)key, ctx->gcm_cb, ghash);
done = decrypt_blocks(datap, datap, chunk_size,
(const void *)key, ctx->gcm_cb, htable, ghash);
clear_fpu_regs();
kfpu_end();
if (done != chunk_size) {
@ -1334,8 +1520,8 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
/* Decrypt remainder, which is less than chunk size, in one go. */
kfpu_begin();
if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
done = aesni_gcm_decrypt(datap, datap, bleft,
(const void *)key, ctx->gcm_cb, ghash);
done = decrypt_blocks(datap, datap, bleft,
(const void *)key, ctx->gcm_cb, htable, ghash);
if (done == 0) {
clear_fpu_regs();
kfpu_end();
@ -1424,13 +1610,42 @@ gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
B_FALSE);
size_t htab_len = 0;
#if CAN_USE_GCM_ASM >= 2
if (ctx->impl == GCM_IMPL_AVX2) {
/*
* BoringSSL's API specifies uint128_t[16] for htab; but only
* uint128_t[12] are used.
* See https://github.com/google/boringssl/blob/
* 813840dd094f9e9c1b00a7368aa25e656554221f1/crypto/fipsmodule/
* modes/asm/aes-gcm-avx2-x86_64.pl#L198-L200
*/
htab_len = (2 * 8 * sizeof (uint128_t));
} else
#endif /* CAN_USE_GCM_ASM >= 2 */
{
htab_len = (2 * 6 * sizeof (uint128_t));
}
ctx->gcm_Htable = kmem_alloc(htab_len, KM_SLEEP);
if (ctx->gcm_Htable == NULL) {
return (CRYPTO_HOST_MEMORY);
}
/* Init H (encrypt zero block) and create the initial counter block. */
memset(H, 0, sizeof (ctx->gcm_H));
kfpu_begin();
aes_encrypt_intel(keysched, aes_rounds,
(const uint32_t *)H, (uint32_t *)H);
gcm_init_htab_avx(ctx->gcm_Htable, H);
#if CAN_USE_GCM_ASM >= 2
if (ctx->impl == GCM_IMPL_AVX2) {
gcm_init_vpclmulqdq_avx2((uint128_t *)ctx->gcm_Htable, H);
} else
#endif /* if CAN_USE_GCM_ASM >= 2 */
{
gcm_init_htab_avx(ctx->gcm_Htable, H);
}
if (iv_len == 12) {
memcpy(cb, iv, 12);

View file

@ -171,7 +171,7 @@ gcm_clear_ctx(gcm_ctx_t *ctx)
explicit_memset(ctx->gcm_remainder, 0, sizeof (ctx->gcm_remainder));
explicit_memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
#if defined(CAN_USE_GCM_ASM)
if (ctx->gcm_use_avx == B_TRUE) {
if (ctx->impl != GCM_IMPL_GENERIC) {
ASSERT3P(ctx->gcm_Htable, !=, NULL);
explicit_memset(ctx->gcm_Htable, 0, ctx->gcm_htab_len);
kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len);

View file

@ -0,0 +1,253 @@
BoringSSL is a fork of OpenSSL. As such, large parts of it fall under OpenSSL
licensing. Files that are completely new have a Google copyright and an ISC
license. This license is reproduced at the bottom of this file.
Contributors to BoringSSL are required to follow the CLA rules for Chromium:
https://cla.developers.google.com/clas
Files in third_party/ have their own licenses, as described therein. The MIT
license, for third_party/fiat, which, unlike other third_party directories, is
compiled into non-test libraries, is included below.
The OpenSSL toolkit stays under a dual license, i.e. both the conditions of the
OpenSSL License and the original SSLeay license apply to the toolkit. See below
for the actual license texts. Actually both licenses are BSD-style Open Source
licenses. In case of any license issues related to OpenSSL please contact
openssl-core@openssl.org.
The following are Google-internal bug numbers where explicit permission from
some authors is recorded for use of their work. (This is purely for our own
record keeping.)
27287199
27287880
27287883
263291445
OpenSSL License
---------------
/* ====================================================================
* Copyright (c) 1998-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
Original SSLeay License
-----------------------
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
ISC license used for completely new code in BoringSSL:
/* Copyright 2015 The BoringSSL Authors
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
The code in third_party/fiat carries the MIT license:
Copyright (c) 2015-2016 the fiat-crypto authors (see
https://github.com/mit-plv/fiat-crypto/blob/master/AUTHORS).
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Licenses for support code
-------------------------
Parts of the TLS test suite are under the Go license. This code is not included
in BoringSSL (i.e. libcrypto and libssl) when compiled, however, so
distributing code linked against BoringSSL does not trigger this license:
Copyright (c) 2009 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
BoringSSL uses the Chromium test infrastructure to run a continuous build,
trybots etc. The scripts which manage this, and the script for generating build
metadata, are under the Chromium license. Distributing code linked against
BoringSSL does not trigger this license.
Copyright 2015 The Chromium Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View file

@ -0,0 +1 @@
PORTIONS OF AES GCM and GHASH FUNCTIONALITY

File diff suppressed because it is too large Load diff

View file

@ -42,7 +42,7 @@ extern "C" {
*/
#if defined(__x86_64__) && defined(HAVE_AVX) && \
defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
#define CAN_USE_GCM_ASM
#define CAN_USE_GCM_ASM (HAVE_VAES && HAVE_VPCLMULQDQ ? 2 : 1)
extern boolean_t gcm_avx_can_use_movbe;
#endif
@ -129,6 +129,15 @@ typedef struct ccm_ctx {
#define ccm_copy_to ccm_common.cc_copy_to
#define ccm_flags ccm_common.cc_flags
#ifdef CAN_USE_GCM_ASM
typedef enum gcm_impl {
GCM_IMPL_GENERIC = 0,
GCM_IMPL_AVX,
GCM_IMPL_AVX2,
GCM_IMPL_MAX,
} gcm_impl;
#endif
/*
* gcm_tag_len: Length of authentication tag.
*
@ -174,7 +183,7 @@ typedef struct gcm_ctx {
uint64_t gcm_len_a_len_c[2];
uint8_t *gcm_pt_buf;
#ifdef CAN_USE_GCM_ASM
boolean_t gcm_use_avx;
enum gcm_impl impl;
#endif
} gcm_ctx_t;

View file

@ -112,7 +112,6 @@ static int zfs__fini(void);
static void zfs_shutdown(void *, int);
static eventhandler_tag zfs_shutdown_event_tag;
static eventhandler_tag zfs_mountroot_event_tag;
#define ZFS_MIN_KSTACK_PAGES 4
@ -311,9 +310,6 @@ zfs_modevent(module_t mod, int type, void *unused __unused)
zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
shutdown_post_sync, zfs_shutdown, NULL,
SHUTDOWN_PRI_FIRST);
zfs_mountroot_event_tag = EVENTHANDLER_REGISTER(
mountroot, spa_boot_init, NULL,
SI_ORDER_ANY);
}
return (err);
case MOD_UNLOAD:
@ -322,9 +318,6 @@ zfs_modevent(module_t mod, int type, void *unused __unused)
if (zfs_shutdown_event_tag != NULL)
EVENTHANDLER_DEREGISTER(shutdown_post_sync,
zfs_shutdown_event_tag);
if (zfs_mountroot_event_tag != NULL)
EVENTHANDLER_DEREGISTER(mountroot,
zfs_mountroot_event_tag);
}
return (err);
case MOD_SHUTDOWN:

View file

@ -31,7 +31,7 @@
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright (c) 2024, Klara, Inc.
* Copyright (c) 2024, 2025, Klara, Inc.
*/
/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
@ -196,7 +196,6 @@ DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
static int zvol_geom_open(struct g_provider *pp, int flag, int count);
static int zvol_geom_close(struct g_provider *pp, int flag, int count);
static void zvol_geom_destroy(zvol_state_t *zv);
static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
static void zvol_geom_bio_start(struct bio *bp);
static int zvol_geom_bio_getattr(struct bio *bp);
@ -226,25 +225,14 @@ zvol_geom_open(struct g_provider *pp, int flag, int count)
}
retry:
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
/*
* Obtain a copy of private under zvol_state_lock to make sure either
* the result of zvol free code setting private to NULL is observed,
* or the zv is protected from being freed because of the positive
* zv_open_count.
*/
zv = pp->private;
if (zv == NULL) {
rw_exit(&zvol_state_lock);
err = SET_ERROR(ENXIO);
goto out_locked;
}
zv = atomic_load_ptr(&pp->private);
if (zv == NULL)
return (SET_ERROR(ENXIO));
mutex_enter(&zv->zv_state_lock);
if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
rw_exit(&zvol_state_lock);
err = SET_ERROR(ENXIO);
goto out_zv_locked;
goto out_locked;
}
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
@ -257,8 +245,24 @@ retry:
drop_suspend = B_TRUE;
if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
mutex_exit(&zv->zv_state_lock);
/*
* Removal may happen while the locks are down, so
* we can't trust zv any longer; we have to start over.
*/
zv = atomic_load_ptr(&pp->private);
if (zv == NULL)
return (SET_ERROR(ENXIO));
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
if (zv->zv_zso->zso_dying ||
zv->zv_flags & ZVOL_REMOVING) {
err = SET_ERROR(ENXIO);
goto out_locked;
}
/* Check to see if zv_suspend_lock is needed. */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
@ -266,7 +270,6 @@ retry:
}
}
}
rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@ -294,7 +297,7 @@ retry:
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
if (err)
goto out_zv_locked;
goto out_locked;
pp->mediasize = zv->zv_volsize;
pp->stripeoffset = 0;
pp->stripesize = zv->zv_volblocksize;
@ -329,9 +332,8 @@ out_opened:
zvol_last_close(zv);
wakeup(zv);
}
out_zv_locked:
mutex_exit(&zv->zv_state_lock);
out_locked:
mutex_exit(&zv->zv_state_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
return (err);
@ -345,12 +347,9 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
boolean_t drop_suspend = B_TRUE;
int new_open_count;
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
zv = pp->private;
if (zv == NULL) {
rw_exit(&zvol_state_lock);
zv = atomic_load_ptr(&pp->private);
if (zv == NULL)
return (SET_ERROR(ENXIO));
}
mutex_enter(&zv->zv_state_lock);
if (zv->zv_flags & ZVOL_EXCL) {
@ -377,6 +376,15 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
/*
* Unlike in zvol_geom_open(), we don't check if
* removal started here, because we might be one of the
* openers that needs to be thrown out! If we're the
* last, we need to call zvol_last_close() below to
* finish cleanup. So, no special treatment for us.
*/
/* Check to see if zv_suspend_lock is needed. */
new_open_count = zv->zv_open_count - count;
if (new_open_count != 0) {
@ -387,7 +395,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
} else {
drop_suspend = B_FALSE;
}
rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@ -408,20 +415,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
return (0);
}
static void
zvol_geom_destroy(zvol_state_t *zv)
{
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
struct g_provider *pp = zsg->zsg_provider;
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
g_topology_assert();
zsg->zsg_provider = NULL;
g_wither_geom(pp->geom, ENXIO);
}
void
zvol_wait_close(zvol_state_t *zv)
{
@ -454,7 +447,7 @@ zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
pp->name, acr, acw, ace));
if (pp->private == NULL) {
if (atomic_load_ptr(&pp->private) == NULL) {
if (acr <= 0 && acw <= 0 && ace <= 0)
return (0);
return (pp->error);
@ -921,25 +914,14 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
boolean_t drop_suspend = B_FALSE;
retry:
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
/*
* Obtain a copy of si_drv2 under zvol_state_lock to make sure either
* the result of zvol free code setting si_drv2 to NULL is observed,
* or the zv is protected from being freed because of the positive
* zv_open_count.
*/
zv = dev->si_drv2;
if (zv == NULL) {
rw_exit(&zvol_state_lock);
err = SET_ERROR(ENXIO);
goto out_locked;
}
zv = atomic_load_ptr(&dev->si_drv2);
if (zv == NULL)
return (SET_ERROR(ENXIO));
mutex_enter(&zv->zv_state_lock);
if (zv->zv_zso->zso_dying) {
rw_exit(&zvol_state_lock);
if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
err = SET_ERROR(ENXIO);
goto out_zv_locked;
goto out_locked;
}
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
@ -954,6 +936,13 @@ retry:
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
/* Removal started while locks were down. */
err = SET_ERROR(ENXIO);
goto out_locked;
}
/* Check to see if zv_suspend_lock is needed. */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
@ -961,7 +950,6 @@ retry:
}
}
}
rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@ -989,7 +977,7 @@ retry:
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
if (err)
goto out_zv_locked;
goto out_locked;
}
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@ -1016,9 +1004,8 @@ out_opened:
zvol_last_close(zv);
wakeup(zv);
}
out_zv_locked:
mutex_exit(&zv->zv_state_lock);
out_locked:
mutex_exit(&zv->zv_state_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
return (err);
@ -1030,12 +1017,9 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
zvol_state_t *zv;
boolean_t drop_suspend = B_TRUE;
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
zv = dev->si_drv2;
if (zv == NULL) {
rw_exit(&zvol_state_lock);
zv = atomic_load_ptr(&dev->si_drv2);
if (zv == NULL)
return (SET_ERROR(ENXIO));
}
mutex_enter(&zv->zv_state_lock);
if (zv->zv_flags & ZVOL_EXCL) {
@ -1060,6 +1044,15 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
/*
* Unlike in zvol_cdev_open(), we don't check if
* removal started here, because we might be one of the
* openers that needs to be thrown out! If we're the
* last, we need to call zvol_last_close() below to
* finish cleanup. So, no special treatment for us.
*/
/* Check to see if zv_suspend_lock is needed. */
if (zv->zv_open_count != 1) {
rw_exit(&zv->zv_suspend_lock);
@ -1069,7 +1062,6 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
} else {
drop_suspend = B_FALSE;
}
rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@ -1101,7 +1093,8 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
int error;
boolean_t sync;
zv = dev->si_drv2;
zv = atomic_load_ptr(&dev->si_drv2);
ASSERT3P(zv, !=, NULL);
error = 0;
KASSERT(zv->zv_open_count > 0,
@ -1162,6 +1155,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
*(off_t *)data = 0;
break;
case DIOCGATTR: {
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
spa_t *spa = dmu_objset_spa(zv->zv_objset);
struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
uint64_t refd, avail, usedobjs, availobjs;
@ -1186,6 +1180,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
arg->value.off = refd / DEV_BSIZE;
} else
error = SET_ERROR(ENOIOCTL);
rw_exit(&zv->zv_suspend_lock);
break;
}
case FIOSEEKHOLE:
@ -1196,10 +1191,12 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
hole = (cmd == FIOSEEKHOLE);
noff = *off;
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
RL_READER);
error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
zfs_rangelock_exit(lr);
rw_exit(&zv->zv_suspend_lock);
*off = noff;
break;
}
@ -1400,42 +1397,65 @@ zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize,
* Remove minor node for the specified volume.
*/
void
zvol_os_free(zvol_state_t *zv)
zvol_os_remove_minor(zvol_state_t *zv)
{
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
ASSERT0(zv->zv_open_count);
ASSERT0(atomic_read(&zv->zv_suspend_ref));
ASSERT(zv->zv_flags & ZVOL_REMOVING);
ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
rw_destroy(&zv->zv_suspend_lock);
zfs_rangelock_fini(&zv->zv_rangelock);
struct zvol_state_os *zso = zv->zv_zso;
zv->zv_zso = NULL;
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
struct g_provider *pp __maybe_unused = zsg->zsg_provider;
ASSERT0P(pp->private);
struct zvol_state_geom *zsg = &zso->zso_geom;
struct g_provider *pp = zsg->zsg_provider;
atomic_store_ptr(&pp->private, NULL);
mutex_exit(&zv->zv_state_lock);
g_topology_lock();
zvol_geom_destroy(zv);
g_wither_geom(pp->geom, ENXIO);
g_topology_unlock();
} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
struct zvol_state_dev *zsd = &zso->zso_dev;
struct cdev *dev = zsd->zsd_cdev;
if (dev != NULL)
atomic_store_ptr(&dev->si_drv2, NULL);
mutex_exit(&zv->zv_state_lock);
if (dev != NULL) {
ASSERT0P(dev->si_drv2);
destroy_dev(dev);
knlist_clear(&zsd->zsd_selinfo.si_note, 0);
knlist_destroy(&zsd->zsd_selinfo.si_note);
}
}
kmem_free(zso, sizeof (struct zvol_state_os));
mutex_enter(&zv->zv_state_lock);
}
void
zvol_os_free(zvol_state_t *zv)
{
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
ASSERT0(zv->zv_open_count);
ASSERT0P(zv->zv_zso);
ASSERT0P(zv->zv_objset);
ASSERT0P(zv->zv_zilog);
ASSERT0P(zv->zv_dn);
ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
rw_destroy(&zv->zv_suspend_lock);
zfs_rangelock_fini(&zv->zv_rangelock);
mutex_destroy(&zv->zv_state_lock);
cv_destroy(&zv->zv_removing_cv);
dataset_kstats_destroy(&zv->zv_kstat);
kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
kmem_free(zv, sizeof (zvol_state_t));
zvol_minors--;
}
@ -1538,28 +1558,6 @@ out_doi:
return (error);
}
void
zvol_os_clear_private(zvol_state_t *zv)
{
ASSERT(RW_LOCK_HELD(&zvol_state_lock));
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
struct g_provider *pp = zsg->zsg_provider;
if (pp->private == NULL) /* already cleared */
return;
pp->private = NULL;
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
struct cdev *dev = zsd->zsd_cdev;
if (dev != NULL)
dev->si_drv2 = NULL;
}
}
int
zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
{

View file

@ -22,7 +22,7 @@
/*
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
* Copyright (c) 2024, Klara, Inc.
* Copyright (c) 2024, 2025, Klara, Inc.
*/
#include <sys/dataset_kstats.h>
@ -679,28 +679,19 @@ zvol_open(struct block_device *bdev, fmode_t flag)
retry:
#endif
rw_enter(&zvol_state_lock, RW_READER);
/*
* Obtain a copy of private_data under the zvol_state_lock to make
* sure that either the result of zvol free code path setting
* disk->private_data to NULL is observed, or zvol_os_free()
* is not called on this zv because of the positive zv_open_count.
*/
#ifdef HAVE_BLK_MODE_T
zv = disk->private_data;
zv = atomic_load_ptr(&disk->private_data);
#else
zv = bdev->bd_disk->private_data;
zv = atomic_load_ptr(&bdev->bd_disk->private_data);
#endif
if (zv == NULL) {
rw_exit(&zvol_state_lock);
return (-SET_ERROR(ENXIO));
}
mutex_enter(&zv->zv_state_lock);
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
mutex_exit(&zv->zv_state_lock);
rw_exit(&zvol_state_lock);
return (-SET_ERROR(ENXIO));
}
@ -712,8 +703,28 @@ retry:
if (zv->zv_open_count == 0) {
if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
mutex_exit(&zv->zv_state_lock);
/*
* Removal may happen while the locks are down, so
* we can't trust zv any longer; we have to start over.
*/
#ifdef HAVE_BLK_MODE_T
zv = atomic_load_ptr(&disk->private_data);
#else
zv = atomic_load_ptr(&bdev->bd_disk->private_data);
#endif
if (zv == NULL)
return (-SET_ERROR(ENXIO));
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
mutex_exit(&zv->zv_state_lock);
rw_exit(&zv->zv_suspend_lock);
return (-SET_ERROR(ENXIO));
}
/* check to see if zv_suspend_lock is needed */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
@ -724,7 +735,6 @@ retry:
drop_suspend = B_TRUE;
}
}
rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@ -821,11 +831,11 @@ zvol_release(struct gendisk *disk, fmode_t unused)
#if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
(void) unused;
#endif
zvol_state_t *zv;
boolean_t drop_suspend = B_TRUE;
rw_enter(&zvol_state_lock, RW_READER);
zv = disk->private_data;
zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
if (zv == NULL)
return;
mutex_enter(&zv->zv_state_lock);
ASSERT3U(zv->zv_open_count, >, 0);
@ -839,6 +849,15 @@ zvol_release(struct gendisk *disk, fmode_t unused)
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
/*
* Unlike in zvol_open(), we don't check if removal
* started here, because we might be one of the openers
* that needs to be thrown out! If we're the last, we
* need to call zvol_last_close() below to finish
* cleanup. So, no special treatment for us.
*/
/* check to see if zv_suspend_lock is needed */
if (zv->zv_open_count != 1) {
rw_exit(&zv->zv_suspend_lock);
@ -848,7 +867,6 @@ zvol_release(struct gendisk *disk, fmode_t unused)
} else {
drop_suspend = B_FALSE;
}
rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@ -868,9 +886,10 @@ static int
zvol_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
zvol_state_t *zv = bdev->bd_disk->private_data;
int error = 0;
zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
switch (cmd) {
@ -923,9 +942,8 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
{
unsigned int mask = 0;
rw_enter(&zvol_state_lock, RW_READER);
zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
zvol_state_t *zv = disk->private_data;
if (zv != NULL) {
mutex_enter(&zv->zv_state_lock);
mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
@ -933,17 +951,14 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing)
mutex_exit(&zv->zv_state_lock);
}
rw_exit(&zvol_state_lock);
return (mask);
}
static int
zvol_revalidate_disk(struct gendisk *disk)
{
rw_enter(&zvol_state_lock, RW_READER);
zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
zvol_state_t *zv = disk->private_data;
if (zv != NULL) {
mutex_enter(&zv->zv_state_lock);
set_capacity(zv->zv_zso->zvo_disk,
@ -951,8 +966,6 @@ zvol_revalidate_disk(struct gendisk *disk)
mutex_exit(&zv->zv_state_lock);
}
rw_exit(&zvol_state_lock);
return (0);
}
@ -971,16 +984,6 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
return (0);
}
void
zvol_os_clear_private(zvol_state_t *zv)
{
/*
* Cleared while holding zvol_state_lock as a writer
* which will prevent zvol_open() from opening it.
*/
zv->zv_zso->zvo_disk->private_data = NULL;
}
/*
* Provide a simple virtual geometry for legacy compatibility. For devices
* smaller than 1 MiB a small head and sector count is used to allow very
@ -990,9 +993,10 @@ zvol_os_clear_private(zvol_state_t *zv)
static int
zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
zvol_state_t *zv = bdev->bd_disk->private_data;
sector_t sectors;
zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
sectors = get_capacity(zv->zv_zso->zvo_disk);
@ -1417,17 +1421,50 @@ out_kmem:
return (ret);
}
/*
* Cleanup then free a zvol_state_t which was created by zvol_alloc().
* At this time, the structure is not opened by anyone, is taken off
* the zvol_state_list, and has its private data set to NULL.
* The zvol_state_lock is dropped.
*
* This function may take many milliseconds to complete (e.g. we've seen
* it take over 256ms), due to the calls to "blk_cleanup_queue" and
* "del_gendisk". Thus, consumers need to be careful to account for this
* latency when calling this function.
*/
void
zvol_os_remove_minor(zvol_state_t *zv)
{
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
ASSERT0(zv->zv_open_count);
ASSERT0(atomic_read(&zv->zv_suspend_ref));
ASSERT(zv->zv_flags & ZVOL_REMOVING);
struct zvol_state_os *zso = zv->zv_zso;
zv->zv_zso = NULL;
/* Clearing private_data will make new callers return immediately. */
atomic_store_ptr(&zso->zvo_disk->private_data, NULL);
/*
* Drop the state lock before calling del_gendisk(). There may be
* callers waiting to acquire it, but del_gendisk() will block until
* they exit, which would deadlock.
*/
mutex_exit(&zv->zv_state_lock);
del_gendisk(zso->zvo_disk);
#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
#if defined(HAVE_BLK_CLEANUP_DISK)
blk_cleanup_disk(zso->zvo_disk);
#else
put_disk(zso->zvo_disk);
#endif
#else
blk_cleanup_queue(zso->zvo_queue);
put_disk(zso->zvo_disk);
#endif
if (zso->use_blk_mq)
blk_mq_free_tag_set(&zso->tag_set);
ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);
kmem_free(zso, sizeof (struct zvol_state_os));
mutex_enter(&zv->zv_state_lock);
}
void
zvol_os_free(zvol_state_t *zv)
{
@ -1435,35 +1472,19 @@ zvol_os_free(zvol_state_t *zv)
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
ASSERT0(zv->zv_open_count);
ASSERT0P(zv->zv_zso->zvo_disk->private_data);
ASSERT0P(zv->zv_zso);
ASSERT0P(zv->zv_objset);
ASSERT0P(zv->zv_zilog);
ASSERT0P(zv->zv_dn);
rw_destroy(&zv->zv_suspend_lock);
zfs_rangelock_fini(&zv->zv_rangelock);
del_gendisk(zv->zv_zso->zvo_disk);
#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
#if defined(HAVE_BLK_CLEANUP_DISK)
blk_cleanup_disk(zv->zv_zso->zvo_disk);
#else
put_disk(zv->zv_zso->zvo_disk);
#endif
#else
blk_cleanup_queue(zv->zv_zso->zvo_queue);
put_disk(zv->zv_zso->zvo_disk);
#endif
if (zv->zv_zso->use_blk_mq)
blk_mq_free_tag_set(&zv->zv_zso->tag_set);
ida_simple_remove(&zvol_ida,
MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
cv_destroy(&zv->zv_removing_cv);
mutex_destroy(&zv->zv_state_lock);
dataset_kstats_destroy(&zv->zv_kstat);
kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
kmem_free(zv, sizeof (zvol_state_t));
}

View file

@ -118,6 +118,10 @@ simd_stat_kstat_data(char *buf, size_t size, void *data)
"pclmulqdq", zfs_pclmulqdq_available());
off += SIMD_STAT_PRINT(simd_stat_kstat_payload,
"movbe", zfs_movbe_available());
off += SIMD_STAT_PRINT(simd_stat_kstat_payload,
"vaes", zfs_vaes_available());
off += SIMD_STAT_PRINT(simd_stat_kstat_payload,
"vpclmulqdq", zfs_vpclmulqdq_available());
off += SIMD_STAT_PRINT(simd_stat_kstat_payload,
"osxsave", boot_cpu_has(X86_FEATURE_OSXSAVE));

View file

@ -2557,12 +2557,13 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
/*
* Due to our use of dn_nlevels below, this can only be called
* in open context, unless we are operating on the MOS.
* From syncing context, dn_nlevels may be different from the
* dn_nlevels used when dbuf was dirtied.
* in open context, unless we are operating on the MOS or it's
* a special object. From syncing context, dn_nlevels may be
* different from the dn_nlevels used when dbuf was dirtied.
*/
ASSERT(db->db_objset ==
dmu_objset_pool(db->db_objset)->dp_meta_objset ||
DMU_OBJECT_IS_SPECIAL(db->db.db_object) ||
txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT0(db->db_level);

View file

@ -81,7 +81,7 @@ multilist_create_impl(multilist_t *ml, size_t size, size_t offset,
ml->ml_num_sublists = num;
ml->ml_index_func = index_func;
ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
ml->ml_sublists = vmem_zalloc(sizeof (multilist_sublist_t) *
ml->ml_num_sublists, KM_SLEEP);
ASSERT3P(ml->ml_sublists, !=, NULL);
@ -134,7 +134,7 @@ multilist_destroy(multilist_t *ml)
}
ASSERT3P(ml->ml_sublists, !=, NULL);
kmem_free(ml->ml_sublists,
vmem_free(ml->ml_sublists,
sizeof (multilist_sublist_t) * ml->ml_num_sublists);
ml->ml_num_sublists = 0;

View file

@ -48,18 +48,17 @@
/*
* Pool configuration repository.
*
* Pool configuration is stored as a packed nvlist on the filesystem. By
* default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot
* (when the ZFS module is loaded). Pools can also have the 'cachefile'
* property set that allows them to be stored in an alternate location until
* the control of external software.
* Pool configuration is stored as a packed nvlist on the filesystem. When
* pools are imported they are added to the /etc/zfs/zpool.cache file and
* removed from it when exported. For each cache file, we have a single nvlist
* which holds all the configuration information. Pools can also have the
* 'cachefile' property set which allows this config to be stored in an
* alternate location under the control of external software.
*
* For each cache file, we have a single nvlist which holds all the
* configuration information. When the module loads, we read this information
* from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is
* maintained independently in spa.c. Whenever the namespace is modified, or
* the configuration of a pool is changed, we call spa_write_cachefile(), which
* walks through all the active pools and writes the configuration to disk.
* The kernel independantly maintains an AVL tree of imported pools. See the
* "SPA locking" comment in spa.c. Whenever a pool configuration is modified
* we call spa_write_cachefile() which walks through all the active pools and
* writes the updated configuration to to /etc/zfs/zpool.cache file.
*/
static uint64_t spa_config_generation = 1;
@ -69,94 +68,6 @@ static uint64_t spa_config_generation = 1;
* userland pools when doing testing.
*/
char *spa_config_path = (char *)ZPOOL_CACHE;
#ifdef _KERNEL
static int zfs_autoimport_disable = B_TRUE;
#endif
/*
* Called when the module is first loaded, this routine loads the configuration
* file into the SPA namespace. It does not actually open or load the pools; it
* only populates the namespace.
*/
void
spa_config_load(void)
{
void *buf = NULL;
nvlist_t *nvlist, *child;
nvpair_t *nvpair;
char *pathname;
zfs_file_t *fp;
zfs_file_attr_t zfa;
uint64_t fsize;
int err;
#ifdef _KERNEL
if (zfs_autoimport_disable)
return;
#endif
/*
* Open the configuration file.
*/
pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
(void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);
err = zfs_file_open(pathname, O_RDONLY, 0, &fp);
#ifdef __FreeBSD__
if (err)
err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp);
#endif
kmem_free(pathname, MAXPATHLEN);
if (err)
return;
if (zfs_file_getattr(fp, &zfa))
goto out;
fsize = zfa.zfa_size;
buf = kmem_alloc(fsize, KM_SLEEP);
/*
* Read the nvlist from the file.
*/
if (zfs_file_read(fp, buf, fsize, NULL) < 0)
goto out;
/*
* Unpack the nvlist.
*/
if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
goto out;
/*
* Iterate over all elements in the nvlist, creating a new spa_t for
* each one with the specified configuration.
*/
mutex_enter(&spa_namespace_lock);
nvpair = NULL;
while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
continue;
child = fnvpair_value_nvlist(nvpair);
if (spa_lookup(nvpair_name(nvpair)) != NULL)
continue;
(void) spa_add(nvpair_name(nvpair), child, NULL);
}
mutex_exit(&spa_namespace_lock);
nvlist_free(nvlist);
out:
if (buf != NULL)
kmem_free(buf, fsize);
zfs_file_close(fp);
}
static int
spa_config_remove(spa_config_dirent_t *dp)
@ -623,7 +534,6 @@ spa_config_update(spa_t *spa, int what)
spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
}
EXPORT_SYMBOL(spa_config_load);
EXPORT_SYMBOL(spa_all_configs);
EXPORT_SYMBOL(spa_config_set);
EXPORT_SYMBOL(spa_config_generate);
@ -634,8 +544,3 @@ EXPORT_SYMBOL(spa_config_update);
ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD,
"SPA config file (/etc/zfs/zpool.cache)");
#endif
#ifdef _KERNEL
ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW,
"Disable pool import at module load");
#endif

View file

@ -2547,13 +2547,6 @@ spa_name_compare(const void *a1, const void *a2)
return (TREE_ISIGN(s));
}
void
spa_boot_init(void *unused)
{
(void) unused;
spa_config_load();
}
void
spa_init(spa_mode_t mode)
{
@ -2607,7 +2600,6 @@ spa_init(spa_mode_t mode)
chksum_init();
zpool_prop_init();
zpool_feature_init();
spa_config_load();
vdev_prop_init();
l2arc_start();
scan_init();

View file

@ -819,34 +819,37 @@ zil_lwb_vdev_compare(const void *x1, const void *x2)
* we choose them here and later make the block allocation match.
*/
static lwb_t *
zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
uint64_t txg, lwb_state_t state)
zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, int min_sz, int sz,
boolean_t slog, uint64_t txg)
{
lwb_t *lwb;
lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
lwb->lwb_flags = 0;
lwb->lwb_zilog = zilog;
if (bp) {
lwb->lwb_blk = *bp;
lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2);
if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2)
lwb->lwb_flags |= LWB_FLAG_SLIM;
sz = BP_GET_LSIZE(bp);
lwb->lwb_min_sz = sz;
} else {
BP_ZERO(&lwb->lwb_blk);
lwb->lwb_slim = (spa_version(zilog->zl_spa) >=
SPA_VERSION_SLIM_ZIL);
if (spa_version(zilog->zl_spa) >= SPA_VERSION_SLIM_ZIL)
lwb->lwb_flags |= LWB_FLAG_SLIM;
lwb->lwb_min_sz = min_sz;
}
lwb->lwb_slog = slog;
if (slog)
lwb->lwb_flags |= LWB_FLAG_SLOG;
lwb->lwb_error = 0;
if (lwb->lwb_slim) {
lwb->lwb_nmax = sz;
lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
} else {
lwb->lwb_nmax = sz - sizeof (zil_chain_t);
lwb->lwb_nused = lwb->lwb_nfilled = 0;
}
/*
* Buffer allocation and capacity setup will be done in
* zil_lwb_write_open() when the LWB is opened for ITX assignment.
*/
lwb->lwb_nmax = lwb->lwb_nused = lwb->lwb_nfilled = 0;
lwb->lwb_sz = sz;
lwb->lwb_state = state;
lwb->lwb_buf = zio_buf_alloc(sz);
lwb->lwb_buf = NULL;
lwb->lwb_state = LWB_STATE_NEW;
lwb->lwb_child_zio = NULL;
lwb->lwb_write_zio = NULL;
lwb->lwb_root_zio = NULL;
@ -857,8 +860,6 @@ zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
mutex_enter(&zilog->zl_lock);
list_insert_tail(&zilog->zl_lwb_list, lwb);
if (state != LWB_STATE_NEW)
zilog->zl_last_lwb_opened = lwb;
mutex_exit(&zilog->zl_lock);
return (lwb);
@ -878,7 +879,7 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
VERIFY(list_is_empty(&lwb->lwb_itxs));
VERIFY(list_is_empty(&lwb->lwb_waiters));
ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
ASSERT(!MUTEX_HELD(&lwb->lwb_lock));
/*
* Clear the zilog's field to indicate this lwb is no longer
@ -1019,7 +1020,7 @@ zil_create(zilog_t *zilog)
}
error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
ZIL_MIN_BLKSZ, &slog);
ZIL_MIN_BLKSZ, ZIL_MIN_BLKSZ, &slog, B_TRUE);
if (error == 0)
zil_init_log_chain(zilog, &blk);
}
@ -1028,7 +1029,7 @@ zil_create(zilog_t *zilog)
* Allocate a log write block (lwb) for the first log block.
*/
if (error == 0)
lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW);
lwb = zil_alloc_lwb(zilog, &blk, 0, 0, slog, txg);
/*
* If we just allocated the first log block, commit our transaction
@ -1324,10 +1325,12 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
* zil_commit() is racing with spa_sync().
*/
static void
zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
zil_commit_waiter_done(zil_commit_waiter_t *zcw, int err)
{
mutex_enter(&zcw->zcw_lock);
ASSERT3B(zcw->zcw_done, ==, B_FALSE);
zcw->zcw_lwb = NULL;
zcw->zcw_error = err;
zcw->zcw_done = B_TRUE;
cv_broadcast(&zcw->zcw_cv);
mutex_exit(&zcw->zcw_lock);
@ -1389,7 +1392,7 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
if (zil_nocacheflush)
return;
mutex_enter(&lwb->lwb_vdev_lock);
mutex_enter(&lwb->lwb_lock);
for (i = 0; i < ndvas; i++) {
zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
if (avl_find(t, &zvsearch, &where) == NULL) {
@ -1398,7 +1401,7 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
avl_insert(t, zv, where);
}
}
mutex_exit(&lwb->lwb_vdev_lock);
mutex_exit(&lwb->lwb_lock);
}
static void
@ -1415,12 +1418,12 @@ zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
/*
* While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
* not need the protection of lwb_vdev_lock (it will only be modified
* not need the protection of lwb_lock (it will only be modified
* while holding zilog->zl_lock) as its writes and those of its
* children have all completed. The younger 'nlwb' may be waiting on
* future writes to additional vdevs.
*/
mutex_enter(&nlwb->lwb_vdev_lock);
mutex_enter(&nlwb->lwb_lock);
/*
* Tear down the 'lwb' vdev tree, ensuring that entries which do not
* exist in 'nlwb' are moved to it, freeing any would-be duplicates.
@ -1434,7 +1437,7 @@ zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
kmem_free(zv, sizeof (*zv));
}
}
mutex_exit(&nlwb->lwb_vdev_lock);
mutex_exit(&nlwb->lwb_lock);
}
void
@ -1491,10 +1494,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
zil_itx_destroy(itx, 0);
while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
mutex_enter(&zcw->zcw_lock);
ASSERT3P(zcw->zcw_lwb, ==, lwb);
zcw->zcw_lwb = NULL;
/*
* We expect any ZIO errors from child ZIOs to have been
* propagated "up" to this specific LWB's root ZIO, in
@ -1509,14 +1508,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
* errors not being handled correctly here. See the
* comment above the call to "zio_flush" for details.
*/
zcw->zcw_zio_error = zio->io_error;
ASSERT3B(zcw->zcw_done, ==, B_FALSE);
zcw->zcw_done = B_TRUE;
cv_broadcast(&zcw->zcw_cv);
mutex_exit(&zcw->zcw_lock);
zil_commit_waiter_done(zcw, zio->io_error);
}
uint64_t txg = lwb->lwb_issued_txg;
@ -1588,7 +1580,7 @@ zil_lwb_write_done(zio_t *zio)
avl_tree_t *t = &lwb->lwb_vdev_tree;
void *cookie = NULL;
zil_vdev_node_t *zv;
lwb_t *nlwb;
lwb_t *nlwb = NULL;
ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
@ -1608,9 +1600,11 @@ zil_lwb_write_done(zio_t *zio)
* its write ZIO a parent this ZIO. In such case we can not defer
* our flushes or below may be a race between the done callbacks.
*/
nlwb = list_next(&zilog->zl_lwb_list, lwb);
if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
nlwb = NULL;
if (!(lwb->lwb_flags & LWB_FLAG_CRASHED)) {
nlwb = list_next(&zilog->zl_lwb_list, lwb);
if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
nlwb = NULL;
}
mutex_exit(&zilog->zl_lock);
if (avl_numnodes(t) == 0)
@ -1624,12 +1618,17 @@ zil_lwb_write_done(zio_t *zio)
* written out.
*
* Additionally, we don't perform any further error handling at
* this point (e.g. setting "zcw_zio_error" appropriately), as
* we expect that to occur in "zil_lwb_flush_vdevs_done" (thus,
* we expect any error seen here, to have been propagated to
* that function).
* this point (e.g. setting "zcw_error" appropriately), as we
* expect that to occur in "zil_lwb_flush_vdevs_done" (thus, we
* expect any error seen here, to have been propagated to that
* function).
*
* Note that we treat a "crashed" LWB as though it was in error,
* even if it did appear to succeed, because we've already
* signaled error and cleaned up waiters and committers in
* zil_crash(); we just want to clean up and get out of here.
*/
if (zio->io_error != 0) {
if (zio->io_error != 0 || (lwb->lwb_flags & LWB_FLAG_CRASHED)) {
while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
kmem_free(zv, sizeof (*zv));
return;
@ -1742,10 +1741,26 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
return;
}
mutex_enter(&lwb->lwb_lock);
mutex_enter(&zilog->zl_lock);
lwb->lwb_state = LWB_STATE_OPENED;
zilog->zl_last_lwb_opened = lwb;
mutex_exit(&zilog->zl_lock);
mutex_exit(&lwb->lwb_lock);
/*
* Allocate buffer and set up LWB capacities.
*/
ASSERT0P(lwb->lwb_buf);
ASSERT3U(lwb->lwb_sz, >, 0);
lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
if (lwb->lwb_flags & LWB_FLAG_SLIM) {
lwb->lwb_nmax = lwb->lwb_sz;
lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
} else {
lwb->lwb_nmax = lwb->lwb_sz - sizeof (zil_chain_t);
lwb->lwb_nused = lwb->lwb_nfilled = 0;
}
}
/*
@ -1762,6 +1777,8 @@ static uint_t
zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
{
uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t);
uint_t waste = zil_max_waste_space(zilog);
waste = MAX(waste, zilog->zl_cur_max);
if (size <= md) {
/*
@ -1772,9 +1789,10 @@ zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
} else if (size > 8 * md) {
/*
* Big bursts use maximum blocks. The first block size
* is hard to predict, but it does not really matter.
* is hard to predict, but we need at least enough space
* to make reasonable progress.
*/
*minsize = 0;
*minsize = waste;
return (md);
}
@ -1787,57 +1805,52 @@ zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
uint_t s = size;
uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t));
uint_t chunk = DIV_ROUND_UP(s, n);
uint_t waste = zil_max_waste_space(zilog);
waste = MAX(waste, zilog->zl_cur_max);
if (chunk <= md - waste) {
*minsize = MAX(s - (md - waste) * (n - 1), waste);
return (chunk);
} else {
*minsize = 0;
*minsize = waste;
return (md);
}
}
/*
* Try to predict next block size based on previous history. Make prediction
* sufficient for 7 of 8 previous bursts. Don't try to save if the saving is
* less then 50%, extra writes may cost more, but we don't want single spike
* to badly affect our predictions.
* sufficient for 7 of 8 previous bursts, but don't try to save if the saving
* is less then 50%. Extra writes may cost more, but we don't want single
* spike to badly affect our predictions.
*/
static uint_t
zil_lwb_predict(zilog_t *zilog)
static void
zil_lwb_predict(zilog_t *zilog, uint64_t *min_predict, uint64_t *max_predict)
{
uint_t m, o;
uint_t m1 = 0, m2 = 0, o;
/* If we are in the middle of a burst, take it into account also. */
if (zilog->zl_cur_size > 0) {
o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m);
} else {
/* If we are in the middle of a burst, take it as another data point. */
if (zilog->zl_cur_size > 0)
o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m1);
else
o = UINT_MAX;
m = 0;
}
/* Find minimum optimal size. We don't need to go below that. */
for (int i = 0; i < ZIL_BURSTS; i++)
o = MIN(o, zilog->zl_prev_opt[i]);
/* Find two biggest minimal first block sizes above the optimal. */
uint_t m1 = MAX(m, o), m2 = o;
/* Find two largest minimal first block sizes. */
for (int i = 0; i < ZIL_BURSTS; i++) {
m = zilog->zl_prev_min[i];
if (m >= m1) {
uint_t cur = zilog->zl_prev_min[i];
if (cur >= m1) {
m2 = m1;
m1 = m;
} else if (m > m2) {
m2 = m;
m1 = cur;
} else if (cur > m2) {
m2 = cur;
}
}
/*
* If second minimum size gives 50% saving -- use it. It may cost us
* one additional write later, but the space saving is just too big.
*/
return ((m1 < m2 * 2) ? m1 : m2);
/* Minimum should guarantee progress in most cases. */
*min_predict = (m1 < m2 * 2) ? m1 : m2;
/* Maximum doesn't need to go below the minimum optimal size. */
for (int i = 0; i < ZIL_BURSTS; i++)
o = MIN(o, zilog->zl_prev_opt[i]);
m1 = MAX(m1, o);
m2 = MAX(m2, o);
*max_predict = (m1 < m2 * 2) ? m1 : m2;
}
/*
@ -1845,12 +1858,13 @@ zil_lwb_predict(zilog_t *zilog)
* Has to be called under zl_issuer_lock to chain more lwbs.
*/
static lwb_t *
zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb)
{
uint64_t blksz, plan, plan2;
uint64_t minbs, maxbs;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
membar_producer();
lwb->lwb_state = LWB_STATE_CLOSED;
/*
@ -1875,27 +1889,34 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
* Try to predict what can it be and plan for the worst case.
*/
uint_t m;
plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
maxbs = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
minbs = m;
if (zilog->zl_parallel) {
plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left +
zil_lwb_predict(zilog), &m);
if (plan < plan2)
plan = plan2;
uint64_t minp, maxp;
zil_lwb_predict(zilog, &minp, &maxp);
maxp = zil_lwb_plan(zilog, zilog->zl_cur_left + maxp,
&m);
if (maxbs < maxp)
maxbs = maxp;
}
} else {
/*
* The previous burst is done and we can only predict what
* will come next.
*/
plan = zil_lwb_predict(zilog);
zil_lwb_predict(zilog, &minbs, &maxbs);
}
blksz = plan + sizeof (zil_chain_t);
blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t);
blksz = MIN(blksz, zilog->zl_max_block_size);
DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz,
uint64_t, plan);
return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state));
minbs += sizeof (zil_chain_t);
maxbs += sizeof (zil_chain_t);
minbs = P2ROUNDUP_TYPED(minbs, ZIL_MIN_BLKSZ, uint64_t);
maxbs = P2ROUNDUP_TYPED(maxbs, ZIL_MIN_BLKSZ, uint64_t);
maxbs = MIN(maxbs, zilog->zl_max_block_size);
minbs = MIN(minbs, maxbs);
DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, minbs,
uint64_t, maxbs);
return (zil_alloc_lwb(zilog, NULL, minbs, maxbs, 0, 0));
}
/*
@ -1944,14 +1965,16 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
mutex_exit(&zilog->zl_lock);
next_lwb:
if (lwb->lwb_slim)
if (lwb->lwb_flags & LWB_FLAG_SLIM)
zilc = (zil_chain_t *)lwb->lwb_buf;
else
zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax);
int wsz = lwb->lwb_sz;
uint64_t alloc_size = BP_GET_LSIZE(&lwb->lwb_blk);
int wsz = alloc_size;
if (lwb->lwb_error == 0) {
abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk)
if (!(lwb->lwb_flags & LWB_FLAG_SLOG) ||
zilog->zl_cur_size <= zil_slog_bulk)
prio = ZIO_PRIORITY_SYNC_WRITE;
else
prio = ZIO_PRIORITY_ASYNC_WRITE;
@ -1959,16 +1982,17 @@ next_lwb:
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
&lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
&lwb->lwb_blk, lwb_abd, alloc_size, zil_lwb_write_done,
lwb, prio, ZIO_FLAG_CANFAIL, &zb);
zil_lwb_add_block(lwb, &lwb->lwb_blk);
if (lwb->lwb_slim) {
if (lwb->lwb_flags & LWB_FLAG_SLIM) {
/* For Slim ZIL only write what is used. */
wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ,
int);
ASSERT3S(wsz, <=, lwb->lwb_sz);
zio_shrink(lwb->lwb_write_zio, wsz);
ASSERT3S(wsz, <=, alloc_size);
if (wsz < alloc_size)
zio_shrink(lwb->lwb_write_zio, wsz);
wsz = lwb->lwb_write_zio->io_size;
}
memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
@ -2004,13 +2028,53 @@ next_lwb:
BP_ZERO(bp);
error = lwb->lwb_error;
if (error == 0) {
error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz,
&slog);
/*
* Allocation flexibility depends on LWB state:
* if NEW: allow range allocation and larger sizes;
* if OPENED: use fixed predetermined allocation size;
* if CLOSED + Slim: allocate precisely for actual usage.
*/
boolean_t flexible = (nlwb->lwb_state == LWB_STATE_NEW);
if (flexible) {
/* We need to prevent opening till we update lwb_sz. */
mutex_enter(&nlwb->lwb_lock);
flexible = (nlwb->lwb_state == LWB_STATE_NEW);
if (!flexible)
mutex_exit(&nlwb->lwb_lock); /* We lost. */
}
boolean_t closed_slim = (nlwb->lwb_state == LWB_STATE_CLOSED &&
(lwb->lwb_flags & LWB_FLAG_SLIM));
uint64_t min_size, max_size;
if (closed_slim) {
/* This transition is racy, but only one way. */
membar_consumer();
min_size = max_size = P2ROUNDUP_TYPED(nlwb->lwb_nused,
ZIL_MIN_BLKSZ, uint64_t);
} else if (flexible) {
min_size = nlwb->lwb_min_sz;
max_size = nlwb->lwb_sz;
} else {
min_size = max_size = nlwb->lwb_sz;
}
error = zio_alloc_zil(spa, zilog->zl_os, txg, bp,
min_size, max_size, &slog, flexible);
if (error == 0) {
if (closed_slim)
ASSERT3U(BP_GET_LSIZE(bp), ==, max_size);
else if (flexible)
nlwb->lwb_sz = BP_GET_LSIZE(bp);
else
ASSERT3U(BP_GET_LSIZE(bp), ==, nlwb->lwb_sz);
}
if (flexible)
mutex_exit(&nlwb->lwb_lock);
}
if (error == 0) {
ASSERT3U(BP_GET_BIRTH(bp), ==, txg);
BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
ZIO_CHECKSUM_ZILOG);
BP_SET_CHECKSUM(bp, (nlwb->lwb_flags & LWB_FLAG_SLIM) ?
ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
bp->blk_cksum = lwb->lwb_blk.blk_cksum;
bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
}
@ -2039,14 +2103,15 @@ next_lwb:
if (nlwb) {
nlwb->lwb_blk = *bp;
nlwb->lwb_error = error;
nlwb->lwb_slog = slog;
if (slog)
nlwb->lwb_flags |= LWB_FLAG_SLOG;
nlwb->lwb_alloc_txg = txg;
if (nlwb->lwb_state != LWB_STATE_READY)
nlwb = NULL;
}
mutex_exit(&zilog->zl_lock);
if (lwb->lwb_slog) {
if (lwb->lwb_flags & LWB_FLAG_SLOG) {
ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
lwb->lwb_nused);
@ -2220,7 +2285,6 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT3P(lwb, !=, NULL);
ASSERT3P(lwb->lwb_buf, !=, NULL);
zil_lwb_write_open(zilog, lwb);
@ -2262,9 +2326,10 @@ cont:
(dlen % max_log_data == 0 ||
lwb_sp < reclen + dlen % max_log_data))) {
list_insert_tail(ilwbs, lwb);
lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED);
lwb = zil_lwb_write_close(zilog, lwb);
if (lwb == NULL)
return (NULL);
zil_lwb_write_open(zilog, lwb);
lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
}
@ -2554,7 +2619,7 @@ zil_itxg_clean(void *arg)
* called) we will hit this case.
*/
if (itx->itx_lr.lrc_txtype == TX_COMMIT)
zil_commit_waiter_skip(itx->itx_private);
zil_commit_waiter_done(itx->itx_private, 0);
zil_itx_destroy(itx, 0);
}
@ -2742,6 +2807,7 @@ zil_crash_clean(zilog_t *zilog, uint64_t synced_txg)
}
/* This LWB is from the past, so we can clean it up now. */
ASSERT(lwb->lwb_flags & LWB_FLAG_CRASHED);
list_remove(&zilog->zl_lwb_crash_list, lwb);
if (lwb->lwb_buf != NULL)
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
@ -2981,7 +3047,7 @@ zil_prune_commit_list(zilog_t *zilog)
* never any itx's for it to wait on), so it's
* safe to skip this waiter and mark it done.
*/
zil_commit_waiter_skip(itx->itx_private);
zil_commit_waiter_done(itx->itx_private, 0);
} else {
zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
}
@ -3212,15 +3278,21 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
* "next" lwb on-disk. When this happens, we must stall
* the ZIL write pipeline; see the comment within
* zil_commit_writer_stall() for more details.
*
* ESHUTDOWN has to be handled carefully here. If we get it,
* then the pool suspended and zil_crash() was called, so we
* need to stop trying and just get an error back to the
* callers.
*/
int err = 0;
while ((lwb = list_remove_head(ilwbs)) != NULL) {
err = zil_lwb_write_issue(zilog, lwb);
if (err != 0)
break;
if (err == 0)
err = zil_lwb_write_issue(zilog, lwb);
}
if (err == 0)
if (err != ESHUTDOWN)
err = zil_commit_writer_stall(zilog);
if (err == ESHUTDOWN)
err = SET_ERROR(EIO);
/*
* Additionally, we have to signal and mark the "nolwb"
@ -3230,7 +3302,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
*/
zil_commit_waiter_t *zcw;
while ((zcw = list_remove_head(&nolwb_waiters)) != NULL)
zil_commit_waiter_skip(zcw);
zil_commit_waiter_done(zcw, err);
/*
* And finally, we have to destroy the itx's that
@ -3238,7 +3310,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
* the itx's callback if one exists for the itx.
*/
while ((itx = list_remove_head(&nolwb_itxs)) != NULL)
zil_itx_destroy(itx, 0);
zil_itx_destroy(itx, err);
} else {
ASSERT(list_is_empty(&nolwb_waiters));
ASSERT3P(lwb, !=, NULL);
@ -3292,17 +3364,17 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
(!zilog->zl_parallel || zilog->zl_suspend > 0)) {
zil_burst_done(zilog);
list_insert_tail(ilwbs, lwb);
lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
lwb = zil_lwb_write_close(zilog, lwb);
if (lwb == NULL) {
int err = 0;
while ((lwb =
list_remove_head(ilwbs)) != NULL) {
err = zil_lwb_write_issue(zilog, lwb);
if (err != 0)
break;
if (err == 0)
err = zil_lwb_write_issue(
zilog, lwb);
}
if (err == 0)
zil_commit_writer_stall(zilog);
if (err != ESHUTDOWN)
(void) zil_commit_writer_stall(zilog);
}
}
}
@ -3470,7 +3542,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* hasn't been issued.
*/
zil_burst_done(zilog);
lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
lwb_t *nlwb = zil_lwb_write_close(zilog, lwb);
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
@ -3546,7 +3618,7 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
* commit itxs. When this occurs, the commit waiters linked
* off of these commit itxs will not be committed to an
* lwb. Additionally, these commit waiters will not be
* marked done until zil_commit_waiter_skip() is called via
* marked done until zil_commit_waiter_done() is called via
* zil_itxg_clean().
*
* Thus, it's possible for this commit waiter (i.e. the
@ -3624,7 +3696,7 @@ zil_alloc_commit_waiter(void)
list_link_init(&zcw->zcw_node);
zcw->zcw_lwb = NULL;
zcw->zcw_done = B_FALSE;
zcw->zcw_zio_error = 0;
zcw->zcw_error = 0;
return (zcw);
}
@ -3728,6 +3800,9 @@ zil_crash(zilog_t *zilog)
*/
for (lwb_t *lwb = list_head(&zilog->zl_lwb_crash_list); lwb != NULL;
lwb = list_next(&zilog->zl_lwb_crash_list, lwb)) {
ASSERT(!(lwb->lwb_flags & LWB_FLAG_CRASHED));
lwb->lwb_flags |= LWB_FLAG_CRASHED;
itx_t *itx;
while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
zil_itx_destroy(itx, EIO);
@ -3736,7 +3811,7 @@ zil_crash(zilog_t *zilog)
while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
mutex_enter(&zcw->zcw_lock);
zcw->zcw_lwb = NULL;
zcw->zcw_zio_error = EIO;
zcw->zcw_error = EIO;
zcw->zcw_done = B_TRUE;
cv_broadcast(&zcw->zcw_cv);
mutex_exit(&zcw->zcw_lock);
@ -4014,7 +4089,7 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
zil_commit_waiter(zilog, zcw);
int err = 0;
if (zcw->zcw_zio_error != 0) {
if (zcw->zcw_error != 0) {
/*
* If there was an error writing out the ZIL blocks that
* this thread is waiting on, then we fallback to
@ -4149,7 +4224,7 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag)
offsetof(zil_commit_waiter_t, zcw_node));
avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&lwb->lwb_lock, NULL, MUTEX_DEFAULT, NULL);
return (0);
}
@ -4158,7 +4233,7 @@ zil_lwb_dest(void *vbuf, void *unused)
{
(void) unused;
lwb_t *lwb = vbuf;
mutex_destroy(&lwb->lwb_vdev_lock);
mutex_destroy(&lwb->lwb_lock);
avl_destroy(&lwb->lwb_vdev_tree);
list_destroy(&lwb->lwb_waiters);
list_destroy(&lwb->lwb_itxs);
@ -4381,7 +4456,7 @@ zil_close(zilog_t *zilog)
if (lwb != NULL) {
ASSERT(list_is_empty(&zilog->zl_lwb_list));
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW);
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
ASSERT0P(lwb->lwb_buf);
zil_free_lwb(zilog, lwb);
}
mutex_exit(&zilog->zl_lock);
@ -4472,16 +4547,16 @@ zil_suspend(const char *osname, void **cookiep)
cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
mutex_exit(&zilog->zl_lock);
if (cookiep == NULL)
if (zilog->zl_restart_txg > 0) {
/* ZIL crashed while we were waiting. */
zil_resume(os);
error = SET_ERROR(EBUSY);
} else if (cookiep == NULL)
zil_resume(os);
else
*cookiep = os;
if (zilog->zl_restart_txg > 0)
/* ZIL crashed while we were waiting. */
return (SET_ERROR(EBUSY));
return (0);
return (error);
}
/*

View file

@ -4434,12 +4434,15 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
*/
int
zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
uint64_t size, boolean_t *slog)
uint64_t min_size, uint64_t max_size, boolean_t *slog,
boolean_t allow_larger)
{
int error;
zio_alloc_list_t io_alloc_list;
uint64_t alloc_size = 0;
ASSERT(txg > spa_syncing_txg(spa));
ASSERT3U(min_size, <=, max_size);
metaslab_trace_init(&io_alloc_list);
@ -4448,7 +4451,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
* Fill in the obvious ones before calling into metaslab_alloc().
*/
BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
BP_SET_PSIZE(new_bp, size);
BP_SET_PSIZE(new_bp, max_size);
BP_SET_LEVEL(new_bp, 0);
/*
@ -4463,43 +4466,51 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
ZIOSTAT_BUMP(ziostat_total_allocations);
/* Try log class (dedicated slog devices) first */
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
txg, NULL, flags, &io_alloc_list, allocator, NULL);
error = metaslab_alloc_range(spa, spa_log_class(spa), min_size,
max_size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
NULL, &alloc_size);
*slog = (error == 0);
/* Try special_embedded_log class (reserved on special vdevs) */
if (error != 0) {
error = metaslab_alloc(spa, spa_special_embedded_log_class(spa),
size, new_bp, 1, txg, NULL, flags, &io_alloc_list,
allocator, NULL);
error = metaslab_alloc_range(spa,
spa_special_embedded_log_class(spa), min_size, max_size,
new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
NULL, &alloc_size);
}
/* Try special class (general special vdev allocation) */
if (error != 0) {
error = metaslab_alloc(spa, spa_special_class(spa), size,
new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
NULL);
error = metaslab_alloc_range(spa, spa_special_class(spa),
min_size, max_size, new_bp, 1, txg, NULL, flags,
&io_alloc_list, allocator, NULL, &alloc_size);
}
/* Try embedded_log class (reserved on normal vdevs) */
if (error != 0) {
error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
NULL);
error = metaslab_alloc_range(spa, spa_embedded_log_class(spa),
min_size, max_size, new_bp, 1, txg, NULL, flags,
&io_alloc_list, allocator, NULL, &alloc_size);
}
/* Finally fall back to normal class */
if (error != 0) {
ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks);
error = metaslab_alloc(spa, spa_normal_class(spa), size,
new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
NULL);
error = metaslab_alloc_range(spa, spa_normal_class(spa),
min_size, max_size, new_bp, 1, txg, NULL, flags,
&io_alloc_list, allocator, NULL, &alloc_size);
}
metaslab_trace_fini(&io_alloc_list);
if (error == 0) {
BP_SET_LSIZE(new_bp, size);
BP_SET_PSIZE(new_bp, size);
if (!allow_larger)
alloc_size = MIN(alloc_size, max_size);
else if (max_size <= SPA_OLD_MAXBLOCKSIZE)
alloc_size = MIN(alloc_size, SPA_OLD_MAXBLOCKSIZE);
alloc_size = P2ALIGN_TYPED(alloc_size, ZIL_MIN_BLKSZ, uint64_t);
BP_SET_LSIZE(new_bp, alloc_size);
BP_SET_PSIZE(new_bp, alloc_size);
BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
BP_SET_CHECKSUM(new_bp,
spa_version(spa) >= SPA_VERSION_SLIM_ZIL
@ -4527,8 +4538,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
}
} else {
zfs_dbgmsg("%s: zil block allocation failure: "
"size %llu, error %d", spa_name(spa), (u_longlong_t)size,
error);
"min_size %llu, max_size %llu, error %d", spa_name(spa),
(u_longlong_t)min_size, (u_longlong_t)max_size, error);
}
return (error);

View file

@ -38,25 +38,36 @@
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
* Copyright (c) 2024, Klara, Inc.
* Copyright (c) 2024, 2025, Klara, Inc.
*/
/*
* Note on locking of zvol state structures.
*
* These structures are used to maintain internal state used to emulate block
* devices on top of zvols. In particular, management of device minor number
* operations - create, remove, rename, and set_snapdev - involves access to
* these structures. The zvol_state_lock is primarily used to protect the
* zvol_state_list. The zv->zv_state_lock is used to protect the contents
* of the zvol_state_t structures, as well as to make sure that when the
* time comes to remove the structure from the list, it is not in use, and
* therefore, it can be taken off zvol_state_list and freed.
* zvol_state_t represents the connection between a single dataset
* (DMU_OST_ZVOL) and the device "minor" (some OS-specific representation of a
* "disk" or "device" or "volume", eg, a /dev/zdXX node, a GEOM object, etc).
*
* The zv_suspend_lock was introduced to allow for suspending I/O to a zvol,
* e.g. for the duration of receive and rollback operations. This lock can be
* held for significant periods of time. Given that it is undesirable to hold
* mutexes for long periods of time, the following lock ordering applies:
* The global zvol_state_lock is used to protect access to zvol_state_list and
* zvol_htable, which are the primary way to obtain a zvol_state_t from a name.
* It should not be used for anything not name-relateds, and you should avoid
* sleeping or waiting while its held. See zvol_find_by_name(), zvol_insert(),
* zvol_remove().
*
* The zv_state_lock is used to protect the contents of the associated
* zvol_state_t. Most of the zvol_state_t is dedicated to control and
* configuration; almost none of it is needed for data operations (that is,
* read, write, flush) so this lock is rarely taken during general IO. It
* should be released quickly; you should avoid sleeping or waiting while its
* held.
*
* zv_suspend_lock is used to suspend IO/data operations to a zvol. The read
* half should held for the duration of an IO operation. The write half should
* be taken when something to wait for IO to complete and the block further IO,
* eg for the duration of receive and rollback operations. This lock can be
* held for long periods of time.
*
* Thus, the following lock ordering appies.
* - take zvol_state_lock if necessary, to protect zvol_state_list
* - take zv_suspend_lock if necessary, by the code path in question
* - take zv_state_lock to protect zvol_state_t
@ -67,9 +78,8 @@
* these operations are serialized per pool. Consequently, we can be certain
* that for a given zvol, there is only one operation at a time in progress.
* That is why one can be sure that first, zvol_state_t for a given zvol is
* allocated and placed on zvol_state_list, and then other minor operations
* for this zvol are going to proceed in the order of issue.
*
* allocated and placed on zvol_state_list, and then other minor operations for
* this zvol are going to proceed in the order of issue.
*/
#include <sys/dataset_kstats.h>
@ -1570,184 +1580,156 @@ zvol_create_minors_impl(zvol_task_t *task)
}
/*
* Remove minors for specified dataset including children and snapshots.
* Remove minors for specified dataset and, optionally, its children and
* snapshots.
*/
/*
* Remove the minor for a given zvol. This will do it all:
* - flag the zvol for removal, so new requests are rejected
* - wait until outstanding requests are completed
* - remove it from lists
* - free it
* It's also usable as a taskq task, and smells nice too.
*/
static void
zvol_remove_minor_task(void *arg)
{
zvol_state_t *zv = (zvol_state_t *)arg;
ASSERT(!RW_LOCK_HELD(&zvol_state_lock));
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
mutex_enter(&zv->zv_state_lock);
while (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) {
zv->zv_flags |= ZVOL_REMOVING;
cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock);
}
mutex_exit(&zv->zv_state_lock);
rw_enter(&zvol_state_lock, RW_WRITER);
mutex_enter(&zv->zv_state_lock);
zvol_remove(zv);
zvol_os_clear_private(zv);
mutex_exit(&zv->zv_state_lock);
rw_exit(&zvol_state_lock);
zvol_os_free(zv);
}
static void
zvol_free_task(void *arg)
{
zvol_os_free(arg);
}
static void
zvol_remove_minors_impl(zvol_task_t *task)
{
zvol_state_t *zv, *zv_next;
const char *name = task ? task->zt_name1 : NULL;
int namelen = ((name) ? strlen(name) : 0);
taskqid_t t;
list_t delay_list, free_list;
boolean_t children = task ? !!task->zt_value : B_TRUE;
if (zvol_inhibit_dev)
return;
list_create(&delay_list, sizeof (zvol_state_t),
offsetof(zvol_state_t, zv_next));
list_create(&free_list, sizeof (zvol_state_t),
offsetof(zvol_state_t, zv_next));
/*
* We collect up zvols that we want to remove on a separate list, so
* that we don't have to hold zvol_state_lock for the whole time.
*
* We can't remove them from the global lists until we're completely
* done with them, because that would make them appear to ZFS-side ops
* that they don't exist, and the name might be reused, which can't be
* good.
*/
list_t remove_list;
list_create(&remove_list, sizeof (zvol_state_t),
offsetof(zvol_state_t, zv_remove_node));
rw_enter(&zvol_state_lock, RW_WRITER);
rw_enter(&zvol_state_lock, RW_READER);
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
mutex_enter(&zv->zv_state_lock);
if (zv->zv_flags & ZVOL_REMOVING) {
/* Another thread is handling shutdown, skip it. */
mutex_exit(&zv->zv_state_lock);
continue;
}
/*
* This zvol should be removed if:
* - no name was offered (ie removing all at shutdown); or
* - name matches exactly; or
* - we were asked to remove children, and
* - the start of the name matches, and
* - there is a '/' immediately after the matched name; or
* - there is a '@' immediately after the matched name
*/
if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
(strncmp(zv->zv_name, name, namelen) == 0 &&
(children && strncmp(zv->zv_name, name, namelen) == 0 &&
(zv->zv_name[namelen] == '/' ||
zv->zv_name[namelen] == '@'))) {
/*
* By holding zv_state_lock here, we guarantee that no
* one is currently using this zv
*/
/*
* If in use, try to throw everyone off and try again
* later.
* Matched, so mark it removal. We want to take the
* write half of the suspend lock to make sure that
* the zvol is not suspended, and give any data ops
* chance to finish.
*/
if (zv->zv_open_count > 0 ||
atomic_read(&zv->zv_suspend_ref)) {
zv->zv_flags |= ZVOL_REMOVING;
t = taskq_dispatch(
zv->zv_objset->os_spa->spa_zvol_taskq,
zvol_remove_minor_task, zv, TQ_SLEEP);
if (t == TASKQID_INVALID) {
/*
* Couldn't create the task, so we'll
* do it in place once the loop is
* finished.
*/
list_insert_head(&delay_list, zv);
}
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, RW_WRITER);
mutex_enter(&zv->zv_state_lock);
if (zv->zv_flags & ZVOL_REMOVING) {
/* Another thread has taken it, let them. */
mutex_exit(&zv->zv_state_lock);
rw_exit(&zv->zv_suspend_lock);
continue;
}
zvol_remove(zv);
/*
* Cleared while holding zvol_state_lock as a writer
* which will prevent zvol_open() from opening it.
* Mark it and unlock. New entries will see the flag
* and return ENXIO.
*/
zvol_os_clear_private(zv);
/* Drop zv_state_lock before zvol_free() */
zv->zv_flags |= ZVOL_REMOVING;
mutex_exit(&zv->zv_state_lock);
rw_exit(&zv->zv_suspend_lock);
/* Try parallel zv_free, if failed do it in place */
t = taskq_dispatch(system_taskq, zvol_free_task, zv,
TQ_SLEEP);
if (t == TASKQID_INVALID)
list_insert_head(&free_list, zv);
} else {
/* Put it on the list for the next stage. */
list_insert_head(&remove_list, zv);
} else
mutex_exit(&zv->zv_state_lock);
}
}
rw_exit(&zvol_state_lock);
/* Wait for zvols that we couldn't create a remove task for */
while ((zv = list_remove_head(&delay_list)) != NULL)
zvol_remove_minor_task(zv);
/* Didn't match any, nothing to do! */
if (list_is_empty(&remove_list)) {
if (task)
task->zt_error = SET_ERROR(ENOENT);
return;
}
/* Free any that we couldn't free in parallel earlier */
while ((zv = list_remove_head(&free_list)) != NULL)
/* Actually shut them all down. */
for (zv = list_head(&remove_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&remove_list, zv);
mutex_enter(&zv->zv_state_lock);
/*
* Still open or suspended, just wait. This can happen if, for
* example, we managed to acquire zv_state_lock in the moments
* where zvol_open() or zvol_release() are trading locks to
* call zvol_first_open() or zvol_last_close().
*/
while (zv->zv_open_count > 0 ||
atomic_read(&zv->zv_suspend_ref))
cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock);
/*
* No users, shut down the OS side. This may not remove the
* minor from view immediately, depending on the kernel
* specifics, but it will ensure that it is unusable and that
* this zvol_state_t can never again be reached from an OS-side
* operation.
*/
zvol_os_remove_minor(zv);
mutex_exit(&zv->zv_state_lock);
/* Remove it from the name lookup lists */
rw_enter(&zvol_state_lock, RW_WRITER);
zvol_remove(zv);
rw_exit(&zvol_state_lock);
}
/*
* Our own references on remove_list is the last one, free them and
* we're done.
*/
while ((zv = list_remove_head(&remove_list)) != NULL)
zvol_os_free(zv);
list_destroy(&remove_list);
}
/* Remove minor for this specific volume only */
static int
zvol_remove_minor_impl(const char *name)
{
zvol_state_t *zv = NULL, *zv_next;
if (zvol_inhibit_dev)
return (0);
rw_enter(&zvol_state_lock, RW_WRITER);
zvol_task_t task;
memset(&task, 0, sizeof (zvol_task_t));
strlcpy(task.zt_name1, name, sizeof (task.zt_name1));
task.zt_value = B_FALSE;
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
zvol_remove_minors_impl(&task);
mutex_enter(&zv->zv_state_lock);
if (strcmp(zv->zv_name, name) == 0)
/* Found, leave the the loop with zv_lock held */
break;
mutex_exit(&zv->zv_state_lock);
}
if (zv == NULL) {
rw_exit(&zvol_state_lock);
return (SET_ERROR(ENOENT));
}
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) {
/*
* In use, so try to throw everyone off, then wait
* until finished.
*/
zv->zv_flags |= ZVOL_REMOVING;
mutex_exit(&zv->zv_state_lock);
rw_exit(&zvol_state_lock);
zvol_remove_minor_task(zv);
return (0);
}
zvol_remove(zv);
zvol_os_clear_private(zv);
mutex_exit(&zv->zv_state_lock);
rw_exit(&zvol_state_lock);
zvol_os_free(zv);
return (0);
return (task.zt_error);
}
/*
@ -2067,6 +2049,7 @@ zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
task->zt_op = ZVOL_ASYNC_REMOVE_MINORS;
strlcpy(task->zt_name1, name, sizeof (task->zt_name1));
task->zt_value = B_TRUE;
id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
if ((async == B_FALSE) && (id != TASKQID_INVALID))
taskq_wait_id(spa->spa_zvol_taskq, id);
@ -2188,14 +2171,6 @@ zvol_fini_impl(void)
zvol_remove_minors_impl(NULL);
/*
* The call to "zvol_remove_minors_impl" may dispatch entries to
* the system_taskq, but it doesn't wait for those entries to
* complete before it returns. Thus, we must wait for all of the
* removals to finish, before we can continue.
*/
taskq_wait_outstanding(system_taskq, 0);
kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
list_destroy(&zvol_state_list);
rw_destroy(&zvol_state_lock);

View file

@ -876,9 +876,9 @@ static void __init
zstd_mempool_init(void)
{
zstd_mempool_cctx =
kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
vmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
zstd_mempool_dctx =
kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
vmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
for (int i = 0; i < ZSTD_POOL_MAX; i++) {
mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
@ -924,8 +924,8 @@ zstd_mempool_deinit(void)
release_pool(&zstd_mempool_dctx[i]);
}
kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
vmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
vmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
zstd_mempool_dctx = NULL;
zstd_mempool_cctx = NULL;
}

View file

@ -190,6 +190,7 @@ my @path_license_tags = (
['BSD-2-Clause OR GPL-2.0-only', 'CDDL-1.0'],
'module/icp' => ['Apache-2.0', 'CDDL-1.0'],
'contrib/icp' => ['Apache-2.0', 'CDDL-1.0'],
# Python bindings are always Apache-2.0
'contrib/pyzfs' => ['Apache-2.0'],

View file

@ -1093,7 +1093,7 @@ tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse',
tags = ['functional', 'zvol', 'zvol_misc']
[tests/functional/zvol/zvol_stress]
tests = ['zvol_stress']
tests = ['zvol_stress', 'zvol_stress_destroy']
tags = ['functional', 'zvol', 'zvol_stress']
[tests/functional/zvol/zvol_swap]

View file

@ -529,6 +529,8 @@ static const char *aes_gcm_impl[][2] = {
{ "aesni", "pclmulqdq" },
{ "x86_64", "avx" },
{ "aesni", "avx" },
{ "x86_64", "avx2" },
{ "aesni", "avx2" },
};
/* signature of function to call after setting implementation params */

View file

@ -2244,6 +2244,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/zvol/zvol_stress/cleanup.ksh \
functional/zvol/zvol_stress/setup.ksh \
functional/zvol/zvol_stress/zvol_stress.ksh \
functional/zvol/zvol_stress/zvol_stress_destroy.ksh \
functional/zvol/zvol_swap/cleanup.ksh \
functional/zvol/zvol_swap/setup.ksh \
functional/zvol/zvol_swap/zvol_swap_001_pos.ksh \

View file

@ -0,0 +1,66 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2025, Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
verify_runnable "global"
typeset -i nzvols=1000
typeset -i parallel=$(( $(get_num_cpus) * 2 ))
function cleanup {
for zvol in $(zfs list -Ho name -t vol) ; do
log_must_busy zfs destroy $zvol
done
}
log_onexit cleanup
log_assert "stress test concurrent zvol create/destroy"
function destroy_zvols_until {
typeset cond=$1
while true ; do
IFS='' zfs list -Ho name -t vol | read -r -d '' zvols
if [[ -n $zvols ]] ; then
echo $zvols | xargs -n 1 -P $parallel zfs destroy
fi
if ! $cond ; then
break
fi
done
}
( seq $nzvols | \
xargs -P $parallel -I % zfs create -s -V 1G $TESTPOOL/testvol% ) &
cpid=$!
sleep 1
destroy_zvols_until "kill -0 $cpid"
destroy_zvols_until "false"
log_pass "stress test done"

View file

@ -40,7 +40,8 @@ CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS \
.if ${MACHINE_ARCH} == "amd64"
CFLAGS+= -D__x86_64 -DHAVE_SSE2 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 \
-DHAVE_AVX -DHAVE_AVX2 -DHAVE_AVX512F -DHAVE_AVX512VL -DHAVE_AVX512BW
-DHAVE_AVX -DHAVE_AVX2 -DHAVE_AVX512F -DHAVE_AVX512VL -DHAVE_AVX512BW \
-DHAVE_VAES HAVE_VPCLMULQDQ
.endif
.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \

View file

@ -704,6 +704,11 @@
/* iops->setattr() takes struct user_namespace* */
/* #undef HAVE_USERNS_IOPS_SETATTR */
#ifdef __amd64__
/* Define if host toolchain supports VAES */
#define HAVE_VAES 1
#endif
/* fops->clone_file_range() is available */
/* #undef HAVE_VFS_CLONE_FILE_RANGE */
@ -743,6 +748,11 @@
/* __vmalloc page flags exists */
/* #undef HAVE_VMALLOC_PAGE_KERNEL */
#ifdef __amd64__
/* Define if host toolchain supports VPCLMULQDQ */
#define HAVE_VPCLMULQDQ 1
#endif
/* int (*writepage_t)() takes struct folio* */
/* #undef HAVE_WRITEPAGE_T_FOLIO */
@ -830,7 +840,7 @@
/* #undef ZFS_DEVICE_MINOR */
/* Define the project alias string. */
#define ZFS_META_ALIAS "zfs-2.3.99-539-FreeBSD_g1d0b94c4e"
#define ZFS_META_ALIAS "zfs-2.3.99-571-FreeBSD_ga9410ccbd"
/* Define the project author. */
#define ZFS_META_AUTHOR "OpenZFS"
@ -839,7 +849,7 @@
/* #undef ZFS_META_DATA */
/* Define the maximum compatible kernel version. */
#define ZFS_META_KVER_MAX "6.15"
#define ZFS_META_KVER_MAX "6.16"
/* Define the minimum compatible kernel version. */
#define ZFS_META_KVER_MIN "4.18"
@ -860,7 +870,7 @@
#define ZFS_META_NAME "zfs"
/* Define the project release. */
#define ZFS_META_RELEASE "539-FreeBSD_g1d0b94c4e"
#define ZFS_META_RELEASE "571-FreeBSD_ga9410ccbd"
/* Define the project version. */
#define ZFS_META_VERSION "2.3.99"

View file

@ -1 +1 @@
#define ZFS_META_GITREV "zfs-2.3.99-539-g1d0b94c4e"
#define ZFS_META_GITREV "zfs-2.3.99-571-ga9410ccbd"