]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/commitdiff
UBUNTU: SAUCE: (noup) Update spl to 0.6.5.3-0ubuntu1, zfs to 0.6.5.3-0ubuntu1
authorTim Gardner <tim.gardner@canonical.com>
Wed, 28 Oct 2015 14:59:32 +0000 (08:59 -0600)
committerTim Gardner <tim.gardner@canonical.com>
Fri, 26 Feb 2016 02:44:02 +0000 (19:44 -0700)
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
289 files changed:
spl/META
spl/Makefile.am
spl/Makefile.in
spl/cmd/Makefile.in
spl/config/config.guess
spl/config/config.sub
spl/config/rpm.am
spl/config/spl-build.m4
spl/configure
spl/dkms.conf
spl/include/Makefile.in
spl/include/fs/Makefile.in
spl/include/linux/Makefile.in
spl/include/linux/rwsem_compat.h
spl/include/rpc/Makefile.in
spl/include/sharefs/Makefile.in
spl/include/sys/Makefile.am
spl/include/sys/Makefile.in
spl/include/sys/condvar.h
spl/include/sys/debug.h
spl/include/sys/fm/Makefile.in
spl/include/sys/fs/Makefile.in
spl/include/sys/kmem_cache.h
spl/include/sys/rwlock.h
spl/include/sys/sysevent/Makefile.in
spl/include/sys/sysmacros.h
spl/include/sys/taskq.h
spl/include/sys/uio.h
spl/include/sys/user.h [new file with mode: 0644]
spl/include/sys/vmem.h
spl/include/sys/vmsystm.h
spl/include/sys/vnode.h
spl/include/util/Makefile.am
spl/include/util/Makefile.in
spl/include/vm/Makefile.in
spl/lib/Makefile.in
spl/man/Makefile.in
spl/man/man1/Makefile.in
spl/man/man5/Makefile.in
spl/man/man5/spl-module-parameters.5
spl/module/spl/Makefile.in
spl/module/spl/spl-condvar.c
spl/module/spl/spl-kmem-cache.c
spl/module/spl/spl-proc.c
spl/module/spl/spl-taskq.c
spl/module/spl/spl-tsd.c
spl/module/spl/spl-vnode.c
spl/module/splat/Makefile.in
spl/module/splat/splat-atomic.c
spl/module/splat/splat-internal.h
spl/module/splat/splat-kmem.c
spl/module/splat/splat-mutex.c
spl/module/splat/splat-rwlock.c
spl/module/splat/splat-taskq.c
spl/module/splat/splat-thread.c
spl/module/splat/splat-vnode.c
spl/rpm/Makefile.in
spl/rpm/generic/Makefile.in
spl/rpm/generic/spl-dkms.spec.in
spl/rpm/generic/spl-kmod.spec.in
spl/rpm/generic/spl.spec.in
spl/rpm/redhat/Makefile.in
spl/rpm/redhat/spl-dkms.spec.in
spl/rpm/redhat/spl.spec.in
spl/scripts/Makefile.am
spl/scripts/Makefile.in
spl/spl_config.h.in
zfs/COPYRIGHT
zfs/META
zfs/Makefile.am
zfs/Makefile.in
zfs/aclocal.m4
zfs/config/Rules.am
zfs/config/always-no-bool-compare.m4 [new file with mode: 0644]
zfs/config/config.guess
zfs/config/config.sub
zfs/config/deb.am
zfs/config/kernel-bdi-setup-and-register.m4
zfs/config/kernel-bio-failfast.m4
zfs/config/kernel-bio-rw-barrier.m4 [new file with mode: 0644]
zfs/config/kernel-bio-rw-discard.m4 [new file with mode: 0644]
zfs/config/kernel-bio-rw-syncio.m4 [deleted file]
zfs/config/kernel-blk-end-request.m4 [deleted file]
zfs/config/kernel-blk-fetch-request.m4 [deleted file]
zfs/config/kernel-blk-queue-discard.m4 [deleted file]
zfs/config/kernel-blk-queue-io-opt.m4 [deleted file]
zfs/config/kernel-blk-queue-nonrot.m4 [deleted file]
zfs/config/kernel-blk-queue-physical-block-size.m4 [deleted file]
zfs/config/kernel-blk-requeue-request.m4 [deleted file]
zfs/config/kernel-blk-rq-bytes.m4 [deleted file]
zfs/config/kernel-blk-rq-pos.m4 [deleted file]
zfs/config/kernel-blk-rq-sectors.m4 [deleted file]
zfs/config/kernel-current_bio_tail.m4 [new file with mode: 0644]
zfs/config/kernel-follow-down-one.m4 [new file with mode: 0644]
zfs/config/kernel-generic_io_acct.m4 [new file with mode: 0644]
zfs/config/kernel-kmap-atomic-args.m4 [new file with mode: 0644]
zfs/config/kernel-mk-request-fn.m4 [new file with mode: 0644]
zfs/config/kernel-rq-for-each_segment.m4 [deleted file]
zfs/config/kernel-rq-is_sync.m4 [deleted file]
zfs/config/kernel.m4
zfs/config/mount-helper.m4
zfs/config/rpm.am
zfs/config/zfs-build.m4
zfs/configure
zfs/configure.ac
zfs/contrib/Makefile.am
zfs/contrib/Makefile.in
zfs/contrib/bash_completion.d/Makefile.in
zfs/contrib/dracut/90zfs/Makefile.am [new file with mode: 0644]
zfs/contrib/dracut/90zfs/Makefile.in [new file with mode: 0644]
zfs/contrib/dracut/90zfs/export-zfs.sh.in [new file with mode: 0755]
zfs/contrib/dracut/90zfs/module-setup.sh.in [new file with mode: 0755]
zfs/contrib/dracut/90zfs/mount-zfs.sh.in [new file with mode: 0755]
zfs/contrib/dracut/90zfs/parse-zfs.sh.in [new file with mode: 0755]
zfs/contrib/dracut/90zfs/zfs-lib.sh.in [new file with mode: 0755]
zfs/contrib/dracut/Makefile.am [new file with mode: 0644]
zfs/contrib/dracut/Makefile.in [new file with mode: 0644]
zfs/contrib/dracut/README.dracut.markdown [new file with mode: 0644]
zfs/contrib/initramfs/Makefile.am [new file with mode: 0644]
zfs/contrib/initramfs/Makefile.in [new file with mode: 0644]
zfs/contrib/initramfs/README.initramfs.markdown [new file with mode: 0644]
zfs/contrib/initramfs/conf-hooks.d/zfs [new file with mode: 0644]
zfs/contrib/initramfs/hooks/zfs [new file with mode: 0755]
zfs/contrib/initramfs/scripts/zfs [new file with mode: 0644]
zfs/dkms.conf
zfs/dracut/90zfs/Makefile.in [deleted file]
zfs/dracut/90zfs/export-zfs.sh.in [deleted file]
zfs/dracut/90zfs/module-setup.sh.in [deleted file]
zfs/dracut/90zfs/mount-zfs.sh.in [deleted file]
zfs/dracut/90zfs/parse-zfs.sh.in [deleted file]
zfs/dracut/Makefile.in [deleted file]
zfs/etc/init.d/zfs-functions.in [new file with mode: 0644]
zfs/etc/init.d/zfs-import.in [new file with mode: 0644]
zfs/etc/init.d/zfs-mount.in [new file with mode: 0644]
zfs/etc/init.d/zfs-share.in [new file with mode: 0644]
zfs/etc/init.d/zfs-zed.in [new file with mode: 0644]
zfs/etc/init.d/zfs.fedora.in [deleted file]
zfs/etc/init.d/zfs.gentoo.in [deleted file]
zfs/etc/init.d/zfs.in [new file with mode: 0644]
zfs/etc/init.d/zfs.lsb.in [deleted file]
zfs/etc/init.d/zfs.lunar.in [deleted file]
zfs/etc/init.d/zfs.redhat.in [deleted file]
zfs/include/Makefile.in
zfs/include/libzfs.h
zfs/include/libzfs_core.h
zfs/include/linux/Makefile.am
zfs/include/linux/Makefile.in
zfs/include/linux/blkdev_compat.h
zfs/include/linux/kmap_compat.h [new file with mode: 0644]
zfs/include/linux/vfs_compat.h
zfs/include/sys/Makefile.am
zfs/include/sys/Makefile.in
zfs/include/sys/arc.h
zfs/include/sys/arc_impl.h
zfs/include/sys/avl.h
zfs/include/sys/bpobj.h
zfs/include/sys/dbuf.h
zfs/include/sys/dmu.h
zfs/include/sys/dmu_objset.h
zfs/include/sys/dmu_send.h
zfs/include/sys/dnode.h
zfs/include/sys/dsl_dataset.h
zfs/include/sys/dsl_dir.h
zfs/include/sys/dsl_pool.h
zfs/include/sys/dsl_synctask.h
zfs/include/sys/fm/Makefile.in
zfs/include/sys/fm/fs/Makefile.in
zfs/include/sys/fm/fs/zfs.h
zfs/include/sys/fs/Makefile.in
zfs/include/sys/fs/zfs.h
zfs/include/sys/mntent.h [new file with mode: 0644]
zfs/include/sys/multilist.h [new file with mode: 0644]
zfs/include/sys/rrwlock.h
zfs/include/sys/sa.h
zfs/include/sys/sa_impl.h
zfs/include/sys/spa.h
zfs/include/sys/spa_impl.h
zfs/include/sys/trace_acl.h
zfs/include/sys/trace_arc.h
zfs/include/sys/trace_dbgmsg.h
zfs/include/sys/trace_dbuf.h
zfs/include/sys/trace_dmu.h
zfs/include/sys/trace_dnode.h
zfs/include/sys/trace_multilist.h [new file with mode: 0644]
zfs/include/sys/trace_txg.h
zfs/include/sys/trace_zil.h
zfs/include/sys/trace_zrlock.h
zfs/include/sys/uberblock.h
zfs/include/sys/vdev.h
zfs/include/sys/vdev_file.h
zfs/include/sys/vdev_impl.h
zfs/include/sys/zap.h
zfs/include/sys/zap_impl.h
zfs/include/sys/zap_leaf.h
zfs/include/sys/zfs_context.h
zfs/include/sys/zfs_ctldir.h
zfs/include/sys/zfs_debug.h
zfs/include/sys/zfs_ioctl.h
zfs/include/sys/zfs_sa.h
zfs/include/sys/zfs_vfsops.h
zfs/include/sys/zfs_znode.h
zfs/include/sys/zil.h
zfs/include/sys/zil_impl.h
zfs/include/sys/zio.h
zfs/include/sys/zpl.h
zfs/include/sys/zvol.h
zfs/include/zfeature_common.h
zfs/module/Makefile.in
zfs/module/avl/Makefile.in
zfs/module/avl/avl.c
zfs/module/nvpair/Makefile.in
zfs/module/unicode/Makefile.in
zfs/module/zcommon/Makefile.in
zfs/module/zcommon/zfs_prop.c
zfs/module/zcommon/zfs_uio.c
zfs/module/zcommon/zpool_prop.c
zfs/module/zfs/Makefile.in
zfs/module/zfs/arc.c
zfs/module/zfs/bpobj.c
zfs/module/zfs/bptree.c
zfs/module/zfs/dbuf.c
zfs/module/zfs/dbuf_stats.c
zfs/module/zfs/ddt.c
zfs/module/zfs/dmu.c
zfs/module/zfs/dmu_diff.c
zfs/module/zfs/dmu_objset.c
zfs/module/zfs/dmu_send.c
zfs/module/zfs/dmu_traverse.c
zfs/module/zfs/dmu_tx.c
zfs/module/zfs/dnode.c
zfs/module/zfs/dnode_sync.c
zfs/module/zfs/dsl_bookmark.c
zfs/module/zfs/dsl_dataset.c
zfs/module/zfs/dsl_deadlist.c
zfs/module/zfs/dsl_deleg.c
zfs/module/zfs/dsl_destroy.c
zfs/module/zfs/dsl_dir.c
zfs/module/zfs/dsl_pool.c
zfs/module/zfs/dsl_prop.c
zfs/module/zfs/dsl_scan.c
zfs/module/zfs/dsl_synctask.c
zfs/module/zfs/dsl_userhold.c
zfs/module/zfs/fm.c
zfs/module/zfs/metaslab.c
zfs/module/zfs/multilist.c [new file with mode: 0644]
zfs/module/zfs/range_tree.c
zfs/module/zfs/rrwlock.c
zfs/module/zfs/sa.c
zfs/module/zfs/spa.c
zfs/module/zfs/spa_config.c
zfs/module/zfs/spa_history.c
zfs/module/zfs/spa_misc.c
zfs/module/zfs/spa_stats.c
zfs/module/zfs/trace.c
zfs/module/zfs/txg.c
zfs/module/zfs/uberblock.c
zfs/module/zfs/vdev.c
zfs/module/zfs/vdev_disk.c
zfs/module/zfs/vdev_file.c
zfs/module/zfs/vdev_mirror.c
zfs/module/zfs/vdev_missing.c
zfs/module/zfs/vdev_queue.c
zfs/module/zfs/vdev_raidz.c
zfs/module/zfs/zap.c
zfs/module/zfs/zap_leaf.c
zfs/module/zfs/zap_micro.c
zfs/module/zfs/zfeature_common.c
zfs/module/zfs/zfs_ctldir.c
zfs/module/zfs/zfs_debug.c
zfs/module/zfs/zfs_ioctl.c
zfs/module/zfs/zfs_log.c
zfs/module/zfs/zfs_onexit.c
zfs/module/zfs/zfs_sa.c
zfs/module/zfs/zfs_vfsops.c
zfs/module/zfs/zfs_vnops.c
zfs/module/zfs/zfs_znode.c
zfs/module/zfs/zil.c
zfs/module/zfs/zio.c
zfs/module/zfs/zio_compress.c
zfs/module/zfs/zio_inject.c
zfs/module/zfs/zpl_ctldir.c
zfs/module/zfs/zpl_export.c
zfs/module/zfs/zpl_file.c
zfs/module/zfs/zpl_inode.c
zfs/module/zfs/zpl_super.c
zfs/module/zfs/zrlock.c
zfs/module/zfs/zvol.c
zfs/module/zpios/Makefile.in
zfs/zfs_config.h.in

index b21badb8f636ad913551e3f418d3b2eee0a01ce3..9d2f946368cc429155de134ea22e425d9c0fa408 100644 (file)
--- a/spl/META
+++ b/spl/META
@@ -1,7 +1,7 @@
 Meta:         1
 Name:         spl
 Branch:       1.0
-Version:      0.6.4.2
+Version:      0.6.5.3
 Release:      0ubuntu1
 Release-Tags: relext
 License:      GPL
index 89af931ae4fa7f732ef5aa6720473ab87d3b3c74..4977448fda95ca7acf5baf63a031f5b726fb7c35 100644 (file)
@@ -1,9 +1,9 @@
 
 ACLOCAL_AMFLAGS = -I config
 
-include $(top_srcdir)/config/rpm.am
-include $(top_srcdir)/config/deb.am
-include $(top_srcdir)/config/tgz.am
+include config/rpm.am
+include config/deb.am
+include config/tgz.am
 
 SUBDIRS = include rpm
 if CONFIG_USER
@@ -40,11 +40,11 @@ dist-hook:
                $(distdir)/META
 
 ctags:
-       $(RM) $(top_srcdir)/tags
+       $(RM) tags
        find $(top_srcdir) -name .git -prune -o -name '*.[hc]' | xargs ctags
 
 etags:
-       $(RM) $(top_srcdir)/TAGS
+       $(RM) TAGS
        find $(top_srcdir) -name .pc -prune -o -name '*.[hc]' | xargs etags -a
 
 tags: ctags etags
index 8e613bd1e5aef69de9958029cbec86940cba768a..22c2e46f55e77b3e3b078be16757ae924c385361 100644 (file)
@@ -235,18 +235,17 @@ ETAGS = etags
 CTAGS = ctags
 CSCOPE = cscope
 DIST_SUBDIRS = include rpm lib cmd man scripts module
-am__DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/spl.release.in \
-       $(srcdir)/spl_config.h.in $(top_srcdir)/config/compile \
-       $(top_srcdir)/config/config.guess \
-       $(top_srcdir)/config/config.sub $(top_srcdir)/config/deb.am \
+am__DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/config/deb.am \
+       $(srcdir)/config/rpm.am $(srcdir)/config/tgz.am \
+       $(srcdir)/spl.release.in $(srcdir)/spl_config.h.in \
+       $(top_srcdir)/config/compile $(top_srcdir)/config/config.guess \
+       $(top_srcdir)/config/config.sub \
        $(top_srcdir)/config/install-sh $(top_srcdir)/config/ltmain.sh \
-       $(top_srcdir)/config/missing $(top_srcdir)/config/rpm.am \
-       $(top_srcdir)/config/tgz.am $(top_srcdir)/module/Makefile.in \
+       $(top_srcdir)/config/missing $(top_srcdir)/module/Makefile.in \
        $(top_srcdir)/module/spl/Makefile.in \
        $(top_srcdir)/module/splat/Makefile.in AUTHORS COPYING \
        config/compile config/config.guess config/config.sub \
-       config/depcomp config/install-sh config/ltmain.sh \
-       config/missing
+       config/install-sh config/ltmain.sh config/missing
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 distdir = $(PACKAGE)-$(VERSION)
 top_distdir = $(distdir)
@@ -444,6 +443,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
@@ -471,7 +471,7 @@ all: spl_config.h
 .SUFFIXES:
 am--refresh: Makefile
        @:
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/config/rpm.am $(top_srcdir)/config/deb.am $(top_srcdir)/config/tgz.am $(am__configure_deps)
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(srcdir)/config/rpm.am $(srcdir)/config/deb.am $(srcdir)/config/tgz.am $(am__configure_deps)
        @for dep in $?; do \
          case '$(am__configure_deps)' in \
            *$$dep*) \
@@ -493,7 +493,7 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
            echo ' cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)'; \
            cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe);; \
        esac;
-$(top_srcdir)/config/rpm.am $(top_srcdir)/config/deb.am $(top_srcdir)/config/tgz.am $(am__empty):
+$(srcdir)/config/rpm.am $(srcdir)/config/deb.am $(srcdir)/config/tgz.am $(am__empty):
 
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
        $(SHELL) ./config.status --recheck
@@ -1050,7 +1050,7 @@ rpm-local:
        mkdir -p $(rpmbuild)/SPECS && \
        cp ${RPM_SPEC_DIR}/$(rpmspec) $(rpmbuild)/SPECS && \
        mkdir -p $(rpmbuild)/SOURCES && \
-       cp scripts/kmodtool $(rpmbuild)/SOURCES && \
+       cp $(top_srcdir)/scripts/kmodtool $(rpmbuild)/SOURCES && \
        cp $(distdir).tar.gz $(rpmbuild)/SOURCES)
 
 srpm-common: dist
@@ -1163,11 +1163,11 @@ dist-hook:
                $(distdir)/META
 
 ctags:
-       $(RM) $(top_srcdir)/tags
+       $(RM) tags
        find $(top_srcdir) -name .git -prune -o -name '*.[hc]' | xargs ctags
 
 etags:
-       $(RM) $(top_srcdir)/TAGS
+       $(RM) TAGS
        find $(top_srcdir) -name .pc -prune -o -name '*.[hc]' | xargs etags -a
 
 tags: ctags etags
index 5e41ea2aad5577885d090ad08cd1e5d517a96b18..3d48e14cad53f8f18e4354531038571348901f1f 100644 (file)
@@ -343,6 +343,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index 1f5c50c0d1529d50b94dc3533ca72a47f0fa5849..16592509d49e52301080275c3647a66349c33795 100755 (executable)
@@ -1,8 +1,8 @@
 #! /bin/sh
 # Attempt to guess a canonical system name.
-#   Copyright 1992-2014 Free Software Foundation, Inc.
+#   Copyright 1992-2015 Free Software Foundation, Inc.
 
-timestamp='2014-03-23'
+timestamp='2015-08-20'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -24,12 +24,12 @@ timestamp='2014-03-23'
 # program.  This Exception is an additional permission under section 7
 # of the GNU General Public License, version 3 ("GPLv3").
 #
-# Originally written by Per Bothner.
+# Originally written by Per Bothner; maintained since 2000 by Ben Elliston.
 #
 # You can get the latest version of this script from:
 # http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
 #
-# Please send patches with a ChangeLog entry to config-patches@gnu.org.
+# Please send patches to <config-patches@gnu.org>.
 
 
 me=`echo "$0" | sed -e 's,.*/,,'`
@@ -50,7 +50,7 @@ version="\
 GNU config.guess ($timestamp)
 
 Originally written by Per Bothner.
-Copyright 1992-2014 Free Software Foundation, Inc.
+Copyright 1992-2015 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -168,20 +168,27 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
        # Note: NetBSD doesn't particularly care about the vendor
        # portion of the name.  We always set it to "unknown".
        sysctl="sysctl -n hw.machine_arch"
-       UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
-           /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
+       UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \
+           /sbin/$sysctl 2>/dev/null || \
+           /usr/sbin/$sysctl 2>/dev/null || \
+           echo unknown)`
        case "${UNAME_MACHINE_ARCH}" in
            armeb) machine=armeb-unknown ;;
            arm*) machine=arm-unknown ;;
            sh3el) machine=shl-unknown ;;
            sh3eb) machine=sh-unknown ;;
            sh5el) machine=sh5le-unknown ;;
+           earmv*)
+               arch=`echo ${UNAME_MACHINE_ARCH} | sed -e 's,^e\(armv[0-9]\).*$,\1,'`
+               endian=`echo ${UNAME_MACHINE_ARCH} | sed -ne 's,^.*\(eb\)$,\1,p'`
+               machine=${arch}${endian}-unknown
+               ;;
            *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
        esac
        # The Operating System including object format, if it has switched
        # to ELF recently, or will in the future.
        case "${UNAME_MACHINE_ARCH}" in
-           arm*|i386|m68k|ns32k|sh3*|sparc|vax)
+           arm*|earm*|i386|m68k|ns32k|sh3*|sparc|vax)
                eval $set_cc_for_build
                if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
                        | grep -q __ELF__
@@ -197,6 +204,13 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
                os=netbsd
                ;;
        esac
+       # Determine ABI tags.
+       case "${UNAME_MACHINE_ARCH}" in
+           earm*)
+               expr='s/^earmv[0-9]/-eabi/;s/eb$//'
+               abi=`echo ${UNAME_MACHINE_ARCH} | sed -e "$expr"`
+               ;;
+       esac
        # The OS release
        # Debian GNU/NetBSD machines have a different userland, and
        # thus, need a distinct triplet. However, they do not need
@@ -207,13 +221,13 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
                release='-gnu'
                ;;
            *)
-               release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+               release=`echo ${UNAME_RELEASE} | sed -e 's/[-_].*//' | cut -d. -f1,2`
                ;;
        esac
        # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
        # contains redundant information, the shorter form:
        # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
-       echo "${machine}-${os}${release}"
+       echo "${machine}-${os}${release}${abi}"
        exit ;;
     *:Bitrig:*:*)
        UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
@@ -235,6 +249,9 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
     *:MirBSD:*:*)
        echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
        exit ;;
+    *:Sortix:*:*)
+       echo ${UNAME_MACHINE}-unknown-sortix
+       exit ;;
     alpha:OSF1:*:*)
        case $UNAME_RELEASE in
        *4.0)
@@ -579,8 +596,9 @@ EOF
        else
                IBM_ARCH=powerpc
        fi
-       if [ -x /usr/bin/oslevel ] ; then
-               IBM_REV=`/usr/bin/oslevel`
+       if [ -x /usr/bin/lslpp ] ; then
+               IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc |
+                          awk -F: '{ print $3 }' | sed s/[0-9]*$/0/`
        else
                IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
        fi
@@ -932,6 +950,9 @@ EOF
     crisv32:Linux:*:*)
        echo ${UNAME_MACHINE}-axis-linux-${LIBC}
        exit ;;
+    e2k:Linux:*:*)
+       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+       exit ;;
     frv:Linux:*:*)
        echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
        exit ;;
@@ -1020,7 +1041,7 @@ EOF
        echo ${UNAME_MACHINE}-dec-linux-${LIBC}
        exit ;;
     x86_64:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+       echo ${UNAME_MACHINE}-pc-linux-${LIBC}
        exit ;;
     xtensa*:Linux:*:*)
        echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
index bba4efb80574987fcf6d85c71e68e55bfeb48ba2..1acc966a33bf509f7c50f87d7678fbb813089ca6 100755 (executable)
@@ -1,8 +1,8 @@
 #! /bin/sh
 # Configuration validation subroutine script.
-#   Copyright 1992-2014 Free Software Foundation, Inc.
+#   Copyright 1992-2015 Free Software Foundation, Inc.
 
-timestamp='2014-09-11'
+timestamp='2015-08-20'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -25,7 +25,7 @@ timestamp='2014-09-11'
 # of the GNU General Public License, version 3 ("GPLv3").
 
 
-# Please send patches with a ChangeLog entry to config-patches@gnu.org.
+# Please send patches to <config-patches@gnu.org>.
 #
 # Configuration subroutine to validate and canonicalize a configuration type.
 # Supply the specified configuration type as an argument.
@@ -68,7 +68,7 @@ Report bugs and patches to <config-patches@gnu.org>."
 version="\
 GNU config.sub ($timestamp)
 
-Copyright 1992-2014 Free Software Foundation, Inc.
+Copyright 1992-2015 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -117,7 +117,7 @@ maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
 case $maybe_os in
   nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \
   linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
-  knetbsd*-gnu* | netbsd*-gnu* | \
+  knetbsd*-gnu* | netbsd*-gnu* | netbsd*-eabi* | \
   kopensolaris*-gnu* | \
   storm-chaos* | os2-emx* | rtmk-nova*)
     os=-$maybe_os
@@ -255,12 +255,13 @@ case $basic_machine in
        | arc | arceb \
        | arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \
        | avr | avr32 \
+       | ba \
        | be32 | be64 \
        | bfin \
        | c4x | c8051 | clipper \
        | d10v | d30v | dlx | dsp16xx \
-       | epiphany \
-       | fido | fr30 | frv \
+       | e2k | epiphany \
+       | fido | fr30 | frv | ft32 \
        | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
        | hexagon \
        | i370 | i860 | i960 | ia64 \
@@ -305,7 +306,7 @@ case $basic_machine in
        | riscv32 | riscv64 \
        | rl78 | rx \
        | score \
-       | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
+       | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[234]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
        | sh64 | sh64le \
        | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
        | sparcv8 | sparcv9 | sparcv9b | sparcv9v \
@@ -313,6 +314,7 @@ case $basic_machine in
        | tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \
        | ubicom32 \
        | v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \
+       | visium \
        | we32k \
        | x86 | xc16x | xstormy16 | xtensa \
        | z8k | z80)
@@ -327,6 +329,9 @@ case $basic_machine in
        c6x)
                basic_machine=tic6x-unknown
                ;;
+       leon|leon[3-9])
+               basic_machine=sparc-$basic_machine
+               ;;
        m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | nvptx | picochip)
                basic_machine=$basic_machine-unknown
                os=-none
@@ -372,12 +377,13 @@ case $basic_machine in
        | alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \
        | arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
        | avr-* | avr32-* \
+       | ba-* \
        | be32-* | be64-* \
        | bfin-* | bs2000-* \
        | c[123]* | c30-* | [cjt]90-* | c4x-* \
        | c8051-* | clipper-* | craynv-* | cydra-* \
        | d10v-* | d30v-* | dlx-* \
-       | elxsi-* \
+       | e2k-* | elxsi-* \
        | f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
        | h8300-* | h8500-* \
        | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
@@ -424,12 +430,13 @@ case $basic_machine in
        | pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
        | powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
        | pyramid-* \
+       | riscv32-* | riscv64-* \
        | rl78-* | romp-* | rs6000-* | rx-* \
        | sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
        | shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
        | sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
        | sparclite-* \
-       | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \
+       | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx*-* \
        | tahoe-* \
        | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
        | tile*-* \
@@ -437,6 +444,7 @@ case $basic_machine in
        | ubicom32-* \
        | v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \
        | vax-* \
+       | visium-* \
        | we32k-* \
        | x86-* | x86_64-* | xc16x-* | xps100-* \
        | xstormy16-* | xtensa*-* \
@@ -513,6 +521,9 @@ case $basic_machine in
                basic_machine=i386-pc
                os=-aros
                ;;
+        asmjs)
+               basic_machine=asmjs-unknown
+               ;;
        aux)
                basic_machine=m68k-apple
                os=-aux
@@ -774,6 +785,9 @@ case $basic_machine in
                basic_machine=m68k-isi
                os=-sysv
                ;;
+       leon-*|leon[3-9]-*)
+               basic_machine=sparc-`echo $basic_machine | sed 's/-.*//'`
+               ;;
        m68knommu)
                basic_machine=m68k-unknown
                os=-linux
@@ -1365,7 +1379,7 @@ case $os in
              | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
              | -sym* | -kopensolaris* | -plan9* \
              | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
-             | -aos* | -aros* \
+             | -aos* | -aros* | -cloudabi* | -sortix* \
              | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
              | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
              | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
index 311c754d429d4dbc00eb35e5fc312dc42abf7db4..51a20b3e6a10d736b3e6422f3faafbb1c93560db 100644 (file)
@@ -51,7 +51,7 @@ rpm-local:
        mkdir -p $(rpmbuild)/SPECS && \
        cp ${RPM_SPEC_DIR}/$(rpmspec) $(rpmbuild)/SPECS && \
        mkdir -p $(rpmbuild)/SOURCES && \
-       cp scripts/kmodtool $(rpmbuild)/SOURCES && \
+       cp $(top_srcdir)/scripts/kmodtool $(rpmbuild)/SOURCES && \
        cp $(distdir).tar.gz $(rpmbuild)/SOURCES)
 
 srpm-common: dist
index 3bfc1e2325e25242d28d0ac43c6b96db8b2da513..daa9eb714897d090efb6492ed2311b243d151b19 100644 (file)
@@ -33,7 +33,6 @@ AC_DEFUN([SPL_AC_CONFIG_KERNEL], [
        SPL_AC_FS_STRUCT_SPINLOCK
        SPL_AC_KUIDGID_T
        SPL_AC_PUT_TASK_STRUCT
-       SPL_AC_EXPORTED_RWSEM_IS_LOCKED
        SPL_AC_KERNEL_FALLOCATE
        SPL_AC_CONFIG_ZLIB_INFLATE
        SPL_AC_CONFIG_ZLIB_DEFLATE
@@ -453,15 +452,14 @@ dnl #
 dnl # Enabled by default it provides a minimal level of memory tracking.
 dnl # A total count of bytes allocated is kept for each alloc and free.
 dnl # Then at module unload time a report to the console will be printed
-dnl # if memory was leaked.  Additionally, /proc/spl/kmem/slab will exist
-dnl # and provide an easy way to inspect the kmem based slab.
+dnl # if memory was leaked.
 dnl #
 AC_DEFUN([SPL_AC_DEBUG_KMEM], [
        AC_ARG_ENABLE([debug-kmem],
                [AS_HELP_STRING([--enable-debug-kmem],
-               [Enable basic kmem accounting @<:@default=yes@:>@])],
+               [Enable basic kmem accounting @<:@default=no@:>@])],
                [],
-               [enable_debug_kmem=yes])
+               [enable_debug_kmem=no])
 
        AS_IF([test "x$enable_debug_kmem" = xyes],
        [
@@ -1201,27 +1199,6 @@ AC_DEFUN([SPL_AC_KERNEL_FALLOCATE], [
        SPL_AC_PAX_KERNEL_FILE_FALLOCATE
 ])
 
-dnl #
-dnl # 2.6.33 API change. Also backported in RHEL5 as of 2.6.18-190.el5.
-dnl # Earlier versions of rwsem_is_locked() were inline and had a race
-dnl # condition.  The fixed version is exported as a symbol.  The race
-dnl # condition is fixed by acquiring sem->wait_lock, so we must not
-dnl # call that version while holding sem->wait_lock.
-dnl #
-AC_DEFUN([SPL_AC_EXPORTED_RWSEM_IS_LOCKED],
-       [AC_MSG_CHECKING([whether rwsem_is_locked() acquires sem->wait_lock])
-       SPL_LINUX_TRY_COMPILE_SYMBOL([
-               #include <linux/rwsem.h>
-               int rwsem_is_locked(struct rw_semaphore *sem) { return 0; }
-       ], [], [rwsem_is_locked], [lib/rwsem-spinlock.c], [
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(RWSEM_IS_LOCKED_TAKES_WAIT_LOCK, 1,
-                         [rwsem_is_locked() acquires sem->wait_lock])
-       ], [
-               AC_MSG_RESULT(no)
-       ])
-])
-
 dnl #
 dnl # zlib inflate compat,
 dnl # Verify the kernel has CONFIG_ZLIB_INFLATE support enabled.
index ded5512ab43afbaae0ad92205e59ef172cedc90a..10322d9ca93461065e70062b0857f0d3e7867c02 100755 (executable)
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for spl 0.6.4.2.
+# Generated by GNU Autoconf 2.69 for spl 0.6.5.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -587,8 +587,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='spl'
 PACKAGE_TARNAME='spl'
-PACKAGE_VERSION='0.6.4.2'
-PACKAGE_STRING='spl 0.6.4.2'
+PACKAGE_VERSION='0.6.5.3'
+PACKAGE_STRING='spl 0.6.5.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -784,6 +784,7 @@ infodir
 docdir
 oldincludedir
 includedir
+runstatedir
 localstatedir
 sharedstatedir
 sysconfdir
@@ -873,6 +874,7 @@ datadir='${datarootdir}'
 sysconfdir='${prefix}/etc'
 sharedstatedir='${prefix}/com'
 localstatedir='${prefix}/var'
+runstatedir='${localstatedir}/run'
 includedir='${prefix}/include'
 oldincludedir='/usr/include'
 docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
@@ -1125,6 +1127,15 @@ do
   | -silent | --silent | --silen | --sile | --sil)
     silent=yes ;;
 
+  -runstatedir | --runstatedir | --runstatedi | --runstated \
+  | --runstate | --runstat | --runsta | --runst | --runs \
+  | --run | --ru | --r)
+    ac_prev=runstatedir ;;
+  -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
+  | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
+  | --run=* | --ru=* | --r=*)
+    runstatedir=$ac_optarg ;;
+
   -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
     ac_prev=sbindir ;;
   -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
@@ -1262,7 +1273,7 @@ fi
 for ac_var in  exec_prefix prefix bindir sbindir libexecdir datarootdir \
                datadir sysconfdir sharedstatedir localstatedir includedir \
                oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
-               libdir localedir mandir
+               libdir localedir mandir runstatedir
 do
   eval ac_val=\$$ac_var
   # Remove trailing slashes.
@@ -1375,7 +1386,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures spl 0.6.4.2 to adapt to many kinds of systems.
+\`configure' configures spl 0.6.5.3 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1415,6 +1426,7 @@ Fine tuning of the installation directories:
   --sysconfdir=DIR        read-only single-machine data [PREFIX/etc]
   --sharedstatedir=DIR    modifiable architecture-independent data [PREFIX/com]
   --localstatedir=DIR     modifiable single-machine data [PREFIX/var]
+  --runstatedir=DIR       modifiable per-process data [LOCALSTATEDIR/run]
   --libdir=DIR            object code libraries [EPREFIX/lib]
   --includedir=DIR        C header files [PREFIX/include]
   --oldincludedir=DIR     C header files for non-gcc [/usr/include]
@@ -1446,7 +1458,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of spl 0.6.4.2:";;
+     short | recursive ) echo "Configuration of spl 0.6.5.3:";;
    esac
   cat <<\_ACEOF
 
@@ -1471,7 +1483,7 @@ Optional Features:
   --enable-linux-builtin  Configure for builtin in-tree kernel modules
                           [default=no]
   --enable-debug          Enable generic debug support [default=no]
-  --enable-debug-kmem     Enable basic kmem accounting [default=yes]
+  --enable-debug-kmem     Enable basic kmem accounting [default=no]
   --enable-debug-kmem-tracking
                           Enable detailed kmem tracking [default=no]
   --enable-atomic-spinlocks
@@ -1566,7 +1578,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-spl configure 0.6.4.2
+spl configure 0.6.5.3
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1844,7 +1856,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by spl $as_me 0.6.4.2, which was
+It was created by spl $as_me 0.6.5.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2977,7 +2989,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='spl'
- VERSION='0.6.4.2'
+ VERSION='0.6.5.3'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -12062,7 +12074,7 @@ $as_echo "$enable_debug" >&6; }
 if test "${enable_debug_kmem+set}" = set; then :
   enableval=$enable_debug_kmem;
 else
-  enable_debug_kmem=yes
+  enable_debug_kmem=no
 fi
 
 
@@ -13544,103 +13556,6 @@ $as_echo "#define HAVE_PUT_TASK_STRUCT 1" >>confdefs.h
        fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether rwsem_is_locked() acquires sem->wait_lock" >&5
-$as_echo_n "checking whether rwsem_is_locked() acquires sem->wait_lock... " >&6; }
-
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-               #include <linux/rwsem.h>
-               int rwsem_is_locked(struct rw_semaphore *sem) { return 0; }
-
-int
-main (void)
-{
-
-  ;
-  return 0;
-}
-
-_ACEOF
-
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror-implicit-function-declaration $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
-
-
-fi
-       rm -Rf build
-
-
-       if test $rc -ne 0; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-       else
-               if test "x$enable_linux_builtin" != xyes; then
-
-       grep -q -E '[[:space:]]rwsem_is_locked[[:space:]]' \
-               $LINUX_OBJ/Module*.symvers 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in lib/rwsem-spinlock.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(rwsem_is_locked)" \
-                               "$LINUX_OBJ/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
-
-               fi
-               if test $rc -ne 0; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-               else :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define RWSEM_IS_LOCKED_TAKES_WAIT_LOCK 1" >>confdefs.h
-
-
-               fi
-       fi
-
-
 
 
        { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->fallocate() exists" >&5
@@ -14729,7 +14644,7 @@ $as_echo "$enable_debug" >&6; }
 if test "${enable_debug_kmem+set}" = set; then :
   enableval=$enable_debug_kmem;
 else
-  enable_debug_kmem=yes
+  enable_debug_kmem=no
 fi
 
 
@@ -16211,103 +16126,6 @@ $as_echo "#define HAVE_PUT_TASK_STRUCT 1" >>confdefs.h
        fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether rwsem_is_locked() acquires sem->wait_lock" >&5
-$as_echo_n "checking whether rwsem_is_locked() acquires sem->wait_lock... " >&6; }
-
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-               #include <linux/rwsem.h>
-               int rwsem_is_locked(struct rw_semaphore *sem) { return 0; }
-
-int
-main (void)
-{
-
-  ;
-  return 0;
-}
-
-_ACEOF
-
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror-implicit-function-declaration $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
-
-
-fi
-       rm -Rf build
-
-
-       if test $rc -ne 0; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-       else
-               if test "x$enable_linux_builtin" != xyes; then
-
-       grep -q -E '[[:space:]]rwsem_is_locked[[:space:]]' \
-               $LINUX_OBJ/Module*.symvers 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in lib/rwsem-spinlock.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(rwsem_is_locked)" \
-                               "$LINUX_OBJ/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
-
-               fi
-               if test $rc -ne 0; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-               else :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define RWSEM_IS_LOCKED_TAKES_WAIT_LOCK 1" >>confdefs.h
-
-
-               fi
-       fi
-
-
 
 
        { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->fallocate() exists" >&5
@@ -17782,7 +17600,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by spl $as_me 0.6.4.2, which was
+This file was extended by spl $as_me 0.6.5.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -17848,7 +17666,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-spl config.status 0.6.4.2
+spl config.status 0.6.5.3
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
index 85e50930ce66e18879fd2d01c264231994c6acfa..90809cf011514d93acfffd53e3c6ca64138d81e4 100644 (file)
@@ -1,6 +1,6 @@
 AUTOINSTALL="yes"
 PACKAGE_NAME="spl"
-PACKAGE_VERSION="0.6.4.2"
+PACKAGE_VERSION="0.6.5.3"
 PRE_BUILD="configure
   --prefix=/usr
   --with-config=kernel
index 0ad774faf983bdb57e4a8803b94a09f16bd71f0a..83050a4083c43996bf61ee442699df8ac7b34a64 100644 (file)
@@ -375,6 +375,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index c862d9e4604b20a7ca913438c7dc07134e900d5f..da6b8893075da82e594bad3a7ce55b2f17a3ebbe 100644 (file)
@@ -331,6 +331,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index 2cb0b7d3ab523587c7da1868ecfd73bfc02f07ef..5735014845aacf296559fd6a981fbdf529db29ec 100644 (file)
@@ -341,6 +341,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index 80f348e4c25abbe6556970090fc38975376275b8..5841d7c286978011df049ecbc2520edb249bdf7a 100644 (file)
 #define spl_rwsem_trylock_irqsave(lk, fl)    spin_trylock_irqsave(lk, fl)
 #endif /* RWSEM_SPINLOCK_IS_RAW */
 
-/*
- * Prior to Linux 2.6.33 there existed a race condition in rwsem_is_locked().
- * The semaphore's activity was checked outside of the wait_lock which
- * could result in some readers getting the incorrect activity value.
- *
- * When a kernel without this fix is detected the SPL takes responsibility
- * for acquiring the wait_lock to avoid this race.
- */
-#if defined(RWSEM_IS_LOCKED_TAKES_WAIT_LOCK)
 #define spl_rwsem_is_locked(rwsem)           rwsem_is_locked(rwsem)
-#else
-static inline int
-spl_rwsem_is_locked(struct rw_semaphore *rwsem)
-{
-       unsigned long flags;
-       int rc = 1;
-
-       if (spl_rwsem_trylock_irqsave(&rwsem->wait_lock, flags)) {
-               rc = rwsem_is_locked(rwsem);
-               spl_rwsem_unlock_irqrestore(&rwsem->wait_lock, flags);
-       }
-
-       return (rc);
-}
-#endif /* RWSEM_IS_LOCKED_TAKES_WAIT_LOCK */
 
 #endif /* _SPL_RWSEM_COMPAT_H */
index f898083f6c76795b0a59d20976dc538d09b831d2..2bdc296d0ff8bc617ca8cd3ea96d99e9977820d8 100644 (file)
@@ -332,6 +332,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index afa15deee31f4f0281206833af9da2a353c7d477..b677378f8620e0ec17dd686302be740adf9f00b9 100644 (file)
@@ -331,6 +331,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index f9e883fd41ab31e9a57e981ce078a32a0ef12775..73c4a84217829d204bb0b42d4fb9a21d21a28f1d 100644 (file)
@@ -91,6 +91,7 @@ KERNEL_H = \
        $(top_srcdir)/include/sys/u8_textprep.h \
        $(top_srcdir)/include/sys/uio.h \
        $(top_srcdir)/include/sys/unistd.h \
+       $(top_srcdir)/include/sys/user.h \
        $(top_srcdir)/include/sys/va_list.h \
        $(top_srcdir)/include/sys/varargs.h \
        $(top_srcdir)/include/sys/vfs.h \
index 3fc95686f3dd1e186123e5eb7197b47b39645085..bd9037d9a1a50dad359a081f39b935cb4ab00c8f 100644 (file)
@@ -221,6 +221,7 @@ am__kernel_HEADERS_DIST = $(top_srcdir)/include/sys/acl.h \
        $(top_srcdir)/include/sys/u8_textprep.h \
        $(top_srcdir)/include/sys/uio.h \
        $(top_srcdir)/include/sys/unistd.h \
+       $(top_srcdir)/include/sys/user.h \
        $(top_srcdir)/include/sys/va_list.h \
        $(top_srcdir)/include/sys/varargs.h \
        $(top_srcdir)/include/sys/vfs.h \
@@ -469,6 +470,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
@@ -572,6 +574,7 @@ KERNEL_H = \
        $(top_srcdir)/include/sys/u8_textprep.h \
        $(top_srcdir)/include/sys/uio.h \
        $(top_srcdir)/include/sys/unistd.h \
+       $(top_srcdir)/include/sys/user.h \
        $(top_srcdir)/include/sys/va_list.h \
        $(top_srcdir)/include/sys/varargs.h \
        $(top_srcdir)/include/sys/vfs.h \
index c9f2bea12ceeb2bc92e0a5c3c0b692fdbe23c218..efcf0dda2769b2c903636932bc0ca0f9b4da0c7d 100644 (file)
@@ -1,4 +1,4 @@
-/*****************************************************************************\
+/*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
-\*****************************************************************************/
+ */
 
 #ifndef _SPL_CONDVAR_H
-#define _SPL_CONDVAR_H
+#define        _SPL_CONDVAR_H
 
 #include <linux/module.h>
 #include <linux/wait.h>
@@ -36,8 +36,8 @@
  * The kcondvar_t struct is protected by mutex taken externally before
  * calling any of the wait/signal funs, and passed into the wait funs.
  */
-#define CV_MAGIC                       0x346545f4
-#define CV_DESTROY                     0x346545f5
+#define        CV_MAGIC                        0x346545f4
+#define        CV_DESTROY                      0x346545f5
 
 typedef struct {
        int cv_magic;
@@ -48,30 +48,30 @@ typedef struct {
        kmutex_t *cv_mutex;
 } kcondvar_t;
 
-typedef enum { CV_DEFAULT=0, CV_DRIVER } kcv_type_t;
+typedef enum { CV_DEFAULT = 0, CV_DRIVER } kcv_type_t;
 
-extern void __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg);
-extern void __cv_destroy(kcondvar_t *cvp);
-extern void __cv_wait(kcondvar_t *cvp, kmutex_t *mp);
-extern void __cv_wait_io(kcondvar_t *cvp, kmutex_t *mp);
-extern void __cv_wait_interruptible(kcondvar_t *cvp, kmutex_t *mp);
-extern clock_t __cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time);
-extern clock_t __cv_timedwait_interruptible(kcondvar_t *cvp, kmutex_t *mp,
-       clock_t exp_time);
-extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp,
-       hrtime_t tim, hrtime_t res, int flag);
-extern void __cv_signal(kcondvar_t *cvp);
-extern void __cv_broadcast(kcondvar_t *cvp);
+extern void __cv_init(kcondvar_t *, char *, kcv_type_t, void *);
+extern void __cv_destroy(kcondvar_t *);
+extern void __cv_wait(kcondvar_t *, kmutex_t *);
+extern void __cv_wait_io(kcondvar_t *, kmutex_t *);
+extern void __cv_wait_sig(kcondvar_t *, kmutex_t *);
+extern clock_t __cv_timedwait(kcondvar_t *, kmutex_t *, clock_t);
+extern clock_t __cv_timedwait_sig(kcondvar_t *, kmutex_t *, clock_t);
+extern clock_t cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t,
+    hrtime_t res, int flag);
+extern void __cv_signal(kcondvar_t *);
+extern void __cv_broadcast(kcondvar_t *c);
 
-#define cv_init(cvp, name, type, arg)          __cv_init(cvp, name, type, arg)
-#define cv_destroy(cvp)                                __cv_destroy(cvp)
-#define cv_wait(cvp, mp)                       __cv_wait(cvp, mp)
-#define cv_wait_io(cvp, mp)                    __cv_wait_io(cvp, mp)
-#define cv_wait_interruptible(cvp, mp)         __cv_wait_interruptible(cvp,mp)
-#define cv_timedwait(cvp, mp, t)               __cv_timedwait(cvp, mp, t)
-#define cv_timedwait_interruptible(cvp, mp, t)                                \
-       __cv_timedwait_interruptible(cvp, mp, t)
-#define cv_signal(cvp)                         __cv_signal(cvp)
-#define cv_broadcast(cvp)                      __cv_broadcast(cvp)
+#define        cv_init(cvp, name, type, arg)           __cv_init(cvp, name, type, arg)
+#define        cv_destroy(cvp)                         __cv_destroy(cvp)
+#define        cv_wait(cvp, mp)                        __cv_wait(cvp, mp)
+#define        cv_wait_io(cvp, mp)                     __cv_wait_io(cvp, mp)
+#define        cv_wait_sig(cvp, mp)                    __cv_wait_sig(cvp, mp)
+#define        cv_wait_interruptible(cvp, mp)          cv_wait_sig(cvp, mp)
+#define        cv_timedwait(cvp, mp, t)                __cv_timedwait(cvp, mp, t)
+#define        cv_timedwait_sig(cvp, mp, t)            __cv_timedwait_sig(cvp, mp, t)
+#define        cv_timedwait_interruptible(cvp, mp, t)  cv_timedwait_sig(cvp, mp, t)
+#define        cv_signal(cvp)                          __cv_signal(cvp)
+#define        cv_broadcast(cvp)                       __cv_broadcast(cvp)
 
 #endif /* _SPL_CONDVAR_H */
index cae2d49e44c41b52a007b410cb1494e50c7809f3..a37740036446a26776e08f679c60cb5f945e36db 100644 (file)
@@ -92,6 +92,8 @@ void spl_dumpstack(void);
 #define        ASSERT3U(x,y,z)         ((void)0)
 #define        ASSERT3P(x,y,z)         ((void)0)
 #define        ASSERT0(x)              ((void)0)
+#define        IMPLY(A, B)             ((void)0)
+#define        EQUIV(A, B)             ((void)0)
 
 /*
  * Debugging enabled (--enable-debug)
@@ -105,6 +107,14 @@ void spl_dumpstack(void);
 #define        ASSERT3U(x,y,z)         VERIFY3U(x, y, z)
 #define        ASSERT3P(x,y,z)         VERIFY3P(x, y, z)
 #define        ASSERT0(x)              VERIFY0(x)
+#define        IMPLY(A, B) \
+       ((void)(((!(A)) || (B)) || \
+           spl_panic(__FILE__, __FUNCTION__, __LINE__, \
+           "(" #A ") implies (" #B ")")))
+#define        EQUIV(A, B) \
+       ((void)((!!(A) == !!(B)) || \
+           spl_panic(__FILE__, __FUNCTION__, __LINE__, \
+           "(" #A ") is equivalent to (" #B ")")))
 
 #endif /* NDEBUG */
 
index 73b5a2e9872e55c246f6b7c546825b81b0dbe0ba..6ab63127bd8b675940979958662e95ec384fbefd 100644 (file)
@@ -332,6 +332,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index 6bea431eaf10622b50a85d3e9adc648b11fd2735..03ef94599503d8d19005c7ed28a303f7984d6c68 100644 (file)
@@ -331,6 +331,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index 75a0a55b7d1f18bf6a8870b1a5b17f4a98df405a..e971c2b0d9522679b5aff50223f5c188cd044fbb 100644 (file)
@@ -170,7 +170,7 @@ typedef struct spl_kmem_cache {
        uint32_t                skc_magic;      /* Sanity magic */
        uint32_t                skc_name_size;  /* Name length */
        char                    *skc_name;      /* Name string */
-       spl_kmem_magazine_t     *skc_mag[NR_CPUS]; /* Per-CPU warm cache */
+       spl_kmem_magazine_t     **skc_mag;      /* Per-CPU warm cache */
        uint32_t                skc_mag_size;   /* Magazine size */
        uint32_t                skc_mag_refill; /* Magazine refill count */
        spl_kmem_ctor_t         skc_ctor;       /* Constructor */
index 9dfbfe5451a50d07db46200bd8b05478f79e2afb..7064e8f1f2018a162ceae6097bf7b80a49948fdd 100644 (file)
@@ -83,15 +83,13 @@ rw_owner(krwlock_t *rwp)
 static inline int
 RW_READ_HELD(krwlock_t *rwp)
 {
-       return (spl_rwsem_is_locked(SEM(rwp)) &&
-               rw_owner(rwp) == NULL);
+       return (spl_rwsem_is_locked(SEM(rwp)) && rw_owner(rwp) == NULL);
 }
 
 static inline int
 RW_WRITE_HELD(krwlock_t *rwp)
 {
-       return (spl_rwsem_is_locked(SEM(rwp)) &&
-               rw_owner(rwp) == current);
+       return (spl_rwsem_is_locked(SEM(rwp)) && rw_owner(rwp) == current);
 }
 
 static inline int
index b920f65a40c652dd91946d6cc4b613d1f96f57fc..6c32550418da8e724abeebbd89a62a57b6874770 100644 (file)
@@ -332,6 +332,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index c56d7e12a6203051abf79213ac88e49d14f55cac..4dc7cd8585619a77d4405ab15724bb30769e3ce6 100644 (file)
@@ -78,6 +78,7 @@
 #define proc_pageout                   NULL
 #define curproc                                current
 #define max_ncpus                      num_possible_cpus()
+#define boot_ncpus                     num_online_cpus()
 #define CPU_SEQID                      smp_processor_id()
 #define _NOTE(x)
 #define is_system_labeled()            0
@@ -92,8 +93,9 @@
  *
  * Treat shim tasks as SCHED_NORMAL tasks
  */
-#define minclsyspri                    (MAX_RT_PRIO)
-#define maxclsyspri                    (MAX_PRIO-1)
+#define minclsyspri                    (MAX_PRIO-1)
+#define maxclsyspri                    (MAX_RT_PRIO)
+#define defclsyspri                    (DEFAULT_PRIO)
 
 #ifndef NICE_TO_PRIO
 #define NICE_TO_PRIO(nice)             (MAX_RT_PRIO + (nice) + 20)
index 7b44e8b8ae42a675514bfc71a7c6879940f14d01..a43a86da651470d42ffc7858164a620e7d516146 100644 (file)
@@ -40,6 +40,7 @@
 #define TASKQ_DYNAMIC          0x00000004
 #define TASKQ_THREADS_CPU_PCT  0x00000008
 #define TASKQ_DC_BATCH         0x00000010
+#define TASKQ_ACTIVE           0x80000000
 
 /*
  * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as
@@ -53,7 +54,6 @@
 #define TQ_NOALLOC             0x02000000
 #define TQ_NEW                 0x04000000
 #define TQ_FRONT               0x08000000
-#define TQ_ACTIVE              0x80000000
 
 typedef unsigned long taskqid_t;
 typedef void (task_func_t)(void *);
@@ -61,11 +61,13 @@ typedef void (task_func_t)(void *);
 typedef struct taskq {
        spinlock_t              tq_lock;       /* protects taskq_t */
        unsigned long           tq_lock_flags; /* interrupt state */
-       const char              *tq_name;      /* taskq name */
+       char                    *tq_name;      /* taskq name */
        struct list_head        tq_thread_list;/* list of all threads */
        struct list_head        tq_active_list;/* list of active threads */
        int                     tq_nactive;    /* # of active threads */
-       int                     tq_nthreads;   /* # of total threads */
+       int                     tq_nthreads;   /* # of existing threads */
+       int                     tq_nspawn;     /* # of threads being spawned */
+       int                     tq_maxthreads; /* # of threads maximum */
        int                     tq_pri;        /* priority */
        int                     tq_minalloc;   /* min task_t pool size */
        int                     tq_maxalloc;   /* max task_t pool size */
@@ -119,7 +121,7 @@ extern void taskq_init_ent(taskq_ent_t *);
 extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
 extern void taskq_destroy(taskq_t *);
 extern void taskq_wait_id(taskq_t *, taskqid_t);
-extern void taskq_wait_all(taskq_t *, taskqid_t);
+extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
 extern void taskq_wait(taskq_t *);
 extern int taskq_cancel_id(taskq_t *, taskqid_t);
 extern int taskq_member(taskq_t *, void *);
index 25c5f4a01807bc72406d004305a9e8c7bb094066..404c03774aea46698653dc2b5d1783b91a67238d 100644 (file)
@@ -1,6 +1,7 @@
 /*****************************************************************************\
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
+ *  Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  *  UCRL-CODE-235197
@@ -26,6 +27,7 @@
 #define _SPL_UIO_H
 
 #include <linux/uio.h>
+#include <linux/blkdev.h>
 #include <asm/uaccess.h>
 #include <sys/types.h>
 
@@ -40,10 +42,14 @@ typedef enum uio_seg {
        UIO_USERSPACE = 0,
        UIO_SYSSPACE =  1,
        UIO_USERISPACE= 2,
+       UIO_BVEC =      3,
 } uio_seg_t;
 
 typedef struct uio {
-       struct iovec    *uio_iov;
+       union {
+               const struct iovec      *uio_iov;
+               const struct bio_vec    *uio_bvec;
+       };
        int             uio_iovcnt;
        offset_t        uio_loffset;
        uio_seg_t       uio_segflg;
@@ -51,6 +57,7 @@ typedef struct uio {
        uint16_t        uio_extflg;
        offset_t        uio_limit;
        ssize_t         uio_resid;
+       size_t          uio_skip;
 } uio_t;
 
 typedef struct aio_req {
diff --git a/spl/include/sys/user.h b/spl/include/sys/user.h
new file mode 100644 (file)
index 0000000..ebbe8f6
--- /dev/null
@@ -0,0 +1,42 @@
+/*****************************************************************************\
+ *  Copyright (C) 2015 Cluster Inc.
+ *  Produced at ClusterHQ Inc (cf, DISCLAIMER).
+ *  Written by Richard Yao <richard.yao@clusterhq.com>.
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+\*****************************************************************************/
+
+#ifndef _SPL_USER_H
+#define _SPL_USER_H
+
+/*
+ * We have uf_info_t for areleasef(). We implement areleasef() using a global
+ * linked list of all open file descriptors with the task structs referenced,
+ * so accessing the correct descriptor from areleasef() only requires knowing
+ * about the Linux task_struct. Since this is internal to our compatibility
+ * layer, we make it an opaque type.
+ *
+ * XXX: If the descriptor changes under us, we would get an incorrect
+ * reference.
+ */
+
+struct uf_info;
+typedef struct uf_info uf_info_t;
+
+#define P_FINFO(x) ((uf_info_t *)x)
+
+#endif /* SPL_USER_H */
index 8aadc9d03b300e8f611261ab8f798b8c276af986..eb482805206190de4fb2c041c92720c82e10479e 100644 (file)
@@ -98,6 +98,7 @@ extern void *spl_vmalloc(unsigned long size, gfp_t lflags, pgprot_t prot);
 #define        vmem_alloc(sz, fl)      spl_vmem_alloc((sz), (fl), __func__, __LINE__)
 #define        vmem_zalloc(sz, fl)     spl_vmem_zalloc((sz), (fl), __func__, __LINE__)
 #define        vmem_free(ptr, sz)      spl_vmem_free((ptr), (sz))
+#define        vmem_qcache_reap(ptr)   ((void)0)
 
 extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line);
 extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line);
index 2fa169523ddfd0efa6a2768301bd707108681240..9d334fe0a16e7bcf04bbdb61be80eaedf596f651 100644 (file)
 
 #define        membar_producer()               smp_wmb()
 #define        physmem                         totalram_pages
-#define        freemem                         nr_free_pages()
+#define        freemem                 (nr_free_pages() + \
+                               global_page_state(NR_INACTIVE_FILE) + \
+                               global_page_state(NR_INACTIVE_ANON) + \
+                               global_page_state(NR_SLAB_RECLAIMABLE))
 
 #define        xcopyin(from, to, size)         copy_from_user(to, from, size)
 #define        xcopyout(from, to, size)        copy_to_user(to, from, size)
index 07a34493837b4356f5f847b7606c2753a0e06e8e..0b857d384ba572a540263e59b830c4d8d96fa754 100644 (file)
@@ -40,6 +40,7 @@
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/uio.h>
+#include <sys/user.h>
 #include <sys/sunldi.h>
 
 /*
@@ -184,6 +185,7 @@ extern int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag,
     offset_t offset, void *x6, void *x7);
 extern file_t *vn_getf(int fd);
 extern void vn_releasef(int fd);
+extern void vn_areleasef(int fd, uf_info_t *fip);
 extern int vn_set_pwd(const char *filename);
 
 int spl_vn_init(void);
@@ -198,6 +200,7 @@ void spl_vn_fini(void);
 #define vn_is_readonly(vp)                     0
 #define getf                                   vn_getf
 #define releasef                               vn_releasef
+#define areleasef                              vn_areleasef
 
 extern vnode_t *rootdir;
 
index b721b5099cf7eb86b777a108ae3bfb8050491787..e2bf09fb10d356c98d6b29eab6509c4a5538990b 100644 (file)
@@ -2,7 +2,7 @@ COMMON_H =
 
 KERNEL_H = \
        $(top_srcdir)/include/util/qsort.h \
-        $(top_srcdir)/include/util/sscanf.h
+       $(top_srcdir)/include/util/sscanf.h
 
 USER_H =
 
index 4870196c63e1f1dee63e4c417599ef171d15b992..765bfb0745beca9f66190602c2a169cc829bf6f5 100644 (file)
@@ -332,6 +332,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
@@ -347,7 +348,7 @@ top_srcdir = @top_srcdir@
 COMMON_H = 
 KERNEL_H = \
        $(top_srcdir)/include/util/qsort.h \
-        $(top_srcdir)/include/util/sscanf.h
+       $(top_srcdir)/include/util/sscanf.h
 
 USER_H = 
 EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H)
index 7614c7f15cf61da03aca89f15280b31cfc96364d..fdb3314ea10e33336298ed7632e0aa0d8f8fa276 100644 (file)
@@ -333,6 +333,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index 311496c8a6a13174c55ff4721672ce48245cc782..26d813e2413d739447c6221395343a373a0ef5df 100644 (file)
@@ -338,6 +338,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index c7121db2520c53b21a64a5ce30da77ba6cf36f48..9bca3d483b6dad6e73def2d2ae9ae50db8f14a18 100644 (file)
@@ -341,6 +341,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index 6106228a3f9c6cf0d5fc7ad86dd14ca610d010ab..e7cfa319c5a7e2d89b0cef58e96379bbfa595806 100644 (file)
@@ -312,6 +312,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index f0825d8dde7308eeca06fd092d18d490b4261c8c..255b02ca6bf6aebdba343082c925e567e6e51622 100644 (file)
@@ -312,6 +312,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index 3e7e877fbbbc86becc7221499702cb658b6911c0..acdd5b658ff84486bcdb3a63bb83f126d7f342ad 100644 (file)
@@ -249,3 +249,52 @@ where a thread should run.
 .sp
 Default value: \fB0\fR
 .RE
+
+.sp
+.ne 2
+.na
+\fBspl_taskq_thread_dynamic\fR (int)
+.ad
+.RS 12n
+Allow dynamic taskqs.  When enabled taskqs which set the TASKQ_DYNAMIC flag
+will by default create only a single thread.  New threads will be created on
+demand up to a maximum allowed number to facilitate the completion of
+outstanding tasks.  Threads which are no longer needed will be promptly
+destroyed.  By default this behavior is enabled but it can be disabled to
+aid performance analysis or troubleshooting.
+.sp
+Default value: \fB1\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_taskq_thread_priority\fR (int)
+.ad
+.RS 12n
+Allow newly created taskq threads to set a non-default scheduler priority.
+When enabled the priority specified when a taskq is created will be applied
+to all threads created by that taskq.  When disabled all threads will use
+the default Linux kernel thread priority.  By default, this behavior is
+enabled.
+.sp
+Default value: \fB1\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_taskq_thread_sequential\fR (int)
+.ad
+.RS 12n
+The number of items a taskq worker thread must handle without interruption
+before requesting a new worker thread be spawned.  This is used to control
+how quickly taskqs ramp up the number of threads processing the queue.
+Because Linux thread creation and destruction are relatively inexpensive a
+small default value has been selected.  This means that normally threads will
+be created aggressively which is desirable.  Increasing this value will
+result in a slower thread creation rate which may be preferable for some
+configurations.
+.sp
+Default value: \fB4\fR
+.RE
index d1742448deb82e2ad9a92ccf1204b422969eea49..a1f1ab82385c94f3a4087bf3f478b8862e73c7a8 100644 (file)
@@ -1,27 +1,30 @@
 # Makefile.in for spl kernel module
 
+src = @abs_top_srcdir@/module/spl
+obj = @abs_builddir@
+
 MODULE := spl
 EXTRA_CFLAGS = $(SPL_MODULE_CFLAGS) @KERNELCPPFLAGS@
 
 # Solaris porting layer module
 obj-$(CONFIG_SPL) := $(MODULE).o
 
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-proc.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-kmem.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-kmem-cache.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-vmem.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-thread.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-taskq.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-rwlock.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-vnode.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-err.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-kobj.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-generic.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-atomic.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-mutex.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-kstat.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-condvar.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-xdr.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-cred.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-tsd.o
-$(MODULE)-objs += @top_srcdir@/module/spl/spl-zlib.o
+$(MODULE)-objs += spl-proc.o
+$(MODULE)-objs += spl-kmem.o
+$(MODULE)-objs += spl-kmem-cache.o
+$(MODULE)-objs += spl-vmem.o
+$(MODULE)-objs += spl-thread.o
+$(MODULE)-objs += spl-taskq.o
+$(MODULE)-objs += spl-rwlock.o
+$(MODULE)-objs += spl-vnode.o
+$(MODULE)-objs += spl-err.o
+$(MODULE)-objs += spl-kobj.o
+$(MODULE)-objs += spl-generic.o
+$(MODULE)-objs += spl-atomic.o
+$(MODULE)-objs += spl-mutex.o
+$(MODULE)-objs += spl-kstat.o
+$(MODULE)-objs += spl-condvar.o
+$(MODULE)-objs += spl-xdr.o
+$(MODULE)-objs += spl-cred.o
+$(MODULE)-objs += spl-tsd.o
+$(MODULE)-objs += spl-zlib.o
index cebb8f2b10e88eca23179ad495b6cae4a23bfd93..c3467a56e6ae2ceff81f8b10aff9f41e61c50656 100644 (file)
@@ -1,4 +1,4 @@
-/*****************************************************************************\
+/*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
@@ -20,9 +20,9 @@
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
- *****************************************************************************
+ *
  *  Solaris Porting Layer (SPL) Credential Implementation.
-\*****************************************************************************/
+ */
 
 #include <sys/condvar.h>
 #include <sys/time.h>
@@ -50,10 +50,10 @@ cv_destroy_wakeup(kcondvar_t *cvp)
        if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) {
                ASSERT(cvp->cv_mutex == NULL);
                ASSERT(!waitqueue_active(&cvp->cv_event));
-               return 1;
+               return (1);
        }
 
-       return 0;
+       return (0);
 }
 
 void
@@ -82,7 +82,7 @@ cv_wait_common(kcondvar_t *cvp, kmutex_t *mp, int state, int io)
        DEFINE_WAIT(wait);
 
        ASSERT(cvp);
-        ASSERT(mp);
+       ASSERT(mp);
        ASSERT(cvp->cv_magic == CV_MAGIC);
        ASSERT(mutex_owned(mp));
        atomic_inc(&cvp->cv_refs);
@@ -96,9 +96,11 @@ cv_wait_common(kcondvar_t *cvp, kmutex_t *mp, int state, int io)
        prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
        atomic_inc(&cvp->cv_waiters);
 
-       /* Mutex should be dropped after prepare_to_wait() this
+       /*
+        * Mutex should be dropped after prepare_to_wait() this
         * ensures we're linked in to the waiters list and avoids the
-        * race where 'cvp->cv_waiters > 0' but the list is empty. */
+        * race where 'cvp->cv_waiters > 0' but the list is empty.
+        */
        mutex_exit(mp);
        if (io)
                io_schedule();
@@ -124,11 +126,11 @@ __cv_wait(kcondvar_t *cvp, kmutex_t *mp)
 EXPORT_SYMBOL(__cv_wait);
 
 void
-__cv_wait_interruptible(kcondvar_t *cvp, kmutex_t *mp)
+__cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp)
 {
        cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0);
 }
-EXPORT_SYMBOL(__cv_wait_interruptible);
+EXPORT_SYMBOL(__cv_wait_sig);
 
 void
 __cv_wait_io(kcondvar_t *cvp, kmutex_t *mp)
@@ -137,18 +139,19 @@ __cv_wait_io(kcondvar_t *cvp, kmutex_t *mp)
 }
 EXPORT_SYMBOL(__cv_wait_io);
 
-/* 'expire_time' argument is an absolute wall clock time in jiffies.
+/*
+ * 'expire_time' argument is an absolute wall clock time in jiffies.
  * Return value is time left (expire_time - now) or -1 if timeout occurred.
  */
 static clock_t
-__cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp,
-                     clock_t expire_time, int state)
+__cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp, clock_t expire_time,
+    int state)
 {
        DEFINE_WAIT(wait);
        clock_t time_left;
 
        ASSERT(cvp);
-        ASSERT(mp);
+       ASSERT(mp);
        ASSERT(cvp->cv_magic == CV_MAGIC);
        ASSERT(mutex_owned(mp));
        atomic_inc(&cvp->cv_refs);
@@ -169,9 +172,11 @@ __cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp,
        prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
        atomic_inc(&cvp->cv_waiters);
 
-       /* Mutex should be dropped after prepare_to_wait() this
+       /*
+        * Mutex should be dropped after prepare_to_wait() this
         * ensures we're linked in to the waiters list and avoids the
-        * race where 'cvp->cv_waiters > 0' but the list is empty. */
+        * race where 'cvp->cv_waiters > 0' but the list is empty.
+        */
        mutex_exit(mp);
        time_left = schedule_timeout(time_left);
        mutex_enter(mp);
@@ -191,24 +196,24 @@ __cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp,
 clock_t
 __cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
 {
-       return __cv_timedwait_common(cvp, mp, exp_time, TASK_UNINTERRUPTIBLE);
+       return (__cv_timedwait_common(cvp, mp, exp_time, TASK_UNINTERRUPTIBLE));
 }
 EXPORT_SYMBOL(__cv_timedwait);
 
 clock_t
-__cv_timedwait_interruptible(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+__cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
 {
-       return __cv_timedwait_common(cvp, mp, exp_time, TASK_INTERRUPTIBLE);
+       return (__cv_timedwait_common(cvp, mp, exp_time, TASK_INTERRUPTIBLE));
 }
-EXPORT_SYMBOL(__cv_timedwait_interruptible);
+EXPORT_SYMBOL(__cv_timedwait_sig);
 
 /*
- *'expire_time' argument is an absolute clock time in nanoseconds.
+ * 'expire_time' argument is an absolute clock time in nanoseconds.
  * Return value is time left (expire_time - now) or -1 if timeout occurred.
  */
 static clock_t
-__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp,
-                    hrtime_t expire_time, int state)
+__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time,
+    int state)
 {
        DEFINE_WAIT(wait);
        hrtime_t time_left, now;
@@ -237,12 +242,16 @@ __cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp,
        prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
        atomic_inc(&cvp->cv_waiters);
 
-       /* Mutex should be dropped after prepare_to_wait() this
+       /*
+        * Mutex should be dropped after prepare_to_wait() this
         * ensures we're linked in to the waiters list and avoids the
-        * race where 'cvp->cv_waiters > 0' but the list is empty. */
+        * race where 'cvp->cv_waiters > 0' but the list is empty.
+        */
        mutex_exit(mp);
-       /* Allow a 100 us range to give kernel an opportunity to coalesce
-        * interrupts */
+       /*
+        * Allow a 100 us range to give kernel an opportunity to coalesce
+        * interrupts
+        */
        usleep_range(time_left_us, time_left_us + 100);
        mutex_enter(mp);
 
@@ -263,8 +272,8 @@ __cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp,
  * Compatibility wrapper for the cv_timedwait_hires() Illumos interface.
  */
 clock_t
-cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
-                  hrtime_t res, int flag)
+cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res,
+    int flag)
 {
        if (res > 1) {
                /*
@@ -278,7 +287,7 @@ cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
        if (!(flag & CALLOUT_FLAG_ABSOLUTE))
                tim += gethrtime();
 
-       return __cv_timedwait_hires(cvp, mp, tim, TASK_UNINTERRUPTIBLE);
+       return (__cv_timedwait_hires(cvp, mp, tim, TASK_UNINTERRUPTIBLE));
 }
 EXPORT_SYMBOL(cv_timedwait_hires);
 
@@ -289,10 +298,12 @@ __cv_signal(kcondvar_t *cvp)
        ASSERT(cvp->cv_magic == CV_MAGIC);
        atomic_inc(&cvp->cv_refs);
 
-       /* All waiters are added with WQ_FLAG_EXCLUSIVE so only one
+       /*
+        * All waiters are added with WQ_FLAG_EXCLUSIVE so only one
         * waiter will be set runable with each call to wake_up().
         * Additionally wake_up() holds a spin_lock assoicated with
-        * the wait queue to ensure we don't race waking up processes. */
+        * the wait queue to ensure we don't race waking up processes.
+        */
        if (atomic_read(&cvp->cv_waiters) > 0)
                wake_up(&cvp->cv_event);
 
@@ -307,8 +318,10 @@ __cv_broadcast(kcondvar_t *cvp)
        ASSERT(cvp->cv_magic == CV_MAGIC);
        atomic_inc(&cvp->cv_refs);
 
-       /* Wake_up_all() will wake up all waiters even those which
-        * have the WQ_FLAG_EXCLUSIVE flag set. */
+       /*
+        * Wake_up_all() will wake up all waiters even those which
+        * have the WQ_FLAG_EXCLUSIVE flag set.
+        */
        if (atomic_read(&cvp->cv_waiters) > 0)
                wake_up_all(&cvp->cv_event);
 
index cd3e543ba08ebed0e3d75c13f38d9d558b9fe8f3..5a8493fe4c3c73515d0af77a2e6a06d48523a66c 100644 (file)
@@ -805,15 +805,18 @@ spl_magazine_create(spl_kmem_cache_t *skc)
        if (skc->skc_flags & KMC_NOMAGAZINE)
                return (0);
 
+       skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) *
+           num_possible_cpus(), kmem_flags_convert(KM_SLEEP));
        skc->skc_mag_size = spl_magazine_size(skc);
        skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
 
-       for_each_online_cpu(i) {
+       for_each_possible_cpu(i) {
                skc->skc_mag[i] = spl_magazine_alloc(skc, i);
                if (!skc->skc_mag[i]) {
                        for (i--; i >= 0; i--)
                                spl_magazine_free(skc->skc_mag[i]);
 
+                       kfree(skc->skc_mag);
                        return (-ENOMEM);
                }
        }
@@ -833,11 +836,13 @@ spl_magazine_destroy(spl_kmem_cache_t *skc)
        if (skc->skc_flags & KMC_NOMAGAZINE)
                return;
 
-       for_each_online_cpu(i) {
+       for_each_possible_cpu(i) {
                skm = skc->skc_mag[i];
                spl_cache_flush(skc, skm, skm->skm_avail);
                spl_magazine_free(skm);
        }
+
+       kfree(skc->skc_mag);
 }
 
 /*
@@ -880,12 +885,6 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
 
        might_sleep();
 
-       /*
-        * Allocate memory for a new cache and initialize it.  Unfortunately,
-        * this usually ends up being a large allocation of ~32k because
-        * we need to allocate enough memory for the worst case number of
-        * cpus in the magazine, skc_mag[NR_CPUS].
-        */
        skc = kzalloc(sizeof (*skc), lflags);
        if (skc == NULL)
                return (NULL);
@@ -986,13 +985,23 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
                if (rc)
                        goto out;
        } else {
+               unsigned long slabflags = 0;
+
                if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) {
                        rc = EINVAL;
                        goto out;
                }
 
+#if defined(SLAB_USERCOPY)
+               /*
+                * Required for PAX-enabled kernels if the slab is to be
+                * used for coping between user and kernel space.
+                */
+               slabflags |= SLAB_USERCOPY;
+#endif
+
                skc->skc_linux_cache = kmem_cache_create(
-                   skc->skc_name, size, align, 0, NULL);
+                   skc->skc_name, size, align, slabflags, NULL);
                if (skc->skc_linux_cache == NULL) {
                        rc = ENOMEM;
                        goto out;
@@ -1403,8 +1412,6 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
        ASSERT(skc->skc_magic == SKC_MAGIC);
        ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
 
-       atomic_inc(&skc->skc_ref);
-
        /*
         * Allocate directly from a Linux slab.  All optimizations are left
         * to the underlying cache we only need to guarantee that KM_SLEEP
@@ -1457,8 +1464,6 @@ ret:
                        prefetchw(obj);
        }
 
-       atomic_dec(&skc->skc_ref);
-
        return (obj);
 }
 EXPORT_SYMBOL(spl_kmem_cache_alloc);
@@ -1479,7 +1484,6 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
 
        ASSERT(skc->skc_magic == SKC_MAGIC);
        ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
-       atomic_inc(&skc->skc_ref);
 
        /*
         * Run the destructor
@@ -1492,7 +1496,7 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
         */
        if (skc->skc_flags & KMC_SLAB) {
                kmem_cache_free(skc->skc_linux_cache, obj);
-               goto out;
+               return;
        }
 
        /*
@@ -1507,7 +1511,7 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
                spin_unlock(&skc->skc_lock);
 
                if (do_emergency && (spl_emergency_free(skc, obj) == 0))
-                       goto out;
+                       return;
        }
 
        local_irq_save(flags);
@@ -1538,8 +1542,6 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
 
        if (do_reclaim)
                spl_slab_reclaim(skc);
-out:
-       atomic_dec(&skc->skc_ref);
 }
 EXPORT_SYMBOL(spl_kmem_cache_free);
 
@@ -1725,7 +1727,9 @@ spl_kmem_cache_init(void)
        init_rwsem(&spl_kmem_cache_sem);
        INIT_LIST_HEAD(&spl_kmem_cache_list);
        spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
-           spl_kmem_cache_kmem_threads, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
+           spl_kmem_cache_kmem_threads, maxclsyspri,
+           spl_kmem_cache_kmem_threads * 8, INT_MAX,
+           TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
        spl_register_shrinker(&spl_kmem_cache_shrinker);
 
        return (0);
index a434ef54fd34f12c08984629e751c9b56c5eaef6..eb00505d6ee8240205276ac63a843a50d8c4db8e 100644 (file)
@@ -42,17 +42,13 @@ typedef struct ctl_table __no_const spl_ctl_table;
 typedef struct ctl_table spl_ctl_table;
 #endif
 
-#ifdef DEBUG_KMEM
 static unsigned long table_min = 0;
 static unsigned long table_max = ~0;
-#endif
 
 static struct ctl_table_header *spl_header = NULL;
 static struct proc_dir_entry *proc_spl = NULL;
-#ifdef DEBUG_KMEM
 static struct proc_dir_entry *proc_spl_kmem = NULL;
 static struct proc_dir_entry *proc_spl_kmem_slab = NULL;
-#endif /* DEBUG_KMEM */
 struct proc_dir_entry *proc_spl_kstat = NULL;
 
 static int
@@ -135,6 +131,7 @@ proc_domemused(struct ctl_table *table, int write,
 
         return (rc);
 }
+#endif /* DEBUG_KMEM */
 
 static int
 proc_doslab(struct ctl_table *table, int write,
@@ -182,7 +179,6 @@ proc_doslab(struct ctl_table *table, int write,
 
         return (rc);
 }
-#endif /* DEBUG_KMEM */
 
 static int
 proc_dohostid(struct ctl_table *table, int write,
@@ -219,7 +215,6 @@ proc_dohostid(struct ctl_table *table, int write,
         return (rc);
 }
 
-#ifdef DEBUG_KMEM
 static void
 slab_seq_show_headers(struct seq_file *f)
 {
@@ -329,10 +324,9 @@ static struct file_operations proc_slab_operations = {
         .llseek         = seq_lseek,
         .release        = seq_release,
 };
-#endif /* DEBUG_KMEM */
 
-#ifdef DEBUG_KMEM
 static struct ctl_table spl_kmem_table[] = {
+#ifdef DEBUG_KMEM
         {
                 .procname = "kmem_used",
                 .data     = &kmem_alloc_used,
@@ -353,6 +347,7 @@ static struct ctl_table spl_kmem_table[] = {
                 .mode     = 0444,
                 .proc_handler = &proc_doulongvec_minmax,
         },
+#endif /* DEBUG_KMEM */
         {
                 .procname = "slab_kmem_total",
                .data     = (void *)(KMC_KMEM | KMC_TOTAL),
@@ -409,7 +404,6 @@ static struct ctl_table spl_kmem_table[] = {
         },
        {0},
 };
-#endif /* DEBUG_KMEM */
 
 static struct ctl_table spl_kstat_table[] = {
        {0},
@@ -433,13 +427,11 @@ static struct ctl_table spl_table[] = {
                 .mode     = 0644,
                 .proc_handler = &proc_dohostid,
         },
-#ifdef DEBUG_KMEM
        {
                .procname = "kmem",
                .mode     = 0555,
                .child    = spl_kmem_table,
        },
-#endif
        {
                .procname = "kstat",
                .mode     = 0555,
@@ -484,7 +476,6 @@ spl_proc_init(void)
                goto out;
        }
 
-#ifdef DEBUG_KMEM
         proc_spl_kmem = proc_mkdir("kmem", proc_spl);
         if (proc_spl_kmem == NULL) {
                 rc = -EUNATCH;
@@ -498,8 +489,6 @@ spl_proc_init(void)
                goto out;
        }
 
-#endif /* DEBUG_KMEM */
-
         proc_spl_kstat = proc_mkdir("kstat", proc_spl);
         if (proc_spl_kstat == NULL) {
                 rc = -EUNATCH;
@@ -508,10 +497,8 @@ spl_proc_init(void)
 out:
        if (rc) {
                remove_proc_entry("kstat", proc_spl);
-#ifdef DEBUG_KMEM
                remove_proc_entry("slab", proc_spl_kmem);
                remove_proc_entry("kmem", proc_spl);
-#endif
                remove_proc_entry("spl", NULL);
                unregister_sysctl_table(spl_header);
        }
@@ -523,10 +510,8 @@ void
 spl_proc_fini(void)
 {
        remove_proc_entry("kstat", proc_spl);
-#ifdef DEBUG_KMEM
         remove_proc_entry("slab", proc_spl_kmem);
        remove_proc_entry("kmem", proc_spl);
-#endif
        remove_proc_entry("spl", NULL);
 
         ASSERT(spl_header != NULL);
index 951298d9fa7fc74f7bbff0a7742b7174ed77ed2f..89dc1bfcacd8ac7921a6c2ce99cf72f4dee71f82 100644 (file)
@@ -31,10 +31,29 @@ int spl_taskq_thread_bind = 0;
 module_param(spl_taskq_thread_bind, int, 0644);
 MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
 
+
+int spl_taskq_thread_dynamic = 0;
+module_param(spl_taskq_thread_dynamic, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads");
+
+int spl_taskq_thread_priority = 1;
+module_param(spl_taskq_thread_priority, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_priority,
+    "Allow non-default priority for taskq threads");
+
+int spl_taskq_thread_sequential = 4;
+module_param(spl_taskq_thread_sequential, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_sequential,
+    "Create new taskq threads after N sequential tasks");
+
 /* Global system-wide dynamic task queue available for all consumers */
 taskq_t *system_taskq;
 EXPORT_SYMBOL(system_taskq);
 
+/* Private dedicated taskq for creating new taskq threads on demand. */
+static taskq_t *dynamic_taskq;
+static taskq_thread_t *taskq_thread_create(taskq_t *);
+
 static int
 task_km_flags(uint_t flags)
 {
@@ -327,6 +346,33 @@ taskq_find(taskq_t *tq, taskqid_t id, int *active)
        return (NULL);
 }
 
+/*
+ * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and
+ * taskq_wait() functions below.
+ *
+ * Taskq waiting is accomplished by tracking the lowest outstanding task
+ * id and the next available task id.  As tasks are dispatched they are
+ * added to the tail of the pending, priority, or delay lists.  As worker
+ * threads become available the tasks are removed from the heads of these
+ * lists and linked to the worker threads.  This ensures the lists are
+ * kept sorted by lowest to highest task id.
+ *
+ * Therefore the lowest outstanding task id can be quickly determined by
+ * checking the head item from all of these lists.  This value is stored
+ * with the taskq as the lowest id.  It only needs to be recalculated when
+ * either the task with the current lowest id completes or is canceled.
+ *
+ * By blocking until the lowest task id exceeds the passed task id the
+ * taskq_wait_outstanding() function can be easily implemented.  Similarly,
+ * by blocking until the lowest task id matches the next task id taskq_wait()
+ * can be implemented.
+ *
+ * Callers should be aware that when there are multiple worked threads it
+ * is possible for larger task ids to complete before smaller ones.  Also
+ * when the taskq contains delay tasks with small task ids callers may
+ * block for a considerable length of time waiting for them to expire and
+ * execute.
+ */
 static int
 taskq_wait_id_check(taskq_t *tq, taskqid_t id)
 {
@@ -351,34 +397,8 @@ taskq_wait_id(taskq_t *tq, taskqid_t id)
 }
 EXPORT_SYMBOL(taskq_wait_id);
 
-/*
- * The taskq_wait() function will block until all previously submitted
- * tasks have been completed.  A previously submitted task is defined as
- * a task with a lower task id than the current task queue id.  Note that
- * all task id's are assigned monotonically at dispatch time.
- *
- * Waiting for all previous tasks to complete is accomplished by tracking
- * the lowest outstanding task id.  As tasks are dispatched they are added
- * added to the tail of the pending, priority, or delay lists.  And as
- * worker threads become available the tasks are removed from the heads
- * of these lists and linked to the worker threads.  This ensures the
- * lists are kept in lowest to highest task id order.
- *
- * Therefore the lowest outstanding task id can be quickly determined by
- * checking the head item from all of these lists.  This value is stored
- * with the task queue as the lowest id.  It only needs to be recalculated
- * when either the task with the current lowest id completes or is canceled.
- *
- * By blocking until the lowest task id exceeds the current task id when
- * the function was called we ensure all previous tasks have completed.
- *
- * NOTE: When there are multiple worked threads it is possible for larger
- * task ids to complete before smaller ones.  Conversely when the task
- * queue contains delay tasks with small task ids, you may block for a
- * considerable length of time waiting for them to expire and execute.
- */
 static int
-taskq_wait_check(taskq_t *tq, taskqid_t id)
+taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id)
 {
        int rc;
 
@@ -389,45 +409,76 @@ taskq_wait_check(taskq_t *tq, taskqid_t id)
        return (rc);
 }
 
+/*
+ * The taskq_wait_outstanding() function will block until all tasks with a
+ * lower taskqid than the passed 'id' have been completed.  Note that all
+ * task id's are assigned monotonically at dispatch time.  Zero may be
+ * passed for the id to indicate all tasks dispatch up to this point,
+ * but not after, should be waited for.
+ */
 void
-taskq_wait_all(taskq_t *tq, taskqid_t id)
+taskq_wait_outstanding(taskq_t *tq, taskqid_t id)
 {
-       wait_event(tq->tq_wait_waitq, taskq_wait_check(tq, id));
+       wait_event(tq->tq_wait_waitq,
+           taskq_wait_outstanding_check(tq, id ? id : tq->tq_next_id - 1));
 }
-EXPORT_SYMBOL(taskq_wait_all);
+EXPORT_SYMBOL(taskq_wait_outstanding);
 
-void
-taskq_wait(taskq_t *tq)
+static int
+taskq_wait_check(taskq_t *tq)
 {
-       taskqid_t id;
-
-       ASSERT(tq);
+       int rc;
 
-       /* Wait for the largest outstanding taskqid */
        spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
-       id = tq->tq_next_id - 1;
+       rc = (tq->tq_lowest_id == tq->tq_next_id);
        spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
 
-       taskq_wait_all(tq, id);
+       return (rc);
+}
+
+/*
+ * The taskq_wait() function will block until the taskq is empty.
+ * This means that if a taskq re-dispatches work to itself taskq_wait()
+ * callers will block indefinitely.
+ */
+void
+taskq_wait(taskq_t *tq)
+{
+       wait_event(tq->tq_wait_waitq, taskq_wait_check(tq));
 }
 EXPORT_SYMBOL(taskq_wait);
 
-int
-taskq_member(taskq_t *tq, void *t)
+static int
+taskq_member_impl(taskq_t *tq, void *t)
 {
        struct list_head *l;
        taskq_thread_t *tqt;
+       int found = 0;
 
        ASSERT(tq);
        ASSERT(t);
+       ASSERT(spin_is_locked(&tq->tq_lock));
 
        list_for_each(l, &tq->tq_thread_list) {
                tqt = list_entry(l, taskq_thread_t, tqt_thread_list);
-               if (tqt->tqt_thread == (struct task_struct *)t)
-                       return (1);
+               if (tqt->tqt_thread == (struct task_struct *)t) {
+                       found = 1;
+                       break;
+               }
        }
+       return (found);
+}
 
-       return (0);
+int
+taskq_member(taskq_t *tq, void *t)
+{
+       int found;
+
+       spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
+       found = taskq_member_impl(tq, t);
+       spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
+
+       return (found);
 }
 EXPORT_SYMBOL(taskq_member);
 
@@ -487,6 +538,8 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id)
 }
 EXPORT_SYMBOL(taskq_cancel_id);
 
+static int taskq_thread_spawn(taskq_t *tq, int seq_tasks);
+
 taskqid_t
 taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 {
@@ -499,7 +552,7 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
        spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
 
        /* Taskq being destroyed and all tasks drained */
-       if (!(tq->tq_flags & TQ_ACTIVE))
+       if (!(tq->tq_flags & TASKQ_ACTIVE))
                goto out;
 
        /* Do not queue the task unless there is idle thread for it */
@@ -533,6 +586,11 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 
        wake_up(&tq->tq_work_waitq);
 out:
+       /* Spawn additional taskq threads if required. */
+       if (tq->tq_nactive == tq->tq_nthreads &&
+           taskq_member_impl(tq, current))
+               (void) taskq_thread_spawn(tq, spl_taskq_thread_sequential + 1);
+
        spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
        return (rc);
 }
@@ -551,7 +609,7 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
        spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
 
        /* Taskq being destroyed and all tasks drained */
-       if (!(tq->tq_flags & TQ_ACTIVE))
+       if (!(tq->tq_flags & TASKQ_ACTIVE))
                goto out;
 
        if ((t = task_alloc(tq, flags)) == NULL)
@@ -576,6 +634,10 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
 
        spin_unlock(&t->tqent_lock);
 out:
+       /* Spawn additional taskq threads if required. */
+       if (tq->tq_nactive == tq->tq_nthreads &&
+           taskq_member_impl(tq, current))
+               (void) taskq_thread_spawn(tq, spl_taskq_thread_sequential + 1);
        spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
        return (rc);
 }
@@ -587,12 +649,11 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
 {
        ASSERT(tq);
        ASSERT(func);
-       ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC));
 
        spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
 
        /* Taskq being destroyed and all tasks drained */
-       if (!(tq->tq_flags & TQ_ACTIVE)) {
+       if (!(tq->tq_flags & TASKQ_ACTIVE)) {
                t->tqent_id = 0;
                goto out;
        }
@@ -621,6 +682,10 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
 
        wake_up(&tq->tq_work_waitq);
 out:
+       /* Spawn additional taskq threads if required. */
+       if (tq->tq_nactive == tq->tq_nthreads &&
+           taskq_member_impl(tq, current))
+               (void) taskq_thread_spawn(tq, spl_taskq_thread_sequential + 1);
        spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
 }
 EXPORT_SYMBOL(taskq_dispatch_ent);
@@ -647,6 +712,97 @@ taskq_init_ent(taskq_ent_t *t)
 }
 EXPORT_SYMBOL(taskq_init_ent);
 
+/*
+ * Return the next pending task, preference is given to tasks on the
+ * priority list which were dispatched with TQ_FRONT.
+ */
+static taskq_ent_t *
+taskq_next_ent(taskq_t *tq)
+{
+       struct list_head *list;
+
+       ASSERT(spin_is_locked(&tq->tq_lock));
+
+       if (!list_empty(&tq->tq_prio_list))
+               list = &tq->tq_prio_list;
+       else if (!list_empty(&tq->tq_pend_list))
+               list = &tq->tq_pend_list;
+       else
+               return (NULL);
+
+       return (list_entry(list->next, taskq_ent_t, tqent_list));
+}
+
+/*
+ * Spawns a new thread for the specified taskq.
+ */
+static void
+taskq_thread_spawn_task(void *arg)
+{
+       taskq_t *tq = (taskq_t *)arg;
+
+       (void) taskq_thread_create(tq);
+
+       spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
+       tq->tq_nspawn--;
+       spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
+}
+
+/*
+ * Spawn addition threads for dynamic taskqs (TASKQ_DYNMAIC) the current
+ * number of threads is insufficient to handle the pending tasks.  These
+ * new threads must be created by the dedicated dynamic_taskq to avoid
+ * deadlocks between thread creation and memory reclaim.  The system_taskq
+ * which is also a dynamic taskq cannot be safely used for this.
+ */
+static int
+taskq_thread_spawn(taskq_t *tq, int seq_tasks)
+{
+       int spawning = 0;
+
+       if (!(tq->tq_flags & TASKQ_DYNAMIC))
+               return (0);
+
+       if ((seq_tasks > spl_taskq_thread_sequential) &&
+           (tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) &&
+           (tq->tq_flags & TASKQ_ACTIVE)) {
+               spawning = (++tq->tq_nspawn);
+               taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task,
+                   tq, TQ_NOSLEEP);
+       }
+
+       return (spawning);
+}
+
+/*
+ * Threads in a dynamic taskq should only exit once it has been completely
+ * drained and no other threads are actively servicing tasks.  This prevents
+ * threads from being created and destroyed more than is required.
+ *
+ * The first thread is the thread list is treated as the primary thread.
+ * There is nothing special about the primary thread but in order to avoid
+ * all the taskq pids from changing we opt to make it long running.
+ */
+static int
+taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
+{
+       ASSERT(spin_is_locked(&tq->tq_lock));
+
+       if (!(tq->tq_flags & TASKQ_DYNAMIC))
+               return (0);
+
+       if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t,
+           tqt_thread_list) == tqt)
+               return (0);
+
+       return
+           ((tq->tq_nspawn == 0) &&    /* No threads are being spawned */
+           (tq->tq_nactive == 0) &&    /* No threads are handling tasks */
+           (tq->tq_nthreads > 1) &&    /* More than 1 thread is running */
+           (!taskq_next_ent(tq)) &&    /* There are no pending tasks */
+           (spl_taskq_thread_dynamic));/* Dynamic taskqs are allowed */
+}
+
 static int
 taskq_thread(void *args)
 {
@@ -655,18 +811,28 @@ taskq_thread(void *args)
        taskq_thread_t *tqt = args;
        taskq_t *tq;
        taskq_ent_t *t;
-       struct list_head *pend_list;
+       int seq_tasks = 0;
 
        ASSERT(tqt);
        tq = tqt->tqt_tq;
        current->flags |= PF_NOFREEZE;
 
+       #if defined(PF_MEMALLOC_NOIO)
+       (void) memalloc_noio_save();
+       #endif
+
        sigfillset(&blocked);
        sigprocmask(SIG_BLOCK, &blocked, NULL);
        flush_signals(current);
 
        spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
+
+       /* Immediately exit if more threads than allowed were created. */
+       if (tq->tq_nthreads >= tq->tq_maxthreads)
+               goto error;
+
        tq->tq_nthreads++;
+       list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list);
        wake_up(&tq->tq_wait_waitq);
        set_current_state(TASK_INTERRUPTIBLE);
 
@@ -674,25 +840,25 @@ taskq_thread(void *args)
 
                if (list_empty(&tq->tq_pend_list) &&
                    list_empty(&tq->tq_prio_list)) {
+
+                       if (taskq_thread_should_stop(tq, tqt)) {
+                               wake_up_all(&tq->tq_wait_waitq);
+                               break;
+                       }
+
                        add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
                        spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
+
                        schedule();
+                       seq_tasks = 0;
+
                        spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
                        remove_wait_queue(&tq->tq_work_waitq, &wait);
                } else {
                        __set_current_state(TASK_RUNNING);
                }
 
-
-               if (!list_empty(&tq->tq_prio_list))
-                       pend_list = &tq->tq_prio_list;
-               else if (!list_empty(&tq->tq_pend_list))
-                       pend_list = &tq->tq_pend_list;
-               else
-                       pend_list = NULL;
-
-               if (pend_list) {
-                       t = list_entry(pend_list->next,taskq_ent_t,tqent_list);
+               if ((t = taskq_next_ent(tq)) != NULL) {
                        list_del_init(&t->tqent_list);
 
                        /* In order to support recursively dispatching a
@@ -721,8 +887,7 @@ taskq_thread(void *args)
                        tqt->tqt_task = NULL;
 
                        /* For prealloc'd tasks, we don't free anything. */
-                       if ((tq->tq_flags & TASKQ_DYNAMIC) ||
-                           !(tqt->tqt_flags & TQENT_FLAG_PREALLOC))
+                       if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC))
                                task_done(tq, t);
 
                        /* When the current lowest outstanding taskqid is
@@ -732,9 +897,16 @@ taskq_thread(void *args)
                                ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id);
                        }
 
+                       /* Spawn additional taskq threads if required. */
+                       if (taskq_thread_spawn(tq, ++seq_tasks))
+                               seq_tasks = 0;
+
                        tqt->tqt_id = 0;
                        tqt->tqt_flags = 0;
                        wake_up_all(&tq->tq_wait_waitq);
+               } else {
+                       if (taskq_thread_should_stop(tq, tqt))
+                               break;
                }
 
                set_current_state(TASK_INTERRUPTIBLE);
@@ -744,27 +916,57 @@ taskq_thread(void *args)
        __set_current_state(TASK_RUNNING);
        tq->tq_nthreads--;
        list_del_init(&tqt->tqt_thread_list);
-       kmem_free(tqt, sizeof(taskq_thread_t));
-
+error:
+       kmem_free(tqt, sizeof (taskq_thread_t));
        spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
 
        return (0);
 }
 
+static taskq_thread_t *
+taskq_thread_create(taskq_t *tq)
+{
+       static int last_used_cpu = 0;
+       taskq_thread_t *tqt;
+
+       tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE);
+       INIT_LIST_HEAD(&tqt->tqt_thread_list);
+       INIT_LIST_HEAD(&tqt->tqt_active_list);
+       tqt->tqt_tq = tq;
+       tqt->tqt_id = 0;
+
+       tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt,
+           "%s", tq->tq_name);
+       if (tqt->tqt_thread == NULL) {
+               kmem_free(tqt, sizeof (taskq_thread_t));
+               return (NULL);
+       }
+
+       if (spl_taskq_thread_bind) {
+               last_used_cpu = (last_used_cpu + 1) % num_online_cpus();
+               kthread_bind(tqt->tqt_thread, last_used_cpu);
+       }
+
+       if (spl_taskq_thread_priority)
+               set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri));
+
+       wake_up_process(tqt->tqt_thread);
+
+       return (tqt);
+}
+
 taskq_t *
 taskq_create(const char *name, int nthreads, pri_t pri,
     int minalloc, int maxalloc, uint_t flags)
 {
-       static int last_used_cpu = 0;
        taskq_t *tq;
        taskq_thread_t *tqt;
-       int rc = 0, i, j = 0;
+       int count = 0, rc = 0, i;
 
        ASSERT(name != NULL);
-       ASSERT(pri <= maxclsyspri);
        ASSERT(minalloc >= 0);
        ASSERT(maxalloc <= INT_MAX);
-       ASSERT(!(flags & (TASKQ_CPR_SAFE | TASKQ_DYNAMIC))); /* Unsupported */
+       ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */
 
        /* Scale the number of threads using nthreads as a percentage */
        if (flags & TASKQ_THREADS_CPU_PCT) {
@@ -775,24 +977,25 @@ taskq_create(const char *name, int nthreads, pri_t pri,
                nthreads = MAX((num_online_cpus() * nthreads) / 100, 1);
        }
 
-       tq = kmem_alloc(sizeof(*tq), KM_PUSHPAGE);
+       tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE);
        if (tq == NULL)
                return (NULL);
 
        spin_lock_init(&tq->tq_lock);
-       spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
        INIT_LIST_HEAD(&tq->tq_thread_list);
        INIT_LIST_HEAD(&tq->tq_active_list);
-       tq->tq_name      = name;
-       tq->tq_nactive   = 0;
-       tq->tq_nthreads  = 0;
-       tq->tq_pri       = pri;
-       tq->tq_minalloc  = minalloc;
-       tq->tq_maxalloc  = maxalloc;
-       tq->tq_nalloc    = 0;
-       tq->tq_flags     = (flags | TQ_ACTIVE);
-       tq->tq_next_id   = 1;
-       tq->tq_lowest_id = 1;
+       tq->tq_name       = strdup(name);
+       tq->tq_nactive    = 0;
+       tq->tq_nthreads   = 0;
+       tq->tq_nspawn     = 0;
+       tq->tq_maxthreads = nthreads;
+       tq->tq_pri        = pri;
+       tq->tq_minalloc   = minalloc;
+       tq->tq_maxalloc   = maxalloc;
+       tq->tq_nalloc     = 0;
+       tq->tq_flags      = (flags | TASKQ_ACTIVE);
+       tq->tq_next_id    = 1;
+       tq->tq_lowest_id  = 1;
        INIT_LIST_HEAD(&tq->tq_free_list);
        INIT_LIST_HEAD(&tq->tq_pend_list);
        INIT_LIST_HEAD(&tq->tq_prio_list);
@@ -800,38 +1003,28 @@ taskq_create(const char *name, int nthreads, pri_t pri,
        init_waitqueue_head(&tq->tq_work_waitq);
        init_waitqueue_head(&tq->tq_wait_waitq);
 
-       if (flags & TASKQ_PREPOPULATE)
+       if (flags & TASKQ_PREPOPULATE) {
+               spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
+
                for (i = 0; i < minalloc; i++)
                        task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW));
 
-       spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
+               spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
+       }
+
+       if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic)
+               nthreads = 1;
 
        for (i = 0; i < nthreads; i++) {
-               tqt = kmem_alloc(sizeof(*tqt), KM_PUSHPAGE);
-               INIT_LIST_HEAD(&tqt->tqt_thread_list);
-               INIT_LIST_HEAD(&tqt->tqt_active_list);
-               tqt->tqt_tq = tq;
-               tqt->tqt_id = 0;
-
-               tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt,
-                   "%s/%d", name, i);
-               if (tqt->tqt_thread) {
-                       list_add(&tqt->tqt_thread_list, &tq->tq_thread_list);
-                       if (spl_taskq_thread_bind) {
-                               last_used_cpu = (last_used_cpu + 1) % num_online_cpus();
-                               kthread_bind(tqt->tqt_thread, last_used_cpu);
-                       }
-                       set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(pri));
-                       wake_up_process(tqt->tqt_thread);
-                       j++;
-               } else {
-                       kmem_free(tqt, sizeof(taskq_thread_t));
+               tqt = taskq_thread_create(tq);
+               if (tqt == NULL)
                        rc = 1;
-               }
+               else
+                       count++;
        }
 
        /* Wait for all threads to be started before potential destroy */
-       wait_event(tq->tq_wait_waitq, tq->tq_nthreads == j);
+       wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count);
 
        if (rc) {
                taskq_destroy(tq);
@@ -851,10 +1044,16 @@ taskq_destroy(taskq_t *tq)
 
        ASSERT(tq);
        spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
-       tq->tq_flags &= ~TQ_ACTIVE;
+       tq->tq_flags &= ~TASKQ_ACTIVE;
        spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
 
-       /* TQ_ACTIVE cleared prevents new tasks being added to pending */
+       /*
+        * When TASKQ_ACTIVE is clear new tasks may not be added nor may
+        * new worker threads be spawned for dynamic taskq.
+        */
+       if (dynamic_taskq != NULL)
+               taskq_wait_outstanding(dynamic_taskq, 0);
+
        taskq_wait(tq);
 
        spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
@@ -867,7 +1066,7 @@ taskq_destroy(taskq_t *tq)
         */
        while (!list_empty(&tq->tq_thread_list)) {
                tqt = list_entry(tq->tq_thread_list.next,
-                                taskq_thread_t, tqt_thread_list);
+                   taskq_thread_t, tqt_thread_list);
                thread = tqt->tqt_thread;
                spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
 
@@ -885,8 +1084,9 @@ taskq_destroy(taskq_t *tq)
                task_free(tq, t);
        }
 
-       ASSERT(tq->tq_nthreads == 0);
-       ASSERT(tq->tq_nalloc == 0);
+       ASSERT0(tq->tq_nthreads);
+       ASSERT0(tq->tq_nalloc);
+       ASSERT0(tq->tq_nspawn);
        ASSERT(list_empty(&tq->tq_thread_list));
        ASSERT(list_empty(&tq->tq_active_list));
        ASSERT(list_empty(&tq->tq_free_list));
@@ -896,25 +1096,35 @@ taskq_destroy(taskq_t *tq)
 
        spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
 
-       kmem_free(tq, sizeof(taskq_t));
+       strfree(tq->tq_name);
+       kmem_free(tq, sizeof (taskq_t));
 }
 EXPORT_SYMBOL(taskq_destroy);
 
 int
 spl_taskq_init(void)
 {
-       /* Solaris creates a dynamic taskq of up to 64 threads, however in
-        * a Linux environment 1 thread per-core is usually about right */
-       system_taskq = taskq_create("spl_system_taskq", num_online_cpus(),
-                                   minclsyspri, 4, 512, TASKQ_PREPOPULATE);
+       system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64),
+           maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
        if (system_taskq == NULL)
                return (1);
 
+       dynamic_taskq = taskq_create("spl_dynamic_taskq", 1,
+           maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE);
+       if (dynamic_taskq == NULL) {
+               taskq_destroy(system_taskq);
+               return (1);
+       }
+
        return (0);
 }
 
 void
 spl_taskq_fini(void)
 {
+       taskq_destroy(dynamic_taskq);
+       dynamic_taskq = NULL;
+
        taskq_destroy(system_taskq);
+       system_taskq = NULL;
 }
index 9a0987527b2705d9bd5a1b0f9be2a8add67ffda6..4d0800e5a11f5d4b570c79d46dd33ccb3f3de1cf 100644 (file)
@@ -1,4 +1,4 @@
-/*****************************************************************************\
+/*
  *  Copyright (C) 2010 Lawrence Livermore National Security, LLC.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
@@ -19,7 +19,8 @@
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
- *****************************************************************************
+ *
+ *
  *  Solaris Porting Layer (SPL) Thread Specific Data Implementation.
  *
  *  Thread specific data has implemented using a hash table, this avoids
@@ -56,7 +57,7 @@
  *  so if your using the Solaris thread API you should not need to call
  *  tsd_exit() directly.
  *
-\*****************************************************************************/
+ */
 
 #include <sys/kmem.h>
 #include <sys/thread.h>
@@ -136,7 +137,7 @@ tsd_hash_dtor(struct hlist_head *work)
                if (entry->he_dtor && entry->he_pid != DTOR_PID)
                        entry->he_dtor(entry->he_value);
 
-               kmem_free(entry, sizeof(tsd_hash_entry_t));
+               kmem_free(entry, sizeof (tsd_hash_entry_t));
        }
 }
 
@@ -163,7 +164,7 @@ tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value)
        ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL);
 
        /* New entry allocate structure, set value, and add to hash */
-       entry = kmem_alloc(sizeof(tsd_hash_entry_t), KM_PUSHPAGE);
+       entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
        if (entry == NULL)
                return (ENOMEM);
 
@@ -222,7 +223,7 @@ tsd_hash_add_key(tsd_hash_table_t *table, uint_t *keyp, dtor_func_t dtor)
        ASSERT3P(table, !=, NULL);
 
        /* Allocate entry to be used as a destructor for this key */
-       entry = kmem_alloc(sizeof(tsd_hash_entry_t), KM_PUSHPAGE);
+       entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
        if (entry == NULL)
                return (ENOMEM);
 
@@ -280,7 +281,7 @@ tsd_hash_add_pid(tsd_hash_table_t *table, pid_t pid)
        ulong_t hash;
 
        /* Allocate entry to be used as the process reference */
-       entry = kmem_alloc(sizeof(tsd_hash_entry_t), KM_PUSHPAGE);
+       entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
        if (entry == NULL)
                return (ENOMEM);
 
@@ -333,13 +334,13 @@ tsd_hash_table_init(uint_t bits)
        tsd_hash_table_t *table;
        int hash, size = (1 << bits);
 
-       table = kmem_zalloc(sizeof(tsd_hash_table_t), KM_SLEEP);
+       table = kmem_zalloc(sizeof (tsd_hash_table_t), KM_SLEEP);
        if (table == NULL)
                return (NULL);
 
-       table->ht_bins = kmem_zalloc(sizeof(tsd_hash_bin_t) * size, KM_SLEEP);
+       table->ht_bins = kmem_zalloc(sizeof (tsd_hash_bin_t) * size, KM_SLEEP);
        if (table->ht_bins == NULL) {
-               kmem_free(table, sizeof(tsd_hash_table_t));
+               kmem_free(table, sizeof (tsd_hash_table_t));
                return (NULL);
        }
 
@@ -376,9 +377,9 @@ tsd_hash_table_fini(tsd_hash_table_t *table)
        for (i = 0, size = (1 << table->ht_bits); i < size; i++) {
                bin = &table->ht_bins[i];
                spin_lock(&bin->hb_lock);
-               while (!hlist_empty(&bin->hb_head)) {
+               while (!hlist_empty(&bin->hb_head)) {
                        entry = hlist_entry(bin->hb_head.first,
-                                           tsd_hash_entry_t, he_list);
+                           tsd_hash_entry_t, he_list);
                        tsd_hash_del(table, entry);
                        hlist_add_head(&entry->he_list, &work);
                }
@@ -387,8 +388,62 @@ tsd_hash_table_fini(tsd_hash_table_t *table)
        spin_unlock(&table->ht_lock);
 
        tsd_hash_dtor(&work);
-       kmem_free(table->ht_bins, sizeof(tsd_hash_bin_t)*(1<<table->ht_bits));
-       kmem_free(table, sizeof(tsd_hash_table_t));
+       kmem_free(table->ht_bins, sizeof (tsd_hash_bin_t)*(1<<table->ht_bits));
+       kmem_free(table, sizeof (tsd_hash_table_t));
+}
+
+/*
+ * tsd_remove_entry - remove a tsd entry for this thread
+ * @entry: entry to remove
+ *
+ * Remove the thread specific data @entry for this thread.
+ * If this is the last entry for this thread, also remove the PID entry.
+ */
+static void
+tsd_remove_entry(tsd_hash_entry_t *entry)
+{
+       HLIST_HEAD(work);
+       tsd_hash_table_t *table;
+       tsd_hash_entry_t *pid_entry;
+       tsd_hash_bin_t *pid_entry_bin, *entry_bin;
+       ulong_t hash;
+
+       table = tsd_hash_table;
+       ASSERT3P(table, !=, NULL);
+       ASSERT3P(entry, !=, NULL);
+
+       spin_lock(&table->ht_lock);
+
+       hash = hash_long((ulong_t)entry->he_key *
+           (ulong_t)entry->he_pid, table->ht_bits);
+       entry_bin = &table->ht_bins[hash];
+
+       /* save the possible pid_entry */
+       pid_entry = list_entry(entry->he_pid_list.next, tsd_hash_entry_t,
+           he_pid_list);
+
+       /* remove entry */
+       spin_lock(&entry_bin->hb_lock);
+       tsd_hash_del(table, entry);
+       hlist_add_head(&entry->he_list, &work);
+       spin_unlock(&entry_bin->hb_lock);
+
+       /* if pid_entry is indeed pid_entry, then remove it if it's empty */
+       if (pid_entry->he_key == PID_KEY &&
+           list_empty(&pid_entry->he_pid_list)) {
+               hash = hash_long((ulong_t)pid_entry->he_key *
+                   (ulong_t)pid_entry->he_pid, table->ht_bits);
+               pid_entry_bin = &table->ht_bins[hash];
+
+               spin_lock(&pid_entry_bin->hb_lock);
+               tsd_hash_del(table, pid_entry);
+               hlist_add_head(&pid_entry->he_list, &work);
+               spin_unlock(&pid_entry_bin->hb_lock);
+       }
+
+       spin_unlock(&table->ht_lock);
+
+       tsd_hash_dtor(&work);
 }
 
 /*
@@ -409,6 +464,8 @@ tsd_set(uint_t key, void *value)
        tsd_hash_entry_t *entry;
        pid_t pid;
        int rc;
+       /* mark remove if value is NULL */
+       boolean_t remove = (value == NULL);
 
        table = tsd_hash_table;
        pid = curthread->pid;
@@ -421,9 +478,16 @@ tsd_set(uint_t key, void *value)
        entry = tsd_hash_search(table, key, pid);
        if (entry) {
                entry->he_value = value;
+               /* remove the entry */
+               if (remove)
+                       tsd_remove_entry(entry);
                return (0);
        }
 
+       /* don't create entry if value is NULL */
+       if (remove)
+               return (0);
+
        /* Add a process entry to the hash if not yet exists */
        entry = tsd_hash_search(table, PID_KEY, pid);
        if (entry == NULL) {
@@ -482,7 +546,7 @@ tsd_create(uint_t *keyp, dtor_func_t dtor)
        if (*keyp)
                return;
 
-       (void)tsd_hash_add_key(tsd_hash_table, keyp, dtor);
+       (void) tsd_hash_add_key(tsd_hash_table, keyp, dtor);
 }
 EXPORT_SYMBOL(tsd_create);
 
@@ -519,14 +583,14 @@ tsd_destroy(uint_t *keyp)
         * DTOR_PID entry.  They are removed from the hash table and
         * linked in to a private working list to be destroyed.
         */
-        while (!list_empty(&dtor_entry->he_key_list)) {
+       while (!list_empty(&dtor_entry->he_key_list)) {
                entry = list_entry(dtor_entry->he_key_list.next,
-                                  tsd_hash_entry_t, he_key_list);
+                   tsd_hash_entry_t, he_key_list);
                ASSERT3U(dtor_entry->he_key, ==, entry->he_key);
                ASSERT3P(dtor_entry->he_dtor, ==, entry->he_dtor);
 
                hash = hash_long((ulong_t)entry->he_key *
-                    (ulong_t)entry->he_pid, table->ht_bits);
+                   (ulong_t)entry->he_pid, table->ht_bits);
                entry_bin = &table->ht_bins[hash];
 
                spin_lock(&entry_bin->hb_lock);
@@ -583,9 +647,9 @@ tsd_exit(void)
         * linked in to a private working list to be destroyed.
         */
 
-        while (!list_empty(&pid_entry->he_pid_list)) {
+       while (!list_empty(&pid_entry->he_pid_list)) {
                entry = list_entry(pid_entry->he_pid_list.next,
-                                  tsd_hash_entry_t, he_pid_list);
+                   tsd_hash_entry_t, he_pid_list);
                ASSERT3U(pid_entry->he_pid, ==, entry->he_pid);
 
                hash = hash_long((ulong_t)entry->he_key *
index 1e26b8e29a89e736f82e26a2a87e5606326bc889..ab9830d1867a04ea6577d534110d6f41a229b85c 100644 (file)
@@ -623,14 +623,14 @@ EXPORT_SYMBOL(vn_space);
 
 /* Function must be called while holding the vn_file_lock */
 static file_t *
-file_find(int fd)
+file_find(int fd, struct task_struct *task)
 {
         file_t *fp;
 
        ASSERT(spin_is_locked(&vn_file_lock));
 
         list_for_each_entry(fp, &vn_file_list,  f_list) {
-               if (fd == fp->f_fd && fp->f_task == current) {
+               if (fd == fp->f_fd && fp->f_task == task) {
                        ASSERT(atomic_read(&fp->f_ref) != 0);
                         return fp;
                }
@@ -648,10 +648,13 @@ vn_getf(int fd)
        vnode_t *vp;
        int rc = 0;
 
+       if (fd < 0)
+               return (NULL);
+
        /* Already open just take an extra reference */
        spin_lock(&vn_file_lock);
 
-       fp = file_find(fd);
+       fp = file_find(fd, current);
        if (fp) {
                atomic_inc(&fp->f_ref);
                spin_unlock(&vn_file_lock);
@@ -730,11 +733,22 @@ static void releasef_locked(file_t *fp)
 
 void
 vn_releasef(int fd)
+{
+       areleasef(fd, P_FINFO(current));
+}
+EXPORT_SYMBOL(releasef);
+
+void
+vn_areleasef(int fd, uf_info_t *fip)
 {
        file_t *fp;
+       struct task_struct *task = (struct task_struct *)fip;
+
+       if (fd < 0)
+               return;
 
        spin_lock(&vn_file_lock);
-       fp = file_find(fd);
+       fp = file_find(fd, task);
        if (fp) {
                atomic_dec(&fp->f_ref);
                if (atomic_read(&fp->f_ref) > 0) {
@@ -749,7 +763,8 @@ vn_releasef(int fd)
 
        return;
 } /* releasef() */
-EXPORT_SYMBOL(releasef);
+EXPORT_SYMBOL(areleasef);
+
 
 static void
 #ifdef HAVE_SET_FS_PWD_WITH_CONST
index f4065196b64e337f11a6403e107f554308c4b88d..680f28492b46b75fce8e2cecdc93c192cb609016 100644 (file)
@@ -1,25 +1,28 @@
 # Makefile.in for splat kernel module
 
+src = @abs_top_srcdir@/module/splat
+obj = @abs_builddir@
+
 MODULE := splat
 EXTRA_CFLAGS = $(SPL_MODULE_CFLAGS) @KERNELCPPFLAGS@
 
 # Solaris Porting LAyer Tests
 obj-$(CONFIG_SPL) := $(MODULE).o
 
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-ctl.o
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-kmem.o
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-taskq.o
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-random.o
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-mutex.o
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-condvar.o
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-thread.o
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-rwlock.o
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-time.o
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-vnode.o
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-kobj.o
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-atomic.o
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-list.o
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-generic.o
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-cred.o
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-zlib.o
-$(MODULE)-objs += @top_srcdir@/module/splat/splat-linux.o
+$(MODULE)-objs += splat-ctl.o
+$(MODULE)-objs += splat-kmem.o
+$(MODULE)-objs += splat-taskq.o
+$(MODULE)-objs += splat-random.o
+$(MODULE)-objs += splat-mutex.o
+$(MODULE)-objs += splat-condvar.o
+$(MODULE)-objs += splat-thread.o
+$(MODULE)-objs += splat-rwlock.o
+$(MODULE)-objs += splat-time.o
+$(MODULE)-objs += splat-vnode.o
+$(MODULE)-objs += splat-kobj.o
+$(MODULE)-objs += splat-atomic.o
+$(MODULE)-objs += splat-list.o
+$(MODULE)-objs += splat-generic.o
+$(MODULE)-objs += splat-cred.o
+$(MODULE)-objs += splat-zlib.o
+$(MODULE)-objs += splat-linux.o
index e94f42f00b15da2ea1cf7799aa6e3fd294ab1510..999f4f0587676429107c56f1d7aff1ad042a8eca 100644 (file)
@@ -156,7 +156,7 @@ splat_atomic_test1(struct file *file, void *arg)
 
                thr = (kthread_t *)thread_create(NULL, 0, splat_atomic_work,
                                                 &ap, 0, &p0, TS_RUN,
-                                                minclsyspri);
+                                                defclsyspri);
                if (thr == NULL) {
                        rc = -ESRCH;
                        mutex_exit(&ap.ap_lock);
index 832132696d885d7f48a8a11f9708db521b442ce3..d00af90fa748d69f5d93cc06ac249ef6ccf29d59 100644 (file)
@@ -28,6 +28,7 @@
 #include "splat-ctl.h"
 #include <sys/mutex.h>
 #include <linux/file_compat.h>
+#include <linux/version.h>
 
 #define SPLAT_SUBSYSTEM_INIT(type)                                      \
 ({      splat_subsystem_t *_sub_;                                       \
index cd0000bae67138c3031d645bd155962238d62bf2..b3fd1a84dc871a4d95deaafcf9c88dfd6986860d 100644 (file)
@@ -739,7 +739,7 @@ splat_kmem_cache_thread_test(struct file *file, void *arg, char *name,
        for (i = 0; i < SPLAT_KMEM_THREADS; i++) {
                thr = thread_create(NULL, 0,
                                    splat_kmem_cache_test_thread,
-                                   kcp, 0, &p0, TS_RUN, minclsyspri);
+                                   kcp, 0, &p0, TS_RUN, defclsyspri);
                if (thr == NULL) {
                        rc = -ESRCH;
                        goto out_cache;
index 909d730cb014453e1bdca80d041ceabc6f3e5896..86bef8ee31be6a2cb2dd26166a6a7a4e08a4cf52 100644 (file)
@@ -87,7 +87,7 @@ splat_mutex_test1(struct file *file, void *arg)
         if (mp == NULL)
                 return -ENOMEM;
 
-        tq = taskq_create(SPLAT_MUTEX_TEST_TASKQ, 1, maxclsyspri,
+        tq = taskq_create(SPLAT_MUTEX_TEST_TASKQ, 1, defclsyspri,
                           50, INT_MAX, TASKQ_PREPOPULATE);
         if (tq == NULL) {
                 rc = -ENOMEM;
@@ -196,7 +196,7 @@ splat_mutex_test2(struct file *file, void *arg)
 
         /* Create several threads allowing tasks to race with each other */
         tq = taskq_create(SPLAT_MUTEX_TEST_TASKQ, num_online_cpus(),
-                          maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
+                          defclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
         if (tq == NULL) {
                 rc = -ENOMEM;
                 goto out;
@@ -266,7 +266,7 @@ splat_mutex_test3(struct file *file, void *arg)
         mp.mp_file = file;
         mutex_init(&mp.mp_mtx, SPLAT_MUTEX_TEST_NAME, MUTEX_DEFAULT, NULL);
 
-        if ((tq = taskq_create(SPLAT_MUTEX_TEST_NAME, 1, maxclsyspri,
+        if ((tq = taskq_create(SPLAT_MUTEX_TEST_NAME, 1, defclsyspri,
                                50, INT_MAX, TASKQ_PREPOPULATE)) == NULL) {
                 splat_vprint(file, SPLAT_MUTEX_TEST3_NAME, "Taskq '%s' "
                              "create failed\n", SPLAT_MUTEX_TEST3_NAME);
index 6c623792e645eedb420be3416adcd7382825f388..284f77370d37dc245dd2982ac947e362bce12347 100644 (file)
@@ -327,7 +327,7 @@ splat_rwlock_test2(struct file *file, void *arg)
 
        /* Create several threads allowing tasks to race with each other */
        tq = taskq_create(SPLAT_RWLOCK_TEST_TASKQ, num_online_cpus(),
-                         maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
+                         defclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
        if (tq == NULL) {
                rc = -ENOMEM;
                goto out;
@@ -500,7 +500,7 @@ splat_rwlock_test4(struct file *file, void *arg)
        if (rwp == NULL)
                return -ENOMEM;
 
-       tq = taskq_create(SPLAT_RWLOCK_TEST_TASKQ, 1, maxclsyspri,
+       tq = taskq_create(SPLAT_RWLOCK_TEST_TASKQ, 1, defclsyspri,
                          50, INT_MAX, TASKQ_PREPOPULATE);
        if (tq == NULL) {
                rc = -ENOMEM;
index 2787bf401b0748c6eefa1d64334eceb86ef4c0d1..8f06f413d5bc95d60cc9fc906a86ff193218713e 100644 (file)
@@ -28,6 +28,7 @@
 #include <sys/vmem.h>
 #include <sys/random.h>
 #include <sys/taskq.h>
+#include <sys/time.h>
 #include <sys/timer.h>
 #include <linux/delay.h>
 #include "splat-internal.h"
 #define SPLAT_TASKQ_TEST10_NAME                "cancel"
 #define SPLAT_TASKQ_TEST10_DESC                "Cancel task execution"
 
+#define SPLAT_TASKQ_TEST11_ID          0x020b
+#define SPLAT_TASKQ_TEST11_NAME                "dynamic"
+#define SPLAT_TASKQ_TEST11_DESC                "Dynamic task queue thread creation"
+
 #define SPLAT_TASKQ_ORDER_MAX          8
 #define SPLAT_TASKQ_DEPTH_MAX          16
 
@@ -129,7 +134,7 @@ splat_taskq_test1_impl(struct file *file, void *arg, boolean_t prealloc)
                     "Taskq '%s' creating (%s dispatch)\n",
                     SPLAT_TASKQ_TEST1_NAME,
                     prealloc ? "prealloc" : "dynamic");
-       if ((tq = taskq_create(SPLAT_TASKQ_TEST1_NAME, 1, maxclsyspri,
+       if ((tq = taskq_create(SPLAT_TASKQ_TEST1_NAME, 1, defclsyspri,
                               50, INT_MAX, TASKQ_PREPOPULATE)) == NULL) {
                splat_vprint(file, SPLAT_TASKQ_TEST1_NAME,
                           "Taskq '%s' create failed\n",
@@ -264,7 +269,7 @@ splat_taskq_test2_impl(struct file *file, void *arg, boolean_t prealloc) {
                             prealloc ? "prealloc" : "dynamic");
                if ((tq[i] = taskq_create(SPLAT_TASKQ_TEST2_NAME,
                                          TEST2_THREADS_PER_TASKQ,
-                                         maxclsyspri, 50, INT_MAX,
+                                         defclsyspri, 50, INT_MAX,
                                          TASKQ_PREPOPULATE)) == NULL) {
                        splat_vprint(file, SPLAT_TASKQ_TEST2_NAME,
                                   "Taskq '%s/%d' create failed\n",
@@ -489,7 +494,7 @@ splat_taskq_test4_common(struct file *file, void *arg, int minalloc,
                     SPLAT_TASKQ_TEST4_NAME,
                     prealloc ? "prealloc" : "dynamic",
                     minalloc, maxalloc, nr_tasks);
-       if ((tq = taskq_create(SPLAT_TASKQ_TEST4_NAME, 1, maxclsyspri,
+       if ((tq = taskq_create(SPLAT_TASKQ_TEST4_NAME, 1, defclsyspri,
                               minalloc, maxalloc, TASKQ_PREPOPULATE)) == NULL) {
                splat_vprint(file, SPLAT_TASKQ_TEST4_NAME,
                             "Taskq '%s' create failed\n",
@@ -588,10 +593,10 @@ splat_taskq_test4(struct file *file, void *arg)
  * next pending task as soon as it completes its current task.  This
  * means that tasks do not strictly complete in order in which they
  * were dispatched (increasing task id).  This is fine but we need to
- * verify that taskq_wait_all() blocks until the passed task id and all
- * lower task ids complete.  We do this by dispatching the following
+ * verify taskq_wait_outstanding() blocks until the passed task id and
+ * all lower task ids complete.  We do this by dispatching the following
  * specific sequence of tasks each of which block for N time units.
- * We then use taskq_wait_all() to unblock at specific task id and
+ * We then use taskq_wait_outstanding() to unblock at specific task id and
  * verify the only the expected task ids have completed and in the
  * correct order.  The two cases of interest are:
  *
@@ -602,17 +607,17 @@ splat_taskq_test4(struct file *file, void *arg)
  *
  * The following table shows each task id and how they will be
  * scheduled.  Each rows represent one time unit and each column
- * one of the three worker threads.  The places taskq_wait_all()
+ * one of the three worker threads.  The places taskq_wait_outstanding()
  * must unblock for a specific id are identified as well as the
  * task ids which must have completed and their order.
  *
- *       +-----+       <--- taskq_wait_all(tq, 8) unblocks
+ *       +-----+       <--- taskq_wait_outstanding(tq, 8) unblocks
  *       |     |            Required Completion Order: 1,2,4,5,3,8,6,7
  * +-----+     |
  * |     |     |
  * |     |     +-----+
  * |     |     |  8  |
- * |     |     +-----+ <--- taskq_wait_all(tq, 3) unblocks
+ * |     |     +-----+ <--- taskq_wait_outstanding(tq, 3) unblocks
  * |     |  7  |     |      Required Completion Order: 1,2,4,5,3
  * |     +-----+     |
  * |  6  |     |     |
@@ -707,7 +712,7 @@ splat_taskq_test5_impl(struct file *file, void *arg, boolean_t prealloc)
                     "Taskq '%s' creating (%s dispatch)\n",
                     SPLAT_TASKQ_TEST5_NAME,
                     prealloc ? "prealloc" : "dynamic");
-       if ((tq = taskq_create(SPLAT_TASKQ_TEST5_NAME, 3, maxclsyspri,
+       if ((tq = taskq_create(SPLAT_TASKQ_TEST5_NAME, 3, defclsyspri,
                               50, INT_MAX, TASKQ_PREPOPULATE)) == NULL) {
                splat_vprint(file, SPLAT_TASKQ_TEST5_NAME,
                             "Taskq '%s' create failed\n",
@@ -755,13 +760,13 @@ splat_taskq_test5_impl(struct file *file, void *arg, boolean_t prealloc)
 
        splat_vprint(file, SPLAT_TASKQ_TEST5_NAME, "Taskq '%s' "
                     "waiting for taskqid %d completion\n", tq_arg.name, 3);
-       taskq_wait_all(tq, 3);
+       taskq_wait_outstanding(tq, 3);
        if ((rc = splat_taskq_test_order(&tq_arg, order1)))
                goto out;
 
        splat_vprint(file, SPLAT_TASKQ_TEST5_NAME, "Taskq '%s' "
                     "waiting for taskqid %d completion\n", tq_arg.name, 8);
-       taskq_wait_all(tq, 8);
+       taskq_wait_outstanding(tq, 8);
        rc = splat_taskq_test_order(&tq_arg, order2);
 
 out:
@@ -868,7 +873,7 @@ splat_taskq_test6_impl(struct file *file, void *arg, boolean_t prealloc)
                     "Taskq '%s' creating (%s dispatch)\n",
                     SPLAT_TASKQ_TEST6_NAME,
                     prealloc ? "prealloc" : "dynamic");
-       if ((tq = taskq_create(SPLAT_TASKQ_TEST6_NAME, 3, maxclsyspri,
+       if ((tq = taskq_create(SPLAT_TASKQ_TEST6_NAME, 3, defclsyspri,
                               50, INT_MAX, TASKQ_PREPOPULATE)) == NULL) {
                splat_vprint(file, SPLAT_TASKQ_TEST6_NAME,
                             "Taskq '%s' create failed\n",
@@ -923,7 +928,7 @@ splat_taskq_test6_impl(struct file *file, void *arg, boolean_t prealloc)
        splat_vprint(file, SPLAT_TASKQ_TEST6_NAME, "Taskq '%s' "
                     "waiting for taskqid %d completion\n", tq_arg.name,
                     SPLAT_TASKQ_ORDER_MAX);
-       taskq_wait_all(tq, SPLAT_TASKQ_ORDER_MAX);
+       taskq_wait_outstanding(tq, SPLAT_TASKQ_ORDER_MAX);
        rc = splat_taskq_test_order(&tq_arg, order);
 
 out:
@@ -1000,7 +1005,7 @@ splat_taskq_test7_impl(struct file *file, void *arg, boolean_t prealloc)
                     "Taskq '%s' creating (%s dispatch)\n",
                     SPLAT_TASKQ_TEST7_NAME,
                     prealloc ? "prealloc" :  "dynamic");
-       if ((tq = taskq_create(SPLAT_TASKQ_TEST7_NAME, 1, maxclsyspri,
+       if ((tq = taskq_create(SPLAT_TASKQ_TEST7_NAME, 1, defclsyspri,
                               50, INT_MAX, TASKQ_PREPOPULATE)) == NULL) {
                splat_vprint(file, SPLAT_TASKQ_TEST7_NAME,
                             "Taskq '%s' create failed\n",
@@ -1030,7 +1035,7 @@ splat_taskq_test7_impl(struct file *file, void *arg, boolean_t prealloc)
        if (tq_arg->flag == 0) {
                splat_vprint(file, SPLAT_TASKQ_TEST7_NAME,
                             "Taskq '%s' waiting\n", tq_arg->name);
-               taskq_wait_all(tq, SPLAT_TASKQ_DEPTH_MAX);
+               taskq_wait_outstanding(tq, SPLAT_TASKQ_DEPTH_MAX);
        }
 
        error = (tq_arg->depth == SPLAT_TASKQ_DEPTH_MAX ? 0 : -EINVAL);
@@ -1052,21 +1057,15 @@ splat_taskq_test7(struct file *file, void *arg)
 
        rc = splat_taskq_test7_impl(file, arg, B_FALSE);
        if (rc)
-               return rc;
+               return (rc);
 
        rc = splat_taskq_test7_impl(file, arg, B_TRUE);
 
-       return rc;
+       return (rc);
 }
 
-/*
- * Create a taskq with 100 threads and dispatch a huge number of trivial
- * tasks to generate contention on tq->tq_lock.  This test should always
- * pass.  The purpose is to provide a benchmark for measuring the
- * effectiveness of taskq optimizations.
- */
 static void
-splat_taskq_test8_func(void *arg)
+splat_taskq_throughput_func(void *arg)
 {
        splat_taskq_arg_t *tq_arg = (splat_taskq_arg_t *)arg;
        ASSERT(tq_arg);
@@ -1074,98 +1073,105 @@ splat_taskq_test8_func(void *arg)
        atomic_inc(tq_arg->count);
 }
 
-#define TEST8_NUM_TASKS                        0x20000
-#define TEST8_THREADS_PER_TASKQ                100
-
 static int
-splat_taskq_test8_common(struct file *file, void *arg, int minalloc,
-                         int maxalloc)
+splat_taskq_throughput(struct file *file, void *arg, const char *name,
+    int nthreads, int minalloc, int maxalloc, int flags, int tasks,
+    struct timespec *delta)
 {
        taskq_t *tq;
        taskqid_t id;
        splat_taskq_arg_t tq_arg;
        taskq_ent_t **tqes;
        atomic_t count;
+       struct timespec start, stop;
        int i, j, rc = 0;
 
-       tqes = vmalloc(sizeof(*tqes) * TEST8_NUM_TASKS);
+       tqes = vmalloc(sizeof (*tqes) * tasks);
        if (tqes == NULL)
-               return -ENOMEM;
-       memset(tqes, 0, sizeof(*tqes) * TEST8_NUM_TASKS);
-
-       splat_vprint(file, SPLAT_TASKQ_TEST8_NAME,
-                    "Taskq '%s' creating (%d/%d/%d)\n",
-                    SPLAT_TASKQ_TEST8_NAME,
-                    minalloc, maxalloc, TEST8_NUM_TASKS);
-       if ((tq = taskq_create(SPLAT_TASKQ_TEST8_NAME, TEST8_THREADS_PER_TASKQ,
-                              maxclsyspri, minalloc, maxalloc,
-                              TASKQ_PREPOPULATE)) == NULL) {
-               splat_vprint(file, SPLAT_TASKQ_TEST8_NAME,
-                            "Taskq '%s' create failed\n",
-                            SPLAT_TASKQ_TEST8_NAME);
+               return (-ENOMEM);
+
+       memset(tqes, 0, sizeof (*tqes) * tasks);
+
+       splat_vprint(file, name, "Taskq '%s' creating (%d/%d/%d/%d)\n",
+           name, nthreads, minalloc, maxalloc, tasks);
+       if ((tq = taskq_create(name, nthreads, defclsyspri,
+           minalloc, maxalloc, flags)) == NULL) {
+               splat_vprint(file, name, "Taskq '%s' create failed\n", name);
                rc = -EINVAL;
                goto out_free;
        }
 
        tq_arg.file = file;
-       tq_arg.name = SPLAT_TASKQ_TEST8_NAME;
+       tq_arg.name = name;
        tq_arg.count = &count;
        atomic_set(tq_arg.count, 0);
 
-       for (i = 0; i < TEST8_NUM_TASKS; i++) {
-               tqes[i] = kmalloc(sizeof(taskq_ent_t), GFP_KERNEL);
+       getnstimeofday(&start);
+
+       for (i = 0; i < tasks; i++) {
+               tqes[i] = kmalloc(sizeof (taskq_ent_t), GFP_KERNEL);
                if (tqes[i] == NULL) {
                        rc = -ENOMEM;
                        goto out;
                }
-               taskq_init_ent(tqes[i]);
-
-               taskq_dispatch_ent(tq, splat_taskq_test8_func,
-                                  &tq_arg, TQ_SLEEP, tqes[i]);
 
+               taskq_init_ent(tqes[i]);
+               taskq_dispatch_ent(tq, splat_taskq_throughput_func,
+                   &tq_arg, TQ_SLEEP, tqes[i]);
                id = tqes[i]->tqent_id;
 
                if (id == 0) {
-                       splat_vprint(file, SPLAT_TASKQ_TEST8_NAME,
-                               "Taskq '%s' function '%s' dispatch "
-                               "%d failed\n", tq_arg.name,
-                               sym2str(splat_taskq_test8_func), i);
-                               rc = -EINVAL;
-                               goto out;
+                       splat_vprint(file, name, "Taskq '%s' function '%s' "
+                           "dispatch %d failed\n", tq_arg.name,
+                           sym2str(splat_taskq_throughput_func), i);
+                       rc = -EINVAL;
+                       goto out;
                }
        }
 
-       splat_vprint(file, SPLAT_TASKQ_TEST8_NAME, "Taskq '%s' "
-                    "waiting for %d dispatches\n", tq_arg.name,
-                    TEST8_NUM_TASKS);
+       splat_vprint(file, name, "Taskq '%s' waiting for %d dispatches\n",
+           tq_arg.name, tasks);
+
        taskq_wait(tq);
-       splat_vprint(file, SPLAT_TASKQ_TEST8_NAME, "Taskq '%s' "
-                    "%d/%d dispatches finished\n", tq_arg.name,
-                    atomic_read(tq_arg.count), TEST8_NUM_TASKS);
 
-       if (atomic_read(tq_arg.count) != TEST8_NUM_TASKS)
+       if (delta != NULL) {
+               getnstimeofday(&stop);
+               *delta = timespec_sub(stop, start);
+       }
+
+       splat_vprint(file, name, "Taskq '%s' %d/%d dispatches finished\n",
+           tq_arg.name, atomic_read(tq_arg.count), tasks);
+
+       if (atomic_read(tq_arg.count) != tasks)
                rc = -ERANGE;
 
 out:
-       splat_vprint(file, SPLAT_TASKQ_TEST8_NAME, "Taskq '%s' destroying\n",
-                  tq_arg.name);
+       splat_vprint(file, name, "Taskq '%s' destroying\n", tq_arg.name);
        taskq_destroy(tq);
 out_free:
-       for (j = 0; j < TEST8_NUM_TASKS && tqes[j] != NULL; j++)
+       for (j = 0; j < tasks && tqes[j] != NULL; j++)
                kfree(tqes[j]);
+
        vfree(tqes);
 
-       return rc;
+       return (rc);
 }
 
+/*
+ * Create a taskq with 100 threads and dispatch a huge number of trivial
+ * tasks to generate contention on tq->tq_lock.  This test should always
+ * pass.  The purpose is to provide a benchmark for measuring the
+ * effectiveness of taskq optimizations.
+ */
+#define        TEST8_NUM_TASKS                 0x20000
+#define        TEST8_THREADS_PER_TASKQ         100
+
 static int
 splat_taskq_test8(struct file *file, void *arg)
 {
-       int rc;
-
-       rc = splat_taskq_test8_common(file, arg, 1, 100);
-
-       return rc;
+       return (splat_taskq_throughput(file, arg,
+           SPLAT_TASKQ_TEST8_NAME, TEST8_THREADS_PER_TASKQ,
+           1, INT_MAX, TASKQ_PREPOPULATE, TEST8_NUM_TASKS, NULL));
 }
 
 /*
@@ -1197,7 +1203,7 @@ splat_taskq_test9(struct file *file, void *arg)
        splat_vprint(file, SPLAT_TASKQ_TEST9_NAME,
            "Taskq '%s' creating (%s dispatch) (%d/%d/%d)\n",
            SPLAT_TASKQ_TEST9_NAME, "delay", minalloc, maxalloc, nr_tasks);
-       if ((tq = taskq_create(SPLAT_TASKQ_TEST9_NAME, 3, maxclsyspri,
+       if ((tq = taskq_create(SPLAT_TASKQ_TEST9_NAME, 3, defclsyspri,
            minalloc, maxalloc, TASKQ_PREPOPULATE)) == NULL) {
                splat_vprint(file, SPLAT_TASKQ_TEST9_NAME,
                    "Taskq '%s' create failed\n", SPLAT_TASKQ_TEST9_NAME);
@@ -1297,7 +1303,7 @@ splat_taskq_test10(struct file *file, void *arg)
        splat_vprint(file, SPLAT_TASKQ_TEST10_NAME,
            "Taskq '%s' creating (%s dispatch) (%d/%d/%d)\n",
            SPLAT_TASKQ_TEST10_NAME, "delay", minalloc, maxalloc, nr_tasks);
-       if ((tq = taskq_create(SPLAT_TASKQ_TEST10_NAME, 3, maxclsyspri,
+       if ((tq = taskq_create(SPLAT_TASKQ_TEST10_NAME, 3, defclsyspri,
            minalloc, maxalloc, TASKQ_PREPOPULATE)) == NULL) {
                splat_vprint(file, SPLAT_TASKQ_TEST10_NAME,
                    "Taskq '%s' create failed\n", SPLAT_TASKQ_TEST10_NAME);
@@ -1433,6 +1439,46 @@ out_free:
        return rc;
 }
 
+/*
+ * Create a dynamic taskq with 100 threads and dispatch a huge number of
+ * trivial tasks.  This will cause the taskq to grow quickly to its max
+ * thread count.  This test should always pass.  The purpose is to provide
+ * a benchmark for measuring the performance of dynamic taskqs.
+ */
+#define        TEST11_NUM_TASKS                        100000
+#define        TEST11_THREADS_PER_TASKQ                100
+
+static int
+splat_taskq_test11(struct file *file, void *arg)
+{
+       struct timespec normal, dynamic;
+       int error;
+
+       error = splat_taskq_throughput(file, arg, SPLAT_TASKQ_TEST11_NAME,
+           TEST11_THREADS_PER_TASKQ, 1, INT_MAX,
+           TASKQ_PREPOPULATE, TEST11_NUM_TASKS, &normal);
+       if (error)
+               return (error);
+
+       error = splat_taskq_throughput(file, arg, SPLAT_TASKQ_TEST11_NAME,
+           TEST11_THREADS_PER_TASKQ, 1, INT_MAX,
+           TASKQ_PREPOPULATE | TASKQ_DYNAMIC, TEST11_NUM_TASKS, &dynamic);
+       if (error)
+               return (error);
+
+       splat_vprint(file, SPLAT_TASKQ_TEST11_NAME,
+           "Timing taskq_wait(): normal=%ld.%09lds, dynamic=%ld.%09lds\n",
+           normal.tv_sec, normal.tv_nsec,
+           dynamic.tv_sec, dynamic.tv_nsec);
+
+       /* A 10x increase in runtime is used to indicate a core problem. */
+       if ((dynamic.tv_sec * NANOSEC + dynamic.tv_nsec) >
+           ((normal.tv_sec * NANOSEC + normal.tv_nsec) * 10))
+               error = -ETIME;
+
+       return (error);
+}
+
 splat_subsystem_t *
 splat_taskq_init(void)
 {
@@ -1470,6 +1516,8 @@ splat_taskq_init(void)
                      SPLAT_TASKQ_TEST9_ID, splat_taskq_test9);
        SPLAT_TEST_INIT(sub, SPLAT_TASKQ_TEST10_NAME, SPLAT_TASKQ_TEST10_DESC,
                      SPLAT_TASKQ_TEST10_ID, splat_taskq_test10);
+       SPLAT_TEST_INIT(sub, SPLAT_TASKQ_TEST11_NAME, SPLAT_TASKQ_TEST11_DESC,
+                     SPLAT_TASKQ_TEST11_ID, splat_taskq_test11);
 
         return sub;
 }
@@ -1478,6 +1526,7 @@ void
 splat_taskq_fini(splat_subsystem_t *sub)
 {
         ASSERT(sub);
+       SPLAT_TEST_FINI(sub, SPLAT_TASKQ_TEST11_ID);
        SPLAT_TEST_FINI(sub, SPLAT_TASKQ_TEST10_ID);
        SPLAT_TEST_FINI(sub, SPLAT_TASKQ_TEST9_ID);
        SPLAT_TEST_FINI(sub, SPLAT_TASKQ_TEST8_ID);
index 3255e37e5b8d515ba2dd607dcbd0c8615baf8dcc..8a44714078d4d5eae7461835703ea97fe1c1e636 100644 (file)
@@ -112,7 +112,7 @@ splat_thread_test1(struct file *file, void *arg)
        tp.tp_rc = 0;
 
        thr = (kthread_t *)thread_create(NULL, 0, splat_thread_work1, &tp, 0,
-                                        &p0, TS_RUN, minclsyspri);
+                                        &p0, TS_RUN, defclsyspri);
        /* Must never fail under Solaris, but we check anyway since this
         * can happen in the linux SPL, we may want to change this behavior */
        if (thr == NULL)
@@ -161,7 +161,7 @@ splat_thread_test2(struct file *file, void *arg)
        tp.tp_rc = 0;
 
        thr = (kthread_t *)thread_create(NULL, 0, splat_thread_work2, &tp, 0,
-                                        &p0, TS_RUN, minclsyspri);
+                                        &p0, TS_RUN, defclsyspri);
        /* Must never fail under Solaris, but we check anyway since this
         * can happen in the linux SPL, we may want to change this behavior */
        if (thr == NULL)
@@ -278,7 +278,7 @@ splat_thread_test3(struct file *file, void *arg)
        /* Start tsd wait threads */
        for (i = 0; i < SPLAT_THREAD_TEST_THREADS; i++) {
                if (thread_create(NULL, 0, splat_thread_work3_wait,
-                                 &tp, 0, &p0, TS_RUN, minclsyspri))
+                                 &tp, 0, &p0, TS_RUN, defclsyspri))
                        wait_count++;
        }
 
@@ -295,7 +295,7 @@ splat_thread_test3(struct file *file, void *arg)
        /* Start tsd exit threads */
        for (i = 0; i < SPLAT_THREAD_TEST_THREADS; i++) {
                if (thread_create(NULL, 0, splat_thread_work3_exit,
-                                 &tp, 0, &p0, TS_RUN, minclsyspri))
+                                 &tp, 0, &p0, TS_RUN, defclsyspri))
                        exit_count++;
        }
 
index f3f17ec9256677c2adf12c0a250a7d2516ac5885..bffcf492ff5dae3c607f7cb9a2c37f0feb30f406 100644 (file)
@@ -223,6 +223,7 @@ out:
         return -rc;
 } /* splat_vnode_test3() */
 
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(4,1,0)
 static int
 splat_vnode_test4(struct file *file, void *arg)
 {
@@ -303,6 +304,7 @@ out:
 
         return -rc;
 } /* splat_vnode_test4() */
+#endif
 
 static int
 splat_vnode_test5(struct file *file, void *arg)
@@ -413,8 +415,10 @@ splat_vnode_init(void)
                        SPLAT_VNODE_TEST2_ID, splat_vnode_test2);
         SPLAT_TEST_INIT(sub, SPLAT_VNODE_TEST3_NAME, SPLAT_VNODE_TEST3_DESC,
                        SPLAT_VNODE_TEST3_ID, splat_vnode_test3);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(4,1,0)
         SPLAT_TEST_INIT(sub, SPLAT_VNODE_TEST4_NAME, SPLAT_VNODE_TEST4_DESC,
                        SPLAT_VNODE_TEST4_ID, splat_vnode_test4);
+#endif
         SPLAT_TEST_INIT(sub, SPLAT_VNODE_TEST5_NAME, SPLAT_VNODE_TEST5_DESC,
                        SPLAT_VNODE_TEST5_ID, splat_vnode_test5);
         SPLAT_TEST_INIT(sub, SPLAT_VNODE_TEST6_NAME, SPLAT_VNODE_TEST6_DESC,
@@ -430,7 +434,9 @@ splat_vnode_fini(splat_subsystem_t *sub)
 
         SPLAT_TEST_FINI(sub, SPLAT_VNODE_TEST6_ID);
         SPLAT_TEST_FINI(sub, SPLAT_VNODE_TEST5_ID);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(4,1,0)
         SPLAT_TEST_FINI(sub, SPLAT_VNODE_TEST4_ID);
+#endif
         SPLAT_TEST_FINI(sub, SPLAT_VNODE_TEST3_ID);
         SPLAT_TEST_FINI(sub, SPLAT_VNODE_TEST2_ID);
         SPLAT_TEST_FINI(sub, SPLAT_VNODE_TEST1_ID);
index e787f6cc3950eab01664fe4b9adef4e62c853f68..03402b5ba1ba1cc2cd115c3099763f1d2ef13355 100644 (file)
@@ -341,6 +341,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index 3959fb3055673f79c8ab75f0a03f350d97e97b05..54cba79142221b2791e0ee35d6fd6fec9d085284 100644 (file)
@@ -282,6 +282,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index 6d8e058cb01acb3244c0700baf05bed2df683bfd..949660ebc024d171b5eaa545381412442728ef5b 100644 (file)
@@ -62,14 +62,8 @@ echo -e "support or upgrade DKMS to a more current version."
 exit 1
 
 %preun
-# Only remove the modules if they are for this %{version}-%{release}.  A
-# package upgrade can replace them if only the %{release} is changed.
-RELEASE="/var/lib/dkms/%{module}/%{version}/build/%{module}.release"
-if [ -f $RELEASE ] && [ `cat $RELEASE`%{?dist} = "%{version}-%{release}" ]; then
-    echo -e
-    echo -e "Uninstall of %{module} module (version %{version}) beginning:"
-    dkms remove -m %{module} -v %{version} --all --rpm_safe_upgrade
-fi
+echo -e "Uninstall of %{module} module (version %{version}) beginning:"
+dkms remove -m %{module} -v %{version} --all --rpm_safe_upgrade
 exit 0
 
 %changelog
index d95e2df58d579a70ea0a7df9c3e99380706a1361..5f5854a3be32742ad4f0225c6a4fae70c2145b91 100644 (file)
@@ -160,9 +160,18 @@ chmod u+x ${RPM_BUILD_ROOT}%{kmodinstdir_prefix}/*/extra/*/*/*
 rm -rf $RPM_BUILD_ROOT
 
 %changelog
-* Wed Jun  24 2015 Ned Bass <bass6@llnl.gov> - 0.6.4.2-1
-- No changes from 0.6.4-1
-- Bump version to match ZFS release
+* Tue Oct 13 2015 Ned Bass <bass6@llnl.gov> - 0.6.5.3-1
+- Fix CPU hotplug zfsonlinux/spl#482
+- Disable dynamic taskqs by default to avoid deadlock zfsonlinux/spl#484
+* Tue Sep 29 2015 Ned Bass <bass6@llnl.gov> - 0.6.5.2-1
+- Released 0.6.5.2-1
+- Fix PAX Patch/Grsec SLAB_USERCOPY panic zfsonlinux/zfs#3796
+- Always remove during dkms uninstall/update zfsonlinux/spl#476
+* Thu Sep 19 2015 Ned Bass <bass6@llnl.gov> - 0.6.5.1-1
+- Released 0.6.5.1-1, no changes from spl-0.6.5
+* Thu Sep 10 2015 Brian Behlendorf <behlendorf1@llnl.gov> - 0.6.5-1
+- Released 0.6.5-1, detailed release notes are available at:
+- https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.6.5
 * Wed Apr  8 2015 Brian Behlendorf <behlendorf1@llnl.gov> - 0.6.4-1
 - Released 0.6.4-1
 * Thu Jun 12 2014 Brian Behlendorf <behlendorf1@llnl.gov> - 0.6.3-1
index 1faeba965e52b368f32d50ca8023a2a38c334e95..ca68723ff906c4f91a8e4100633c92caf1752bc2 100644 (file)
@@ -38,9 +38,18 @@ make install DESTDIR=%{?buildroot}
 %{_mandir}/man5/*
 
 %changelog
-* Wed Jun  24 2015 Ned Bass <bass6@llnl.gov> - 0.6.4.2-1
-- No changes from 0.6.4-1
-- Bump version to match ZFS release
+* Tue Oct 13 2015 Ned Bass <bass6@llnl.gov> - 0.6.5.3-1
+- Fix CPU hotplug zfsonlinux/spl#482
+- Disable dynamic taskqs by default to avoid deadlock zfsonlinux/spl#484
+* Tue Sep 29 2015 Ned Bass <bass6@llnl.gov> - 0.6.5.2-1
+- Released 0.6.5.2-1
+- Fix PAX Patch/Grsec SLAB_USERCOPY panic zfsonlinux/zfs#3796
+- Always remove during dkms uninstall/update zfsonlinux/spl#476
+* Thu Sep 19 2015 Ned Bass <bass6@llnl.gov> - 0.6.5.1-1
+- Released 0.6.5.1-1, no changes from spl-0.6.5
+* Thu Sep 10 2015 Brian Behlendorf <behlendorf1@llnl.gov> - 0.6.5-1
+- Released 0.6.5-1, detailed release notes are available at:
+- https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.6.5
 * Wed Apr  8 2015 Brian Behlendorf <behlendorf1@llnl.gov> - 0.6.4-1
 - Released 0.6.4-1
 * Thu Jun 12 2014 Brian Behlendorf <behlendorf1@llnl.gov> - 0.6.3-1
index 6dadab74c8e4bc473762bc1fb45559384a6c8efc..ba0359cfc14d7404e64ae7534c76a6297f0332ca 100644 (file)
@@ -282,6 +282,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
index 6d8e058cb01acb3244c0700baf05bed2df683bfd..949660ebc024d171b5eaa545381412442728ef5b 100644 (file)
@@ -62,14 +62,8 @@ echo -e "support or upgrade DKMS to a more current version."
 exit 1
 
 %preun
-# Only remove the modules if they are for this %{version}-%{release}.  A
-# package upgrade can replace them if only the %{release} is changed.
-RELEASE="/var/lib/dkms/%{module}/%{version}/build/%{module}.release"
-if [ -f $RELEASE ] && [ `cat $RELEASE`%{?dist} = "%{version}-%{release}" ]; then
-    echo -e
-    echo -e "Uninstall of %{module} module (version %{version}) beginning:"
-    dkms remove -m %{module} -v %{version} --all --rpm_safe_upgrade
-fi
+echo -e "Uninstall of %{module} module (version %{version}) beginning:"
+dkms remove -m %{module} -v %{version} --all --rpm_safe_upgrade
 exit 0
 
 %changelog
index 1faeba965e52b368f32d50ca8023a2a38c334e95..ca68723ff906c4f91a8e4100633c92caf1752bc2 100644 (file)
@@ -38,9 +38,18 @@ make install DESTDIR=%{?buildroot}
 %{_mandir}/man5/*
 
 %changelog
-* Wed Jun  24 2015 Ned Bass <bass6@llnl.gov> - 0.6.4.2-1
-- No changes from 0.6.4-1
-- Bump version to match ZFS release
+* Tue Oct 13 2015 Ned Bass <bass6@llnl.gov> - 0.6.5.3-1
+- Fix CPU hotplug zfsonlinux/spl#482
+- Disable dynamic taskqs by default to avoid deadlock zfsonlinux/spl#484
+* Tue Sep 29 2015 Ned Bass <bass6@llnl.gov> - 0.6.5.2-1
+- Released 0.6.5.2-1
+- Fix PAX Patch/Grsec SLAB_USERCOPY panic zfsonlinux/zfs#3796
+- Always remove during dkms uninstall/update zfsonlinux/spl#476
+* Thu Sep 19 2015 Ned Bass <bass6@llnl.gov> - 0.6.5.1-1
+- Released 0.6.5.1-1, no changes from spl-0.6.5
+* Thu Sep 10 2015 Brian Behlendorf <behlendorf1@llnl.gov> - 0.6.5-1
+- Released 0.6.5-1, detailed release notes are available at:
+- https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.6.5
 * Wed Apr  8 2015 Brian Behlendorf <behlendorf1@llnl.gov> - 0.6.4-1
 - Released 0.6.4-1
 * Thu Jun 12 2014 Brian Behlendorf <behlendorf1@llnl.gov> - 0.6.3-1
index f146ec6e674a4cf76f3bd038edc42a1b26938e2b..a718c4b1abf97e5b0478dc79152c7219b1449d74 100644 (file)
@@ -1,4 +1,4 @@
 EXTRA_DIST = check.sh dkms.mkconf dkms.postbuild kmodtool
 
 check:
-       $(top_srcdir)/scripts/check.sh
+       scripts/check.sh
index 01fd70ad747e2d2a0463fc13e2d64f62ded438c5..f61879e5c6162643b57c11632631f8f459c8f449 100644 (file)
@@ -281,6 +281,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
@@ -487,7 +488,7 @@ uninstall-am:
 
 
 check:
-       $(top_srcdir)/scripts/check.sh
+       scripts/check.sh
 
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
index 858636c5998e2279b1a84785ed59570f7a846164..cfd94db79c4496b79a0b3f5fbc6114ec3340fd7f 100644 (file)
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION
 
-/* rwsem_is_locked() acquires sem->wait_lock */
-#undef RWSEM_IS_LOCKED_TAKES_WAIT_LOCK
-
 /* struct rw_semaphore member wait_lock is raw_spinlock_t */
 #undef RWSEM_SPINLOCK_IS_RAW
 
index 8ba7a76eeafd84eb16792581e03656695db9a6d5..759a48439bfa72f760d1150082c223baa90680da 100644 (file)
@@ -29,3 +29,5 @@ Unless otherwise noted, all files in this distribution are released
 under the Common Development and Distribution License (CDDL).
 Exceptions are noted within the associated source files.  See the file
 OPENSOLARIS.LICENSE for more information.
+
+Refer to the git commit log for authoritative copyright attribution.
index db61ab28a43f503e09caf92ab8b5cf326e98ed90..8e5bbd6a91622c9253892adf5de33ecf0835c5ed 100644 (file)
--- a/zfs/META
+++ b/zfs/META
@@ -1,8 +1,8 @@
 Meta:         1
 Name:         zfs
 Branch:       1.0
-Version:      0.6.4.2
-Release:      0ubuntu1
+Version:      0.6.5.3
+Release:      0ubuntu2
 Release-Tags: relext
 License:      CDDL
 Author:       OpenZFS on Linux
index 49b417a81d9e6ec5c090f37a6f9901e3d3d6eaf3..f8abb5f2c6e52984c211511bc84571450cc7d373 100644 (file)
@@ -1,13 +1,12 @@
-
 ACLOCAL_AMFLAGS = -I config
 
-include $(top_srcdir)/config/rpm.am
-include $(top_srcdir)/config/deb.am
-include $(top_srcdir)/config/tgz.am
+include config/rpm.am
+include config/deb.am
+include config/tgz.am
 
 SUBDIRS = include rpm
 if CONFIG_USER
-SUBDIRS += dracut udev etc man scripts lib cmd contrib
+SUBDIRS += udev etc man scripts lib cmd contrib
 endif
 if CONFIG_KERNEL
 SUBDIRS += module
@@ -40,16 +39,28 @@ dist-hook:
        sed -i 's/Release:[[:print:]]*/Release:      $(RELEASE)/' \
                $(distdir)/META
 
-checkstyle:
+checkstyle: cstyle shellcheck
+
+cstyle:
        @find ${top_srcdir} -name '*.[hc]' ! -name 'zfs_config.*' \
                ! -name '*.mod.c' -type f -exec scripts/cstyle.pl {} \+
 
+shellcheck:
+       @if type shellcheck > /dev/null 2>&1; then \
+               (find ${top_srcdir} -type f -name '*.sh.in' -o -type f \
+                -name '*.sh'; find etc/init.d/zfs*.in -type f) | \
+                grep -v 'zfs-script-config' | \
+                while read file; do \
+                       shellcheck --format gcc "$$file"; \
+                done; \
+        fi
+
 ctags:
-       $(RM) $(top_srcdir)/tags
+       $(RM) tags
        find $(top_srcdir) -name .git -prune -o -name '*.[hc]' | xargs ctags
 
 etags:
-       $(RM) $(top_srcdir)/TAGS
+       $(RM) TAGS
        find $(top_srcdir) -name .pc -prune -o -name '*.[hc]' | xargs etags -a
 
 tags: ctags etags
index 9d2279756e8abacf6daa6518130fba736e89b360..31652023d7e8b6a037b93d88eb9b64c31fe813dc 100644 (file)
@@ -97,11 +97,11 @@ POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
-@CONFIG_USER_TRUE@am__append_1 = dracut udev etc man scripts lib cmd contrib
+@CONFIG_USER_TRUE@am__append_1 = udev etc man scripts lib cmd contrib
 @CONFIG_KERNEL_TRUE@am__append_2 = module
 subdir = .
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps =  \
+am__aclocal_m4_deps = $(top_srcdir)/config/always-no-bool-compare.m4 \
        $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \
        $(top_srcdir)/config/dkms.m4 \
        $(top_srcdir)/config/kernel-acl.m4 \
@@ -113,20 +113,11 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-bio-bvec-iter.m4 \
        $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \
        $(top_srcdir)/config/kernel-bio-failfast.m4 \
-       $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \
-       $(top_srcdir)/config/kernel-blk-end-request.m4 \
-       $(top_srcdir)/config/kernel-blk-fetch-request.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-discard.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-barrier.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-discard.m4 \
        $(top_srcdir)/config/kernel-blk-queue-flush.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \
-       $(top_srcdir)/config/kernel-blk-requeue-request.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-pos.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \
        $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \
        $(top_srcdir)/config/kernel-blkdev-get.m4 \
        $(top_srcdir)/config/kernel-block-device-operations-release-void.m4 \
@@ -134,6 +125,7 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-clear-inode.m4 \
        $(top_srcdir)/config/kernel-commit-metadata.m4 \
        $(top_srcdir)/config/kernel-create-nameidata.m4 \
+       $(top_srcdir)/config/kernel-current_bio_tail.m4 \
        $(top_srcdir)/config/kernel-d-make-root.m4 \
        $(top_srcdir)/config/kernel-d-obtain-alias.m4 \
        $(top_srcdir)/config/kernel-d-prune-aliases.m4 \
@@ -147,23 +139,25 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-fallocate.m4 \
        $(top_srcdir)/config/kernel-file-inode.m4 \
        $(top_srcdir)/config/kernel-fmode-t.m4 \
+       $(top_srcdir)/config/kernel-follow-down-one.m4 \
        $(top_srcdir)/config/kernel-follow-link-nameidata.m4 \
        $(top_srcdir)/config/kernel-fsync.m4 \
+       $(top_srcdir)/config/kernel-generic_io_acct.m4 \
        $(top_srcdir)/config/kernel-get-disk-ro.m4 \
        $(top_srcdir)/config/kernel-get-gendisk.m4 \
        $(top_srcdir)/config/kernel-insert-inode-locked.m4 \
        $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \
        $(top_srcdir)/config/kernel-is_owner_or_cap.m4 \
+       $(top_srcdir)/config/kernel-kmap-atomic-args.m4 \
        $(top_srcdir)/config/kernel-kobj-name-len.m4 \
        $(top_srcdir)/config/kernel-lookup-bdev.m4 \
        $(top_srcdir)/config/kernel-lookup-nameidata.m4 \
        $(top_srcdir)/config/kernel-lseek-execute.m4 \
+       $(top_srcdir)/config/kernel-mk-request-fn.m4 \
        $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \
        $(top_srcdir)/config/kernel-mount-nodev.m4 \
        $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \
        $(top_srcdir)/config/kernel-put-link-nameidata.m4 \
-       $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \
-       $(top_srcdir)/config/kernel-rq-is_sync.m4 \
        $(top_srcdir)/config/kernel-security-inode-init.m4 \
        $(top_srcdir)/config/kernel-set-nlink.m4 \
        $(top_srcdir)/config/kernel-sget-args.m4 \
@@ -292,24 +286,23 @@ am__define_uniq_tagged_files = \
 ETAGS = etags
 CTAGS = ctags
 CSCOPE = cscope
-DIST_SUBDIRS = include rpm dracut udev etc man scripts lib cmd contrib \
-       module
-am__DIST_COMMON = $(srcdir)/Makefile.in \
+DIST_SUBDIRS = include rpm udev etc man scripts lib cmd contrib module
+am__DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/config/deb.am \
+       $(srcdir)/config/rpm.am $(srcdir)/config/tgz.am \
        $(srcdir)/zfs-script-config.sh.in $(srcdir)/zfs.release.in \
        $(srcdir)/zfs_config.h.in $(top_srcdir)/config/compile \
        $(top_srcdir)/config/config.guess \
-       $(top_srcdir)/config/config.sub $(top_srcdir)/config/deb.am \
+       $(top_srcdir)/config/config.sub \
        $(top_srcdir)/config/install-sh $(top_srcdir)/config/ltmain.sh \
-       $(top_srcdir)/config/missing $(top_srcdir)/config/rpm.am \
-       $(top_srcdir)/config/tgz.am $(top_srcdir)/module/Makefile.in \
+       $(top_srcdir)/config/missing $(top_srcdir)/module/Makefile.in \
        $(top_srcdir)/module/avl/Makefile.in \
        $(top_srcdir)/module/nvpair/Makefile.in \
        $(top_srcdir)/module/unicode/Makefile.in \
        $(top_srcdir)/module/zcommon/Makefile.in \
        $(top_srcdir)/module/zfs/Makefile.in \
        $(top_srcdir)/module/zpios/Makefile.in AUTHORS config/compile \
-       config/config.guess config/config.sub config/depcomp \
-       config/install-sh config/ltmain.sh config/missing
+       config/config.guess config/config.sub config/install-sh \
+       config/ltmain.sh config/missing
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 distdir = $(PACKAGE)-$(VERSION)
 top_distdir = $(distdir)
@@ -375,9 +368,11 @@ DEBUG_CFLAGS = @DEBUG_CFLAGS@
 DEBUG_DMU_TX = @DEBUG_DMU_TX@
 DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@
 DEBUG_ZFS = @DEBUG_ZFS@
+DEFAULT_INITCONF_DIR = @DEFAULT_INITCONF_DIR@
 DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@
 DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@
 DEFAULT_PACKAGE = @DEFAULT_PACKAGE@
+DEFINE_INITRAMFS = @DEFINE_INITRAMFS@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -427,6 +422,7 @@ MANIFEST_TOOL = @MANIFEST_TOOL@
 MKDIR_P = @MKDIR_P@
 NM = @NM@
 NMEDIT = @NMEDIT@
+NO_BOOL_COMPARE = @NO_BOOL_COMPARE@
 NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@
 OBJDUMP = @OBJDUMP@
 OBJEXT = @OBJEXT@
@@ -561,7 +557,7 @@ all: zfs_config.h
 .SUFFIXES:
 am--refresh: Makefile
        @:
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/config/rpm.am $(top_srcdir)/config/deb.am $(top_srcdir)/config/tgz.am $(am__configure_deps)
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(srcdir)/config/rpm.am $(srcdir)/config/deb.am $(srcdir)/config/tgz.am $(am__configure_deps)
        @for dep in $?; do \
          case '$(am__configure_deps)' in \
            *$$dep*) \
@@ -583,7 +579,7 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
            echo ' cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)'; \
            cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe);; \
        esac;
-$(top_srcdir)/config/rpm.am $(top_srcdir)/config/deb.am $(top_srcdir)/config/tgz.am $(am__empty):
+$(srcdir)/config/rpm.am $(srcdir)/config/deb.am $(srcdir)/config/tgz.am $(am__empty):
 
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
        $(SHELL) ./config.status --recheck
@@ -1150,7 +1146,7 @@ rpm-local:
        mkdir -p $(rpmbuild)/SPECS && \
        cp ${RPM_SPEC_DIR}/$(rpmspec) $(rpmbuild)/SPECS && \
        mkdir -p $(rpmbuild)/SOURCES && \
-       cp scripts/kmodtool $(rpmbuild)/SOURCES && \
+       cp $(top_srcdir)/scripts/kmodtool $(rpmbuild)/SOURCES && \
        cp $(distdir).tar.gz $(rpmbuild)/SOURCES)
 
 srpm-common: dist
@@ -1220,9 +1216,12 @@ deb-utils: deb-local rpm-utils
 @CONFIG_USER_TRUE@     pkg6=libzfs2-devel-$${version}.$${arch}.rpm; \
 @CONFIG_USER_TRUE@     pkg7=$${name}-test-$${version}.$${arch}.rpm; \
 @CONFIG_USER_TRUE@     pkg8=$${name}-dracut-$${version}.$${arch}.rpm; \
+@CONFIG_USER_TRUE@     pkg9=$${name}-initramfs-$${version}.$${arch}.rpm; \
 @CONFIG_USER_TRUE@     fakeroot $(ALIEN) --bump=0 --scripts --to-deb \
-@CONFIG_USER_TRUE@         $$pkg1 $$pkg2 $$pkg3 $$pkg4 $$pkg5 $$pkg6 $$pkg7 $$pkg8; \
-@CONFIG_USER_TRUE@     $(RM) $$pkg1 $$pkg2 $$pkg3 $$pkg4 $$pkg5 $$pkg6 $$pkg7 $$pkg8;
+@CONFIG_USER_TRUE@         $$pkg1 $$pkg2 $$pkg3 $$pkg4 $$pkg5 $$pkg6 $$pkg7 \
+@CONFIG_USER_TRUE@         $$pkg8 $$pkg9;
+@CONFIG_USER_TRUE@     $(RM) $$pkg1 $$pkg2 $$pkg3 $$pkg4 $$pkg5 $$pkg6 $$pkg7 \
+@CONFIG_USER_TRUE@         $$pkg8 $$pkg9;
 
 deb: deb-kmod deb-utils
 tgz-local:
@@ -1270,16 +1269,28 @@ dist-hook:
        sed -i 's/Release:[[:print:]]*/Release:      $(RELEASE)/' \
                $(distdir)/META
 
-checkstyle:
+checkstyle: cstyle shellcheck
+
+cstyle:
        @find ${top_srcdir} -name '*.[hc]' ! -name 'zfs_config.*' \
                ! -name '*.mod.c' -type f -exec scripts/cstyle.pl {} \+
 
+shellcheck:
+       @if type shellcheck > /dev/null 2>&1; then \
+               (find ${top_srcdir} -type f -name '*.sh.in' -o -type f \
+                -name '*.sh'; find etc/init.d/zfs*.in -type f) | \
+                grep -v 'zfs-script-config' | \
+                while read file; do \
+                       shellcheck --format gcc "$$file"; \
+                done; \
+        fi
+
 ctags:
-       $(RM) $(top_srcdir)/tags
+       $(RM) tags
        find $(top_srcdir) -name .git -prune -o -name '*.[hc]' | xargs ctags
 
 etags:
-       $(RM) $(top_srcdir)/TAGS
+       $(RM) TAGS
        find $(top_srcdir) -name .pc -prune -o -name '*.[hc]' | xargs etags -a
 
 tags: ctags etags
index 14d77e4fc1fcd876ee52ae69bbad3486879770ae..758cbecd407e9809f6dbd4b85666ad766e1f2e2f 100644 (file)
@@ -1206,6 +1206,7 @@ AC_SUBST([am__tar])
 AC_SUBST([am__untar])
 ]) # _AM_PROG_TAR
 
+m4_include([config/always-no-bool-compare.m4])
 m4_include([config/always-no-unused-but-set-variable.m4])
 m4_include([config/dkms.m4])
 m4_include([config/kernel-acl.m4])
@@ -1217,20 +1218,11 @@ m4_include([config/kernel-bdi-setup-and-register.m4])
 m4_include([config/kernel-bio-bvec-iter.m4])
 m4_include([config/kernel-bio-end-io-t-args.m4])
 m4_include([config/kernel-bio-failfast.m4])
-m4_include([config/kernel-bio-rw-syncio.m4])
-m4_include([config/kernel-blk-end-request.m4])
-m4_include([config/kernel-blk-fetch-request.m4])
-m4_include([config/kernel-blk-queue-discard.m4])
+m4_include([config/kernel-bio-rw-barrier.m4])
+m4_include([config/kernel-bio-rw-discard.m4])
 m4_include([config/kernel-blk-queue-flush.m4])
-m4_include([config/kernel-blk-queue-io-opt.m4])
 m4_include([config/kernel-blk-queue-max-hw-sectors.m4])
 m4_include([config/kernel-blk-queue-max-segments.m4])
-m4_include([config/kernel-blk-queue-nonrot.m4])
-m4_include([config/kernel-blk-queue-physical-block-size.m4])
-m4_include([config/kernel-blk-requeue-request.m4])
-m4_include([config/kernel-blk-rq-bytes.m4])
-m4_include([config/kernel-blk-rq-pos.m4])
-m4_include([config/kernel-blk-rq-sectors.m4])
 m4_include([config/kernel-blkdev-get-by-path.m4])
 m4_include([config/kernel-blkdev-get.m4])
 m4_include([config/kernel-block-device-operations-release-void.m4])
@@ -1238,6 +1230,7 @@ m4_include([config/kernel-check-disk-size-change.m4])
 m4_include([config/kernel-clear-inode.m4])
 m4_include([config/kernel-commit-metadata.m4])
 m4_include([config/kernel-create-nameidata.m4])
+m4_include([config/kernel-current_bio_tail.m4])
 m4_include([config/kernel-d-make-root.m4])
 m4_include([config/kernel-d-obtain-alias.m4])
 m4_include([config/kernel-d-prune-aliases.m4])
@@ -1251,23 +1244,25 @@ m4_include([config/kernel-evict-inode.m4])
 m4_include([config/kernel-fallocate.m4])
 m4_include([config/kernel-file-inode.m4])
 m4_include([config/kernel-fmode-t.m4])
+m4_include([config/kernel-follow-down-one.m4])
 m4_include([config/kernel-follow-link-nameidata.m4])
 m4_include([config/kernel-fsync.m4])
+m4_include([config/kernel-generic_io_acct.m4])
 m4_include([config/kernel-get-disk-ro.m4])
 m4_include([config/kernel-get-gendisk.m4])
 m4_include([config/kernel-insert-inode-locked.m4])
 m4_include([config/kernel-invalidate-bdev-args.m4])
 m4_include([config/kernel-is_owner_or_cap.m4])
+m4_include([config/kernel-kmap-atomic-args.m4])
 m4_include([config/kernel-kobj-name-len.m4])
 m4_include([config/kernel-lookup-bdev.m4])
 m4_include([config/kernel-lookup-nameidata.m4])
 m4_include([config/kernel-lseek-execute.m4])
+m4_include([config/kernel-mk-request-fn.m4])
 m4_include([config/kernel-mkdir-umode-t.m4])
 m4_include([config/kernel-mount-nodev.m4])
 m4_include([config/kernel-open-bdev-exclusive.m4])
 m4_include([config/kernel-put-link-nameidata.m4])
-m4_include([config/kernel-rq-for-each_segment.m4])
-m4_include([config/kernel-rq-is_sync.m4])
 m4_include([config/kernel-security-inode-init.m4])
 m4_include([config/kernel-set-nlink.m4])
 m4_include([config/kernel-sget-args.m4])
index ff2165e12a84eb3bcedc8a6963e141c3a01a0571..af20ddb3964f9c0273d699d448c070d571e68884 100644 (file)
@@ -3,6 +3,7 @@ DEFAULT_INCLUDES = -include ${top_builddir}/zfs_config.h
 AM_LIBTOOLFLAGS = --silent
 AM_CFLAGS  = ${DEBUG_CFLAGS} -Wall -Wstrict-prototypes
 AM_CFLAGS += ${NO_UNUSED_BUT_SET_VARIABLE}
+AM_CFLAGS += ${NO_BOOL_COMPARE}
 AM_CFLAGS += -fno-strict-aliasing
 AM_CPPFLAGS  = -D_GNU_SOURCE -D__EXTENSIONS__ -D_REENTRANT
 AM_CPPFLAGS += -D_POSIX_PTHREAD_SEMANTICS -D_FILE_OFFSET_BITS=64
diff --git a/zfs/config/always-no-bool-compare.m4 b/zfs/config/always-no-bool-compare.m4
new file mode 100644 (file)
index 0000000..316b04b
--- /dev/null
@@ -0,0 +1,27 @@
+dnl #
+dnl # Check if gcc supports -Wno-bool-compare option.
+dnl #
+dnl # We actually invoke gcc with the -Wbool-compare option
+dnl # and infer the 'no-' version does or doesn't exist based upon
+dnl # the results.  This is required because when checking any of
+dnl # no- prefixed options gcc always returns success.
+dnl #
+AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_NO_BOOL_COMPARE], [
+       AC_MSG_CHECKING([for -Wno-bool-compare support])
+
+       saved_flags="$CFLAGS"
+       CFLAGS="$CFLAGS -Wbool-compare"
+
+       AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])],
+       [
+               NO_BOOL_COMPARE=-Wno-bool-compare
+               AC_MSG_RESULT([yes])
+       ],
+       [
+               NO_BOOL_COMPARE=
+               AC_MSG_RESULT([no])
+       ])
+
+       CFLAGS="$saved_flags"
+       AC_SUBST([NO_BOOL_COMPARE])
+])
index 1f5c50c0d1529d50b94dc3533ca72a47f0fa5849..16592509d49e52301080275c3647a66349c33795 100755 (executable)
@@ -1,8 +1,8 @@
 #! /bin/sh
 # Attempt to guess a canonical system name.
-#   Copyright 1992-2014 Free Software Foundation, Inc.
+#   Copyright 1992-2015 Free Software Foundation, Inc.
 
-timestamp='2014-03-23'
+timestamp='2015-08-20'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -24,12 +24,12 @@ timestamp='2014-03-23'
 # program.  This Exception is an additional permission under section 7
 # of the GNU General Public License, version 3 ("GPLv3").
 #
-# Originally written by Per Bothner.
+# Originally written by Per Bothner; maintained since 2000 by Ben Elliston.
 #
 # You can get the latest version of this script from:
 # http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
 #
-# Please send patches with a ChangeLog entry to config-patches@gnu.org.
+# Please send patches to <config-patches@gnu.org>.
 
 
 me=`echo "$0" | sed -e 's,.*/,,'`
@@ -50,7 +50,7 @@ version="\
 GNU config.guess ($timestamp)
 
 Originally written by Per Bothner.
-Copyright 1992-2014 Free Software Foundation, Inc.
+Copyright 1992-2015 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -168,20 +168,27 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
        # Note: NetBSD doesn't particularly care about the vendor
        # portion of the name.  We always set it to "unknown".
        sysctl="sysctl -n hw.machine_arch"
-       UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
-           /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
+       UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \
+           /sbin/$sysctl 2>/dev/null || \
+           /usr/sbin/$sysctl 2>/dev/null || \
+           echo unknown)`
        case "${UNAME_MACHINE_ARCH}" in
            armeb) machine=armeb-unknown ;;
            arm*) machine=arm-unknown ;;
            sh3el) machine=shl-unknown ;;
            sh3eb) machine=sh-unknown ;;
            sh5el) machine=sh5le-unknown ;;
+           earmv*)
+               arch=`echo ${UNAME_MACHINE_ARCH} | sed -e 's,^e\(armv[0-9]\).*$,\1,'`
+               endian=`echo ${UNAME_MACHINE_ARCH} | sed -ne 's,^.*\(eb\)$,\1,p'`
+               machine=${arch}${endian}-unknown
+               ;;
            *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
        esac
        # The Operating System including object format, if it has switched
        # to ELF recently, or will in the future.
        case "${UNAME_MACHINE_ARCH}" in
-           arm*|i386|m68k|ns32k|sh3*|sparc|vax)
+           arm*|earm*|i386|m68k|ns32k|sh3*|sparc|vax)
                eval $set_cc_for_build
                if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
                        | grep -q __ELF__
@@ -197,6 +204,13 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
                os=netbsd
                ;;
        esac
+       # Determine ABI tags.
+       case "${UNAME_MACHINE_ARCH}" in
+           earm*)
+               expr='s/^earmv[0-9]/-eabi/;s/eb$//'
+               abi=`echo ${UNAME_MACHINE_ARCH} | sed -e "$expr"`
+               ;;
+       esac
        # The OS release
        # Debian GNU/NetBSD machines have a different userland, and
        # thus, need a distinct triplet. However, they do not need
@@ -207,13 +221,13 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
                release='-gnu'
                ;;
            *)
-               release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+               release=`echo ${UNAME_RELEASE} | sed -e 's/[-_].*//' | cut -d. -f1,2`
                ;;
        esac
        # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
        # contains redundant information, the shorter form:
        # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
-       echo "${machine}-${os}${release}"
+       echo "${machine}-${os}${release}${abi}"
        exit ;;
     *:Bitrig:*:*)
        UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
@@ -235,6 +249,9 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
     *:MirBSD:*:*)
        echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
        exit ;;
+    *:Sortix:*:*)
+       echo ${UNAME_MACHINE}-unknown-sortix
+       exit ;;
     alpha:OSF1:*:*)
        case $UNAME_RELEASE in
        *4.0)
@@ -579,8 +596,9 @@ EOF
        else
                IBM_ARCH=powerpc
        fi
-       if [ -x /usr/bin/oslevel ] ; then
-               IBM_REV=`/usr/bin/oslevel`
+       if [ -x /usr/bin/lslpp ] ; then
+               IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc |
+                          awk -F: '{ print $3 }' | sed s/[0-9]*$/0/`
        else
                IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
        fi
@@ -932,6 +950,9 @@ EOF
     crisv32:Linux:*:*)
        echo ${UNAME_MACHINE}-axis-linux-${LIBC}
        exit ;;
+    e2k:Linux:*:*)
+       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+       exit ;;
     frv:Linux:*:*)
        echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
        exit ;;
@@ -1020,7 +1041,7 @@ EOF
        echo ${UNAME_MACHINE}-dec-linux-${LIBC}
        exit ;;
     x86_64:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+       echo ${UNAME_MACHINE}-pc-linux-${LIBC}
        exit ;;
     xtensa*:Linux:*:*)
        echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
index bba4efb80574987fcf6d85c71e68e55bfeb48ba2..1acc966a33bf509f7c50f87d7678fbb813089ca6 100755 (executable)
@@ -1,8 +1,8 @@
 #! /bin/sh
 # Configuration validation subroutine script.
-#   Copyright 1992-2014 Free Software Foundation, Inc.
+#   Copyright 1992-2015 Free Software Foundation, Inc.
 
-timestamp='2014-09-11'
+timestamp='2015-08-20'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -25,7 +25,7 @@ timestamp='2014-09-11'
 # of the GNU General Public License, version 3 ("GPLv3").
 
 
-# Please send patches with a ChangeLog entry to config-patches@gnu.org.
+# Please send patches to <config-patches@gnu.org>.
 #
 # Configuration subroutine to validate and canonicalize a configuration type.
 # Supply the specified configuration type as an argument.
@@ -68,7 +68,7 @@ Report bugs and patches to <config-patches@gnu.org>."
 version="\
 GNU config.sub ($timestamp)
 
-Copyright 1992-2014 Free Software Foundation, Inc.
+Copyright 1992-2015 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -117,7 +117,7 @@ maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
 case $maybe_os in
   nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \
   linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
-  knetbsd*-gnu* | netbsd*-gnu* | \
+  knetbsd*-gnu* | netbsd*-gnu* | netbsd*-eabi* | \
   kopensolaris*-gnu* | \
   storm-chaos* | os2-emx* | rtmk-nova*)
     os=-$maybe_os
@@ -255,12 +255,13 @@ case $basic_machine in
        | arc | arceb \
        | arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \
        | avr | avr32 \
+       | ba \
        | be32 | be64 \
        | bfin \
        | c4x | c8051 | clipper \
        | d10v | d30v | dlx | dsp16xx \
-       | epiphany \
-       | fido | fr30 | frv \
+       | e2k | epiphany \
+       | fido | fr30 | frv | ft32 \
        | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
        | hexagon \
        | i370 | i860 | i960 | ia64 \
@@ -305,7 +306,7 @@ case $basic_machine in
        | riscv32 | riscv64 \
        | rl78 | rx \
        | score \
-       | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
+       | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[234]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
        | sh64 | sh64le \
        | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
        | sparcv8 | sparcv9 | sparcv9b | sparcv9v \
@@ -313,6 +314,7 @@ case $basic_machine in
        | tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \
        | ubicom32 \
        | v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \
+       | visium \
        | we32k \
        | x86 | xc16x | xstormy16 | xtensa \
        | z8k | z80)
@@ -327,6 +329,9 @@ case $basic_machine in
        c6x)
                basic_machine=tic6x-unknown
                ;;
+       leon|leon[3-9])
+               basic_machine=sparc-$basic_machine
+               ;;
        m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | nvptx | picochip)
                basic_machine=$basic_machine-unknown
                os=-none
@@ -372,12 +377,13 @@ case $basic_machine in
        | alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \
        | arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
        | avr-* | avr32-* \
+       | ba-* \
        | be32-* | be64-* \
        | bfin-* | bs2000-* \
        | c[123]* | c30-* | [cjt]90-* | c4x-* \
        | c8051-* | clipper-* | craynv-* | cydra-* \
        | d10v-* | d30v-* | dlx-* \
-       | elxsi-* \
+       | e2k-* | elxsi-* \
        | f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
        | h8300-* | h8500-* \
        | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
@@ -424,12 +430,13 @@ case $basic_machine in
        | pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
        | powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
        | pyramid-* \
+       | riscv32-* | riscv64-* \
        | rl78-* | romp-* | rs6000-* | rx-* \
        | sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
        | shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
        | sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
        | sparclite-* \
-       | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \
+       | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx*-* \
        | tahoe-* \
        | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
        | tile*-* \
@@ -437,6 +444,7 @@ case $basic_machine in
        | ubicom32-* \
        | v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \
        | vax-* \
+       | visium-* \
        | we32k-* \
        | x86-* | x86_64-* | xc16x-* | xps100-* \
        | xstormy16-* | xtensa*-* \
@@ -513,6 +521,9 @@ case $basic_machine in
                basic_machine=i386-pc
                os=-aros
                ;;
+        asmjs)
+               basic_machine=asmjs-unknown
+               ;;
        aux)
                basic_machine=m68k-apple
                os=-aux
@@ -774,6 +785,9 @@ case $basic_machine in
                basic_machine=m68k-isi
                os=-sysv
                ;;
+       leon-*|leon[3-9]-*)
+               basic_machine=sparc-`echo $basic_machine | sed 's/-.*//'`
+               ;;
        m68knommu)
                basic_machine=m68k-unknown
                os=-linux
@@ -1365,7 +1379,7 @@ case $os in
              | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
              | -sym* | -kopensolaris* | -plan9* \
              | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
-             | -aos* | -aros* \
+             | -aos* | -aros* | -cloudabi* | -sortix* \
              | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
              | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
              | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
index acde650ced1a7d2f3f449c2067af572d0a9395ac..648417b2adc6e5dfb35a9c8b1e32fc0f7cbf73b1 100644 (file)
@@ -37,9 +37,12 @@ if CONFIG_USER
        pkg6=libzfs2-devel-$${version}.$${arch}.rpm; \
        pkg7=$${name}-test-$${version}.$${arch}.rpm; \
        pkg8=$${name}-dracut-$${version}.$${arch}.rpm; \
+       pkg9=$${name}-initramfs-$${version}.$${arch}.rpm; \
        fakeroot $(ALIEN) --bump=0 --scripts --to-deb \
-           $$pkg1 $$pkg2 $$pkg3 $$pkg4 $$pkg5 $$pkg6 $$pkg7 $$pkg8; \
-       $(RM) $$pkg1 $$pkg2 $$pkg3 $$pkg4 $$pkg5 $$pkg6 $$pkg7 $$pkg8;
+           $$pkg1 $$pkg2 $$pkg3 $$pkg4 $$pkg5 $$pkg6 $$pkg7 \
+           $$pkg8 $$pkg9;
+       $(RM) $$pkg1 $$pkg2 $$pkg3 $$pkg4 $$pkg5 $$pkg6 $$pkg7 \
+           $$pkg8 $$pkg9;
 endif
 
 deb: deb-kmod deb-utils
index 11eefb28bfcaf4b7b39e67c69ed9d1b2b37d0f49..d1062e17ec1e2d922a591b7cb41d99b813af8524 100644 (file)
@@ -7,8 +7,8 @@ AC_DEFUN([ZFS_AC_KERNEL_BDI_SETUP_AND_REGISTER], [
        AC_MSG_CHECKING([whether bdi_setup_and_register() wants 2 args])
        ZFS_LINUX_TRY_COMPILE_SYMBOL([
                #include <linux/backing-dev.h>
-       ], [
                struct backing_dev_info bdi;
+       ], [
                char *name = "bdi";
                int error __attribute__((unused)) =
                    bdi_setup_and_register(&bdi, name);
@@ -21,8 +21,8 @@ AC_DEFUN([ZFS_AC_KERNEL_BDI_SETUP_AND_REGISTER], [
                AC_MSG_CHECKING([whether bdi_setup_and_register() wants 3 args])
                ZFS_LINUX_TRY_COMPILE_SYMBOL([
                        #include <linux/backing-dev.h>
-               ], [
                        struct backing_dev_info bdi;
+               ], [
                        char *name = "bdi";
                        unsigned int cap = BDI_CAP_MAP_COPY;
                        int error __attribute__((unused)) =
index 6b9a5269eea12109ab0e3f9aa0ef995cc2b908d6..cfbec05238ceeb96ee2755d23e34031c9661ed26 100644 (file)
@@ -1,24 +1,8 @@
 dnl #
 dnl # Preferred interface for setting FAILFAST on a bio:
-dnl #   2.6.12-2.6.27: BIO_RW_FAILFAST
 dnl #   2.6.28-2.6.35: BIO_RW_FAILFAST_{DEV|TRANSPORT|DRIVER}
-dnl #   2.6.36-2.6.xx: REQ_FAILFAST_{DEV|TRANSPORT|DRIVER}
+dnl #       >= 2.6.36: REQ_FAILFAST_{DEV|TRANSPORT|DRIVER}
 dnl #
-AC_DEFUN([ZFS_AC_KERNEL_BIO_FAILFAST], [
-       AC_MSG_CHECKING([whether BIO_RW_FAILFAST is defined])
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/bio.h>
-       ],[
-               int flags __attribute__ ((unused));
-               flags = (1 << BIO_RW_FAILFAST);
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_BIO_RW_FAILFAST, 1,
-                         [BIO_RW_FAILFAST is defined])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-])
 
 AC_DEFUN([ZFS_AC_KERNEL_BIO_FAILFAST_DTD], [
        AC_MSG_CHECKING([whether BIO_RW_FAILFAST_* are defined])
@@ -47,7 +31,7 @@ AC_DEFUN([ZFS_AC_KERNEL_REQ_FAILFAST_MASK], [
                flags = REQ_FAILFAST_MASK;
        ],[
                AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_BIO_REQ_FAILFAST_MASK, 1,
+               AC_DEFINE(HAVE_REQ_FAILFAST_MASK, 1,
                          [REQ_FAILFAST_MASK is defined])
        ],[
                AC_MSG_RESULT(no)
diff --git a/zfs/config/kernel-bio-rw-barrier.m4 b/zfs/config/kernel-bio-rw-barrier.m4
new file mode 100644 (file)
index 0000000..bcf0f7e
--- /dev/null
@@ -0,0 +1,25 @@
+dnl #
+dnl # Interface for issuing a discard bio:
+dnl # 2.6.28-2.6.35: BIO_RW_BARRIER
+dnl # 2.6.36-3.x:    REQ_BARRIER
+dnl #
+
+dnl # Since REQ_BARRIER is a preprocessor definition, there is no need for an
+dnl # autotools check for it. Also, REQ_BARRIER existed in the request layer
+dnl # until torvalds/linux@7b6d91daee5cac6402186ff224c3af39d79f4a0e unified the
+dnl # request layer and bio layer flags, so it would be wrong to assume that
+dnl # the APIs are mutually exclusive contrary to the typical case.
+AC_DEFUN([ZFS_AC_KERNEL_BIO_RW_BARRIER], [
+       AC_MSG_CHECKING([whether BIO_RW_BARRIER is defined])
+       ZFS_LINUX_TRY_COMPILE([
+               #include <linux/bio.h>
+       ],[
+               int flags __attribute__ ((unused));
+               flags = BIO_RW_BARRIER;
+       ],[
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(HAVE_BIO_RW_BARRIER, 1, [BIO_RW_BARRIER is defined])
+       ],[
+               AC_MSG_RESULT(no)
+       ])
+])
diff --git a/zfs/config/kernel-bio-rw-discard.m4 b/zfs/config/kernel-bio-rw-discard.m4
new file mode 100644 (file)
index 0000000..0554b9a
--- /dev/null
@@ -0,0 +1,25 @@
+dnl #
+dnl # Interface for issuing a discard bio:
+dnl # 2.6.28-2.6.35: BIO_RW_DISCARD
+dnl # 2.6.36-3.x:    REQ_DISCARD
+dnl #
+
+dnl # Since REQ_DISCARD is a preprocessor definition, there is no need for an
+dnl # autotools check for it. Also, REQ_DISCARD existed in the request layer
+dnl # until torvalds/linux@7b6d91daee5cac6402186ff224c3af39d79f4a0e unified the
+dnl # request layer and bio layer flags, so it would be wrong to assume that
+dnl # the APIs are mutually exclusive contrary to the typical case.
+AC_DEFUN([ZFS_AC_KERNEL_BIO_RW_DISCARD], [
+       AC_MSG_CHECKING([whether BIO_RW_DISCARD is defined])
+       ZFS_LINUX_TRY_COMPILE([
+               #include <linux/bio.h>
+       ],[
+               int flags __attribute__ ((unused));
+               flags = BIO_RW_DISCARD;
+       ],[
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(HAVE_BIO_RW_DISCARD, 1, [BIO_RW_DISCARD is defined])
+       ],[
+               AC_MSG_RESULT(no)
+       ])
+])
diff --git a/zfs/config/kernel-bio-rw-syncio.m4 b/zfs/config/kernel-bio-rw-syncio.m4
deleted file mode 100644 (file)
index 4bff80a..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-dnl #
-dnl # Preferred interface for flagging a synchronous bio:
-dnl # 2.6.12-2.6.29: BIO_RW_SYNC
-dnl # 2.6.30-2.6.35: BIO_RW_SYNCIO
-dnl # 2.6.36-2.6.xx: REQ_SYNC
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_BIO_RW_SYNC], [
-       AC_MSG_CHECKING([whether BIO_RW_SYNC is defined])
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/bio.h>
-       ],[
-               int flags __attribute__ ((unused));
-               flags = BIO_RW_SYNC;
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_BIO_RW_SYNC, 1, [BIO_RW_SYNC is defined])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-])
-
-AC_DEFUN([ZFS_AC_KERNEL_BIO_RW_SYNCIO], [
-       AC_MSG_CHECKING([whether BIO_RW_SYNCIO is defined])
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/bio.h>
-       ],[
-               int flags __attribute__ ((unused));
-               flags = BIO_RW_SYNCIO;
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_BIO_RW_SYNCIO, 1, [BIO_RW_SYNCIO is defined])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-])
-
-AC_DEFUN([ZFS_AC_KERNEL_REQ_SYNC], [
-       AC_MSG_CHECKING([whether REQ_SYNC is defined])
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/bio.h>
-       ],[
-               int flags __attribute__ ((unused));
-               flags = REQ_SYNC;
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_REQ_SYNC, 1, [REQ_SYNC is defined])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-])
diff --git a/zfs/config/kernel-blk-end-request.m4 b/zfs/config/kernel-blk-end-request.m4
deleted file mode 100644 (file)
index c2980e5..0000000
+++ /dev/null
@@ -1,40 +0,0 @@
-dnl #
-dnl # 2.6.31 API change
-dnl # In 2.6.29 kernels blk_end_request() was a GPL-only symbol, this was
-dnl # changed in 2.6.31 so it may be used by non-GPL modules.
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_BLK_END_REQUEST], [
-       AC_MSG_CHECKING([whether blk_end_request() is available])
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/blkdev.h>
-       ],[
-               struct request *req = NULL;
-               (void) blk_end_request(req, 0, 0);
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_BLK_END_REQUEST, 1,
-                         [blk_end_request() is available])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-
-       AC_MSG_CHECKING([whether blk_end_request() is GPL-only])
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/module.h>
-               #include <linux/blkdev.h>
-               
-               MODULE_LICENSE("$ZFS_META_LICENSE");
-       ],[
-               struct request *req = NULL;
-               (void) blk_end_request(req, 0, 0);
-       ],[
-               AC_MSG_RESULT(no)
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_BLK_END_REQUEST_GPL_ONLY, 1,
-                         [blk_end_request() is GPL-only])
-       ])
-       EXTRA_KCFLAGS="$tmp_flags"
-])
diff --git a/zfs/config/kernel-blk-fetch-request.m4 b/zfs/config/kernel-blk-fetch-request.m4
deleted file mode 100644 (file)
index c4e1146..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-dnl #
-dnl # 2.6.31 API change
-dnl # Request queue peek/retrieval interface cleanup, the blk_fetch_request()
-dnl # function replaces the elv_next_request() and blk_fetch_request()
-dnl # functions.  The updated blk_fetch_request() function returns the
-dnl # next available request and removed it from the request queue.
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_BLK_FETCH_REQUEST], [
-       AC_MSG_CHECKING([whether blk_fetch_request() is available])
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/blkdev.h>
-       ],[
-               struct request_queue *q = NULL;
-               (void) blk_fetch_request(q);
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_BLK_FETCH_REQUEST, 1,
-                         [blk_fetch_request() is available])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-       EXTRA_KCFLAGS="$tmp_flags"
-])
diff --git a/zfs/config/kernel-blk-queue-discard.m4 b/zfs/config/kernel-blk-queue-discard.m4
deleted file mode 100644 (file)
index 8306c88..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-dnl #
-dnl # 2.6.32 API change
-dnl # Discard requests were moved to the normal I/O path.
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_DISCARD], [
-       AC_MSG_CHECKING([whether blk_queue_discard() is available])
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/blkdev.h>
-       ],[
-               struct request_queue *q = NULL;
-               (void) blk_queue_discard(q);
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_BLK_QUEUE_DISCARD, 1,
-                         [blk_queue_discard() is available])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-       EXTRA_KCFLAGS="$tmp_flags"
-])
diff --git a/zfs/config/kernel-blk-queue-io-opt.m4 b/zfs/config/kernel-blk-queue-io-opt.m4
deleted file mode 100644 (file)
index fb9b684..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-dnl #
-dnl # 2.6.30 API change
-dnl # The blk_queue_io_opt() function was added to indicate the optimal
-dnl # I/O size for the device.
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_IO_OPT], [
-       AC_MSG_CHECKING([whether blk_queue_io_opt() is available])
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/blkdev.h>
-       ],[
-               struct request_queue *q = NULL;
-               unsigned int opt = 1;
-               (void) blk_queue_io_opt(q, opt);
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_BLK_QUEUE_IO_OPT, 1,
-                         [blk_queue_io_opt() is available])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-       EXTRA_KCFLAGS="$tmp_flags"
-])
diff --git a/zfs/config/kernel-blk-queue-nonrot.m4 b/zfs/config/kernel-blk-queue-nonrot.m4
deleted file mode 100644 (file)
index aa6d678..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-dnl #
-dnl # 2.6.27 API change
-dnl # The blk_queue_nonrot() function and QUEUE_FLAG_NONROT flag were
-dnl # added so non-rotational devices could be identified.  These devices
-dnl # have no seek time which the higher level elevator uses to optimize
-dnl # how the I/O issued to the device.
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_NONROT], [
-       AC_MSG_CHECKING([whether blk_queue_nonrot() is available])
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/blkdev.h>
-       ],[
-               struct request_queue *q = NULL;
-               (void) blk_queue_nonrot(q);
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_BLK_QUEUE_NONROT, 1,
-                         [blk_queue_nonrot() is available])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-       EXTRA_KCFLAGS="$tmp_flags"
-])
diff --git a/zfs/config/kernel-blk-queue-physical-block-size.m4 b/zfs/config/kernel-blk-queue-physical-block-size.m4
deleted file mode 100644 (file)
index a585b28..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-dnl #
-dnl # 2.6.30 API change
-dnl # The blk_queue_physical_block_size() function was introduced to
-dnl # indicate the smallest I/O the device can write without incurring
-dnl # a read-modify-write penalty.
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_PHYSICAL_BLOCK_SIZE], [
-       AC_MSG_CHECKING([whether blk_queue_physical_block_size() is available])
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/blkdev.h>
-       ],[
-               struct request_queue *q = NULL;
-               unsigned short block_size = 1;
-               (void) blk_queue_physical_block_size(q, block_size);
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_BLK_QUEUE_PHYSICAL_BLOCK_SIZE, 1,
-                         [blk_queue_physical_block_size() is available])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-       EXTRA_KCFLAGS="$tmp_flags"
-])
diff --git a/zfs/config/kernel-blk-requeue-request.m4 b/zfs/config/kernel-blk-requeue-request.m4
deleted file mode 100644 (file)
index 286c4b9..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-dnl #
-dnl # 2.6.31 API change
-dnl # Request queue peek/retrieval interface cleanup, the
-dnl # elv_requeue_request() function has been replaced with the
-dnl # blk_requeue_request() function.
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_BLK_REQUEUE_REQUEST], [
-       AC_MSG_CHECKING([whether blk_requeue_request() is available])
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/blkdev.h>
-       ],[
-               struct request_queue *q = NULL;
-               struct request *req = NULL;
-               blk_requeue_request(q, req);
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_BLK_REQUEUE_REQUEST, 1,
-                         [blk_requeue_request() is available])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-       EXTRA_KCFLAGS="$tmp_flags"
-])
diff --git a/zfs/config/kernel-blk-rq-bytes.m4 b/zfs/config/kernel-blk-rq-bytes.m4
deleted file mode 100644 (file)
index bedbcc6..0000000
+++ /dev/null
@@ -1,41 +0,0 @@
-dnl #
-dnl # 2.6.29 API change
-dnl # In the 2.6.29 kernel blk_rq_bytes() was available as a GPL-only symbol.
-dnl # So we need to check the symbol license as well.  As of 2.6.31 the
-dnl blk_rq_bytes() helper was changed to a static inline which we can use.
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_BLK_RQ_BYTES], [
-       AC_MSG_CHECKING([whether blk_rq_bytes() is available])
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/blkdev.h>
-       ],[
-               struct request *req = NULL;
-               (void) blk_rq_bytes(req);
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_BLK_RQ_BYTES, 1,
-                         [blk_rq_bytes() is available])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-
-       AC_MSG_CHECKING([whether blk_rq_bytes() is GPL-only])
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/module.h>
-               #include <linux/blkdev.h>
-
-               MODULE_LICENSE("$ZFS_META_LICENSE");
-       ],[
-               struct request *req = NULL;
-               (void) blk_rq_bytes(req);
-       ],[
-               AC_MSG_RESULT(no)
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_BLK_RQ_BYTES_GPL_ONLY, 1,
-                         [blk_rq_bytes() is GPL-only])
-       ])
-       EXTRA_KCFLAGS="$tmp_flags"
-])
diff --git a/zfs/config/kernel-blk-rq-pos.m4 b/zfs/config/kernel-blk-rq-pos.m4
deleted file mode 100644 (file)
index efa595f..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-dnl #
-dnl # 2.6.31 API change
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_BLK_RQ_POS], [
-       AC_MSG_CHECKING([whether blk_rq_pos() is available])
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/blkdev.h>
-       ],[
-               struct request *req = NULL;
-               (void) blk_rq_pos(req);
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_BLK_RQ_POS, 1,
-                         [blk_rq_pos() is available])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-       EXTRA_KCFLAGS="$tmp_flags"
-])
diff --git a/zfs/config/kernel-blk-rq-sectors.m4 b/zfs/config/kernel-blk-rq-sectors.m4
deleted file mode 100644 (file)
index dea5bb5..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-dnl #
-dnl # 2.6.31 API change
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_BLK_RQ_SECTORS], [
-       AC_MSG_CHECKING([whether blk_rq_sectors() is available])
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/blkdev.h>
-       ],[
-               struct request *req = NULL;
-               (void) blk_rq_sectors(req);
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_BLK_RQ_SECTORS, 1,
-                         [blk_rq_sectors() is available])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-       EXTRA_KCFLAGS="$tmp_flags"
-])
diff --git a/zfs/config/kernel-current_bio_tail.m4 b/zfs/config/kernel-current_bio_tail.m4
new file mode 100644 (file)
index 0000000..b72f21e
--- /dev/null
@@ -0,0 +1,33 @@
+dnl #
+dnl # 2.6.34 API change
+dnl # current->bio_tail and current->bio_list were struct bio pointers prior to
+dnl # Linux 2.6.34. They were refactored into a struct bio_list pointer called
+dnl # current->bio_list in Linux 2.6.34.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_CURRENT_BIO_TAIL], [
+       AC_MSG_CHECKING([whether current->bio_tail exists])
+       ZFS_LINUX_TRY_COMPILE([
+               #include <linux/sched.h>
+       ],[
+               current->bio_tail = (struct bio **) NULL;
+       ],[
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(HAVE_CURRENT_BIO_TAIL, 1,
+                   [current->bio_tail exists])
+       ],[
+               AC_MSG_RESULT(no)
+               AC_MSG_CHECKING([whether current->bio_list exists])
+               ZFS_LINUX_TRY_COMPILE([
+                       #include <linux/sched.h>
+               ],[
+                       current->bio_list = (struct bio_list *) NULL;
+               ],[
+                       AC_MSG_RESULT(yes)
+                       AC_DEFINE(HAVE_CURRENT_BIO_LIST, 1,
+                           [current->bio_list exists])
+               ],[
+                       AC_MSG_ERROR(no - Please file a bug report at
+                           https://github.com/zfsonlinux/zfs/issues/new)
+               ])
+       ])
+])
diff --git a/zfs/config/kernel-follow-down-one.m4 b/zfs/config/kernel-follow-down-one.m4
new file mode 100644 (file)
index 0000000..63fa779
--- /dev/null
@@ -0,0 +1,20 @@
+dnl #
+dnl # 2.6.38 API change
+dnl # follow_down() renamed follow_down_one().  The original follow_down()
+dnl # symbol still exists but will traverse down all the layers.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_FOLLOW_DOWN_ONE], [
+       AC_MSG_CHECKING([whether follow_down_one() is available])
+       ZFS_LINUX_TRY_COMPILE([
+               #include <linux/namei.h>
+       ],[
+               struct path *p = NULL;
+               follow_down_one(p);
+       ],[
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(HAVE_FOLLOW_DOWN_ONE, 1,
+                   [follow_down_one() is available])
+       ],[
+               AC_MSG_RESULT(no)
+       ])
+])
diff --git a/zfs/config/kernel-generic_io_acct.m4 b/zfs/config/kernel-generic_io_acct.m4
new file mode 100644 (file)
index 0000000..25bfa38
--- /dev/null
@@ -0,0 +1,26 @@
+dnl #
+dnl # 3.19 API addition
+dnl #
+dnl # torvalds/linux@394ffa503bc40e32d7f54a9b817264e81ce131b4 allows us to
+dnl # increment iostat counters without generic_make_request().
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_GENERIC_IO_ACCT], [
+       AC_MSG_CHECKING([whether generic IO accounting symbols are avaliable])
+       ZFS_LINUX_TRY_COMPILE_SYMBOL([
+               #include <linux/bio.h>
+
+               void (*generic_start_io_acct_f)(int, unsigned long,
+                   struct hd_struct *) = &generic_start_io_acct;
+               void (*generic_end_io_acct_f)(int, struct hd_struct *,
+                   unsigned long) = &generic_end_io_acct;
+       ], [
+               generic_start_io_acct(0, 0, NULL);
+               generic_end_io_acct(0, NULL, 0);
+       ], [generic_start_io_acct], [block/bio.c], [
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(HAVE_GENERIC_IO_ACCT, 1,
+                   [generic_start_io_acct()/generic_end_io_acct() avaliable])
+       ], [
+               AC_MSG_RESULT(no)
+       ])
+])
diff --git a/zfs/config/kernel-kmap-atomic-args.m4 b/zfs/config/kernel-kmap-atomic-args.m4
new file mode 100644 (file)
index 0000000..beb1692
--- /dev/null
@@ -0,0 +1,20 @@
+dnl #
+dnl # 2.6.37 API change
+dnl # kmap_atomic changed from assigning hard-coded named slot to using
+dnl # push/pop based dynamical allocation.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS], [
+       AC_MSG_CHECKING([whether kmap_atomic wants 1 args])
+       ZFS_LINUX_TRY_COMPILE([
+               #include <linux/pagemap.h>
+       ],[
+               struct page page;
+               kmap_atomic(&page);
+       ],[
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(HAVE_1ARG_KMAP_ATOMIC, 1,
+                         [kmap_atomic wants 1 args])
+       ],[
+               AC_MSG_RESULT(no)
+       ])
+])
diff --git a/zfs/config/kernel-mk-request-fn.m4 b/zfs/config/kernel-mk-request-fn.m4
new file mode 100644 (file)
index 0000000..88ee2eb
--- /dev/null
@@ -0,0 +1,43 @@
+dnl #
+dnl # Linux 3.2 API Change
+dnl # make_request_fn returns void instead of int.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [
+       AC_MSG_CHECKING([whether make_request_fn() returns int])
+       ZFS_LINUX_TRY_COMPILE([
+               #include <linux/blkdev.h>
+
+               int make_request(struct request_queue *q, struct bio *bio)
+               {
+                       return (0);
+               }
+       ],[
+               blk_queue_make_request(NULL, &make_request);
+       ],[
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(MAKE_REQUEST_FN_RET, int,
+                   [make_request_fn() returns int])
+               AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_INT, 1,
+                   [Noting that make_request_fn() returns int])
+       ],[
+               AC_MSG_RESULT(no)
+               AC_MSG_CHECKING([whether make_request_fn() returns void])
+               ZFS_LINUX_TRY_COMPILE([
+                       #include <linux/blkdev.h>
+
+                       void make_request(struct request_queue *q, struct bio *bio)
+                       {
+                               return;
+                       }
+               ],[
+                       blk_queue_make_request(NULL, &make_request);
+               ],[
+                       AC_MSG_RESULT(yes)
+                       AC_DEFINE(MAKE_REQUEST_FN_RET, void,
+                           [make_request_fn() returns void])
+               ],[
+                       AC_MSG_ERROR(no - Please file a bug report at
+                           https://github.com/zfsonlinux/zfs/issues/new)
+               ])
+       ])
+])
diff --git a/zfs/config/kernel-rq-for-each_segment.m4 b/zfs/config/kernel-rq-for-each_segment.m4
deleted file mode 100644 (file)
index 84ce7d1..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-dnl #
-dnl # 2.6.x API change
-dnl #
-dnl # 3.14 API change
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_RQ_FOR_EACH_SEGMENT], [
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
-
-       AC_MSG_CHECKING([whether rq_for_each_segment() wants bio_vec *])
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/blkdev.h>
-       ],[
-               struct bio_vec *bv;
-               struct req_iterator iter;
-               struct request *req = NULL;
-               rq_for_each_segment(bv, req, iter) { }
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_RQ_FOR_EACH_SEGMENT, 1,
-                         [rq_for_each_segment() is available])
-               AC_DEFINE(HAVE_RQ_FOR_EACH_SEGMENT_BVP, 1,
-                         [rq_for_each_segment() wants bio_vec *])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-
-       AC_MSG_CHECKING([whether rq_for_each_segment() wants bio_vec])
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/blkdev.h>
-       ],[
-               struct bio_vec bv;
-               struct req_iterator iter;
-               struct request *req = NULL;
-               rq_for_each_segment(bv, req, iter) { }
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_RQ_FOR_EACH_SEGMENT, 1,
-                         [rq_for_each_segment() is available])
-               AC_DEFINE(HAVE_RQ_FOR_EACH_SEGMENT_BV, 1,
-                         [rq_for_each_segment() wants bio_vec])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-
-       EXTRA_KCFLAGS="$tmp_flags"
-])
diff --git a/zfs/config/kernel-rq-is_sync.m4 b/zfs/config/kernel-rq-is_sync.m4
deleted file mode 100644 (file)
index f6f51c8..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-dnl #
-dnl # 2.6.x API change
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_RQ_IS_SYNC], [
-       AC_MSG_CHECKING([whether rq_is_sync() is available])
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/blkdev.h>
-       ],[
-               struct request *req = NULL;
-               (void) rq_is_sync(req);
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_RQ_IS_SYNC, 1,
-                         [rq_is_sync() is available])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-       EXTRA_KCFLAGS="$tmp_flags"
-])
index 11700cf9494d61c6cf38156ade56c804067c70db..0a65f39ef21b4d66efd0066f19bc956358c4a56b 100644 (file)
@@ -7,6 +7,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
        ZFS_AC_TEST_MODULE
        ZFS_AC_KERNEL_CONFIG
        ZFS_AC_KERNEL_DECLARE_EVENT_CLASS
+       ZFS_AC_KERNEL_CURRENT_BIO_TAIL
        ZFS_AC_KERNEL_BDEV_BLOCK_DEVICE_OPERATIONS
        ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
        ZFS_AC_KERNEL_TYPE_FMODE_T
@@ -19,30 +20,16 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
        ZFS_AC_KERNEL_BDEV_LOGICAL_BLOCK_SIZE
        ZFS_AC_KERNEL_BDEV_PHYSICAL_BLOCK_SIZE
        ZFS_AC_KERNEL_BIO_BVEC_ITER
-       ZFS_AC_KERNEL_BIO_FAILFAST
        ZFS_AC_KERNEL_BIO_FAILFAST_DTD
        ZFS_AC_KERNEL_REQ_FAILFAST_MASK
        ZFS_AC_KERNEL_BIO_END_IO_T_ARGS
-       ZFS_AC_KERNEL_BIO_RW_SYNC
-       ZFS_AC_KERNEL_BIO_RW_SYNCIO
-       ZFS_AC_KERNEL_REQ_SYNC
-       ZFS_AC_KERNEL_BLK_END_REQUEST
+       ZFS_AC_KERNEL_BIO_RW_BARRIER
+       ZFS_AC_KERNEL_BIO_RW_DISCARD
        ZFS_AC_KERNEL_BLK_QUEUE_FLUSH
        ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS
        ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS
-       ZFS_AC_KERNEL_BLK_QUEUE_PHYSICAL_BLOCK_SIZE
-       ZFS_AC_KERNEL_BLK_QUEUE_IO_OPT
-       ZFS_AC_KERNEL_BLK_QUEUE_NONROT
-       ZFS_AC_KERNEL_BLK_QUEUE_DISCARD
-       ZFS_AC_KERNEL_BLK_FETCH_REQUEST
-       ZFS_AC_KERNEL_BLK_REQUEUE_REQUEST
-       ZFS_AC_KERNEL_BLK_RQ_BYTES
-       ZFS_AC_KERNEL_BLK_RQ_POS
-       ZFS_AC_KERNEL_BLK_RQ_SECTORS
        ZFS_AC_KERNEL_GET_DISK_RO
        ZFS_AC_KERNEL_GET_GENDISK
-       ZFS_AC_KERNEL_RQ_IS_SYNC
-       ZFS_AC_KERNEL_RQ_FOR_EACH_SEGMENT
        ZFS_AC_KERNEL_DISCARD_GRANULARITY
        ZFS_AC_KERNEL_CONST_XATTR_HANDLER
        ZFS_AC_KERNEL_XATTR_HANDLER_GET
@@ -101,6 +88,10 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
        ZFS_AC_KERNEL_LSEEK_EXECUTE
        ZFS_AC_KERNEL_VFS_ITERATE
        ZFS_AC_KERNEL_VFS_RW_ITERATE
+       ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS
+       ZFS_AC_KERNEL_FOLLOW_DOWN_ONE
+       ZFS_AC_KERNEL_MAKE_REQUEST_FN
+       ZFS_AC_KERNEL_GENERIC_IO_ACCT
 
        AS_IF([test "$LINUX_OBJ" != "$LINUX"], [
                KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$LINUX_OBJ"
@@ -111,6 +102,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
        dnl # -Wall -fno-strict-aliasing -Wstrict-prototypes and other
        dnl # compiler options are added by the kernel build system.
        KERNELCPPFLAGS="$KERNELCPPFLAGS $NO_UNUSED_BUT_SET_VARIABLE"
+       KERNELCPPFLAGS="$KERNELCPPFLAGS $NO_BOOL_COMPARE"
        KERNELCPPFLAGS="$KERNELCPPFLAGS -DHAVE_SPL -D_KERNEL"
        KERNELCPPFLAGS="$KERNELCPPFLAGS -DTEXT_DOMAIN=\\\"zfs-linux-kernel\\\""
 
@@ -338,6 +330,8 @@ AC_DEFUN([ZFS_AC_SPL], [
                                splbuild="${splsrc}/${LINUX_VERSION}"
                        ], [ test -e "${splsrc}/spl_config.h" ], [
                                splbuild="${splsrc}"
+                       ], [ find -L "${splsrc}" -name spl_config.h 2> /dev/null | grep -wq spl_config.h ], [
+                               splbuild=$(find -L "${splsrc}" -name spl_config.h | sed 's,/spl_config.h,,')
                        ], [
                                splbuild="[Not found]"
                        ])
index ad494f1e57e85ac929c3b6ece68664ce059639a7..0a6c7670840bea920a10865d1de0ae874336705d 100644 (file)
@@ -2,7 +2,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_MOUNT_HELPER], [
        AC_ARG_WITH(mounthelperdir,
                AC_HELP_STRING([--with-mounthelperdir=DIR],
                [install mount.zfs in dir [[/sbin]]]),
-               mounthelperdir=$withval,mounthelperdir=$sbindir)
+               mounthelperdir=$withval,mounthelperdir=/sbin)
 
        AC_SUBST(mounthelperdir)
 ])
index 311c754d429d4dbc00eb35e5fc312dc42abf7db4..51a20b3e6a10d736b3e6422f3faafbb1c93560db 100644 (file)
@@ -51,7 +51,7 @@ rpm-local:
        mkdir -p $(rpmbuild)/SPECS && \
        cp ${RPM_SPEC_DIR}/$(rpmspec) $(rpmbuild)/SPECS && \
        mkdir -p $(rpmbuild)/SOURCES && \
-       cp scripts/kmodtool $(rpmbuild)/SOURCES && \
+       cp $(top_srcdir)/scripts/kmodtool $(rpmbuild)/SOURCES && \
        cp $(distdir).tar.gz $(rpmbuild)/SOURCES)
 
 srpm-common: dist
index d2fe4e8e9c17b65c50032a274350301989bb3980..9eda5d9c34e0f56fd5a59a19f4bf82a1e9cbd351 100644 (file)
@@ -62,6 +62,7 @@ AC_DEFUN([ZFS_AC_DEBUG_DMU_TX], [
 
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [
        ZFS_AC_CONFIG_ALWAYS_NO_UNUSED_BUT_SET_VARIABLE
+       ZFS_AC_CONFIG_ALWAYS_NO_BOOL_COMPARE
 ])
 
 AC_DEFUN([ZFS_AC_CONFIG], [
@@ -139,7 +140,7 @@ AC_DEFUN([ZFS_AC_RPM], [
        ])
 
        RPM_DEFINE_COMMON='--define "$(DEBUG_ZFS) 1" --define "$(DEBUG_DMU_TX) 1"'
-       RPM_DEFINE_UTIL='--define "_dracutdir $(dracutdir)" --define "_udevdir $(udevdir)" --define "_udevruledir $(udevruledir)"'
+       RPM_DEFINE_UTIL='--define "_dracutdir $(dracutdir)" --define "_udevdir $(udevdir)" --define "_udevruledir $(udevruledir)" --define "_initconfdir $(DEFAULT_INITCONF_DIR)" $(DEFINE_INITRAMFS)'
        RPM_DEFINE_KMOD='--define "kernels $(LINUX_VERSION)" --define "require_spldir $(SPL)" --define "require_splobj $(SPL_OBJ)" --define "ksrc $(LINUX)" --define "kobj $(LINUX_OBJ)"'
        RPM_DEFINE_DKMS=
 
@@ -285,7 +286,6 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [
                debian)     DEFAULT_PACKAGE=deb  ;;
                *)          DEFAULT_PACKAGE=rpm  ;;
        esac
-
        AC_MSG_RESULT([$DEFAULT_PACKAGE])
        AC_SUBST(DEFAULT_PACKAGE)
 
@@ -308,9 +308,32 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [
                debian)     DEFAULT_INIT_SCRIPT=lsb    ;;
                *)          DEFAULT_INIT_SCRIPT=lsb    ;;
        esac
-
        AC_MSG_RESULT([$DEFAULT_INIT_SCRIPT])
        AC_SUBST(DEFAULT_INIT_SCRIPT)
+
+       AC_MSG_CHECKING([default init config direectory])
+       case "$VENDOR" in
+               gentoo)     DEFAULT_INITCONF_DIR=/etc/conf.d    ;;
+               toss)       DEFAULT_INITCONF_DIR=/etc/sysconfig ;;
+               redhat)     DEFAULT_INITCONF_DIR=/etc/sysconfig ;;
+               fedora)     DEFAULT_INITCONF_DIR=/etc/sysconfig ;;
+               sles)       DEFAULT_INITCONF_DIR=/etc/sysconfig ;;
+               ubuntu)     DEFAULT_INITCONF_DIR=/etc/default   ;;
+               debian)     DEFAULT_INITCONF_DIR=/etc/default   ;;
+               *)          DEFAULT_INITCONF_DIR=/etc/default   ;;
+       esac
+       AC_MSG_RESULT([$DEFAULT_INITCONF_DIR])
+       AC_SUBST(DEFAULT_INITCONF_DIR)
+
+       AC_MSG_CHECKING([whether initramfs-tools is available])
+       if test -d /usr/share/initramfs-tools ; then
+               DEFINE_INITRAMFS='--define "_initramfs 1"'
+               AC_MSG_RESULT([yes])
+       else
+               DEFINE_INITRAMFS=''
+               AC_MSG_RESULT([no])
+       fi
+       AC_SUBST(DEFINE_INITRAMFS)
 ])
 
 dnl #
index 04f0f3453ca3edd89da2b24b42647162f6fcde14..d76bf34624db92baffb88a5d0a970dee03447787 100755 (executable)
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for zfs 0.6.4.2.
+# Generated by GNU Autoconf 2.69 for zfs 0.6.5.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -587,8 +587,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='zfs'
 PACKAGE_TARNAME='zfs'
-PACKAGE_VERSION='0.6.4.2'
-PACKAGE_STRING='zfs 0.6.4.2'
+PACKAGE_VERSION='0.6.5.3'
+PACKAGE_STRING='zfs 0.6.5.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -650,7 +650,6 @@ LINUX_SYMBOLS
 LINUX_VERSION
 LINUX_OBJ
 LINUX
-runstatedir
 FRAME_LARGER_THAN
 LIBBLKID
 LIBUUID
@@ -665,6 +664,7 @@ ZFS_INIT_SYSTEMD
 udevruledir
 udevdir
 mounthelperdir
+NO_BOOL_COMPARE
 NO_UNUSED_BUT_SET_VARIABLE
 ZFS_CONFIG
 TARGET_ASM_DIR
@@ -692,6 +692,8 @@ HAVE_RPMBUILD
 RPM_VERSION
 RPM
 HAVE_RPM
+DEFINE_INITRAMFS
+DEFAULT_INITCONF_DIR
 DEFAULT_INIT_SCRIPT
 DEFAULT_INIT_DIR
 DEFAULT_PACKAGE
@@ -812,6 +814,7 @@ infodir
 docdir
 oldincludedir
 includedir
+runstatedir
 localstatedir
 sharedstatedir
 sysconfdir
@@ -914,6 +917,7 @@ datadir='${datarootdir}'
 sysconfdir='${prefix}/etc'
 sharedstatedir='${prefix}/com'
 localstatedir='${prefix}/var'
+runstatedir='${localstatedir}/run'
 includedir='${prefix}/include'
 oldincludedir='/usr/include'
 docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
@@ -1166,6 +1170,15 @@ do
   | -silent | --silent | --silen | --sile | --sil)
     silent=yes ;;
 
+  -runstatedir | --runstatedir | --runstatedi | --runstated \
+  | --runstate | --runstat | --runsta | --runst | --runs \
+  | --run | --ru | --r)
+    ac_prev=runstatedir ;;
+  -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
+  | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
+  | --run=* | --ru=* | --r=*)
+    runstatedir=$ac_optarg ;;
+
   -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
     ac_prev=sbindir ;;
   -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
@@ -1303,7 +1316,7 @@ fi
 for ac_var in  exec_prefix prefix bindir sbindir libexecdir datarootdir \
                datadir sysconfdir sharedstatedir localstatedir includedir \
                oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
-               libdir localedir mandir
+               libdir localedir mandir runstatedir
 do
   eval ac_val=\$$ac_var
   # Remove trailing slashes.
@@ -1416,7 +1429,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures zfs 0.6.4.2 to adapt to many kinds of systems.
+\`configure' configures zfs 0.6.5.3 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1456,6 +1469,7 @@ Fine tuning of the installation directories:
   --sysconfdir=DIR        read-only single-machine data [PREFIX/etc]
   --sharedstatedir=DIR    modifiable architecture-independent data [PREFIX/com]
   --localstatedir=DIR     modifiable single-machine data [PREFIX/var]
+  --runstatedir=DIR       modifiable per-process data [LOCALSTATEDIR/run]
   --libdir=DIR            object code libraries [EPREFIX/lib]
   --includedir=DIR        C header files [PREFIX/include]
   --oldincludedir=DIR     C header files for non-gcc [/usr/include]
@@ -1487,7 +1501,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of zfs 0.6.4.2:";;
+     short | recursive ) echo "Configuration of zfs 0.6.5.3:";;
    esac
   cat <<\_ACEOF
 
@@ -1625,7 +1639,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-zfs configure 0.6.4.2
+zfs configure 0.6.5.3
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1990,7 +2004,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by zfs $as_me 0.6.4.2, which was
+It was created by zfs $as_me 0.6.5.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -3149,7 +3163,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='zfs'
- VERSION='0.6.4.2'
+ VERSION='0.6.5.3'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -12002,7 +12016,6 @@ $as_echo_n "checking default package type... " >&6; }
                debian)     DEFAULT_PACKAGE=deb  ;;
                *)          DEFAULT_PACKAGE=rpm  ;;
        esac
-
        { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DEFAULT_PACKAGE" >&5
 $as_echo "$DEFAULT_PACKAGE" >&6; }
 
@@ -12029,11 +12042,39 @@ $as_echo_n "checking default init script type... " >&6; }
                debian)     DEFAULT_INIT_SCRIPT=lsb    ;;
                *)          DEFAULT_INIT_SCRIPT=lsb    ;;
        esac
-
        { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DEFAULT_INIT_SCRIPT" >&5
 $as_echo "$DEFAULT_INIT_SCRIPT" >&6; }
 
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking default init config direectory" >&5
+$as_echo_n "checking default init config direectory... " >&6; }
+       case "$VENDOR" in
+               gentoo)     DEFAULT_INITCONF_DIR=/etc/conf.d    ;;
+               toss)       DEFAULT_INITCONF_DIR=/etc/sysconfig ;;
+               redhat)     DEFAULT_INITCONF_DIR=/etc/sysconfig ;;
+               fedora)     DEFAULT_INITCONF_DIR=/etc/sysconfig ;;
+               sles)       DEFAULT_INITCONF_DIR=/etc/sysconfig ;;
+               ubuntu)     DEFAULT_INITCONF_DIR=/etc/default   ;;
+               debian)     DEFAULT_INITCONF_DIR=/etc/default   ;;
+               *)          DEFAULT_INITCONF_DIR=/etc/default   ;;
+       esac
+       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DEFAULT_INITCONF_DIR" >&5
+$as_echo "$DEFAULT_INITCONF_DIR" >&6; }
+
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether initramfs-tools is available" >&5
+$as_echo_n "checking whether initramfs-tools is available... " >&6; }
+       if test -d /usr/share/initramfs-tools ; then
+               DEFINE_INITRAMFS='--define "_initramfs 1"'
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+       else
+               DEFINE_INITRAMFS=''
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+       fi
+
+
 
        RPM=rpm
        RPMBUILD=rpmbuild
@@ -12073,7 +12114,7 @@ $as_echo "$HAVE_RPMBUILD" >&6; }
 fi
 
        RPM_DEFINE_COMMON='--define "$(DEBUG_ZFS) 1" --define "$(DEBUG_DMU_TX) 1"'
-       RPM_DEFINE_UTIL='--define "_dracutdir $(dracutdir)" --define "_udevdir $(udevdir)" --define "_udevruledir $(udevruledir)"'
+       RPM_DEFINE_UTIL='--define "_dracutdir $(dracutdir)" --define "_udevdir $(udevdir)" --define "_udevruledir $(udevruledir)" --define "_initconfdir $(DEFAULT_INITCONF_DIR)" $(DEFINE_INITRAMFS)'
        RPM_DEFINE_KMOD='--define "kernels $(LINUX_VERSION)" --define "require_spldir $(SPL)" --define "require_splobj $(SPL_OBJ)" --define "ksrc $(LINUX)" --define "kobj $(LINUX_OBJ)"'
        RPM_DEFINE_DKMS=
 
@@ -12248,6 +12289,42 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
 
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for -Wno-bool-compare support" >&5
+$as_echo_n "checking for -Wno-bool-compare support... " >&6; }
+
+       saved_flags="$CFLAGS"
+       CFLAGS="$CFLAGS -Wbool-compare"
+
+       cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+               NO_BOOL_COMPARE=-Wno-bool-compare
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+else
+
+               NO_BOOL_COMPARE=
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+       CFLAGS="$saved_flags"
+
+
+
 
        case "$ZFS_CONFIG" in
                user)
@@ -12274,7 +12351,7 @@ fi
 if test "${with_mounthelperdir+set}" = set; then :
   withval=$with_mounthelperdir; mounthelperdir=$withval
 else
-  mounthelperdir=$sbindir
+  mounthelperdir=/sbin
 fi
 
 
@@ -13278,6 +13355,10 @@ elif  test -e "${splsrc}/spl_config.h" ; then :
 
                                splbuild="${splsrc}"
 
+elif  find -L "${splsrc}" -name spl_config.h 2> /dev/null | grep -wq spl_config.h ; then :
+
+                               splbuild=$(find -L "${splsrc}" -name spl_config.h | sed 's,/spl_config.h,,')
+
 else
 
                                splbuild="Not found"
        EXTRA_KCFLAGS="$tmp_flags"
 
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether current->bio_tail exists" >&5
+$as_echo_n "checking whether current->bio_tail exists... " >&6; }
+
+
+cat confdefs.h - <<_ACEOF >conftest.c
+
+
+               #include <linux/sched.h>
+
+int
+main (void)
+{
+
+               current->bio_tail = (struct bio **) NULL;
+
+  ;
+  return 0;
+}
+
+_ACEOF
+
+
+
+cat - <<_ACEOF >conftest.h
+
+_ACEOF
+
+
+       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
+       echo "obj-m := conftest.o" >build/Makefile
+       modpost_flag=''
+       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
+       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_CURRENT_BIO_TAIL 1" >>confdefs.h
+
+
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether current->bio_list exists" >&5
+$as_echo_n "checking whether current->bio_list exists... " >&6; }
+
+
+cat confdefs.h - <<_ACEOF >conftest.c
+
+
+                       #include <linux/sched.h>
+
+int
+main (void)
+{
+
+                       current->bio_list = (struct bio_list *) NULL;
+
+  ;
+  return 0;
+}
+
+_ACEOF
+
+
+
+cat - <<_ACEOF >conftest.h
+
+_ACEOF
+
+
+       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
+       echo "obj-m := conftest.o" >build/Makefile
+       modpost_flag=''
+       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
+       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_CURRENT_BIO_LIST 1" >>confdefs.h
+
+
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+                       as_fn_error $? "no - Please file a bug report at
+                           https://github.com/zfsonlinux/zfs/issues/new" "$LINENO" 5
+
+
+
+fi
+       rm -Rf build
+
+
+
+
+
+fi
+       rm -Rf build
+
+
+
+
        { $as_echo "$as_me:${as_lineno-$LINENO}: checking block device operation prototypes" >&5
 $as_echo_n "checking block device operation prototypes... " >&6; }
        tmp_flags="$EXTRA_KCFLAGS"
@@ -14678,8 +14887,8 @@ fi
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether BIO_RW_FAILFAST is defined" >&5
-$as_echo_n "checking whether BIO_RW_FAILFAST is defined... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether BIO_RW_FAILFAST_* are defined" >&5
+$as_echo_n "checking whether BIO_RW_FAILFAST_* are defined... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
@@ -14692,7 +14901,9 @@ main (void)
 {
 
                int flags __attribute__ ((unused));
-               flags = (1 << BIO_RW_FAILFAST);
+               flags = ((1 << BIO_RW_FAILFAST_DEV) |
+                        (1 << BIO_RW_FAILFAST_TRANSPORT) |
+                        (1 << BIO_RW_FAILFAST_DRIVER));
 
   ;
   return 0;
@@ -14726,7 +14937,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BIO_RW_FAILFAST 1" >>confdefs.h
+$as_echo "#define HAVE_BIO_RW_FAILFAST_DTD 1" >>confdefs.h
 
 
 else
@@ -14744,8 +14955,8 @@ fi
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether BIO_RW_FAILFAST_* are defined" >&5
-$as_echo_n "checking whether BIO_RW_FAILFAST_* are defined... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether REQ_FAILFAST_MASK is defined" >&5
+$as_echo_n "checking whether REQ_FAILFAST_MASK is defined... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
@@ -14758,9 +14969,7 @@ main (void)
 {
 
                int flags __attribute__ ((unused));
-               flags = ((1 << BIO_RW_FAILFAST_DEV) |
-                        (1 << BIO_RW_FAILFAST_TRANSPORT) |
-                        (1 << BIO_RW_FAILFAST_DRIVER));
+               flags = REQ_FAILFAST_MASK;
 
   ;
   return 0;
@@ -14794,7 +15003,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BIO_RW_FAILFAST_DTD 1" >>confdefs.h
+$as_echo "#define HAVE_REQ_FAILFAST_MASK 1" >>confdefs.h
 
 
 else
@@ -14812,8 +15021,8 @@ fi
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether REQ_FAILFAST_MASK is defined" >&5
-$as_echo_n "checking whether REQ_FAILFAST_MASK is defined... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bio_end_io_t wants 1 arg" >&5
+$as_echo_n "checking whether bio_end_io_t wants 1 arg... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
@@ -14821,12 +15030,14 @@ cat confdefs.h - <<_ACEOF >conftest.c
 
                #include <linux/bio.h>
 
+               void wanted_end_io(struct bio *bio) { return; }
+
+               bio_end_io_t *end_io __attribute__ ((unused)) = wanted_end_io;
+
 int
 main (void)
 {
 
-               int flags __attribute__ ((unused));
-               flags = REQ_FAILFAST_MASK;
 
   ;
   return 0;
@@ -14860,7 +15071,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BIO_REQ_FAILFAST_MASK 1" >>confdefs.h
+$as_echo "#define HAVE_1ARG_BIO_END_IO_T 1" >>confdefs.h
 
 
 else
@@ -14878,8 +15089,8 @@ fi
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bio_end_io_t wants 2 args" >&5
-$as_echo_n "checking whether bio_end_io_t wants 2 args... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether BIO_RW_BARRIER is defined" >&5
+$as_echo_n "checking whether BIO_RW_BARRIER is defined... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
@@ -14887,14 +15098,12 @@ cat confdefs.h - <<_ACEOF >conftest.c
 
                #include <linux/bio.h>
 
-               void wanted_end_io(struct bio *bio, int x) { return; }
-
-               bio_end_io_t *end_io __attribute__ ((unused)) = wanted_end_io;
-
 int
 main (void)
 {
 
+               int flags __attribute__ ((unused));
+               flags = BIO_RW_BARRIER;
 
   ;
   return 0;
@@ -14928,7 +15137,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_2ARGS_BIO_END_IO_T 1" >>confdefs.h
+$as_echo "#define HAVE_BIO_RW_BARRIER 1" >>confdefs.h
 
 
 else
@@ -14946,8 +15155,8 @@ fi
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether BIO_RW_SYNC is defined" >&5
-$as_echo_n "checking whether BIO_RW_SYNC is defined... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether BIO_RW_DISCARD is defined" >&5
+$as_echo_n "checking whether BIO_RW_DISCARD is defined... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
@@ -14960,7 +15169,7 @@ main (void)
 {
 
                int flags __attribute__ ((unused));
-               flags = BIO_RW_SYNC;
+               flags = BIO_RW_DISCARD;
 
   ;
   return 0;
@@ -14994,7 +15203,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BIO_RW_SYNC 1" >>confdefs.h
+$as_echo "#define HAVE_BIO_RW_DISCARD 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether BIO_RW_SYNCIO is defined" >&5
-$as_echo_n "checking whether BIO_RW_SYNCIO is defined... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_flush() is available" >&5
+$as_echo_n "checking whether blk_queue_flush() is available... " >&6; }
+       tmp_flags="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/bio.h>
+               #include <linux/blkdev.h>
 
 int
 main (void)
 {
 
-               int flags __attribute__ ((unused));
-               flags = BIO_RW_SYNCIO;
+               struct request_queue *q = NULL;
+               (void) blk_queue_flush(q, REQ_FLUSH);
 
   ;
   return 0;
@@ -15060,7 +15271,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BIO_RW_SYNCIO 1" >>confdefs.h
+$as_echo "#define HAVE_BLK_QUEUE_FLUSH 1" >>confdefs.h
 
 
 else
 
 
 
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether REQ_SYNC is defined" >&5
-$as_echo_n "checking whether REQ_SYNC is defined... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_flush() is GPL-only" >&5
+$as_echo_n "checking whether blk_queue_flush() is GPL-only... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/bio.h>
+               #include <linux/module.h>
+               #include <linux/blkdev.h>
+
+               MODULE_LICENSE("$ZFS_META_LICENSE");
 
 int
 main (void)
 {
 
-               int flags __attribute__ ((unused));
-               flags = REQ_SYNC;
+               struct request_queue *q = NULL;
+               (void) blk_queue_flush(q, REQ_FLUSH);
 
   ;
   return 0;
@@ -15123,18 +15336,18 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_REQ_SYNC 1" >>confdefs.h
-
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_BLK_QUEUE_FLUSH_GPL_ONLY 1" >>confdefs.h
+
 
 
 
        rm -Rf build
 
 
+       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_end_request() is available" >&5
-$as_echo_n "checking whether blk_end_request() is available... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_max_hw_sectors() is available" >&5
+$as_echo_n "checking whether blk_queue_max_hw_sectors() is available... " >&6; }
        tmp_flags="$EXTRA_KCFLAGS"
        EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
@@ -15159,8 +15373,8 @@ int
 main (void)
 {
 
-               struct request *req = NULL;
-               (void) blk_end_request(req, 0, 0);
+               struct request_queue *q = NULL;
+               (void) blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
 
   ;
   return 0;
@@ -15194,7 +15408,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_END_REQUEST 1" >>confdefs.h
+$as_echo "#define HAVE_BLK_QUEUE_MAX_HW_SECTORS 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
+       EXTRA_KCFLAGS="$tmp_flags"
+
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_end_request() is GPL-only" >&5
-$as_echo_n "checking whether blk_end_request() is GPL-only... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_max_segments() is available" >&5
+$as_echo_n "checking whether blk_queue_max_segments() is available... " >&6; }
+       tmp_flags="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/module.h>
                #include <linux/blkdev.h>
 
-               MODULE_LICENSE("$ZFS_META_LICENSE");
-
 int
 main (void)
 {
 
-               struct request *req = NULL;
-               (void) blk_end_request(req, 0, 0);
+               struct request_queue *q = NULL;
+               (void) blk_queue_max_segments(q, BLK_MAX_SEGMENTS);
 
   ;
   return 0;
@@ -15259,19 +15474,19 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_END_REQUEST_GPL_ONLY 1" >>confdefs.h
-
-
+$as_echo "#define HAVE_BLK_QUEUE_MAX_SEGMENTS 1" >>confdefs.h
+
+
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
 
 
 fi
@@ -15281,8 +15496,8 @@ fi
        EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_flush() is available" >&5
-$as_echo_n "checking whether blk_queue_flush() is available... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether get_disk_ro() is available" >&5
+$as_echo_n "checking whether get_disk_ro() is available... " >&6; }
        tmp_flags="$EXTRA_KCFLAGS"
        EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
@@ -15296,8 +15511,8 @@ int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               (void) blk_queue_flush(q, REQ_FLUSH);
+               struct gendisk *disk = NULL;
+               (void) get_disk_ro(disk);
 
   ;
   return 0;
@@ -15331,7 +15546,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_QUEUE_FLUSH 1" >>confdefs.h
+$as_echo "#define HAVE_GET_DISK_RO 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
+       EXTRA_KCFLAGS="$tmp_flags"
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_flush() is GPL-only" >&5
-$as_echo_n "checking whether blk_queue_flush() is GPL-only... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether get_gendisk() is available" >&5
+$as_echo_n "checking whether get_gendisk() is available... " >&6; }
 
 
-cat confdefs.h - <<_ACEOF >conftest.c
 
+cat confdefs.h - <<_ACEOF >conftest.c
 
-               #include <linux/module.h>
-               #include <linux/blkdev.h>
 
-               MODULE_LICENSE("$ZFS_META_LICENSE");
+               #include <linux/genhd.h>
 
 int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               (void) blk_queue_flush(q, REQ_FLUSH);
+               get_gendisk(0, NULL);
 
   ;
   return 0;
@@ -15395,33 +15608,69 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
+  rc=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+ rc=1
+
+
+fi
+       rm -Rf build
+
+
+       if test $rc -ne 0; then :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+       else
+               if test "x$enable_linux_builtin" != xyes; then
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+       grep -q -E '[[:space:]]get_gendisk[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in block/genhd.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(get_gendisk)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
 
-$as_echo "#define HAVE_BLK_QUEUE_FLUSH_GPL_ONLY 1" >>confdefs.h
+               fi
+               if test $rc -ne 0; then :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
+               else :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
+$as_echo "#define HAVE_GET_GENDISK 1" >>confdefs.h
 
-fi
-       rm -Rf build
 
+               fi
+       fi
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_max_hw_sectors() is available" >&5
-$as_echo_n "checking whether blk_queue_max_hw_sectors() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ql->discard_granularity is available" >&5
+$as_echo_n "checking whether ql->discard_granularity is available... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
@@ -15433,8 +15682,9 @@ int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               (void) blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
+               struct queue_limits ql __attribute__ ((unused));
+
+               ql.discard_granularity = 0;
 
   ;
   return 0;
@@ -15468,7 +15718,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_QUEUE_MAX_HW_SECTORS 1" >>confdefs.h
+$as_echo "#define HAVE_DISCARD_GRANULARITY 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
-
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_max_segments() is available" >&5
-$as_echo_n "checking whether blk_queue_max_segments() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether super_block uses const struct xattr_hander" >&5
+$as_echo_n "checking whether super_block uses const struct xattr_hander... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/fs.h>
+               #include <linux/xattr.h>
+
+               const struct xattr_handler xattr_test_handler = {
+                       .prefix = "test",
+                       .get    = NULL,
+                       .set    = NULL,
+               };
+
+               const struct xattr_handler *xattr_handlers[] = {
+                       &xattr_test_handler,
+               };
+
+               const struct super_block sb __attribute__ ((unused)) = {
+                       .s_xattr = xattr_handlers,
+               };
 
 int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               (void) blk_queue_max_segments(q, BLK_MAX_SEGMENTS);
 
   ;
   return 0;
@@ -15537,7 +15796,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_QUEUE_MAX_SEGMENTS 1" >>confdefs.h
+$as_echo "#define HAVE_CONST_XATTR_HANDLER 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_physical_block_size() is available" >&5
-$as_echo_n "checking whether blk_queue_physical_block_size() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether xattr_handler->get() wants dentry" >&5
+$as_echo_n "checking whether xattr_handler->get() wants dentry... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/xattr.h>
+
+               int get(struct dentry *dentry, const char *name,
+                   void *buffer, size_t size, int handler_flags) { return 0; }
+               static const struct xattr_handler
+                   xops __attribute__ ((unused)) = {
+                       .get = get,
+               };
 
 int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               unsigned short block_size = 1;
-               (void) blk_queue_physical_block_size(q, block_size);
 
   ;
   return 0;
@@ -15607,7 +15867,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_QUEUE_PHYSICAL_BLOCK_SIZE 1" >>confdefs.h
+$as_echo "#define HAVE_DENTRY_XATTR_GET 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_io_opt() is available" >&5
-$as_echo_n "checking whether blk_queue_io_opt() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether xattr_handler->set() wants dentry" >&5
+$as_echo_n "checking whether xattr_handler->set() wants dentry... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/xattr.h>
+
+               int set(struct dentry *dentry, const char *name,
+                   const void *buffer, size_t size, int flags,
+                   int handler_flags) { return 0; }
+               static const struct xattr_handler
+                   xops __attribute__ ((unused)) = {
+                       .set = set,
+               };
 
 int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               unsigned int opt = 1;
-               (void) blk_queue_io_opt(q, opt);
 
   ;
   return 0;
@@ -15677,7 +15939,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_QUEUE_IO_OPT 1" >>confdefs.h
+$as_echo "#define HAVE_DENTRY_XATTR_SET 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_nonrot() is available" >&5
-$as_echo_n "checking whether blk_queue_nonrot() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether xattr_handler->list() wants dentry" >&5
+$as_echo_n "checking whether xattr_handler->list() wants dentry... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/xattr.h>
+
+               size_t list(struct dentry *dentry, char *list, size_t list_size,
+                   const char *name, size_t name_len, int handler_flags)
+                   { return 0; }
+               static const struct xattr_handler
+                   xops __attribute__ ((unused)) = {
+                       .list = list,
+               };
 
 int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               (void) blk_queue_nonrot(q);
 
   ;
   return 0;
@@ -15746,7 +16011,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_QUEUE_NONROT 1" >>confdefs.h
+$as_echo "#define HAVE_DENTRY_XATTR_LIST 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_discard() is available" >&5
-$as_echo_n "checking whether blk_queue_discard() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether inode_owner_or_capable() exists" >&5
+$as_echo_n "checking whether inode_owner_or_capable() exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               (void) blk_queue_discard(q);
+               struct inode *ip = NULL;
+               (void) inode_owner_or_capable(ip);
 
   ;
   return 0;
@@ -15815,7 +16077,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_QUEUE_DISCARD 1" >>confdefs.h
+$as_echo "#define HAVE_INODE_OWNER_OR_CAPABLE 1" >>confdefs.h
 
 
 else
@@ -15824,33 +16086,22 @@ sed 's/^/| /' conftest.$ac_ext >&5
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
-
-
-
-fi
-       rm -Rf build
-
-
-       EXTRA_KCFLAGS="$tmp_flags"
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_fetch_request() is available" >&5
-$as_echo_n "checking whether blk_fetch_request() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether is_owner_or_cap() exists" >&5
+$as_echo_n "checking whether is_owner_or_cap() exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+                       #include <linux/fs.h>
+                       #include <linux/sched.h>
 
 int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               (void) blk_fetch_request(q);
+                       struct inode *ip = NULL;
+                       (void) is_owner_or_cap(ip);
 
   ;
   return 0;
@@ -15881,18 +16132,18 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_FETCH_REQUEST 1" >>confdefs.h
+$as_echo "#define HAVE_IS_OWNER_OR_CAP 1" >>confdefs.h
 
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+                       as_fn_error $? "no - Please file a bug report at
+                           https://github.com/zfsonlinux/zfs/issues/new" "$LINENO" 5
 
 
 
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_requeue_request() is available" >&5
-$as_echo_n "checking whether blk_requeue_request() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+
+fi
+       rm -Rf build
+
+
+
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether posix_acl_from_xattr() needs user_ns" >&5
+$as_echo_n "checking whether posix_acl_from_xattr() needs user_ns... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/cred.h>
+               #include <linux/fs.h>
+               #include <linux/posix_acl_xattr.h>
 
 int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               struct request *req = NULL;
-               blk_requeue_request(q, req);
+               posix_acl_from_xattr(&init_user_ns, NULL, 0);
 
   ;
   return 0;
@@ -15954,7 +16209,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_REQUEUE_REQUEST 1" >>confdefs.h
+$as_echo "#define HAVE_POSIX_ACL_FROM_XATTR_USERNS 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_rq_bytes() is available" >&5
-$as_echo_n "checking whether blk_rq_bytes() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether posix_acl_release() is available" >&5
+$as_echo_n "checking whether posix_acl_release() is available... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/cred.h>
+               #include <linux/fs.h>
+               #include <linux/posix_acl.h>
 
 int
 main (void)
 {
 
-               struct request *req = NULL;
-               (void) blk_rq_bytes(req);
+               struct posix_acl* tmp = posix_acl_alloc(1, 0);
+               posix_acl_release(tmp);
 
   ;
   return 0;
@@ -16023,7 +16277,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_RQ_BYTES 1" >>confdefs.h
+$as_echo "#define HAVE_POSIX_ACL_RELEASE 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_rq_bytes() is GPL-only" >&5
-$as_echo_n "checking whether blk_rq_bytes() is GPL-only... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether posix_acl_release() is GPL-only" >&5
+$as_echo_n "checking whether posix_acl_release() is GPL-only... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/module.h>
-               #include <linux/blkdev.h>
-
-               MODULE_LICENSE("$ZFS_META_LICENSE");
+               #include <linux/cred.h>
+               #include <linux/fs.h>
+               #include <linux/posix_acl.h>
+
+               MODULE_LICENSE("$ZFS_META_LICENSE");
 
 int
 main (void)
 {
 
-               struct request *req = NULL;
-               (void) blk_rq_bytes(req);
+               struct posix_acl* tmp = posix_acl_alloc(1, 0);
+               posix_acl_release(tmp);
 
   ;
   return 0;
@@ -16098,7 +16353,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_RQ_BYTES_GPL_ONLY 1" >>confdefs.h
+$as_echo "#define HAVE_POSIX_ACL_RELEASE_GPL_ONLY 1" >>confdefs.h
 
 
 
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_rq_pos() is available" >&5
-$as_echo_n "checking whether blk_rq_pos() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether posix_acl_chmod exists" >&5
+$as_echo_n "checking whether posix_acl_chmod exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/fs.h>
+               #include <linux/posix_acl.h>
 
 int
 main (void)
 {
 
-               struct request *req = NULL;
-               (void) blk_rq_pos(req);
+               posix_acl_chmod(NULL, 0, 0)
 
   ;
   return 0;
@@ -16160,7 +16412,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_RQ_POS 1" >>confdefs.h
+$as_echo "#define HAVE_POSIX_ACL_CHMOD 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
-
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_rq_sectors() is available" >&5
-$as_echo_n "checking whether blk_rq_sectors() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether __posix_acl_chmod exists" >&5
+$as_echo_n "checking whether __posix_acl_chmod exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/fs.h>
+               #include <linux/posix_acl.h>
 
 int
 main (void)
 {
 
-               struct request *req = NULL;
-               (void) blk_rq_sectors(req);
+               __posix_acl_chmod(NULL, 0, 0)
 
   ;
   return 0;
@@ -16229,7 +16477,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_RQ_SECTORS 1" >>confdefs.h
+$as_echo "#define HAVE___POSIX_ACL_CHMOD 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether get_disk_ro() is available" >&5
-$as_echo_n "checking whether get_disk_ro() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether inode has i_acl and i_default_acl" >&5
+$as_echo_n "checking whether inode has i_acl and i_default_acl... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
-               struct gendisk *disk = NULL;
-               (void) get_disk_ro(disk);
+               struct inode ino;
+               ino.i_acl = NULL;
+               ino.i_default_acl = NULL;
 
   ;
   return 0;
@@ -16298,7 +16544,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_GET_DISK_RO 1" >>confdefs.h
+$as_echo "#define HAVE_POSIX_ACL_CACHING 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether get_gendisk() is available" >&5
-$as_echo_n "checking whether get_gendisk() is available... " >&6; }
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether posix_acl_equiv_mode() wants umode_t" >&5
+$as_echo_n "checking whether posix_acl_equiv_mode() wants umode_t... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/genhd.h>
+               #include <linux/fs.h>
+               #include <linux/posix_acl.h>
 
 int
 main (void)
 {
 
-               get_gendisk(0, NULL);
+               umode_t tmp;
+               posix_acl_equiv_mode(NULL,&tmp);
 
   ;
   return 0;
@@ -16360,84 +16607,48 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
-fi
-       rm -Rf build
+$as_echo "#define HAVE_POSIX_ACL_EQUIV_MODE_UMODE_T 1" >>confdefs.h
 
 
-       if test $rc -ne 0; then :
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
-       else
-               if test "x$enable_linux_builtin" != xyes; then
-
-       grep -q -E '[[:space:]]get_gendisk[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in block/genhd.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(get_gendisk)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
-
-               fi
-               if test $rc -ne 0; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
 
-               else :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+fi
+       rm -Rf build
 
-$as_echo "#define HAVE_GET_GENDISK 1" >>confdefs.h
 
 
-               fi
-       fi
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->permission() exists" >&5
+$as_echo_n "checking whether iops->permission() exists... " >&6; }
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether rq_is_sync() is available" >&5
-$as_echo_n "checking whether rq_is_sync() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+cat confdefs.h - <<_ACEOF >conftest.c
 
 
-cat confdefs.h - <<_ACEOF >conftest.c
+               #include <linux/fs.h>
 
+               int permission_fn(struct inode *inode, int mask) { return 0; }
 
-               #include <linux/blkdev.h>
+               static const struct inode_operations
+                   iops __attribute__ ((unused)) = {
+                       .permission = permission_fn,
+               };
 
 int
 main (void)
 {
 
-               struct request *req = NULL;
-               (void) rq_is_sync(req);
 
   ;
   return 0;
@@ -16471,7 +16682,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_RQ_IS_SYNC 1" >>confdefs.h
+$as_echo "#define HAVE_PERMISSION 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
-
 
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether rq_for_each_segment() wants bio_vec *" >&5
-$as_echo_n "checking whether rq_for_each_segment() wants bio_vec *... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->permission() wants nameidata" >&5
+$as_echo_n "checking whether iops->permission() wants nameidata... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/fs.h>
+
+               int permission_fn(struct inode *inode, int mask,
+                   struct nameidata *nd) { return 0; }
+
+               static const struct inode_operations
+                   iops __attribute__ ((unused)) = {
+                       .permission = permission_fn,
+               };
 
 int
 main (void)
 {
 
-               struct bio_vec *bv;
-               struct req_iterator iter;
-               struct request *req = NULL;
-               rq_for_each_segment(bv, req, iter) { }
 
   ;
   return 0;
@@ -16543,10 +16754,10 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_RQ_FOR_EACH_SEGMENT 1" >>confdefs.h
+$as_echo "#define HAVE_PERMISSION 1" >>confdefs.h
 
 
-$as_echo "#define HAVE_RQ_FOR_EACH_SEGMENT_BVP 1" >>confdefs.h
+$as_echo "#define HAVE_PERMISSION_WITH_NAMEIDATA 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether rq_for_each_segment() wants bio_vec" >&5
-$as_echo_n "checking whether rq_for_each_segment() wants bio_vec... " >&6; }
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->check_acl() exists" >&5
+$as_echo_n "checking whether iops->check_acl() exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/fs.h>
+
+               int check_acl_fn(struct inode *inode, int mask) { return 0; }
+
+               static const struct inode_operations
+                   iops __attribute__ ((unused)) = {
+                       .check_acl = check_acl_fn,
+               };
 
 int
 main (void)
 {
 
-               struct bio_vec bv;
-               struct req_iterator iter;
-               struct request *req = NULL;
-               rq_for_each_segment(bv, req, iter) { }
 
   ;
   return 0;
@@ -16613,10 +16828,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_RQ_FOR_EACH_SEGMENT 1" >>confdefs.h
-
-
-$as_echo "#define HAVE_RQ_FOR_EACH_SEGMENT_BV 1" >>confdefs.h
+$as_echo "#define HAVE_CHECK_ACL 1" >>confdefs.h
 
 
 else
 
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
-
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ql->discard_granularity is available" >&5
-$as_echo_n "checking whether ql->discard_granularity is available... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->check_acl() wants flags" >&5
+$as_echo_n "checking whether iops->check_acl() wants flags... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/fs.h>
+
+               int check_acl_fn(struct inode *inode, int mask,
+                   unsigned int flags) { return 0; }
+
+               static const struct inode_operations
+                   iops __attribute__ ((unused)) = {
+                       .check_acl = check_acl_fn,
+               };
 
 int
 main (void)
 {
 
-               struct queue_limits ql __attribute__ ((unused));
-
-               ql.discard_granularity = 0;
 
   ;
   return 0;
@@ -16685,7 +16900,10 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_DISCARD_GRANULARITY 1" >>confdefs.h
+$as_echo "#define HAVE_CHECK_ACL 1" >>confdefs.h
+
+
+$as_echo "#define HAVE_CHECK_ACL_WITH_FLAGS 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether super_block uses const struct xattr_hander" >&5
-$as_echo_n "checking whether super_block uses const struct xattr_hander... " >&6; }
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->get_acl() exists" >&5
+$as_echo_n "checking whether iops->get_acl() exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
                #include <linux/fs.h>
-               #include <linux/xattr.h>
-
-               const struct xattr_handler xattr_test_handler = {
-                       .prefix = "test",
-                       .get    = NULL,
-                       .set    = NULL,
-               };
 
-               const struct xattr_handler *xattr_handlers[] = {
-                       &xattr_test_handler,
-               };
+               struct posix_acl *get_acl_fn(struct inode *inode, int type)
+                   { return NULL; }
 
-               const struct super_block sb __attribute__ ((unused)) = {
-                       .s_xattr = xattr_handlers,
+               static const struct inode_operations
+                   iops __attribute__ ((unused)) = {
+                       .get_acl = get_acl_fn,
                };
 
 int
@@ -16763,7 +16975,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_CONST_XATTR_HANDLER 1" >>confdefs.h
+$as_echo "#define HAVE_GET_ACL 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether xattr_handler->get() wants dentry" >&5
-$as_echo_n "checking whether xattr_handler->get() wants dentry... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether current_umask exists" >&5
+$as_echo_n "checking whether current_umask exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/xattr.h>
-
-               int get(struct dentry *dentry, const char *name,
-                   void *buffer, size_t size, int handler_flags) { return 0; }
-               static const struct xattr_handler
-                   xops __attribute__ ((unused)) = {
-                       .get = get,
-               };
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
+               current_umask();
 
   ;
   return 0;
@@ -16834,7 +17040,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_DENTRY_XATTR_GET 1" >>confdefs.h
+$as_echo "#define HAVE_CURRENT_UMASK 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether xattr_handler->set() wants dentry" >&5
-$as_echo_n "checking whether xattr_handler->set() wants dentry... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether sops->show_options() wants dentry" >&5
+$as_echo_n "checking whether sops->show_options() wants dentry... " >&6; }
+
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/xattr.h>
+               #include <linux/fs.h>
 
-               int set(struct dentry *dentry, const char *name,
-                   const void *buffer, size_t size, int flags,
-                   int handler_flags) { return 0; }
-               static const struct xattr_handler
-                   xops __attribute__ ((unused)) = {
-                       .set = set,
+               int show_options (struct seq_file * x, struct dentry * y) { return 0; };
+               static struct super_operations sops __attribute__ ((unused)) = {
+                       .show_options = show_options,
                };
 
 int
@@ -16906,7 +17110,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_DENTRY_XATTR_SET 1" >>confdefs.h
+$as_echo "#define HAVE_SHOW_OPTIONS_WITH_DENTRY 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether xattr_handler->list() wants dentry" >&5
-$as_echo_n "checking whether xattr_handler->list() wants dentry... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether file_inode() is available" >&5
+$as_echo_n "checking whether file_inode() is available... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/xattr.h>
-
-               size_t list(struct dentry *dentry, char *list, size_t list_size,
-                   const char *name, size_t name_len, int handler_flags)
-                   { return 0; }
-               static const struct xattr_handler
-                   xops __attribute__ ((unused)) = {
-                       .list = list,
-               };
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
+               struct file *f = NULL;
+               file_inode(f);
 
   ;
   return 0;
@@ -16978,7 +17176,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_DENTRY_XATTR_LIST 1" >>confdefs.h
+$as_echo "#define HAVE_FILE_INODE 1" >>confdefs.h
 
 
 else
@@ -16996,8 +17194,9 @@ fi
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether inode_owner_or_capable() exists" >&5
-$as_echo_n "checking whether inode_owner_or_capable() exists... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->fsync() wants" >&5
+$as_echo_n "checking whether fops->fsync() wants... " >&6; }
+
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
@@ -17005,12 +17204,18 @@ cat confdefs.h - <<_ACEOF >conftest.c
 
                #include <linux/fs.h>
 
+               int test_fsync(struct file *f, struct dentry *dentry, int x)
+                   { return 0; }
+
+               static const struct file_operations
+                   fops __attribute__ ((unused)) = {
+                       .fsync = test_fsync,
+               };
+
 int
 main (void)
 {
 
-               struct inode *ip = NULL;
-               (void) inode_owner_or_capable(ip);
 
   ;
   return 0;
@@ -17041,34 +17246,43 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: dentry" >&5
+$as_echo "dentry" >&6; }
 
-$as_echo "#define HAVE_INODE_OWNER_OR_CAPABLE 1" >>confdefs.h
+$as_echo "#define HAVE_FSYNC_WITH_DENTRY 1" >>confdefs.h
 
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether is_owner_or_cap() exists" >&5
-$as_echo_n "checking whether is_owner_or_cap() exists... " >&6; }
+
+
+
+fi
+       rm -Rf build
+
+
+
+
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-                       #include <linux/fs.h>
-                       #include <linux/sched.h>
+               #include <linux/fs.h>
+
+               int test_fsync(struct file *f, int x) { return 0; }
+
+               static const struct file_operations
+                   fops __attribute__ ((unused)) = {
+                       .fsync = test_fsync,
+               };
 
 int
 main (void)
 {
 
-                       struct inode *ip = NULL;
-                       (void) is_owner_or_cap(ip);
 
   ;
   return 0;
@@ -17099,25 +17313,16 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no dentry" >&5
+$as_echo "no dentry" >&6; }
 
-$as_echo "#define HAVE_IS_OWNER_OR_CAP 1" >>confdefs.h
+$as_echo "#define HAVE_FSYNC_WITHOUT_DENTRY 1" >>confdefs.h
 
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-                       as_fn_error $? "no - Please file a bug report at
-                           https://github.com/zfsonlinux/zfs/issues/new" "$LINENO" 5
-
-
-
-fi
-       rm -Rf build
-
-
 
 
 
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether posix_acl_from_xattr() needs user_ns" >&5
-$as_echo_n "checking whether posix_acl_from_xattr() needs user_ns... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/cred.h>
                #include <linux/fs.h>
-               #include <linux/posix_acl_xattr.h>
+
+               int test_fsync(struct file *f, loff_t a, loff_t b, int c)
+                   { return 0; }
+
+               static const struct file_operations
+                   fops __attribute__ ((unused)) = {
+                       .fsync = test_fsync,
+               };
 
 int
 main (void)
 {
 
-               posix_acl_from_xattr(&init_user_ns, NULL, 0);
 
   ;
   return 0;
@@ -17173,18 +17381,16 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: range" >&5
+$as_echo "range" >&6; }
 
-$as_echo "#define HAVE_POSIX_ACL_FROM_XATTR_USERNS 1" >>confdefs.h
+$as_echo "#define HAVE_FSYNC_RANGE 1" >>confdefs.h
 
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
 
 
 
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether posix_acl_release() is available" >&5
-$as_echo_n "checking whether posix_acl_release() is available... " >&6; }
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether sops->evict_inode() exists" >&5
+$as_echo_n "checking whether sops->evict_inode() exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/cred.h>
                #include <linux/fs.h>
-               #include <linux/posix_acl.h>
+               void evict_inode (struct inode * t) { return; }
+               static struct super_operations sops __attribute__ ((unused)) = {
+                       .evict_inode = evict_inode,
+               };
 
 int
 main (void)
 {
 
-               struct posix_acl* tmp = posix_acl_alloc(1, 0);
-               posix_acl_release(tmp);
 
   ;
   return 0;
@@ -17244,7 +17451,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_POSIX_ACL_RELEASE 1" >>confdefs.h
+$as_echo "#define HAVE_EVICT_INODE 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether posix_acl_release() is GPL-only" >&5
-$as_echo_n "checking whether posix_acl_release() is GPL-only... " >&6; }
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether sops->dirty_inode() wants flags" >&5
+$as_echo_n "checking whether sops->dirty_inode() wants flags... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/cred.h>
                #include <linux/fs.h>
-               #include <linux/posix_acl.h>
 
-               MODULE_LICENSE("$ZFS_META_LICENSE");
+               void dirty_inode(struct inode *a, int b) { return; }
+
+               static const struct super_operations
+                   sops __attribute__ ((unused)) = {
+                       .dirty_inode = dirty_inode,
+               };
 
 int
 main (void)
 {
 
-               struct posix_acl* tmp = posix_acl_alloc(1, 0);
-               posix_acl_release(tmp);
 
   ;
   return 0;
@@ -17310,18 +17519,18 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_DIRTY_INODE_WITH_FLAGS 1" >>confdefs.h
+
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_POSIX_ACL_RELEASE_GPL_ONLY 1" >>confdefs.h
-
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
 
 
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether posix_acl_chmod exists" >&5
-$as_echo_n "checking whether posix_acl_chmod exists... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether sops->nr_cached_objects() exists" >&5
+$as_echo_n "checking whether sops->nr_cached_objects() exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
                #include <linux/fs.h>
-               #include <linux/posix_acl.h>
+
+               int nr_cached_objects(struct super_block *sb) { return 0; }
+
+               static const struct super_operations
+                   sops __attribute__ ((unused)) = {
+                       .nr_cached_objects = nr_cached_objects,
+               };
 
 int
 main (void)
 {
 
-               posix_acl_chmod(NULL, 0, 0)
 
   ;
   return 0;
@@ -17379,7 +17593,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_POSIX_ACL_CHMOD 1" >>confdefs.h
+$as_echo "#define HAVE_NR_CACHED_OBJECTS 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether __posix_acl_chmod exists" >&5
-$as_echo_n "checking whether __posix_acl_chmod exists... " >&6; }
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether sops->free_cached_objects() exists" >&5
+$as_echo_n "checking whether sops->free_cached_objects() exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
                #include <linux/fs.h>
-               #include <linux/posix_acl.h>
+
+               void free_cached_objects(struct super_block *sb, int x)
+                   { return; }
+
+               static const struct super_operations
+                   sops __attribute__ ((unused)) = {
+                       .free_cached_objects = free_cached_objects,
+               };
 
 int
 main (void)
 {
 
-               __posix_acl_chmod(NULL, 0, 0)
 
   ;
   return 0;
@@ -17444,7 +17665,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE___POSIX_ACL_CHMOD 1" >>confdefs.h
+$as_echo "#define HAVE_FREE_CACHED_OBJECTS 1" >>confdefs.h
 
 
 else
@@ -17462,8 +17683,9 @@ fi
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether inode has i_acl and i_default_acl" >&5
-$as_echo_n "checking whether inode has i_acl and i_default_acl... " >&6; }
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->fallocate() exists" >&5
+$as_echo_n "checking whether fops->fallocate() exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
@@ -17471,13 +17693,18 @@ cat confdefs.h - <<_ACEOF >conftest.c
 
                #include <linux/fs.h>
 
+               long test_fallocate(struct file *file, int mode,
+                   loff_t offset, loff_t len) { return 0; }
+
+               static const struct file_operations
+                   fops __attribute__ ((unused)) = {
+                       .fallocate = test_fallocate,
+               };
+
 int
 main (void)
 {
 
-               struct inode ino;
-               ino.i_acl = NULL;
-               ino.i_default_acl = NULL;
 
   ;
   return 0;
@@ -17511,7 +17738,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_POSIX_ACL_CACHING 1" >>confdefs.h
+$as_echo "#define HAVE_FILE_FALLOCATE 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether posix_acl_equiv_mode() wants umode_t" >&5
-$as_echo_n "checking whether posix_acl_equiv_mode() wants umode_t... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->fallocate() exists" >&5
+$as_echo_n "checking whether iops->fallocate() exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
                #include <linux/fs.h>
-               #include <linux/posix_acl.h>
+
+               long test_fallocate(struct inode *inode, int mode,
+                   loff_t offset, loff_t len) { return 0; }
+
+               static const struct inode_operations
+                   fops __attribute__ ((unused)) = {
+                       .fallocate = test_fallocate,
+               };
 
 int
 main (void)
 {
 
-               umode_t tmp;
-               posix_acl_equiv_mode(NULL,&tmp);
 
   ;
   return 0;
@@ -17578,7 +17810,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_POSIX_ACL_EQUIV_MODE_UMODE_T 1" >>confdefs.h
+$as_echo "#define HAVE_INODE_FALLOCATE 1" >>confdefs.h
 
 
 else
@@ -17596,8 +17828,9 @@ fi
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->permission() exists" >&5
-$as_echo_n "checking whether iops->permission() exists... " >&6; }
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->create()/mkdir()/mknod() take umode_t" >&5
+$as_echo_n "checking whether iops->create()/mkdir()/mknod() take umode_t... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
@@ -17605,11 +17838,12 @@ cat confdefs.h - <<_ACEOF >conftest.c
 
                #include <linux/fs.h>
 
-               int permission_fn(struct inode *inode, int mask) { return 0; }
+               int mkdir(struct inode *inode, struct dentry *dentry,
+                   umode_t umode) { return 0; }
 
                static const struct inode_operations
                    iops __attribute__ ((unused)) = {
-                       .permission = permission_fn,
+                       .mkdir = mkdir,
                };
 
 int
@@ -17649,7 +17883,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_PERMISSION 1" >>confdefs.h
+$as_echo "#define HAVE_MKDIR_UMODE_T 1" >>confdefs.h
 
 
 else
@@ -17667,8 +17901,8 @@ fi
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->permission() wants nameidata" >&5
-$as_echo_n "checking whether iops->permission() wants nameidata... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->lookup() passes nameidata" >&5
+$as_echo_n "checking whether iops->lookup() passes nameidata... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
@@ -17676,12 +17910,13 @@ cat confdefs.h - <<_ACEOF >conftest.c
 
                #include <linux/fs.h>
 
-               int permission_fn(struct inode *inode, int mask,
-                   struct nameidata *nd) { return 0; }
+               struct dentry *inode_lookup(struct inode *inode,
+                   struct dentry *dentry, struct nameidata *nidata)
+                   { return NULL; }
 
-               static const struct inode_operations
-                   iops __attribute__ ((unused)) = {
-                       .permission = permission_fn,
+               static const struct inode_operations iops
+                   __attribute__ ((unused)) = {
+                       .lookup = inode_lookup,
                };
 
 int
@@ -17721,10 +17956,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_PERMISSION 1" >>confdefs.h
-
-
-$as_echo "#define HAVE_PERMISSION_WITH_NAMEIDATA 1" >>confdefs.h
+$as_echo "#define HAVE_LOOKUP_NAMEIDATA 1" >>confdefs.h
 
 
 else
@@ -17742,8 +17974,8 @@ fi
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->check_acl() exists" >&5
-$as_echo_n "checking whether iops->check_acl() exists... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->create() passes nameidata" >&5
+$as_echo_n "checking whether iops->create() passes nameidata... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
@@ -17751,11 +17983,17 @@ cat confdefs.h - <<_ACEOF >conftest.c
 
                #include <linux/fs.h>
 
-               int check_acl_fn(struct inode *inode, int mask) { return 0; }
+               #ifdef HAVE_MKDIR_UMODE_T
+               int inode_create(struct inode *inode ,struct dentry *dentry,
+                   umode_t umode, struct nameidata *nidata) { return 0; }
+               #else
+               int inode_create(struct inode *inode,struct dentry *dentry,
+                   int umode, struct nameidata * nidata) { return 0; }
+               #endif
 
                static const struct inode_operations
                    iops __attribute__ ((unused)) = {
-                       .check_acl = check_acl_fn,
+                       .create         = inode_create,
                };
 
 int
@@ -17795,7 +18033,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_CHECK_ACL 1" >>confdefs.h
+$as_echo "#define HAVE_CREATE_NAMEIDATA 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->check_acl() wants flags" >&5
-$as_echo_n "checking whether iops->check_acl() wants flags... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->follow_link() passes nameidata" >&5
+$as_echo_n "checking whether iops->follow_link() passes nameidata... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
                #include <linux/fs.h>
+               const char *follow_link(struct dentry *de, void **cookie)
+                   { return "symlink"; }
+               static struct inode_operations iops __attribute__ ((unused)) = {
+                       .follow_link = follow_link,
+               };
 
-               int check_acl_fn(struct inode *inode, int mask,
-                   unsigned int flags) { return 0; }
-
-               static const struct inode_operations
-                   iops __attribute__ ((unused)) = {
-                       .check_acl = check_acl_fn,
-               };
-
-int
-main (void)
-{
+int
+main (void)
+{
 
 
   ;
@@ -17864,21 +18099,18 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_CHECK_ACL 1" >>confdefs.h
-
-
-$as_echo "#define HAVE_CHECK_ACL_WITH_FLAGS 1" >>confdefs.h
-
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_FOLLOW_LINK_NAMEIDATA 1" >>confdefs.h
+
 
 
 
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->get_acl() exists" >&5
-$as_echo_n "checking whether iops->get_acl() exists... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->put_link() passes nameidata" >&5
+$as_echo_n "checking whether iops->put_link() passes nameidata... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
                #include <linux/fs.h>
-
-               struct posix_acl *get_acl_fn(struct inode *inode, int type)
-                   { return NULL; }
-
-               static const struct inode_operations
-                   iops __attribute__ ((unused)) = {
-                       .get_acl = get_acl_fn,
+               void put_link(struct inode *ip, void *cookie) { return; }
+               static struct inode_operations iops __attribute__ ((unused)) = {
+                       .put_link = put_link,
                };
 
 int
@@ -17939,18 +18167,18 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_GET_ACL 1" >>confdefs.h
-
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_PUT_LINK_NAMEIDATA 1" >>confdefs.h
+
 
 
 
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether current_umask exists" >&5
-$as_echo_n "checking whether current_umask exists... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->truncate_range() exists" >&5
+$as_echo_n "checking whether iops->truncate_range() exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
                #include <linux/fs.h>
+               void truncate_range(struct inode *inode, loff_t start,
+                                   loff_t end) { return; }
+               static struct inode_operations iops __attribute__ ((unused)) = {
+                       .truncate_range = truncate_range,
+               };
 
 int
 main (void)
 {
 
-               current_umask();
 
   ;
   return 0;
@@ -18007,7 +18239,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_CURRENT_UMASK 1" >>confdefs.h
+$as_echo "#define HAVE_INODE_TRUNCATE_RANGE 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether sops->show_options() wants dentry" >&5
-$as_echo_n "checking whether sops->show_options() wants dentry... " >&6; }
-
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether dops->d_automount() exists" >&5
+$as_echo_n "checking whether dops->d_automount() exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
-
-               int show_options (struct seq_file * x, struct dentry * y) { return 0; };
-               static struct super_operations sops __attribute__ ((unused)) = {
-                       .show_options = show_options,
+               #include <linux/dcache.h>
+               struct vfsmount *d_automount(struct path *p) { return NULL; }
+               struct dentry_operations dops __attribute__ ((unused)) = {
+                       .d_automount = d_automount,
                };
 
 int
@@ -18077,7 +18307,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_SHOW_OPTIONS_WITH_DENTRY 1" >>confdefs.h
+$as_echo "#define HAVE_AUTOMOUNT 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether file_inode() is available" >&5
-$as_echo_n "checking whether file_inode() is available... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether eops->encode_fh() wants inode" >&5
+$as_echo_n "checking whether eops->encode_fh() wants inode... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
+               #include <linux/exportfs.h>
+               int encode_fh(struct inode *inode, __u32 *fh, int *max_len,
+                             struct inode *parent) { return 0; }
+               static struct export_operations eops __attribute__ ((unused))={
+                       .encode_fh = encode_fh,
+               };
 
 int
 main (void)
 {
 
-               struct file *f = NULL;
-               file_inode(f);
 
   ;
   return 0;
@@ -18143,7 +18376,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_FILE_INODE 1" >>confdefs.h
+$as_echo "#define HAVE_ENCODE_FH_WITH_INODE 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->fsync() wants" >&5
-$as_echo_n "checking whether fops->fsync() wants... " >&6; }
-
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether eops->commit_metadata() exists" >&5
+$as_echo_n "checking whether eops->commit_metadata() exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
-
-               int test_fsync(struct file *f, struct dentry *dentry, int x)
-                   { return 0; }
-
-               static const struct file_operations
-                   fops __attribute__ ((unused)) = {
-                       .fsync = test_fsync,
+               #include <linux/exportfs.h>
+               int commit_metadata(struct inode *inode) { return 0; }
+               static struct export_operations eops __attribute__ ((unused))={
+                       .commit_metadata = commit_metadata,
                };
 
 int
@@ -18213,16 +18441,18 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: dentry" >&5
-$as_echo "dentry" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_FSYNC_WITH_DENTRY 1" >>confdefs.h
+$as_echo "#define HAVE_COMMIT_METADATA 1" >>confdefs.h
 
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
 
 
@@ -18231,6 +18461,8 @@ fi
 
 
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether clear_inode() is available" >&5
+$as_echo_n "checking whether clear_inode() is available... " >&6; }
 
 
 
@@ -18239,17 +18471,11 @@ cat confdefs.h - <<_ACEOF >conftest.c
 
                #include <linux/fs.h>
 
-               int test_fsync(struct file *f, int x) { return 0; }
-
-               static const struct file_operations
-                   fops __attribute__ ((unused)) = {
-                       .fsync = test_fsync,
-               };
-
 int
 main (void)
 {
 
+               clear_inode(NULL);
 
   ;
   return 0;
@@ -18279,45 +18505,81 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
+  rc=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+ rc=1
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no dentry" >&5
-$as_echo "no dentry" >&6; }
 
-$as_echo "#define HAVE_FSYNC_WITHOUT_DENTRY 1" >>confdefs.h
+fi
+       rm -Rf build
 
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+       if test $rc -ne 0; then :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
+       else
+               if test "x$enable_linux_builtin" != xyes; then
 
+       grep -q -E '[[:space:]]clear_inode[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in fs/inode.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(clear_inode)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
 
-fi
-       rm -Rf build
+               fi
+               if test $rc -ne 0; then :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
+               else :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
+$as_echo "#define HAVE_CLEAR_INODE 1" >>confdefs.h
 
 
-cat confdefs.h - <<_ACEOF >conftest.c
+               fi
+       fi
 
 
-               #include <linux/fs.h>
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether insert_inode_locked() is available" >&5
+$as_echo_n "checking whether insert_inode_locked() is available... " >&6; }
 
-               int test_fsync(struct file *f, loff_t a, loff_t b, int c)
-                   { return 0; }
 
-               static const struct file_operations
-                   fops __attribute__ ((unused)) = {
-                       .fsync = test_fsync,
-               };
+
+cat confdefs.h - <<_ACEOF >conftest.c
+
+
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
+               insert_inode_locked(NULL);
 
   ;
   return 0;
@@ -18347,44 +18609,81 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: range" >&5
-$as_echo "range" >&6; }
-
-$as_echo "#define HAVE_FSYNC_RANGE 1" >>confdefs.h
-
-
+  rc=0
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
-
-
+ rc=1
 
 
 fi
        rm -Rf build
 
 
+       if test $rc -ne 0; then :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
+       else
+               if test "x$enable_linux_builtin" != xyes; then
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether sops->evict_inode() exists" >&5
-$as_echo_n "checking whether sops->evict_inode() exists... " >&6; }
+       grep -q -E '[[:space:]]insert_inode_locked[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in fs/inode.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(insert_inode_locked)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
 
+               fi
+               if test $rc -ne 0; then :
 
-cat confdefs.h - <<_ACEOF >conftest.c
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
+               else :
 
-               #include <linux/fs.h>
-               void evict_inode (struct inode * t) { return; }
-               static struct super_operations sops __attribute__ ((unused)) = {
-                       .evict_inode = evict_inode,
-               };
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
-int
-main (void)
-{
+$as_echo "#define HAVE_INSERT_INODE_LOCKED 1" >>confdefs.h
+
+
+               fi
+       fi
+
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether d_make_root() is available" >&5
+$as_echo_n "checking whether d_make_root() is available... " >&6; }
+
+
+
+cat confdefs.h - <<_ACEOF >conftest.c
+
+
+               #include <linux/dcache.h>
 
+int
+main (void)
+{
+
+               d_make_root(NULL);
 
   ;
   return 0;
@@ -18414,48 +18713,81 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
+  rc=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+ rc=1
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_EVICT_INODE 1" >>confdefs.h
+fi
+       rm -Rf build
 
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+       if test $rc -ne 0; then :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
+       else
+               if test "x$enable_linux_builtin" != xyes; then
+
+       grep -q -E '[[:space:]]d_make_root[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in fs/dcache.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(d_make_root)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
 
+               fi
+               if test $rc -ne 0; then :
 
-fi
-       rm -Rf build
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
+               else :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
+$as_echo "#define HAVE_D_MAKE_ROOT 1" >>confdefs.h
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether sops->dirty_inode() wants flags" >&5
-$as_echo_n "checking whether sops->dirty_inode() wants flags... " >&6; }
 
+               fi
+       fi
 
-cat confdefs.h - <<_ACEOF >conftest.c
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether d_obtain_alias() is available" >&5
+$as_echo_n "checking whether d_obtain_alias() is available... " >&6; }
 
-               #include <linux/fs.h>
 
-               void dirty_inode(struct inode *a, int b) { return; }
 
-               static const struct super_operations
-                   sops __attribute__ ((unused)) = {
-                       .dirty_inode = dirty_inode,
-               };
+cat confdefs.h - <<_ACEOF >conftest.c
+
+
+               #include <linux/dcache.h>
 
 int
 main (void)
 {
 
+               d_obtain_alias(NULL);
 
   ;
   return 0;
@@ -18485,48 +18817,82 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
+  rc=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+ rc=1
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_DIRTY_INODE_WITH_FLAGS 1" >>confdefs.h
+fi
+       rm -Rf build
 
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+       if test $rc -ne 0; then :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
+       else
+               if test "x$enable_linux_builtin" != xyes; then
 
+       grep -q -E '[[:space:]]d_obtain_alias[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in fs/dcache.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(d_obtain_alias)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
 
-fi
-       rm -Rf build
+               fi
+               if test $rc -ne 0; then :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
+               else :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether sops->nr_cached_objects() exists" >&5
-$as_echo_n "checking whether sops->nr_cached_objects() exists... " >&6; }
+$as_echo "#define HAVE_D_OBTAIN_ALIAS 1" >>confdefs.h
 
 
-cat confdefs.h - <<_ACEOF >conftest.c
+               fi
+       fi
 
 
-               #include <linux/fs.h>
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether d_prune_aliases() is available" >&5
+$as_echo_n "checking whether d_prune_aliases() is available... " >&6; }
 
-               int nr_cached_objects(struct super_block *sb) { return 0; }
 
-               static const struct super_operations
-                   sops __attribute__ ((unused)) = {
-                       .nr_cached_objects = nr_cached_objects,
-               };
+
+cat confdefs.h - <<_ACEOF >conftest.c
+
+
+               #include <linux/dcache.h>
 
 int
 main (void)
 {
 
+               struct inode *ip = NULL;
+               d_prune_aliases(ip);
 
   ;
   return 0;
@@ -18556,49 +18922,81 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
+  rc=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+ rc=1
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_NR_CACHED_OBJECTS 1" >>confdefs.h
+fi
+       rm -Rf build
 
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+       if test $rc -ne 0; then :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
+       else
+               if test "x$enable_linux_builtin" != xyes; then
+
+       grep -q -E '[[:space:]]d_prune_aliases[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in fs/dcache.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(d_prune_aliases)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
+
+               fi
+               if test $rc -ne 0; then :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
-fi
-       rm -Rf build
+               else :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
+$as_echo "#define HAVE_D_PRUNE_ALIASES 1" >>confdefs.h
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether sops->free_cached_objects() exists" >&5
-$as_echo_n "checking whether sops->free_cached_objects() exists... " >&6; }
+               fi
+       fi
 
 
-cat confdefs.h - <<_ACEOF >conftest.c
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether d_set_d_op() is available" >&5
+$as_echo_n "checking whether d_set_d_op() is available... " >&6; }
 
 
-               #include <linux/fs.h>
 
-               void free_cached_objects(struct super_block *sb, int x)
-                   { return; }
+cat confdefs.h - <<_ACEOF >conftest.c
 
-               static const struct super_operations
-                   sops __attribute__ ((unused)) = {
-                       .free_cached_objects = free_cached_objects,
-               };
+
+               #include <linux/dcache.h>
 
 int
 main (void)
 {
 
+               d_set_d_op(NULL, NULL);
 
   ;
   return 0;
@@ -18628,44 +19026,82 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_FREE_CACHED_OBJECTS 1" >>confdefs.h
-
-
+  rc=0
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
+ rc=1
 
 
 fi
        rm -Rf build
 
 
+       if test $rc -ne 0; then :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
+       else
+               if test "x$enable_linux_builtin" != xyes; then
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->fallocate() exists" >&5
-$as_echo_n "checking whether fops->fallocate() exists... " >&6; }
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-               #include <linux/fs.h>
-
-               long test_fallocate(struct file *file, int mode,
-                   loff_t offset, loff_t len) { return 0; }
-
-               static const struct file_operations
-                   fops __attribute__ ((unused)) = {
-                       .fallocate = test_fallocate,
+       grep -q -E '[[:space:]]d_set_d_op[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in fs/dcache.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(d_set_d_op)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
+
+               fi
+               if test $rc -ne 0; then :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+               else :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_D_SET_D_OP 1" >>confdefs.h
+
+
+               fi
+       fi
+
+
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether dops->d_revalidate() takes struct nameidata" >&5
+$as_echo_n "checking whether dops->d_revalidate() takes struct nameidata... " >&6; }
+
+
+cat confdefs.h - <<_ACEOF >conftest.c
+
+
+               #include <linux/dcache.h>
+
+               int revalidate (struct dentry *dentry,
+                   struct nameidata *nidata) { return 0; }
+
+               static const struct dentry_operations
+                   dops __attribute__ ((unused)) = {
+                       .d_revalidate   = revalidate,
                };
 
 int
@@ -18705,7 +19141,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_FILE_FALLOCATE 1" >>confdefs.h
+$as_echo "#define HAVE_D_REVALIDATE_NAMEIDATA 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->fallocate() exists" >&5
-$as_echo_n "checking whether iops->fallocate() exists... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether dentry uses const struct dentry_operations" >&5
+$as_echo_n "checking whether dentry uses const struct dentry_operations... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
-
-               long test_fallocate(struct inode *inode, int mode,
-                   loff_t offset, loff_t len) { return 0; }
+               #include <linux/dcache.h>
 
-               static const struct inode_operations
-                   fops __attribute__ ((unused)) = {
-                       .fallocate = test_fallocate,
+               const struct dentry_operations test_d_op = {
+                       .d_revalidate = NULL,
                };
 
 int
 main (void)
 {
 
+               struct dentry d __attribute__ ((unused));
+
+               d.d_op = &test_d_op;
 
   ;
   return 0;
@@ -18777,7 +19212,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_INODE_FALLOCATE 1" >>confdefs.h
+$as_echo "#define HAVE_CONST_DENTRY_OPERATIONS 1" >>confdefs.h
 
 
 else
 
 
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether check_disk_size_change() is available" >&5
+$as_echo_n "checking whether check_disk_size_change() is available... " >&6; }
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->create()/mkdir()/mknod() take umode_t" >&5
-$as_echo_n "checking whether iops->create()/mkdir()/mknod() take umode_t... " >&6; }
-
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
                #include <linux/fs.h>
 
-               int mkdir(struct inode *inode, struct dentry *dentry,
-                   umode_t umode) { return 0; }
-
-               static const struct inode_operations
-                   iops __attribute__ ((unused)) = {
-                       .mkdir = mkdir,
-               };
-
 int
 main (void)
 {
 
+               check_disk_size_change(NULL, NULL);
 
   ;
   return 0;
@@ -18846,50 +19273,81 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
+  rc=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+ rc=1
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_MKDIR_UMODE_T 1" >>confdefs.h
+fi
+       rm -Rf build
 
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+       if test $rc -ne 0; then :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
+       else
+               if test "x$enable_linux_builtin" != xyes; then
+
+       grep -q -E '[[:space:]]check_disk_size_change[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in fs/block_dev.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(check_disk_size_change)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
+
+               fi
+               if test $rc -ne 0; then :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
-fi
-       rm -Rf build
+               else :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
+$as_echo "#define HAVE_CHECK_DISK_SIZE_CHANGE 1" >>confdefs.h
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->lookup() passes nameidata" >&5
-$as_echo_n "checking whether iops->lookup() passes nameidata... " >&6; }
+               fi
+       fi
 
 
-cat confdefs.h - <<_ACEOF >conftest.c
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether truncate_setsize() is available" >&5
+$as_echo_n "checking whether truncate_setsize() is available... " >&6; }
 
 
-               #include <linux/fs.h>
 
-               struct dentry *inode_lookup(struct inode *inode,
-                   struct dentry *dentry, struct nameidata *nidata)
-                   { return NULL; }
+cat confdefs.h - <<_ACEOF >conftest.c
 
-               static const struct inode_operations iops
-                   __attribute__ ((unused)) = {
-                       .lookup = inode_lookup,
-               };
+
+               #include <linux/mm.h>
 
 int
 main (void)
 {
 
+               truncate_setsize(NULL, 0);
 
   ;
   return 0;
@@ -18919,54 +19377,88 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
+  rc=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+ rc=1
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_LOOKUP_NAMEIDATA 1" >>confdefs.h
+fi
+       rm -Rf build
 
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+       if test $rc -ne 0; then :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
+       else
+               if test "x$enable_linux_builtin" != xyes; then
+
+       grep -q -E '[[:space:]]truncate_setsize[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in mm/truncate.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(truncate_setsize)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
+
+               fi
+               if test $rc -ne 0; then :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
-fi
-       rm -Rf build
+               else :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
+$as_echo "#define HAVE_TRUNCATE_SETSIZE 1" >>confdefs.h
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->create() passes nameidata" >&5
-$as_echo_n "checking whether iops->create() passes nameidata... " >&6; }
+               fi
+       fi
 
 
-cat confdefs.h - <<_ACEOF >conftest.c
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether security_inode_init_security wants 6 args" >&5
+$as_echo_n "checking whether security_inode_init_security wants 6 args... " >&6; }
 
-               #include <linux/fs.h>
 
-               #ifdef HAVE_MKDIR_UMODE_T
-               int inode_create(struct inode *inode ,struct dentry *dentry,
-                   umode_t umode, struct nameidata *nidata) { return 0; }
-               #else
-               int inode_create(struct inode *inode,struct dentry *dentry,
-                   int umode, struct nameidata * nidata) { return 0; }
-               #endif
+cat confdefs.h - <<_ACEOF >conftest.c
 
-               static const struct inode_operations
-                   iops __attribute__ ((unused)) = {
-                       .create         = inode_create,
-               };
+
+               #include <linux/security.h>
 
 int
 main (void)
 {
 
+               struct inode *ip __attribute__ ((unused)) = NULL;
+               struct inode *dip __attribute__ ((unused)) = NULL;
+               const struct qstr *str __attribute__ ((unused)) = NULL;
+               char *name __attribute__ ((unused)) = NULL;
+               void *value __attribute__ ((unused)) = NULL;
+               size_t len __attribute__ ((unused)) = 0;
+
+               security_inode_init_security(ip, dip, str, &name, &value, &len);
 
   ;
   return 0;
@@ -19000,7 +19492,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_CREATE_NAMEIDATA 1" >>confdefs.h
+$as_echo "#define HAVE_6ARGS_SECURITY_INODE_INIT_SECURITY 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->follow_link() passes nameidata" >&5
-$as_echo_n "checking whether iops->follow_link() passes nameidata... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether security_inode_init_security wants callback" >&5
+$as_echo_n "checking whether security_inode_init_security wants callback... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
-               const char *follow_link(struct dentry *de, void **cookie)
-                   { return "symlink"; }
-               static struct inode_operations iops __attribute__ ((unused)) = {
-                       .follow_link = follow_link,
-               };
+               #include <linux/security.h>
 
 int
 main (void)
 {
 
-
+               struct inode *ip __attribute__ ((unused)) = NULL;
+               struct inode *dip __attribute__ ((unused)) = NULL;
+               const struct qstr *str __attribute__ ((unused)) = NULL;
+               initxattrs func __attribute__ ((unused)) = NULL;
+
+               security_inode_init_security(ip, dip, str, func, NULL);
+
   ;
   return 0;
 }
@@ -19066,18 +19559,18 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY 1" >>confdefs.h
+
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_FOLLOW_LINK_NAMEIDATA 1" >>confdefs.h
-
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
 
 
 
 
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether mount_nodev() is available" >&5
+$as_echo_n "checking whether mount_nodev() is available... " >&6; }
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->put_link() passes nameidata" >&5
-$as_echo_n "checking whether iops->put_link() passes nameidata... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
                #include <linux/fs.h>
-               void put_link(struct inode *ip, void *cookie) { return; }
-               static struct inode_operations iops __attribute__ ((unused)) = {
-                       .put_link = put_link,
-               };
 
 int
 main (void)
 {
 
+               mount_nodev(NULL, 0, NULL, NULL);
 
   ;
   return 0;
@@ -19133,40 +19623,84 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
+  rc=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+ rc=1
+
+
+fi
+       rm -Rf build
+
+
+       if test $rc -ne 0; then :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+       else
+               if test "x$enable_linux_builtin" != xyes; then
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+       grep -q -E '[[:space:]]mount_nodev[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in fs/super.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(mount_nodev)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
 
-$as_echo "#define HAVE_PUT_LINK_NAMEIDATA 1" >>confdefs.h
+               fi
+               if test $rc -ne 0; then :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
+               else :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
-fi
-       rm -Rf build
+$as_echo "#define HAVE_MOUNT_NODEV 1" >>confdefs.h
 
 
+               fi
+       fi
+
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether iops->truncate_range() exists" >&5
-$as_echo_n "checking whether iops->truncate_range() exists... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether super_block has s_shrink" >&5
+$as_echo_n "checking whether super_block has s_shrink... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
                #include <linux/fs.h>
-               void truncate_range(struct inode *inode, loff_t start,
-                                   loff_t end) { return; }
-               static struct inode_operations iops __attribute__ ((unused)) = {
-                       .truncate_range = truncate_range,
+
+               int shrink(struct shrinker *s, struct shrink_control *sc)
+                   { return 0; }
+
+               static const struct super_block
+                   sb __attribute__ ((unused)) = {
+                       .s_shrink.shrink = shrink,
+                       .s_shrink.seeks = DEFAULT_SEEKS,
+                       .s_shrink.batch = 0,
                };
 
 int
@@ -19206,7 +19740,8 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_INODE_TRUNCATE_RANGE 1" >>confdefs.h
+$as_echo "#define HAVE_SHRINK 1" >>confdefs.h
+
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether dops->d_automount() exists" >&5
-$as_echo_n "checking whether dops->d_automount() exists... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether shrink_control has nid" >&5
+$as_echo_n "checking whether shrink_control has nid... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/dcache.h>
-               struct vfsmount *d_automount(struct path *p) { return NULL; }
-               struct dentry_operations dops __attribute__ ((unused)) = {
-                       .d_automount = d_automount,
-               };
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
+               struct shrink_control sc __attribute__ ((unused));
+               unsigned long scnidsize __attribute__ ((unused)) =
+                   sizeof(sc.nid);
 
   ;
   return 0;
@@ -19274,7 +19808,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_AUTOMOUNT 1" >>confdefs.h
+$as_echo "#define SHRINK_CONTROL_HAS_NID 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether eops->encode_fh() wants inode" >&5
-$as_echo_n "checking whether eops->encode_fh() wants inode... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether super_block has s_instances list_head" >&5
+$as_echo_n "checking whether super_block has s_instances list_head... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/exportfs.h>
-               int encode_fh(struct inode *inode, __u32 *fh, int *max_len,
-                             struct inode *parent) { return 0; }
-               static struct export_operations eops __attribute__ ((unused))={
-                       .encode_fh = encode_fh,
-               };
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
+               struct super_block sb __attribute__ ((unused));
+
+               INIT_LIST_HEAD(&sb.s_instances);
 
   ;
   return 0;
@@ -19343,7 +19875,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_ENCODE_FH_WITH_INODE 1" >>confdefs.h
+$as_echo "#define HAVE_S_INSTANCES_LIST_HEAD 1" >>confdefs.h
 
 
 else
 
 
 
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether eops->commit_metadata() exists" >&5
-$as_echo_n "checking whether eops->commit_metadata() exists... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether super_block has s_d_op" >&5
+$as_echo_n "checking whether super_block has s_d_op... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/exportfs.h>
-               int commit_metadata(struct inode *inode) { return 0; }
-               static struct export_operations eops __attribute__ ((unused))={
-                       .commit_metadata = commit_metadata,
-               };
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
+               struct super_block sb __attribute__ ((unused));
+               sb.s_d_op = NULL;
 
   ;
   return 0;
@@ -19411,7 +19940,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_COMMIT_METADATA 1" >>confdefs.h
+$as_echo "#define HAVE_S_D_OP 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether clear_inode() is available" >&5
-$as_echo_n "checking whether clear_inode() is available... " >&6; }
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bdi_setup_and_register() wants 2 args" >&5
+$as_echo_n "checking whether bdi_setup_and_register() wants 2 args... " >&6; }
 
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
+               #include <linux/backing-dev.h>
+               struct backing_dev_info bdi;
 
 int
 main (void)
 {
 
-               clear_inode(NULL);
+               char *name = "bdi";
+               int error __attribute__((unused)) =
+                   bdi_setup_and_register(&bdi, name);
 
   ;
   return 0;
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
-
-       else
-               if test "x$enable_linux_builtin" != xyes; then
-
-       grep -q -E '[[:space:]]clear_inode[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in fs/inode.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(clear_inode)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
-
-               fi
-               if test $rc -ne 0; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-               else :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_CLEAR_INODE 1" >>confdefs.h
-
-
-               fi
-       fi
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether insert_inode_locked() is available" >&5
-$as_echo_n "checking whether insert_inode_locked() is available... " >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bdi_setup_and_register() wants 3 args" >&5
+$as_echo_n "checking whether bdi_setup_and_register() wants 3 args... " >&6; }
 
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
+                       #include <linux/backing-dev.h>
+                       struct backing_dev_info bdi;
 
 int
 main (void)
 {
 
-               insert_inode_locked(NULL);
+                       char *name = "bdi";
+                       unsigned int cap = BDI_CAP_MAP_COPY;
+                       int error __attribute__((unused)) =
+                           bdi_setup_and_register(&bdi, name, cap);
 
   ;
   return 0;
 
        if test $rc -ne 0; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
        else
                if test "x$enable_linux_builtin" != xyes; then
 
-       grep -q -E '[[:space:]]insert_inode_locked[[:space:]]' \
+       grep -q -E '[[:space:]]bdi_setup_and_register[[:space:]]' \
                $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
        rc=$?
        if test $rc -ne 0; then
                export=0
-               for file in fs/inode.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(insert_inode_locked)" \
+               for file in mm/backing-dev.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(bdi_setup_and_register)" \
                                "$LINUX/$file" 2>/dev/null
                        rc=$?
                        if test $rc -eq 0; then
@@ -19621,91 +20113,31 @@ $as_echo "no" >&6; }
                fi
                if test $rc -ne 0; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
                else :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_INSERT_INODE_LOCKED 1" >>confdefs.h
+$as_echo "#define HAVE_3ARGS_BDI_SETUP_AND_REGISTER 1" >>confdefs.h
 
 
                fi
        fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether d_make_root() is available" >&5
-$as_echo_n "checking whether d_make_root() is available... " >&6; }
-
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-               #include <linux/dcache.h>
-
-int
-main (void)
-{
-
-               d_make_root(NULL);
-
-  ;
-  return 0;
-}
+       else
+               if test "x$enable_linux_builtin" != xyes; then
 
-_ACEOF
-
-
-
-cat - <<_ACEOF >conftest.h
-
-_ACEOF
-
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
-
-
-fi
-       rm -Rf build
-
-
-       if test $rc -ne 0; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-       else
-               if test "x$enable_linux_builtin" != xyes; then
-
-       grep -q -E '[[:space:]]d_make_root[[:space:]]' \
+       grep -q -E '[[:space:]]bdi_setup_and_register[[:space:]]' \
                $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
        rc=$?
        if test $rc -ne 0; then
                export=0
-               for file in fs/dcache.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(d_make_root)" \
+               for file in mm/backing-dev.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(bdi_setup_and_register)" \
                                "$LINUX/$file" 2>/dev/null
                        rc=$?
                        if test $rc -eq 0; then
@@ -19727,34 +20159,25 @@ $as_echo "no" >&6; }
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
-
-               else :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_D_MAKE_ROOT 1" >>confdefs.h
-
-
-               fi
-       fi
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether d_obtain_alias() is available" >&5
-$as_echo_n "checking whether d_obtain_alias() is available... " >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bdi_setup_and_register() wants 3 args" >&5
+$as_echo_n "checking whether bdi_setup_and_register() wants 3 args... " >&6; }
 
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/dcache.h>
+                       #include <linux/backing-dev.h>
+                       struct backing_dev_info bdi;
 
 int
 main (void)
 {
 
-               d_obtain_alias(NULL);
+                       char *name = "bdi";
+                       unsigned int cap = BDI_CAP_MAP_COPY;
+                       int error __attribute__((unused)) =
+                           bdi_setup_and_register(&bdi, name, cap);
 
   ;
   return 0;
 
        if test $rc -ne 0; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
        else
                if test "x$enable_linux_builtin" != xyes; then
 
-       grep -q -E '[[:space:]]d_obtain_alias[[:space:]]' \
+       grep -q -E '[[:space:]]bdi_setup_and_register[[:space:]]' \
                $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
        rc=$?
        if test $rc -ne 0; then
                export=0
-               for file in fs/dcache.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(d_obtain_alias)" \
+               for file in mm/backing-dev.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(bdi_setup_and_register)" \
                                "$LINUX/$file" 2>/dev/null
                        rc=$?
                        if test $rc -eq 0; then
@@ -19829,37 +20252,50 @@ $as_echo "no" >&6; }
                fi
                if test $rc -ne 0; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
+               else :
+
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_3ARGS_BDI_SETUP_AND_REGISTER 1" >>confdefs.h
+
+
+               fi
+       fi
+
+
                else :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_D_OBTAIN_ALIAS 1" >>confdefs.h
+$as_echo "#define HAVE_2ARGS_BDI_SETUP_AND_REGISTER 1" >>confdefs.h
 
 
                fi
        fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether d_prune_aliases() is available" >&5
-$as_echo_n "checking whether d_prune_aliases() is available... " >&6; }
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether set_nlink() is available" >&5
+$as_echo_n "checking whether set_nlink() is available... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/dcache.h>
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
-               struct inode *ip = NULL;
-               d_prune_aliases(ip);
+               struct inode node;
+               unsigned int link = 0;
+               (void) set_nlink(&node, link);
 
   ;
   return 0;
@@ -19889,81 +20325,48 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
-
-
-fi
-       rm -Rf build
-
-
-       if test $rc -ne 0; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
-       else
-               if test "x$enable_linux_builtin" != xyes; then
+$as_echo "#define HAVE_SET_NLINK 1" >>confdefs.h
 
-       grep -q -E '[[:space:]]d_prune_aliases[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in fs/dcache.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(d_prune_aliases)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
 
-               fi
-               if test $rc -ne 0; then :
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
-               else :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_D_PRUNE_ALIASES 1" >>confdefs.h
 
+fi
+       rm -Rf build
 
-               fi
-       fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether d_set_d_op() is available" >&5
-$as_echo_n "checking whether d_set_d_op() is available... " >&6; }
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether elevator_change() is available" >&5
+$as_echo_n "checking whether elevator_change() is available... " >&6; }
+       tmp_flags="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/dcache.h>
+               #include <linux/blkdev.h>
+               #include <linux/elevator.h>
 
 int
 main (void)
 {
 
-               d_set_d_op(NULL, NULL);
+               int ret;
+               struct request_queue *q = NULL;
+               char *elevator = NULL;
+               ret = elevator_change(q, elevator);
 
   ;
   return 0;
@@ -19993,88 +20396,47 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
-
-
-fi
-       rm -Rf build
-
-
-       if test $rc -ne 0; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
-       else
-               if test "x$enable_linux_builtin" != xyes; then
+$as_echo "#define HAVE_ELEVATOR_CHANGE 1" >>confdefs.h
 
-       grep -q -E '[[:space:]]d_set_d_op[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in fs/dcache.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(d_set_d_op)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
 
-               fi
-               if test $rc -ne 0; then :
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
-               else :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_D_SET_D_OP 1" >>confdefs.h
 
 
-               fi
-       fi
+fi
+       rm -Rf build
 
 
+       EXTRA_KCFLAGS="$tmp_flags"
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether dops->d_revalidate() takes struct nameidata" >&5
-$as_echo_n "checking whether dops->d_revalidate() takes struct nameidata... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether sget() wants 5 args" >&5
+$as_echo_n "checking whether sget() wants 5 args... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/dcache.h>
-
-               int revalidate (struct dentry *dentry,
-                   struct nameidata *nidata) { return 0; }
-
-               static const struct dentry_operations
-                   dops __attribute__ ((unused)) = {
-                       .d_revalidate   = revalidate,
-               };
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
+               struct file_system_type *type = NULL;
+               int (*test)(struct super_block *,void *) = NULL;
+               int (*set)(struct super_block *,void *) = NULL;
+               int flags = 0;
+               void *data = NULL;
+               (void) sget(type, test, set, flags, data);
 
   ;
   return 0;
@@ -20108,7 +20470,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_D_REVALIDATE_NAMEIDATA 1" >>confdefs.h
+$as_echo "#define HAVE_5ARG_SGET 1" >>confdefs.h
 
 
 else
 
 
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether lseek_execute() is available" >&5
+$as_echo_n "checking whether lseek_execute() is available... " >&6; }
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether dentry uses const struct dentry_operations" >&5
-$as_echo_n "checking whether dentry uses const struct dentry_operations... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/dcache.h>
-
-               const struct dentry_operations test_d_op = {
-                       .d_revalidate = NULL,
-               };
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
-               struct dentry d __attribute__ ((unused));
+               struct file *fp __attribute__ ((unused)) = NULL;
+               struct inode *ip __attribute__ ((unused)) = NULL;
+               loff_t offset __attribute__ ((unused)) = 0;
+               loff_t maxsize __attribute__ ((unused)) = 0;
 
-               d.d_op = &test_d_op;
+               lseek_execute(fp, ip, offset, maxsize);
 
   ;
   return 0;
@@ -20175,42 +20536,87 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
+  rc=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+ rc=1
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_CONST_DENTRY_OPERATIONS 1" >>confdefs.h
+fi
+       rm -Rf build
 
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+       if test $rc -ne 0; then :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
+       else
+               if test "x$enable_linux_builtin" != xyes; then
 
+       grep -q -E '[[:space:]]lseek_exclusive[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in fs/read_write.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(lseek_exclusive)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
 
-fi
-       rm -Rf build
+               fi
+               if test $rc -ne 0; then :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
+               else :
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether check_disk_size_change() is available" >&5
-$as_echo_n "checking whether check_disk_size_change() is available... " >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_LSEEK_EXECUTE 1" >>confdefs.h
+
+
+               fi
+       fi
 
 
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->iterate() is available" >&5
+$as_echo_n "checking whether fops->iterate() is available... " >&6; }
+
+
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
                #include <linux/fs.h>
+               int iterate(struct file *filp, struct dir_context * context)
+                   { return 0; }
+
+               static const struct file_operations fops
+                   __attribute__ ((unused)) = {
+                       .iterate         = iterate,
+               };
 
 int
 main (void)
 {
 
-               check_disk_size_change(NULL, NULL);
 
   ;
   return 0;
@@ -20240,81 +20646,40 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
-
-
-fi
-       rm -Rf build
-
-
-       if test $rc -ne 0; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-       else
-               if test "x$enable_linux_builtin" != xyes; then
-
-       grep -q -E '[[:space:]]check_disk_size_change[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in fs/block_dev.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(check_disk_size_change)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
-
-               fi
-               if test $rc -ne 0; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-               else :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_CHECK_DISK_SIZE_CHANGE 1" >>confdefs.h
-
+$as_echo "#define HAVE_VFS_ITERATE 1" >>confdefs.h
 
-               fi
-       fi
 
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether truncate_setsize() is available" >&5
-$as_echo_n "checking whether truncate_setsize() is available... " >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->readdir() is available" >&5
+$as_echo_n "checking whether fops->readdir() is available... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/mm.h>
+                       #include <linux/fs.h>
+                       int readdir(struct file *filp, void *entry, filldir_t func)
+                           { return 0; }
+
+                       static const struct file_operations fops
+                           __attribute__ ((unused)) = {
+                               .readdir = readdir,
+                       };
 
 int
 main (void)
 {
 
-               truncate_setsize(NULL, 0);
 
   ;
   return 0;
@@ -20344,88 +20709,58 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
 
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
-fi
-       rm -Rf build
+$as_echo "#define HAVE_VFS_READDIR 1" >>confdefs.h
 
 
-       if test $rc -ne 0; then :
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+                       as_fn_error $? "no; file a bug report with ZFSOnLinux" "$LINENO" 5
 
-       else
-               if test "x$enable_linux_builtin" != xyes; then
 
-       grep -q -E '[[:space:]]truncate_setsize[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in mm/truncate.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(truncate_setsize)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
 
-               fi
-               if test $rc -ne 0; then :
+fi
+       rm -Rf build
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
 
-               else :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_TRUNCATE_SETSIZE 1" >>confdefs.h
 
 
-               fi
-       fi
+fi
+       rm -Rf build
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether security_inode_init_security wants 6 args" >&5
-$as_echo_n "checking whether security_inode_init_security wants 6 args... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->read/write_iter() are available" >&5
+$as_echo_n "checking whether fops->read/write_iter() are available... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/security.h>
+               #include <linux/fs.h>
+
+               ssize_t test_read(struct kiocb *kiocb, struct iov_iter *to)
+                   { return 0; }
+               ssize_t test_write(struct kiocb *kiocb, struct iov_iter *from)
+                   { return 0; }
+
+               static const struct file_operations
+                   fops __attribute__ ((unused)) = {
+                   .read_iter = test_read,
+                   .write_iter = test_write,
+               };
 
 int
 main (void)
 {
 
-               struct inode *ip __attribute__ ((unused)) = NULL;
-               struct inode *dip __attribute__ ((unused)) = NULL;
-               const struct qstr *str __attribute__ ((unused)) = NULL;
-               char *name __attribute__ ((unused)) = NULL;
-               void *value __attribute__ ((unused)) = NULL;
-               size_t len __attribute__ ((unused)) = 0;
-
-               security_inode_init_security(ip, dip, str, &name, &value, &len);
 
   ;
   return 0;
@@ -20459,7 +20794,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_6ARGS_SECURITY_INODE_INIT_SECURITY 1" >>confdefs.h
+$as_echo "#define HAVE_VFS_RW_ITERATE 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether security_inode_init_security wants callback" >&5
-$as_echo_n "checking whether security_inode_init_security wants callback... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether kmap_atomic wants 1 args" >&5
+$as_echo_n "checking whether kmap_atomic wants 1 args... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/security.h>
+               #include <linux/pagemap.h>
 
 int
 main (void)
 {
 
-               struct inode *ip __attribute__ ((unused)) = NULL;
-               struct inode *dip __attribute__ ((unused)) = NULL;
-               const struct qstr *str __attribute__ ((unused)) = NULL;
-               initxattrs func __attribute__ ((unused)) = NULL;
-
-               security_inode_init_security(ip, dip, str, func, NULL);
+               struct page page;
+               kmap_atomic(&page);
 
   ;
   return 0;
@@ -20529,7 +20860,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY 1" >>confdefs.h
+$as_echo "#define HAVE_1ARG_KMAP_ATOMIC 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether mount_nodev() is available" >&5
-$as_echo_n "checking whether mount_nodev() is available... " >&6; }
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether follow_down_one() is available" >&5
+$as_echo_n "checking whether follow_down_one() is available... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
+               #include <linux/namei.h>
 
 int
 main (void)
 {
 
-               mount_nodev(NULL, 0, NULL, NULL);
+               struct path *p = NULL;
+               follow_down_one(p);
 
   ;
   return 0;
@@ -20590,90 +20922,47 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
-
-
-fi
-       rm -Rf build
-
-
-       if test $rc -ne 0; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
-       else
-               if test "x$enable_linux_builtin" != xyes; then
+$as_echo "#define HAVE_FOLLOW_DOWN_ONE 1" >>confdefs.h
 
-       grep -q -E '[[:space:]]mount_nodev[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in fs/super.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(mount_nodev)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
 
-               fi
-               if test $rc -ne 0; then :
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
-               else :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_MOUNT_NODEV 1" >>confdefs.h
 
+fi
+       rm -Rf build
 
-               fi
-       fi
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether super_block has s_shrink" >&5
-$as_echo_n "checking whether super_block has s_shrink... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether make_request_fn() returns int" >&5
+$as_echo_n "checking whether make_request_fn() returns int... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
-
-               int shrink(struct shrinker *s, struct shrink_control *sc)
-                   { return 0; }
+               #include <linux/blkdev.h>
 
-               static const struct super_block
-                   sb __attribute__ ((unused)) = {
-                       .s_shrink.shrink = shrink,
-                       .s_shrink.seeks = DEFAULT_SEEKS,
-                       .s_shrink.batch = 0,
-               };
+               int make_request(struct request_queue *q, struct bio *bio)
+               {
+                       return (0);
+               }
 
 int
 main (void)
 {
 
+               blk_queue_make_request(NULL, &make_request);
 
   ;
   return 0;
@@ -20707,8 +20996,10 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_SHRINK 1" >>confdefs.h
+$as_echo "#define MAKE_REQUEST_FN_RET int" >>confdefs.h
+
 
+$as_echo "#define HAVE_MAKE_REQUEST_FN_RET_INT 1" >>confdefs.h
 
 
 else
@@ -20717,31 +21008,25 @@ sed 's/^/| /' conftest.$ac_ext >&5
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
-
-
-
-fi
-       rm -Rf build
-
-
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether shrink_control has nid" >&5
-$as_echo_n "checking whether shrink_control has nid... " >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether make_request_fn() returns void" >&5
+$as_echo_n "checking whether make_request_fn() returns void... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
+                       #include <linux/blkdev.h>
+
+                       void make_request(struct request_queue *q, struct bio *bio)
+                       {
+                               return;
+                       }
 
 int
 main (void)
 {
 
-               struct shrink_control sc __attribute__ ((unused));
-               unsigned long scnidsize __attribute__ ((unused)) =
-                   sizeof(sc.nid);
+                       blk_queue_make_request(NULL, &make_request);
 
   ;
   return 0;
@@ -20772,18 +21057,18 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define SHRINK_CONTROL_HAS_NID 1" >>confdefs.h
+$as_echo "#define MAKE_REQUEST_FN_RET void" >>confdefs.h
 
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+                       as_fn_error $? "no - Please file a bug report at
+                           https://github.com/zfsonlinux/zfs/issues/new" "$LINENO" 5
 
 
 
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether super_block has s_instances list_head" >&5
-$as_echo_n "checking whether super_block has s_instances list_head... " >&6; }
+
+fi
+       rm -Rf build
+
+
+
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether generic IO accounting symbols are avaliable" >&5
+$as_echo_n "checking whether generic IO accounting symbols are avaliable... " >&6; }
+
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
+               #include <linux/bio.h>
+
+               void (*generic_start_io_acct_f)(int, unsigned long,
+                   struct hd_struct *) = &generic_start_io_acct;
+               void (*generic_end_io_acct_f)(int, struct hd_struct *,
+                   unsigned long) = &generic_end_io_acct;
 
 int
 main (void)
 {
 
-               struct super_block sb __attribute__ ((unused));
-
-               INIT_LIST_HEAD(&sb.s_instances);
+               generic_start_io_acct(0, 0, NULL);
+               generic_end_io_acct(0, NULL, 0);
 
   ;
   return 0;
@@ -20838,3272 +21135,1236 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_S_INSTANCES_LIST_HEAD 1" >>confdefs.h
-
-
+  rc=0
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
+ rc=1
 
 
 fi
        rm -Rf build
 
 
+       if test $rc -ne 0; then :
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether super_block has s_d_op" >&5
-$as_echo_n "checking whether super_block has s_d_op... " >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
+       else
+               if test "x$enable_linux_builtin" != xyes; then
 
-cat confdefs.h - <<_ACEOF >conftest.c
+       grep -q -E '[[:space:]]generic_start_io_acct[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in block/bio.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(generic_start_io_acct)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
 
+               fi
+               if test $rc -ne 0; then :
 
-               #include <linux/fs.h>
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
-int
-main (void)
-{
+               else :
 
-               struct super_block sb __attribute__ ((unused));
-               sb.s_d_op = NULL;
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
-  ;
-  return 0;
-}
+$as_echo "#define HAVE_GENERIC_IO_ACCT 1" >>confdefs.h
 
-_ACEOF
 
+               fi
+       fi
 
 
-cat - <<_ACEOF >conftest.h
 
-_ACEOF
+       if test "$LINUX_OBJ" != "$LINUX"; then :
 
+               KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$LINUX_OBJ"
 
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
+fi
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_S_D_OP 1" >>confdefs.h
 
+                       KERNELCPPFLAGS="$KERNELCPPFLAGS $NO_UNUSED_BUT_SET_VARIABLE"
+       KERNELCPPFLAGS="$KERNELCPPFLAGS $NO_BOOL_COMPARE"
+       KERNELCPPFLAGS="$KERNELCPPFLAGS -DHAVE_SPL -D_KERNEL"
+       KERNELCPPFLAGS="$KERNELCPPFLAGS -DTEXT_DOMAIN=\\\"zfs-linux-kernel\\\""
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+ ;;
+               all)
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for dkms.conf file" >&5
+$as_echo_n "checking for dkms.conf file... " >&6; }
+        if test -e dkms.conf; then :
 
+               as_fn_error $? "
+       *** ZFS should not be manually built in the DKMS source tree.
+       *** Remove all ZFS packages before compiling the ZoL sources.
+       *** Running \"make install\" breaks ZFS packages." "$LINENO" 5
 
-fi
-       rm -Rf build
+else
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: not found" >&5
+$as_echo "not found" >&6; }
 
+fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bdi_setup_and_register() wants 2 args" >&5
-$as_echo_n "checking whether bdi_setup_and_register() wants 2 args... " >&6; }
 
+# Check whether --with-mounthelperdir was given.
+if test "${with_mounthelperdir+set}" = set; then :
+  withval=$with_mounthelperdir; mounthelperdir=$withval
+else
+  mounthelperdir=/sbin
+fi
 
 
-cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/backing-dev.h>
 
-int
-main (void)
-{
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for udev directories" >&5
+$as_echo_n "checking for udev directories... " >&6; }
 
-               struct backing_dev_info bdi;
-               char *name = "bdi";
-               int error __attribute__((unused)) =
-                   bdi_setup_and_register(&bdi, name);
+# Check whether --with-udevdir was given.
+if test "${with_udevdir+set}" = set; then :
+  withval=$with_udevdir; udevdir=$withval
+else
+  udevdir=check
+fi
 
-  ;
-  return 0;
-}
 
-_ACEOF
+       if test "x$udevdir" = xcheck; then :
 
+               path1=/lib/udev
+               path2=/usr/lib/udev
+               default=$path2
 
+               if test -d "$path1"; then :
+  udevdir="$path1"
+else
 
-cat - <<_ACEOF >conftest.h
+                       if test -d "$path2"; then :
+  udevdir="$path2"
+else
+  udevdir="$default"
+fi
 
-_ACEOF
+fi
 
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
+fi
 
 
+# Check whether --with-udevruledir was given.
+if test "${with_udevruledir+set}" = set; then :
+  withval=$with_udevruledir; udevruledir=$withval
+else
+  udevruledir="${udevdir}/rules.d"
 fi
-       rm -Rf build
 
 
-       if test $rc -ne 0; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bdi_setup_and_register() wants 3 args" >&5
-$as_echo_n "checking whether bdi_setup_and_register() wants 3 args... " >&6; }
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $udevdir;$udevruledir" >&5
+$as_echo "$udevdir;$udevruledir" >&6; }
 
 
-cat confdefs.h - <<_ACEOF >conftest.c
+       # Check whether --enable-systemd was given.
+if test "${enable_systemd+set}" = set; then :
+  enableval=$enable_systemd;
+else
+  enable_systemd=yes
+fi
 
 
-                       #include <linux/backing-dev.h>
 
-int
-main (void)
-{
+# Check whether --with-systemdunitdir was given.
+if test "${with_systemdunitdir+set}" = set; then :
+  withval=$with_systemdunitdir; systemdunitdir=$withval
+else
+  systemdunitdir=/usr/lib/systemd/system
+fi
 
-                       struct backing_dev_info bdi;
-                       char *name = "bdi";
-                       unsigned int cap = BDI_CAP_MAP_COPY;
-                       int error __attribute__((unused)) =
-                           bdi_setup_and_register(&bdi, name, cap);
 
-  ;
-  return 0;
-}
 
-_ACEOF
+# Check whether --with-systemdpresetdir was given.
+if test "${with_systemdpresetdir+set}" = set; then :
+  withval=$with_systemdpresetdir; systemdpresetdir=$withval
+else
+  systemdpresetdir=/usr/lib/systemd/system-preset
+fi
 
 
 
-cat - <<_ACEOF >conftest.h
+# Check whether --with-systemdmodulesloaddir was given.
+if test "${with_systemdmodulesloaddir+set}" = set; then :
+  withval=$with_systemdmodulesloaddir; systemdmoduleloaddir=$withval
+else
+  systemdmodulesloaddir=/usr/lib/modules-load.d
+fi
 
-_ACEOF
 
 
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
+       if test "x$enable_systemd" = xyes; then :
 
+               ZFS_INIT_SYSTEMD=systemd
+               ZFS_MODULE_LOAD=modules-load.d
+               modulesloaddir=$systemdmodulesloaddir
 
 fi
-       rm -Rf build
 
 
-       if test $rc -ne 0; then :
 
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
 
-       else
-               if test "x$enable_linux_builtin" != xyes; then
 
-       grep -q -E '[[:space:]]bdi_setup_and_register[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in mm/backing-dev.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(bdi_setup_and_register)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
 
-               fi
-               if test $rc -ne 0; then :
 
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
 
-               else :
+       # Check whether --enable-sysvinit was given.
+if test "${enable_sysvinit+set}" = set; then :
+  enableval=$enable_sysvinit;
+else
+  enable_sysvinit=yes
+fi
 
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_3ARGS_BDI_SETUP_AND_REGISTER 1" >>confdefs.h
+       if test "x$enable_sysvinit" = xyes; then :
+  ZFS_INIT_SYSV=init.d
+fi
 
 
-               fi
-       fi
 
 
-       else
-               if test "x$enable_linux_builtin" != xyes; then
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for dracut directory" >&5
+$as_echo_n "checking for dracut directory... " >&6; }
 
-       grep -q -E '[[:space:]]bdi_setup_and_register[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in mm/backing-dev.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(bdi_setup_and_register)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
+# Check whether --with-dracutdir was given.
+if test "${with_dracutdir+set}" = set; then :
+  withval=$with_dracutdir; dracutdir=$withval
+else
+  dracutdir=check
+fi
 
-               fi
-               if test $rc -ne 0; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bdi_setup_and_register() wants 3 args" >&5
-$as_echo_n "checking whether bdi_setup_and_register() wants 3 args... " >&6; }
+       if test "x$dracutdir" = xcheck; then :
 
+               path1=/usr/share/dracut
+               path2=/usr/lib/dracut
+               default=$path2
 
+               if test -d "$path1"; then :
+  dracutdir="$path1"
+else
 
-cat confdefs.h - <<_ACEOF >conftest.c
+                       if test -d "$path2"; then :
+  dracutdir="$path2"
+else
+  dracutdir="$default"
+fi
 
+fi
 
-                       #include <linux/backing-dev.h>
+fi
 
-int
-main (void)
-{
 
-                       struct backing_dev_info bdi;
-                       char *name = "bdi";
-                       unsigned int cap = BDI_CAP_MAP_COPY;
-                       int error __attribute__((unused)) =
-                           bdi_setup_and_register(&bdi, name, cap);
+       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $dracutdir" >&5
+$as_echo "$dracutdir" >&6; }
 
-  ;
-  return 0;
-}
 
-_ACEOF
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for target asm dir" >&5
+$as_echo_n "checking for target asm dir... " >&6; }
+       TARGET_ARCH=`echo ${target_cpu} | sed -e s/i.86/i386/`
 
+       case $TARGET_ARCH in
+       i386|x86_64)
+               TARGET_ASM_DIR=asm-${TARGET_ARCH}
+               ;;
+       *)
+               TARGET_ASM_DIR=asm-generic
+               ;;
+       esac
 
 
-cat - <<_ACEOF >conftest.h
+       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $TARGET_ASM_DIR" >&5
+$as_echo "$TARGET_ASM_DIR" >&6; }
 
-_ACEOF
 
+       ZLIB=
+
+       ac_fn_c_check_header_mongrel "$LINENO" "zlib.h" "ac_cv_header_zlib_h" "$ac_includes_default"
+if test "x$ac_cv_header_zlib_h" = xyes; then :
 
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  rc=0
 else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "
+       *** zlib.h missing, zlib-devel package required
+See \`config.log' for more details" "$LINENO" 5; }
+fi
 
 
-fi
-       rm -Rf build
-
-
-       if test $rc -ne 0; then :
-
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-       else
-               if test "x$enable_linux_builtin" != xyes; then
-
-       grep -q -E '[[:space:]]bdi_setup_and_register[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in mm/backing-dev.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(bdi_setup_and_register)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
-
-               fi
-               if test $rc -ne 0; then :
-
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-               else :
-
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_3ARGS_BDI_SETUP_AND_REGISTER 1" >>confdefs.h
-
-
-               fi
-       fi
-
-
-               else :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_2ARGS_BDI_SETUP_AND_REGISTER 1" >>confdefs.h
-
-
-               fi
-       fi
-
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether set_nlink() is available" >&5
-$as_echo_n "checking whether set_nlink() is available... " >&6; }
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-               #include <linux/fs.h>
-
-int
-main (void)
-{
-
-               struct inode node;
-               unsigned int link = 0;
-               (void) set_nlink(&node, link);
-
-  ;
-  return 0;
-}
-
-_ACEOF
-
-
-
-cat - <<_ACEOF >conftest.h
-
-_ACEOF
-
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_SET_NLINK 1" >>confdefs.h
-
-
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-
-
-fi
-       rm -Rf build
-
-
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether elevator_change() is available" >&5
-$as_echo_n "checking whether elevator_change() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-               #include <linux/blkdev.h>
-               #include <linux/elevator.h>
-
-int
-main (void)
-{
-
-               int ret;
-               struct request_queue *q = NULL;
-               char *elevator = NULL;
-               ret = elevator_change(q, elevator);
-
-  ;
-  return 0;
-}
-
-_ACEOF
-
-
-
-cat - <<_ACEOF >conftest.h
-
-_ACEOF
-
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_ELEVATOR_CHANGE 1" >>confdefs.h
-
-
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-
-
-fi
-       rm -Rf build
-
-
-       EXTRA_KCFLAGS="$tmp_flags"
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether sget() wants 5 args" >&5
-$as_echo_n "checking whether sget() wants 5 args... " >&6; }
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-               #include <linux/fs.h>
-
-int
-main (void)
-{
-
-               struct file_system_type *type = NULL;
-               int (*test)(struct super_block *,void *) = NULL;
-               int (*set)(struct super_block *,void *) = NULL;
-               int flags = 0;
-               void *data = NULL;
-               (void) sget(type, test, set, flags, data);
-
-  ;
-  return 0;
-}
-
-_ACEOF
-
-
-
-cat - <<_ACEOF >conftest.h
-
-_ACEOF
-
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_5ARG_SGET 1" >>confdefs.h
-
-
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-
-
-fi
-       rm -Rf build
-
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether lseek_execute() is available" >&5
-$as_echo_n "checking whether lseek_execute() is available... " >&6; }
-
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-               #include <linux/fs.h>
-
-int
-main (void)
-{
-
-               struct file *fp __attribute__ ((unused)) = NULL;
-               struct inode *ip __attribute__ ((unused)) = NULL;
-               loff_t offset __attribute__ ((unused)) = 0;
-               loff_t maxsize __attribute__ ((unused)) = 0;
-
-               lseek_execute(fp, ip, offset, maxsize);
-
-  ;
-  return 0;
-}
-
-_ACEOF
-
-
-
-cat - <<_ACEOF >conftest.h
-
-_ACEOF
-
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
-
-
-fi
-       rm -Rf build
-
-
-       if test $rc -ne 0; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-       else
-               if test "x$enable_linux_builtin" != xyes; then
-
-       grep -q -E '[[:space:]]lseek_exclusive[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in fs/read_write.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(lseek_exclusive)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
-
-               fi
-               if test $rc -ne 0; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-               else :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_LSEEK_EXECUTE 1" >>confdefs.h
-
-
-               fi
-       fi
-
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->iterate() is available" >&5
-$as_echo_n "checking whether fops->iterate() is available... " >&6; }
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-               #include <linux/fs.h>
-               int iterate(struct file *filp, struct dir_context * context)
-                   { return 0; }
-
-               static const struct file_operations fops
-                   __attribute__ ((unused)) = {
-                       .iterate         = iterate,
-               };
-
-int
-main (void)
-{
-
-
-  ;
-  return 0;
-}
-
-_ACEOF
-
-
-
-cat - <<_ACEOF >conftest.h
-
-_ACEOF
-
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_VFS_ITERATE 1" >>confdefs.h
-
-
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->readdir() is available" >&5
-$as_echo_n "checking whether fops->readdir() is available... " >&6; }
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-                       #include <linux/fs.h>
-                       int readdir(struct file *filp, void *entry, filldir_t func)
-                           { return 0; }
-
-                       static const struct file_operations fops
-                           __attribute__ ((unused)) = {
-                               .readdir = readdir,
-                       };
-
-int
-main (void)
-{
-
-
-  ;
-  return 0;
-}
-
-_ACEOF
-
-
-
-cat - <<_ACEOF >conftest.h
-
-_ACEOF
-
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_VFS_READDIR 1" >>confdefs.h
-
-
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-                       as_fn_error $? "no; file a bug report with ZFSOnLinux" "$LINENO" 5
-
-
-
-fi
-       rm -Rf build
-
-
-
-
-
-
-fi
-       rm -Rf build
-
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->read/write_iter() are available" >&5
-$as_echo_n "checking whether fops->read/write_iter() are available... " >&6; }
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-               #include <linux/fs.h>
-
-               ssize_t test_read(struct kiocb *kiocb, struct iov_iter *to)
-                   { return 0; }
-               ssize_t test_write(struct kiocb *kiocb, struct iov_iter *from)
-                   { return 0; }
-
-               static const struct file_operations
-                   fops __attribute__ ((unused)) = {
-                   .read_iter = test_read,
-                   .write_iter = test_write,
-               };
-
-int
-main (void)
-{
-
-
-  ;
-  return 0;
-}
-
-_ACEOF
-
-
-
-cat - <<_ACEOF >conftest.h
-
-_ACEOF
-
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_VFS_RW_ITERATE 1" >>confdefs.h
-
-
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-
-
-fi
-       rm -Rf build
-
-
-
-
-       if test "$LINUX_OBJ" != "$LINUX"; then :
-
-               KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$LINUX_OBJ"
-
-fi
-
-
-
-                       KERNELCPPFLAGS="$KERNELCPPFLAGS $NO_UNUSED_BUT_SET_VARIABLE"
-       KERNELCPPFLAGS="$KERNELCPPFLAGS -DHAVE_SPL -D_KERNEL"
-       KERNELCPPFLAGS="$KERNELCPPFLAGS -DTEXT_DOMAIN=\\\"zfs-linux-kernel\\\""
-
-
- ;;
-               all)
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for dkms.conf file" >&5
-$as_echo_n "checking for dkms.conf file... " >&6; }
-        if test -e dkms.conf; then :
-
-               as_fn_error $? "
-       *** ZFS should not be manually built in the DKMS source tree.
-       *** Remove all ZFS packages before compiling the ZoL sources.
-       *** Running \"make install\" breaks ZFS packages." "$LINENO" 5
-
-else
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: not found" >&5
-$as_echo "not found" >&6; }
-
-fi
-
-
-
-# Check whether --with-mounthelperdir was given.
-if test "${with_mounthelperdir+set}" = set; then :
-  withval=$with_mounthelperdir; mounthelperdir=$withval
-else
-  mounthelperdir=$sbindir
-fi
-
-
-
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for udev directories" >&5
-$as_echo_n "checking for udev directories... " >&6; }
-
-# Check whether --with-udevdir was given.
-if test "${with_udevdir+set}" = set; then :
-  withval=$with_udevdir; udevdir=$withval
-else
-  udevdir=check
-fi
-
-
-       if test "x$udevdir" = xcheck; then :
-
-               path1=/lib/udev
-               path2=/usr/lib/udev
-               default=$path2
-
-               if test -d "$path1"; then :
-  udevdir="$path1"
-else
-
-                       if test -d "$path2"; then :
-  udevdir="$path2"
-else
-  udevdir="$default"
-fi
-
-fi
-
-fi
-
-
-# Check whether --with-udevruledir was given.
-if test "${with_udevruledir+set}" = set; then :
-  withval=$with_udevruledir; udevruledir=$withval
-else
-  udevruledir="${udevdir}/rules.d"
-fi
-
-
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $udevdir;$udevruledir" >&5
-$as_echo "$udevdir;$udevruledir" >&6; }
-
-
-       # Check whether --enable-systemd was given.
-if test "${enable_systemd+set}" = set; then :
-  enableval=$enable_systemd;
-else
-  enable_systemd=yes
-fi
-
-
-
-# Check whether --with-systemdunitdir was given.
-if test "${with_systemdunitdir+set}" = set; then :
-  withval=$with_systemdunitdir; systemdunitdir=$withval
-else
-  systemdunitdir=/usr/lib/systemd/system
-fi
-
-
-
-# Check whether --with-systemdpresetdir was given.
-if test "${with_systemdpresetdir+set}" = set; then :
-  withval=$with_systemdpresetdir; systemdpresetdir=$withval
-else
-  systemdpresetdir=/usr/lib/systemd/system-preset
-fi
-
-
-
-# Check whether --with-systemdmodulesloaddir was given.
-if test "${with_systemdmodulesloaddir+set}" = set; then :
-  withval=$with_systemdmodulesloaddir; systemdmoduleloaddir=$withval
-else
-  systemdmodulesloaddir=/usr/lib/modules-load.d
-fi
-
-
-
-       if test "x$enable_systemd" = xyes; then :
-
-               ZFS_INIT_SYSTEMD=systemd
-               ZFS_MODULE_LOAD=modules-load.d
-               modulesloaddir=$systemdmodulesloaddir
-
-fi
-
-
-
-
-
-
-
-
-       # Check whether --enable-sysvinit was given.
-if test "${enable_sysvinit+set}" = set; then :
-  enableval=$enable_sysvinit;
-else
-  enable_sysvinit=yes
-fi
-
-
-       if test "x$enable_sysvinit" = xyes; then :
-  ZFS_INIT_SYSV=init.d
-fi
-
-
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for dracut directory" >&5
-$as_echo_n "checking for dracut directory... " >&6; }
-
-# Check whether --with-dracutdir was given.
-if test "${with_dracutdir+set}" = set; then :
-  withval=$with_dracutdir; dracutdir=$withval
-else
-  dracutdir=check
-fi
-
-
-       if test "x$dracutdir" = xcheck; then :
-
-               path1=/usr/share/dracut
-               path2=/usr/lib/dracut
-               default=$path2
-
-               if test -d "$path1"; then :
-  dracutdir="$path1"
-else
-
-                       if test -d "$path2"; then :
-  dracutdir="$path2"
-else
-  dracutdir="$default"
-fi
-
-fi
-
-fi
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $dracutdir" >&5
-$as_echo "$dracutdir" >&6; }
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for target asm dir" >&5
-$as_echo_n "checking for target asm dir... " >&6; }
-       TARGET_ARCH=`echo ${target_cpu} | sed -e s/i.86/i386/`
-
-       case $TARGET_ARCH in
-       i386|x86_64)
-               TARGET_ASM_DIR=asm-${TARGET_ARCH}
-               ;;
-       *)
-               TARGET_ASM_DIR=asm-generic
-               ;;
-       esac
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $TARGET_ASM_DIR" >&5
-$as_echo "$TARGET_ASM_DIR" >&6; }
-
-
-       ZLIB=
-
-       ac_fn_c_check_header_mongrel "$LINENO" "zlib.h" "ac_cv_header_zlib_h" "$ac_includes_default"
-if test "x$ac_cv_header_zlib_h" = xyes; then :
-
-else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "
-       *** zlib.h missing, zlib-devel package required
-See \`config.log' for more details" "$LINENO" 5; }
-fi
-
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for compress2 in -lz" >&5
-$as_echo_n "checking for compress2 in -lz... " >&6; }
-if ${ac_cv_lib_z_compress2+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lz  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char compress2 ();
-int
-main ()
-{
-return compress2 ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_z_compress2=yes
-else
-  ac_cv_lib_z_compress2=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_z_compress2" >&5
-$as_echo "$ac_cv_lib_z_compress2" >&6; }
-if test "x$ac_cv_lib_z_compress2" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LIBZ 1
-_ACEOF
-
-  LIBS="-lz $LIBS"
-
-else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "
-       *** compress2() missing, zlib-devel package required
-See \`config.log' for more details" "$LINENO" 5; }
-fi
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for uncompress in -lz" >&5
-$as_echo_n "checking for uncompress in -lz... " >&6; }
-if ${ac_cv_lib_z_uncompress+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lz  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char uncompress ();
-int
-main ()
-{
-return uncompress ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_z_uncompress=yes
-else
-  ac_cv_lib_z_uncompress=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_z_uncompress" >&5
-$as_echo "$ac_cv_lib_z_uncompress" >&6; }
-if test "x$ac_cv_lib_z_uncompress" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LIBZ 1
-_ACEOF
-
-  LIBS="-lz $LIBS"
-
-else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "
-       *** uncompress() missing, zlib-devel package required
-See \`config.log' for more details" "$LINENO" 5; }
-fi
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for crc32 in -lz" >&5
-$as_echo_n "checking for crc32 in -lz... " >&6; }
-if ${ac_cv_lib_z_crc32+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lz  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char crc32 ();
-int
-main ()
-{
-return crc32 ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_z_crc32=yes
-else
-  ac_cv_lib_z_crc32=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_z_crc32" >&5
-$as_echo "$ac_cv_lib_z_crc32" >&6; }
-if test "x$ac_cv_lib_z_crc32" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LIBZ 1
-_ACEOF
-
-  LIBS="-lz $LIBS"
-
-else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "
-       *** crc32() missing, zlib-devel package required
-See \`config.log' for more details" "$LINENO" 5; }
-fi
-
-
-       ZLIB="-lz"
-
-
-$as_echo "#define HAVE_ZLIB 1" >>confdefs.h
-
-
-
-       LIBUUID=
-
-       ac_fn_c_check_header_mongrel "$LINENO" "uuid/uuid.h" "ac_cv_header_uuid_uuid_h" "$ac_includes_default"
-if test "x$ac_cv_header_uuid_uuid_h" = xyes; then :
-
-else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "
-       *** uuid/uuid.h missing, libuuid-devel package required
-See \`config.log' for more details" "$LINENO" 5; }
-fi
-
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for uuid_generate in -luuid" >&5
-$as_echo_n "checking for uuid_generate in -luuid... " >&6; }
-if ${ac_cv_lib_uuid_uuid_generate+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-luuid  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char uuid_generate ();
-int
-main ()
-{
-return uuid_generate ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_uuid_uuid_generate=yes
-else
-  ac_cv_lib_uuid_uuid_generate=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_uuid_uuid_generate" >&5
-$as_echo "$ac_cv_lib_uuid_uuid_generate" >&6; }
-if test "x$ac_cv_lib_uuid_uuid_generate" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LIBUUID 1
-_ACEOF
-
-  LIBS="-luuid $LIBS"
-
-else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "
-       *** uuid_generate() missing, libuuid-devel package required
-See \`config.log' for more details" "$LINENO" 5; }
-fi
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for uuid_is_null in -luuid" >&5
-$as_echo_n "checking for uuid_is_null in -luuid... " >&6; }
-if ${ac_cv_lib_uuid_uuid_is_null+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-luuid  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char uuid_is_null ();
-int
-main ()
-{
-return uuid_is_null ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_uuid_uuid_is_null=yes
-else
-  ac_cv_lib_uuid_uuid_is_null=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_uuid_uuid_is_null" >&5
-$as_echo "$ac_cv_lib_uuid_uuid_is_null" >&6; }
-if test "x$ac_cv_lib_uuid_uuid_is_null" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LIBUUID 1
-_ACEOF
-
-  LIBS="-luuid $LIBS"
-
-else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "
-       *** uuid_is_null() missing, libuuid-devel package required
-See \`config.log' for more details" "$LINENO" 5; }
-fi
-
-
-       LIBUUID="-luuid"
-
-
-$as_echo "#define HAVE_LIBUUID 1" >>confdefs.h
-
-
-
-
-# Check whether --with-blkid was given.
-if test "${with_blkid+set}" = set; then :
-  withval=$with_blkid;
-else
-  with_blkid=check
-fi
-
-
-       LIBBLKID=
-       if test "x$with_blkid" = xyes; then :
-
-               LIBBLKID="-lblkid"
-
-
-$as_echo "#define HAVE_LIBBLKID 1" >>confdefs.h
-
-
-fi
-
-       if test "x$with_blkid" = xcheck; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: checking for blkid_get_cache in -lblkid" >&5
-$as_echo_n "checking for blkid_get_cache in -lblkid... " >&6; }
-if ${ac_cv_lib_blkid_blkid_get_cache+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lblkid  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char blkid_get_cache ();
-int
-main ()
-{
-return blkid_get_cache ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_blkid_blkid_get_cache=yes
-else
-  ac_cv_lib_blkid_blkid_get_cache=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_blkid_blkid_get_cache" >&5
-$as_echo "$ac_cv_lib_blkid_blkid_get_cache" >&6; }
-if test "x$ac_cv_lib_blkid_blkid_get_cache" = xyes; then :
-
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for blkid zfs support" >&5
-$as_echo_n "checking for blkid zfs support... " >&6; }
-
-                       ZFS_DEV=`mktemp`
-                       truncate -s 64M $ZFS_DEV
-                       echo -en "\x0c\xb1\xba\0\0\0\0\0" | \
-                               dd of=$ZFS_DEV bs=1k count=8 \
-                               seek=128 conv=notrunc &>/dev/null \
-                               >/dev/null 2>/dev/null
-                       echo -en "\x0c\xb1\xba\0\0\0\0\0" | \
-                               dd of=$ZFS_DEV bs=1k count=8 \
-                               seek=132 conv=notrunc &>/dev/null \
-                               >/dev/null 2>/dev/null
-                       echo -en "\x0c\xb1\xba\0\0\0\0\0" | \
-                               dd of=$ZFS_DEV bs=1k count=8 \
-                               seek=136 conv=notrunc &>/dev/null \
-                               >/dev/null 2>/dev/null
-                       echo -en "\x0c\xb1\xba\0\0\0\0\0" | \
-                               dd of=$ZFS_DEV bs=1k count=8 \
-                               seek=140 conv=notrunc &>/dev/null \
-                               >/dev/null 2>/dev/null
-
-                       saved_LIBS="$LIBS"
-                       LIBS="-lblkid"
-
-                       if test "$cross_compiling" = yes; then :
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "cannot run test program while cross compiling
-See \`config.log' for more details" "$LINENO" 5; }
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-                               #include <stdio.h>
-                               #include <stdlib.h>
-                               #include <blkid/blkid.h>
-
-int
-main ()
-{
-
-                               blkid_cache cache;
-                               char *value;
-
-                               if (blkid_get_cache(&cache, NULL) < 0)
-                                       return 1;
-
-                               value = blkid_get_tag_value(cache, "TYPE",
-                                                           "$ZFS_DEV");
-                               if (!value) {
-                                       blkid_put_cache(cache);
-                                       return 2;
-                               }
-
-                               if (strcmp(value, "zfs_member")) {
-                                       free(value);
-                                       blkid_put_cache(cache);
-                                       return 0;
-                               }
-
-                               free(value);
-                               blkid_put_cache(cache);
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_run "$LINENO"; then :
-
-                               rm -f $ZFS_DEV
-                               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-                               LIBBLKID="-lblkid"
-
-
-$as_echo "#define HAVE_LIBBLKID 1" >>confdefs.h
-
-
-else
-
-                               rm -f $ZFS_DEV
-                               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-                               if test "x$with_blkid" != xcheck; then :
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "--with-blkid given but unavailable
-See \`config.log' for more details" "$LINENO" 5; }
-fi
-
-fi
-rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
-  conftest.$ac_objext conftest.beam conftest.$ac_ext
-fi
-
-
-                       LIBS="$saved_LIBS"
-
-else
-
-                       if test "x$with_blkid" != xcheck; then :
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "--with-blkid given but unavailable
-See \`config.log' for more details" "$LINENO" 5; }
-fi
-
-
-fi
-
-
-fi
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for -Wframe-larger-than=<size> support" >&5
-$as_echo_n "checking for -Wframe-larger-than=<size> support... " >&6; }
-
-       saved_flags="$CFLAGS"
-       CFLAGS="$CFLAGS -Wframe-larger-than=1024"
-
-       cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-
-               FRAME_LARGER_THAN=-Wframe-larger-than=1024
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-else
-
-               FRAME_LARGER_THAN=
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-
-       CFLAGS="$saved_flags"
-
-
-
-       if test "x$runstatedir" = x; then
-               runstatedir='${localstatedir}/run'
-
-       fi
-
-       for ac_func in mlockall
-do :
-  ac_fn_c_check_func "$LINENO" "mlockall" "ac_cv_func_mlockall"
-if test "x$ac_cv_func_mlockall" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_MLOCKALL 1
-_ACEOF
-
-fi
-done
-
-
-
-
-
-# Check whether --with-linux was given.
-if test "${with_linux+set}" = set; then :
-  withval=$with_linux; kernelsrc="$withval"
-fi
-
-
-
-# Check whether --with-linux-obj was given.
-if test "${with_linux_obj+set}" = set; then :
-  withval=$with_linux_obj; kernelbuild="$withval"
-fi
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking kernel source directory" >&5
-$as_echo_n "checking kernel source directory... " >&6; }
-       if test -z "$kernelsrc"; then :
-
-               if test -e "/lib/modules/$(uname -r)/source"; then :
-
-                       headersdir="/lib/modules/$(uname -r)/source"
-                       sourcelink=$(readlink -f "$headersdir")
-
-elif test -e "/lib/modules/$(uname -r)/build"; then :
-
-                       headersdir="/lib/modules/$(uname -r)/build"
-                       sourcelink=$(readlink -f "$headersdir")
-
-else
-
-                       sourcelink=$(ls -1d /usr/src/kernels/* \
-                                    /usr/src/linux-* \
-                                    2>/dev/null | grep -v obj | tail -1)
-
-fi
-
-               if test -n "$sourcelink" && test -e ${sourcelink}; then :
-
-                       kernelsrc=`readlink -f ${sourcelink}`
-
-else
-
-                       kernelsrc="Not found"
-
-fi
-
-else
-
-               if test "$kernelsrc" = "NONE"; then :
-
-                       kernsrcver=NONE
-
-fi
-
-fi
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $kernelsrc" >&5
-$as_echo "$kernelsrc" >&6; }
-       if test ! -d "$kernelsrc"; then :
-
-               as_fn_error $? "
-       *** Please make sure the kernel devel package for your distribution
-       *** is installed and then try again.  If that fails, you can specify the
-       *** location of the kernel source with the '--with-linux=PATH' option." "$LINENO" 5
-
-fi
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking kernel build directory" >&5
-$as_echo_n "checking kernel build directory... " >&6; }
-       if test -z "$kernelbuild"; then :
-
-               if test -e "/lib/modules/$(uname -r)/build"; then :
-
-                       kernelbuild=`readlink -f /lib/modules/$(uname -r)/build`
-
-elif test -d ${kernelsrc}-obj/${target_cpu}/${target_cpu}; then :
-
-                       kernelbuild=${kernelsrc}-obj/${target_cpu}/${target_cpu}
-
-elif test -d ${kernelsrc}-obj/${target_cpu}/default; then :
-
-                       kernelbuild=${kernelsrc}-obj/${target_cpu}/default
-
-elif test -d `dirname ${kernelsrc}`/build-${target_cpu}; then :
-
-                       kernelbuild=`dirname ${kernelsrc}`/build-${target_cpu}
-
-else
-
-                       kernelbuild=${kernelsrc}
-
-fi
-
-fi
-       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $kernelbuild" >&5
-$as_echo "$kernelbuild" >&6; }
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking kernel source version" >&5
-$as_echo_n "checking kernel source version... " >&6; }
-       utsrelease1=$kernelbuild/include/linux/version.h
-       utsrelease2=$kernelbuild/include/linux/utsrelease.h
-       utsrelease3=$kernelbuild/include/generated/utsrelease.h
-       if test -r $utsrelease1 && fgrep -q UTS_RELEASE $utsrelease1; then :
-
-               utsrelease=linux/version.h
-
-elif test -r $utsrelease2 && fgrep -q UTS_RELEASE $utsrelease2; then :
-
-               utsrelease=linux/utsrelease.h
-
-elif test -r $utsrelease3 && fgrep -q UTS_RELEASE $utsrelease3; then :
-
-               utsrelease=generated/utsrelease.h
-
-fi
-
-       if test "$utsrelease"; then :
-
-               kernsrcver=`(echo "#include <$utsrelease>";
-                            echo "kernsrcver=UTS_RELEASE") |
-                            cpp -I $kernelbuild/include |
-                            grep "^kernsrcver=" | cut -d \" -f 2`
-
-               if test -z "$kernsrcver"; then :
-
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: Not found" >&5
-$as_echo "Not found" >&6; }
-                       as_fn_error $? "*** Cannot determine kernel version." "$LINENO" 5
-
-fi
-
-else
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: Not found" >&5
-$as_echo "Not found" >&6; }
-               if test "x$enable_linux_builtin" != xyes; then
-                       as_fn_error $? "*** Cannot find UTS_RELEASE definition." "$LINENO" 5
-               else
-                       as_fn_error $? "
-       *** Cannot find UTS_RELEASE definition.
-       *** Please run 'make prepare' inside the kernel source tree." "$LINENO" 5
-               fi
-
-fi
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $kernsrcver" >&5
-$as_echo "$kernsrcver" >&6; }
-
-       LINUX=${kernelsrc}
-       LINUX_OBJ=${kernelbuild}
-       LINUX_VERSION=${kernsrcver}
-
-
-
-
-
-
-       modpost=$LINUX/scripts/Makefile.modpost
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking kernel file name for module symbols" >&5
-$as_echo_n "checking kernel file name for module symbols... " >&6; }
-       if test "x$enable_linux_builtin" != xyes -a -f "$modpost"; then :
-
-               if grep -q Modules.symvers $modpost; then :
-
-                       LINUX_SYMBOLS=Modules.symvers
-
-else
-
-                       LINUX_SYMBOLS=Module.symvers
-
-fi
-
-               if test ! -f "$LINUX_OBJ/$LINUX_SYMBOLS"; then :
-
-                       as_fn_error $? "
-       *** Please make sure the kernel devel package for your distribution
-       *** is installed.  If you are building with a custom kernel, make sure the
-       *** kernel is configured, built, and the '--with-linux=PATH' configure
-       *** option refers to the location of the kernel source." "$LINENO" 5
-
-fi
-
-else
-
-               LINUX_SYMBOLS=NONE
-
-fi
-       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $LINUX_SYMBOLS" >&5
-$as_echo "$LINUX_SYMBOLS" >&6; }
-
-
-
-
-
-# Check whether --with-spl was given.
-if test "${with_spl+set}" = set; then :
-  withval=$with_spl; splsrc="$withval"
-fi
-
-
-
-# Check whether --with-spl-obj was given.
-if test "${with_spl_obj+set}" = set; then :
-  withval=$with_spl_obj; splbuild="$withval"
-fi
-
-
-
-# Check whether --with-spl-timeout was given.
-if test "${with_spl_timeout+set}" = set; then :
-  withval=$with_spl_timeout; timeout="$withval"
-else
-  timeout=0
-fi
-
-
-                                       splsrc0="/var/lib/dkms/spl/${VERSION}/build"
-       splsrc1="/usr/local/src/spl-${VERSION}/${LINUX_VERSION}"
-       splsrc2="/usr/local/src/spl-${VERSION}"
-       splsrc3="/usr/src/spl-${VERSION}/${LINUX_VERSION}"
-       splsrc4="/usr/src/spl-${VERSION}"
-       splsrc5="../spl/"
-       splsrc6="$LINUX"
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking spl source directory" >&5
-$as_echo_n "checking spl source directory... " >&6; }
-       if test -z "${splsrc}"; then :
-
-               if  test -e "${splsrc0}/spl.release.in"; then :
-
-                       splsrc=${splsrc0}
-
-elif  test -e "${splsrc1}/spl.release.in"; then :
-
-                       splsrc=${splsrc1}
-
-elif  test -e "${splsrc2}/spl.release.in"; then :
-
-                       splsrc=${splsrc2}
-
-elif  test -e "${splsrc3}/spl.release.in"; then :
-
-                       splsrc=$(readlink -f "${splsrc3}")
-
-elif  test -e "${splsrc4}/spl.release.in" ; then :
-
-                       splsrc=${splsrc4}
-
-elif  test -e "${splsrc5}/spl.release.in"; then :
-
-                       splsrc=$(readlink -f "${splsrc5}")
-
-elif  test -e "${splsrc6}/spl.release.in" ; then :
-
-                       splsrc=${splsrc6}
-
-else
-
-                       splsrc="Not found"
-
-fi
-
-else
-
-               if test "$splsrc" = "NONE"; then :
-
-                       splbuild=NONE
-                       splsrcver=NONE
-
-fi
-
-fi
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $splsrc" >&5
-$as_echo "$splsrc" >&6; }
-       if  test ! -e "$splsrc/spl.release.in"; then :
-
-               as_fn_error $? "
-       *** Please make sure the kmod spl devel package for your distribution
-       *** is installed then try again.  If that fails you can specify the
-       *** location of the spl source with the '--with-spl=PATH' option." "$LINENO" 5
-
-fi
-
-                                                                                                       { $as_echo "$as_me:${as_lineno-$LINENO}: checking spl build directory" >&5
-$as_echo_n "checking spl build directory... " >&6; }
-       while true; do
-               if test -z "$splbuild"; then :
-
-                       if  test -e "${splsrc}/${LINUX_VERSION}/spl_config.h" ; then :
-
-                               splbuild="${splsrc}/${LINUX_VERSION}"
-
-elif  test -e "${splsrc}/spl_config.h" ; then :
-
-                               splbuild="${splsrc}"
-
-else
-
-                               splbuild="Not found"
-
-fi
-
-fi
-               if test -e "$splbuild/spl_config.h" -o $timeout -le 0; then :
-
-                       break;
-
-else
-
-                       sleep 1
-                       timeout=$((timeout-1))
-
-fi
-       done
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $splbuild" >&5
-$as_echo "$splbuild" >&6; }
-       if  ! test -e "$splbuild/spl_config.h"; then :
-
-               as_fn_error $? "
-       *** Please make sure the kmod spl devel <kernel> package for your
-       *** distribution is installed then try again.  If that fails you
-       *** can specify the location of the spl objects with the
-       *** '--with-spl-obj=PATH' option." "$LINENO" 5
-
-fi
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking spl source version" >&5
-$as_echo_n "checking spl source version... " >&6; }
-       if test -r $splbuild/spl_config.h &&
-               fgrep -q SPL_META_VERSION $splbuild/spl_config.h; then :
-
-
-               splsrcver=`(echo "#include <spl_config.h>";
-                           echo "splsrcver=SPL_META_VERSION-SPL_META_RELEASE") |
-                           cpp -I $splbuild |
-                           grep "^splsrcver=" | tr -d \" | cut -d= -f2`
-
-fi
-
-       if test -z "$splsrcver"; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: Not found" >&5
-$as_echo "Not found" >&6; }
-               as_fn_error $? "
-       *** Cannot determine the version of the spl source.
-       *** Please prepare the spl source before running this script" "$LINENO" 5
-
-fi
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $splsrcver" >&5
-$as_echo "$splsrcver" >&6; }
-
-       SPL=${splsrc}
-       SPL_OBJ=${splbuild}
-       SPL_VERSION=${splsrcver}
-
-
-
-
-
-                                                                                                               { $as_echo "$as_me:${as_lineno-$LINENO}: checking spl file name for module symbols" >&5
-$as_echo_n "checking spl file name for module symbols... " >&6; }
-       SPL_SYMBOLS=NONE
-
-       while true; do
-               if test -r $SPL_OBJ/Module.symvers; then :
-
-                       SPL_SYMBOLS=Module.symvers
-
-elif test -r $SPL_OBJ/Modules.symvers; then :
-
-                       SPL_SYMBOLS=Modules.symvers
-
-elif test -r $SPL_OBJ/module/Module.symvers; then :
-
-                       SPL_SYMBOLS=Module.symvers
-
-elif test -r $SPL_OBJ/module/Modules.symvers; then :
-
-                       SPL_SYMBOLS=Modules.symvers
-
-fi
-
-               if test $SPL_SYMBOLS != NONE -o $timeout -le 0; then :
-
-                       break;
-
-else
-
-                       sleep 1
-                       timeout=$((timeout-1))
-
-fi
-       done
-
-       if test "$SPL_SYMBOLS" = NONE; then :
-
-               SPL_SYMBOLS=$LINUX_SYMBOLS
-
-fi
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $SPL_SYMBOLS" >&5
-$as_echo "$SPL_SYMBOLS" >&6; }
-
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether modules can be built" >&5
-$as_echo_n "checking whether modules can be built... " >&6; }
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-int
-main (void)
-{
-
-  ;
-  return 0;
-}
-
-_ACEOF
-
-
-
-cat - <<_ACEOF >conftest.h
-
-_ACEOF
-
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-               if test "x$enable_linux_builtin" != xyes; then
-                       as_fn_error $? "*** Unable to build an empty module." "$LINENO" 5
-               else
-                       as_fn_error $? "
-       *** Unable to build an empty module.
-       *** Please run 'make scripts' inside the kernel source tree." "$LINENO" 5
-               fi
-
-
-
-fi
-       rm -Rf build
-
-
-
-
-       if test "$cross_compiling" = yes; then :
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "cannot run test program while cross compiling
-See \`config.log' for more details" "$LINENO" 5; }
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-
-                       #include "$LINUX/include/linux/license.h"
-
-int
-main ()
-{
-
-                       return !license_is_gpl_compatible("$ZFS_META_LICENSE");
-
-  ;
-  return 0;
-}
-
-_ACEOF
-if ac_fn_c_try_run "$LINENO"; then :
-
-
-$as_echo "#define ZFS_IS_GPL_COMPATIBLE 1" >>confdefs.h
-
-
-fi
-rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
-  conftest.$ac_objext conftest.beam conftest.$ac_ext
-fi
-
-
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether Linux was built with CONFIG_DEBUG_LOCK_ALLOC" >&5
-$as_echo_n "checking whether Linux was built with CONFIG_DEBUG_LOCK_ALLOC... " >&6; }
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-               #include <linux/module.h>
-
-int
-main (void)
-{
-
-               #ifndef CONFIG_DEBUG_LOCK_ALLOC
-               #error CONFIG_DEBUG_LOCK_ALLOC not #defined
-               #endif
-
-  ;
-  return 0;
-}
-
-_ACEOF
-
-
-
-cat - <<_ACEOF >conftest.h
-
-_ACEOF
-
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether mutex_lock() is GPL-only" >&5
-$as_echo_n "checking whether mutex_lock() is GPL-only... " >&6; }
-               tmp_flags="$EXTRA_KCFLAGS"
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-                       #include <linux/module.h>
-                       #include <linux/mutex.h>
 
-                       MODULE_LICENSE("$ZFS_META_LICENSE");
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for compress2 in -lz" >&5
+$as_echo_n "checking for compress2 in -lz... " >&6; }
+if ${ac_cv_lib_z_compress2+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lz  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char compress2 ();
 int
-main (void)
+main ()
 {
-
-                       struct mutex lock;
-
-                       mutex_init(&lock);
-                       mutex_lock(&lock);
-                       mutex_unlock(&lock);
-
+return compress2 ();
   ;
   return 0;
 }
-
-_ACEOF
-
-
-
-cat - <<_ACEOF >conftest.h
-
 _ACEOF
-
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_z_compress2=yes
 else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-                       as_fn_error $? "
-       *** Kernel built with CONFIG_DEBUG_LOCK_ALLOC which is incompatible
-       *** with the CDDL license and will prevent the module linking stage
-       *** from succeeding.  You must rebuild your kernel without this
-       *** option enabled." "$LINENO" 5
-
-
-
+  ac_cv_lib_z_compress2=no
 fi
-       rm -Rf build
-
-
-               EXTRA_KCFLAGS="$tmp_flags"
-
-
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-
-
-
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
 fi
-       rm -Rf build
-
-
-
-
-
-
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="-I\$(src)"
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether DECLARE_EVENT_CLASS() is available" >&5
-$as_echo_n "checking whether DECLARE_EVENT_CLASS() is available... " >&6; }
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-               #include <linux/module.h>
-               MODULE_LICENSE(ZFS_META_LICENSE);
-
-               #define CREATE_TRACE_POINTS
-               #include "conftest.h"
-
-int
-main (void)
-{
-
-               trace_zfs_autoconf_event_one(1UL);
-               trace_zfs_autoconf_event_two(2UL);
-
-  ;
-  return 0;
-}
-
-_ACEOF
-
-
-
-cat - <<_ACEOF >conftest.h
-
-               #if !defined(_CONFTEST_H) || defined(TRACE_HEADER_MULTI_READ)
-               #define _CONFTEST_H
-
-               #undef  TRACE_SYSTEM
-               #define TRACE_SYSTEM zfs
-               #include <linux/tracepoint.h>
-
-               DECLARE_EVENT_CLASS(zfs_autoconf_event_class,
-                       TP_PROTO(unsigned long i),
-                       TP_ARGS(i),
-                       TP_STRUCT__entry(
-                               __field(unsigned long, i)
-                       ),
-                       TP_fast_assign(
-                               __entry->i = i;
-                       ),
-                       TP_printk("i = %lu", __entry->i)
-               );
-
-               #define DEFINE_AUTOCONF_EVENT(name) \
-               DEFINE_EVENT(zfs_autoconf_event_class, name, \
-                       TP_PROTO(unsigned long i), \
-                       TP_ARGS(i))
-               DEFINE_AUTOCONF_EVENT(zfs_autoconf_event_one);
-               DEFINE_AUTOCONF_EVENT(zfs_autoconf_event_two);
-
-               #endif /* _CONFTEST_H */
-
-               #undef  TRACE_INCLUDE_PATH
-               #define TRACE_INCLUDE_PATH .
-               #define TRACE_INCLUDE_FILE conftest
-               #include <trace/define_trace.h>
-
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_z_compress2" >&5
+$as_echo "$ac_cv_lib_z_compress2" >&6; }
+if test "x$ac_cv_lib_z_compress2" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBZ 1
 _ACEOF
 
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_DECLARE_EVENT_CLASS 1" >>confdefs.h
-
+  LIBS="-lz $LIBS"
 
 else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-
-
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "
+       *** compress2() missing, zlib-devel package required
+See \`config.log' for more details" "$LINENO" 5; }
 fi
-       rm -Rf build
-
-
-       EXTRA_KCFLAGS="$tmp_flags"
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking block device operation prototypes" >&5
-$as_echo_n "checking block device operation prototypes... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-               #include <linux/blkdev.h>
 
-               int blk_open(struct block_device *bdev, fmode_t mode)
-                   { return 0; }
-               int blk_ioctl(struct block_device *bdev, fmode_t mode,
-                   unsigned x, unsigned long y) { return 0; }
-               int blk_compat_ioctl(struct block_device * bdev, fmode_t mode,
-                   unsigned x, unsigned long y) { return 0; }
 
-               static const struct block_device_operations
-                   bops __attribute__ ((unused)) = {
-                       .open           = blk_open,
-                       .release        = NULL,
-                       .ioctl          = blk_ioctl,
-                       .compat_ioctl   = blk_compat_ioctl,
-               };
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for uncompress in -lz" >&5
+$as_echo_n "checking for uncompress in -lz... " >&6; }
+if ${ac_cv_lib_z_uncompress+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lz  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char uncompress ();
 int
-main (void)
+main ()
 {
-
-
+return uncompress ();
   ;
   return 0;
 }
-
 _ACEOF
-
-
-
-cat - <<_ACEOF >conftest.h
-
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_z_uncompress=yes
+else
+  ac_cv_lib_z_uncompress=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_z_uncompress" >&5
+$as_echo "$ac_cv_lib_z_uncompress" >&6; }
+if test "x$ac_cv_lib_z_uncompress" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBZ 1
 _ACEOF
 
+  LIBS="-lz $LIBS"
 
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: struct block_device" >&5
-$as_echo "struct block_device" >&6; }
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "
+       *** uncompress() missing, zlib-devel package required
+See \`config.log' for more details" "$LINENO" 5; }
+fi
 
-$as_echo "#define HAVE_BDEV_BLOCK_DEVICE_OPERATIONS 1" >>confdefs.h
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for crc32 in -lz" >&5
+$as_echo_n "checking for crc32 in -lz... " >&6; }
+if ${ac_cv_lib_z_crc32+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lz  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char crc32 ();
+int
+main ()
+{
+return crc32 ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_z_crc32=yes
 else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+  ac_cv_lib_z_crc32=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_z_crc32" >&5
+$as_echo "$ac_cv_lib_z_crc32" >&6; }
+if test "x$ac_cv_lib_z_crc32" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBZ 1
+_ACEOF
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: struct inode" >&5
-$as_echo "struct inode" >&6; }
+  LIBS="-lz $LIBS"
 
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "
+       *** crc32() missing, zlib-devel package required
+See \`config.log' for more details" "$LINENO" 5; }
+fi
 
 
-fi
-       rm -Rf build
+       ZLIB="-lz"
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
+$as_echo "#define HAVE_ZLIB 1" >>confdefs.h
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether block_device_operations.release is void" >&5
-$as_echo_n "checking whether block_device_operations.release is void... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
+       LIBUUID=
 
-cat confdefs.h - <<_ACEOF >conftest.c
+       ac_fn_c_check_header_mongrel "$LINENO" "uuid/uuid.h" "ac_cv_header_uuid_uuid_h" "$ac_includes_default"
+if test "x$ac_cv_header_uuid_uuid_h" = xyes; then :
 
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "
+       *** uuid/uuid.h missing, libuuid-devel package required
+See \`config.log' for more details" "$LINENO" 5; }
+fi
 
-               #include <linux/blkdev.h>
 
-               void blk_release(struct gendisk *g, fmode_t mode) { return; }
 
-               static const struct block_device_operations
-                   bops __attribute__ ((unused)) = {
-                       .open           = NULL,
-                       .release        = blk_release,
-                       .ioctl          = NULL,
-                       .compat_ioctl   = NULL,
-               };
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for uuid_generate in -luuid" >&5
+$as_echo_n "checking for uuid_generate in -luuid... " >&6; }
+if ${ac_cv_lib_uuid_uuid_generate+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-luuid  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char uuid_generate ();
 int
-main (void)
+main ()
 {
-
-
+return uuid_generate ();
   ;
   return 0;
 }
-
 _ACEOF
-
-
-
-cat - <<_ACEOF >conftest.h
-
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_uuid_uuid_generate=yes
+else
+  ac_cv_lib_uuid_uuid_generate=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_uuid_uuid_generate" >&5
+$as_echo "$ac_cv_lib_uuid_uuid_generate" >&6; }
+if test "x$ac_cv_lib_uuid_uuid_generate" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBUUID 1
 _ACEOF
 
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: void" >&5
-$as_echo "void" >&6; }
-
-$as_echo "#define HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID 1" >>confdefs.h
-
+  LIBS="-luuid $LIBS"
 
 else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: int" >&5
-$as_echo "int" >&6; }
-
-
-
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "
+       *** uuid_generate() missing, libuuid-devel package required
+See \`config.log' for more details" "$LINENO" 5; }
 fi
-       rm -Rf build
-
-
-       EXTRA_KCFLAGS="$tmp_flags"
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether kernel defines fmode_t" >&5
-$as_echo_n "checking whether kernel defines fmode_t... " >&6; }
 
 
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-               #include <linux/types.h>
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for uuid_is_null in -luuid" >&5
+$as_echo_n "checking for uuid_is_null in -luuid... " >&6; }
+if ${ac_cv_lib_uuid_uuid_is_null+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-luuid  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char uuid_is_null ();
 int
-main (void)
+main ()
 {
-
-               fmode_t *ptr __attribute__ ((unused));
-
+return uuid_is_null ();
   ;
   return 0;
 }
-
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_uuid_uuid_is_null=yes
+else
+  ac_cv_lib_uuid_uuid_is_null=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_uuid_uuid_is_null" >&5
+$as_echo "$ac_cv_lib_uuid_uuid_is_null" >&6; }
+if test "x$ac_cv_lib_uuid_uuid_is_null" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBUUID 1
 _ACEOF
 
+  LIBS="-luuid $LIBS"
 
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "
+       *** uuid_is_null() missing, libuuid-devel package required
+See \`config.log' for more details" "$LINENO" 5; }
+fi
 
-cat - <<_ACEOF >conftest.h
 
-_ACEOF
+       LIBUUID="-luuid"
 
 
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
+$as_echo "#define HAVE_LIBUUID 1" >>confdefs.h
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_FMODE_T 1" >>confdefs.h
 
 
+# Check whether --with-blkid was given.
+if test "${with_blkid+set}" = set; then :
+  withval=$with_blkid;
 else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+  with_blkid=check
+fi
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
 
+       LIBBLKID=
+       if test "x$with_blkid" = xyes; then :
+
+               LIBBLKID="-lblkid"
+
+
+$as_echo "#define HAVE_LIBBLKID 1" >>confdefs.h
 
 
 fi
-       rm -Rf build
 
+       if test "x$with_blkid" = xcheck; then :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: checking for blkid_get_cache in -lblkid" >&5
+$as_echo_n "checking for blkid_get_cache in -lblkid... " >&6; }
+if ${ac_cv_lib_blkid_blkid_get_cache+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lblkid  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char blkid_get_cache ();
+int
+main ()
+{
+return blkid_get_cache ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_blkid_blkid_get_cache=yes
+else
+  ac_cv_lib_blkid_blkid_get_cache=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_blkid_blkid_get_cache" >&5
+$as_echo "$ac_cv_lib_blkid_blkid_get_cache" >&6; }
+if test "x$ac_cv_lib_blkid_blkid_get_cache" = xyes; then :
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether kernel defines KOBJ_NAME_LEN" >&5
-$as_echo_n "checking whether kernel defines KOBJ_NAME_LEN... " >&6; }
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for blkid zfs support" >&5
+$as_echo_n "checking for blkid zfs support... " >&6; }
 
+                       ZFS_DEV=`mktemp`
+                       truncate -s 64M $ZFS_DEV
+                       echo -en "\x0c\xb1\xba\0\0\0\0\0" | \
+                               dd of=$ZFS_DEV bs=1k count=8 \
+                               seek=128 conv=notrunc &>/dev/null \
+                               >/dev/null 2>/dev/null
+                       echo -en "\x0c\xb1\xba\0\0\0\0\0" | \
+                               dd of=$ZFS_DEV bs=1k count=8 \
+                               seek=132 conv=notrunc &>/dev/null \
+                               >/dev/null 2>/dev/null
+                       echo -en "\x0c\xb1\xba\0\0\0\0\0" | \
+                               dd of=$ZFS_DEV bs=1k count=8 \
+                               seek=136 conv=notrunc &>/dev/null \
+                               >/dev/null 2>/dev/null
+                       echo -en "\x0c\xb1\xba\0\0\0\0\0" | \
+                               dd of=$ZFS_DEV bs=1k count=8 \
+                               seek=140 conv=notrunc &>/dev/null \
+                               >/dev/null 2>/dev/null
 
-cat confdefs.h - <<_ACEOF >conftest.c
+                       saved_LIBS="$LIBS"
+                       LIBS="-lblkid"
 
+                       if test "$cross_compiling" = yes; then :
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot run test program while cross compiling
+See \`config.log' for more details" "$LINENO" 5; }
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-               #include <linux/kobject.h>
+                               #include <stdio.h>
+                               #include <stdlib.h>
+                               #include <blkid/blkid.h>
 
 int
-main (void)
+main ()
 {
 
-               int val __attribute__ ((unused));
-               val = KOBJ_NAME_LEN;
-
-  ;
-  return 0;
-}
+                               blkid_cache cache;
+                               char *value;
 
-_ACEOF
+                               if (blkid_get_cache(&cache, NULL) < 0)
+                                       return 1;
 
+                               value = blkid_get_tag_value(cache, "TYPE",
+                                                           "$ZFS_DEV");
+                               if (!value) {
+                                       blkid_put_cache(cache);
+                                       return 2;
+                               }
 
+                               if (strcmp(value, "zfs_member")) {
+                                       free(value);
+                                       blkid_put_cache(cache);
+                                       return 0;
+                               }
 
-cat - <<_ACEOF >conftest.h
+                               free(value);
+                               blkid_put_cache(cache);
 
+  ;
+  return 0;
+}
 _ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
 
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+                               rm -f $ZFS_DEV
+                               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
+                               LIBBLKID="-lblkid"
 
-$as_echo "#define HAVE_KOBJ_NAME_LEN 1" >>confdefs.h
+
+$as_echo "#define HAVE_LIBBLKID 1" >>confdefs.h
 
 
 else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+                               rm -f $ZFS_DEV
+                               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
+                               if test "x$with_blkid" != xcheck; then :
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "--with-blkid given but unavailable
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
 
 
+                       LIBS="$saved_LIBS"
+
+else
 
+                       if test "x$with_blkid" != xcheck; then :
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "--with-blkid given but unavailable
+See \`config.log' for more details" "$LINENO" 5; }
 fi
-       rm -Rf build
 
 
+fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blkdev_get() wants 3 args" >&5
-$as_echo_n "checking whether blkdev_get() wants 3 args... " >&6; }
+fi
 
 
-cat confdefs.h - <<_ACEOF >conftest.c
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for -Wframe-larger-than=<size> support" >&5
+$as_echo_n "checking for -Wframe-larger-than=<size> support... " >&6; }
 
+       saved_flags="$CFLAGS"
+       CFLAGS="$CFLAGS -Wframe-larger-than=1024"
 
-               #include <linux/fs.h>
+       cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
 int
-main (void)
+main ()
 {
 
-               struct block_device *bdev = NULL;
-               (void) blkdev_get(bdev, 0, NULL);
-
   ;
   return 0;
 }
-
 _ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+               FRAME_LARGER_THAN=-Wframe-larger-than=1024
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
+else
 
+               FRAME_LARGER_THAN=
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
-cat - <<_ACEOF >conftest.h
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
-_ACEOF
+       CFLAGS="$saved_flags"
 
 
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+       if test "x$runstatedir" = x; then
+               runstatedir='${localstatedir}/run'
+
+       fi
 
-$as_echo "#define HAVE_3ARG_BLKDEV_GET 1" >>confdefs.h
+       for ac_func in mlockall
+do :
+  ac_fn_c_check_func "$LINENO" "mlockall" "ac_cv_func_mlockall"
+if test "x$ac_cv_func_mlockall" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_MLOCKALL 1
+_ACEOF
 
+fi
+done
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
 
 
 
+# Check whether --with-linux was given.
+if test "${with_linux+set}" = set; then :
+  withval=$with_linux; kernelsrc="$withval"
 fi
-       rm -Rf build
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blkdev_get_by_path() is available" >&5
-$as_echo_n "checking whether blkdev_get_by_path() is available... " >&6; }
+# Check whether --with-linux-obj was given.
+if test "${with_linux_obj+set}" = set; then :
+  withval=$with_linux_obj; kernelbuild="$withval"
+fi
 
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking kernel source directory" >&5
+$as_echo_n "checking kernel source directory... " >&6; }
+       if test -z "$kernelsrc"; then :
 
-cat confdefs.h - <<_ACEOF >conftest.c
+               if test -e "/lib/modules/$(uname -r)/source"; then :
 
+                       headersdir="/lib/modules/$(uname -r)/source"
+                       sourcelink=$(readlink -f "$headersdir")
 
-               #include <linux/fs.h>
+elif test -e "/lib/modules/$(uname -r)/build"; then :
 
-int
-main (void)
-{
+                       headersdir="/lib/modules/$(uname -r)/build"
+                       sourcelink=$(readlink -f "$headersdir")
 
-               blkdev_get_by_path(NULL, 0, NULL);
+else
 
-  ;
-  return 0;
-}
+                       sourcelink=$(ls -1d /usr/src/kernels/* \
+                                    /usr/src/linux-* \
+                                    2>/dev/null | grep -v obj | tail -1)
 
-_ACEOF
+fi
 
+               if test -n "$sourcelink" && test -e ${sourcelink}; then :
 
+                       kernelsrc=`readlink -f ${sourcelink}`
 
-cat - <<_ACEOF >conftest.h
+else
 
-_ACEOF
+                       kernelsrc="Not found"
 
+fi
 
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  rc=0
 else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
 
+               if test "$kernelsrc" = "NONE"; then :
+
+                       kernsrcver=NONE
 
 fi
-       rm -Rf build
 
+fi
 
-       if test $rc -ne 0; then :
+       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $kernelsrc" >&5
+$as_echo "$kernelsrc" >&6; }
+       if test ! -d "$kernelsrc"; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+               as_fn_error $? "
+       *** Please make sure the kernel devel package for your distribution
+       *** is installed and then try again.  If that fails, you can specify the
+       *** location of the kernel source with the '--with-linux=PATH' option." "$LINENO" 5
 
-       else
-               if test "x$enable_linux_builtin" != xyes; then
+fi
 
-       grep -q -E '[[:space:]]blkdev_get_by_path[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in fs/block_dev.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(blkdev_get_by_path)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking kernel build directory" >&5
+$as_echo_n "checking kernel build directory... " >&6; }
+       if test -z "$kernelbuild"; then :
 
-               fi
-               if test $rc -ne 0; then :
+               if test -e "/lib/modules/$(uname -r)/build"; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+                       kernelbuild=`readlink -f /lib/modules/$(uname -r)/build`
 
-               else :
+elif test -d ${kernelsrc}-obj/${target_cpu}/${target_cpu}; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+                       kernelbuild=${kernelsrc}-obj/${target_cpu}/${target_cpu}
 
-$as_echo "#define HAVE_BLKDEV_GET_BY_PATH 1" >>confdefs.h
+elif test -d ${kernelsrc}-obj/${target_cpu}/default; then :
 
+                       kernelbuild=${kernelsrc}-obj/${target_cpu}/default
 
-               fi
-       fi
+elif test -d `dirname ${kernelsrc}`/build-${target_cpu}; then :
 
+                       kernelbuild=`dirname ${kernelsrc}`/build-${target_cpu}
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether open_bdev_exclusive() is available" >&5
-$as_echo_n "checking whether open_bdev_exclusive() is available... " >&6; }
+else
 
+                       kernelbuild=${kernelsrc}
 
+fi
 
-cat confdefs.h - <<_ACEOF >conftest.c
+fi
+       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $kernelbuild" >&5
+$as_echo "$kernelbuild" >&6; }
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking kernel source version" >&5
+$as_echo_n "checking kernel source version... " >&6; }
+       utsrelease1=$kernelbuild/include/linux/version.h
+       utsrelease2=$kernelbuild/include/linux/utsrelease.h
+       utsrelease3=$kernelbuild/include/generated/utsrelease.h
+       if test -r $utsrelease1 && fgrep -q UTS_RELEASE $utsrelease1; then :
 
-               #include <linux/fs.h>
+               utsrelease=linux/version.h
 
-int
-main (void)
-{
+elif test -r $utsrelease2 && fgrep -q UTS_RELEASE $utsrelease2; then :
 
-               open_bdev_exclusive(NULL, 0, NULL);
+               utsrelease=linux/utsrelease.h
 
-  ;
-  return 0;
-}
+elif test -r $utsrelease3 && fgrep -q UTS_RELEASE $utsrelease3; then :
 
-_ACEOF
+               utsrelease=generated/utsrelease.h
 
+fi
 
+       if test "$utsrelease"; then :
 
-cat - <<_ACEOF >conftest.h
+               kernsrcver=`(echo "#include <$utsrelease>";
+                            echo "kernsrcver=UTS_RELEASE") |
+                            cpp -I $kernelbuild/include |
+                            grep "^kernsrcver=" | cut -d \" -f 2`
 
-_ACEOF
+               if test -z "$kernsrcver"; then :
 
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: Not found" >&5
+$as_echo "Not found" >&6; }
+                       as_fn_error $? "*** Cannot determine kernel version." "$LINENO" 5
+
+fi
 
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  rc=0
 else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: Not found" >&5
+$as_echo "Not found" >&6; }
+               if test "x$enable_linux_builtin" != xyes; then
+                       as_fn_error $? "*** Cannot find UTS_RELEASE definition." "$LINENO" 5
+               else
+                       as_fn_error $? "
+       *** Cannot find UTS_RELEASE definition.
+       *** Please run 'make prepare' inside the kernel source tree." "$LINENO" 5
+               fi
 
 fi
-       rm -Rf build
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $kernsrcver" >&5
+$as_echo "$kernsrcver" >&6; }
 
-       if test $rc -ne 0; then :
+       LINUX=${kernelsrc}
+       LINUX_OBJ=${kernelbuild}
+       LINUX_VERSION=${kernsrcver}
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
 
-       else
-               if test "x$enable_linux_builtin" != xyes; then
 
-       grep -q -E '[[:space:]]open_bdev_exclusive[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in fs/block_dev.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(open_bdev_exclusive)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
 
-               fi
-               if test $rc -ne 0; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
 
-               else :
+       modpost=$LINUX/scripts/Makefile.modpost
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking kernel file name for module symbols" >&5
+$as_echo_n "checking kernel file name for module symbols... " >&6; }
+       if test "x$enable_linux_builtin" != xyes -a -f "$modpost"; then :
+
+               if grep -q Modules.symvers $modpost; then :
+
+                       LINUX_SYMBOLS=Modules.symvers
+
+else
+
+                       LINUX_SYMBOLS=Module.symvers
+
+fi
+
+               if test ! -f "$LINUX_OBJ/$LINUX_SYMBOLS"; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+                       as_fn_error $? "
+       *** Please make sure the kernel devel package for your distribution
+       *** is installed.  If you are building with a custom kernel, make sure the
+       *** kernel is configured, built, and the '--with-linux=PATH' configure
+       *** option refers to the location of the kernel source." "$LINENO" 5
 
-$as_echo "#define HAVE_OPEN_BDEV_EXCLUSIVE 1" >>confdefs.h
+fi
 
+else
 
-               fi
-       fi
+               LINUX_SYMBOLS=NONE
 
+fi
+       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $LINUX_SYMBOLS" >&5
+$as_echo "$LINUX_SYMBOLS" >&6; }
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether lookup_bdev() is available" >&5
-$as_echo_n "checking whether lookup_bdev() is available... " >&6; }
 
 
 
-cat confdefs.h - <<_ACEOF >conftest.c
 
+# Check whether --with-spl was given.
+if test "${with_spl+set}" = set; then :
+  withval=$with_spl; splsrc="$withval"
+fi
 
-               #include <linux/fs.h>
 
-int
-main (void)
-{
 
-               lookup_bdev(NULL);
+# Check whether --with-spl-obj was given.
+if test "${with_spl_obj+set}" = set; then :
+  withval=$with_spl_obj; splbuild="$withval"
+fi
 
-  ;
-  return 0;
-}
 
-_ACEOF
 
+# Check whether --with-spl-timeout was given.
+if test "${with_spl_timeout+set}" = set; then :
+  withval=$with_spl_timeout; timeout="$withval"
+else
+  timeout=0
+fi
 
 
-cat - <<_ACEOF >conftest.h
+                                       splsrc0="/var/lib/dkms/spl/${VERSION}/build"
+       splsrc1="/usr/local/src/spl-${VERSION}/${LINUX_VERSION}"
+       splsrc2="/usr/local/src/spl-${VERSION}"
+       splsrc3="/usr/src/spl-${VERSION}/${LINUX_VERSION}"
+       splsrc4="/usr/src/spl-${VERSION}"
+       splsrc5="../spl/"
+       splsrc6="$LINUX"
 
-_ACEOF
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking spl source directory" >&5
+$as_echo_n "checking spl source directory... " >&6; }
+       if test -z "${splsrc}"; then :
 
+               if  test -e "${splsrc0}/spl.release.in"; then :
 
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
+                       splsrc=${splsrc0}
 
+elif  test -e "${splsrc1}/spl.release.in"; then :
 
-fi
-       rm -Rf build
+                       splsrc=${splsrc1}
 
+elif  test -e "${splsrc2}/spl.release.in"; then :
 
-       if test $rc -ne 0; then :
+                       splsrc=${splsrc2}
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+elif  test -e "${splsrc3}/spl.release.in"; then :
 
-       else
-               if test "x$enable_linux_builtin" != xyes; then
+                       splsrc=$(readlink -f "${splsrc3}")
 
-       grep -q -E '[[:space:]]lookup_bdev[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in fs/block_dev.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(lookup_bdev)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
+elif  test -e "${splsrc4}/spl.release.in" ; then :
 
-               fi
-               if test $rc -ne 0; then :
+                       splsrc=${splsrc4}
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+elif  test -e "${splsrc5}/spl.release.in"; then :
 
-               else :
+                       splsrc=$(readlink -f "${splsrc5}")
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+elif  test -e "${splsrc6}/spl.release.in" ; then :
 
-$as_echo "#define HAVE_LOOKUP_BDEV 1" >>confdefs.h
+                       splsrc=${splsrc6}
 
+else
 
-               fi
-       fi
+                       splsrc="Not found"
 
+fi
 
+else
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether invalidate_bdev() wants 1 arg" >&5
-$as_echo_n "checking whether invalidate_bdev() wants 1 arg... " >&6; }
+               if test "$splsrc" = "NONE"; then :
 
+                       splbuild=NONE
+                       splsrcver=NONE
 
-cat confdefs.h - <<_ACEOF >conftest.c
+fi
 
+fi
 
-               #include <linux/buffer_head.h>
+       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $splsrc" >&5
+$as_echo "$splsrc" >&6; }
+       if  test ! -e "$splsrc/spl.release.in"; then :
 
-int
-main (void)
-{
+               as_fn_error $? "
+       *** Please make sure the kmod spl devel package for your distribution
+       *** is installed then try again.  If that fails you can specify the
+       *** location of the spl source with the '--with-spl=PATH' option." "$LINENO" 5
 
-               struct block_device *bdev = NULL;
-               invalidate_bdev(bdev);
+fi
 
-  ;
-  return 0;
-}
+                                                                                                       { $as_echo "$as_me:${as_lineno-$LINENO}: checking spl build directory" >&5
+$as_echo_n "checking spl build directory... " >&6; }
+       while true; do
+               if test -z "$splbuild"; then :
 
-_ACEOF
+                       if  test -e "${splsrc}/${LINUX_VERSION}/spl_config.h" ; then :
 
+                               splbuild="${splsrc}/${LINUX_VERSION}"
 
+elif  test -e "${splsrc}/spl_config.h" ; then :
 
-cat - <<_ACEOF >conftest.h
+                               splbuild="${splsrc}"
 
-_ACEOF
+elif  find -L "${splsrc}" -name spl_config.h 2> /dev/null | grep -wq spl_config.h ; then :
 
+                               splbuild=$(find -L "${splsrc}" -name spl_config.h | sed 's,/spl_config.h,,')
 
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
+else
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+                               splbuild="Not found"
 
-$as_echo "#define HAVE_1ARG_INVALIDATE_BDEV 1" >>confdefs.h
+fi
+
+fi
+               if test -e "$splbuild/spl_config.h" -o $timeout -le 0; then :
 
+                       break;
 
 else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+                       sleep 1
+                       timeout=$((timeout-1))
+
+fi
+       done
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $splbuild" >&5
+$as_echo "$splbuild" >&6; }
+       if  ! test -e "$splbuild/spl_config.h"; then :
 
+               as_fn_error $? "
+       *** Please make sure the kmod spl devel <kernel> package for your
+       *** distribution is installed then try again.  If that fails you
+       *** can specify the location of the spl objects with the
+       *** '--with-spl-obj=PATH' option." "$LINENO" 5
 
 fi
-       rm -Rf build
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking spl source version" >&5
+$as_echo_n "checking spl source version... " >&6; }
+       if test -r $splbuild/spl_config.h &&
+               fgrep -q SPL_META_VERSION $splbuild/spl_config.h; then :
 
 
+               splsrcver=`(echo "#include <spl_config.h>";
+                           echo "splsrcver=SPL_META_VERSION-SPL_META_RELEASE") |
+                           cpp -I $splbuild |
+                           grep "^splsrcver=" | tr -d \" | cut -d= -f2`
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bdev_logical_block_size() is available" >&5
-$as_echo_n "checking whether bdev_logical_block_size() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+fi
 
+       if test -z "$splsrcver"; then :
 
-cat confdefs.h - <<_ACEOF >conftest.c
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: Not found" >&5
+$as_echo "Not found" >&6; }
+               as_fn_error $? "
+       *** Cannot determine the version of the spl source.
+       *** Please prepare the spl source before running this script" "$LINENO" 5
 
+fi
 
-               #include <linux/blkdev.h>
+       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $splsrcver" >&5
+$as_echo "$splsrcver" >&6; }
 
-int
-main (void)
-{
+       SPL=${splsrc}
+       SPL_OBJ=${splbuild}
+       SPL_VERSION=${splsrcver}
 
-               struct block_device *bdev = NULL;
-               bdev_logical_block_size(bdev);
 
-  ;
-  return 0;
-}
 
-_ACEOF
 
 
+                                                                                                               { $as_echo "$as_me:${as_lineno-$LINENO}: checking spl file name for module symbols" >&5
+$as_echo_n "checking spl file name for module symbols... " >&6; }
+       SPL_SYMBOLS=NONE
+
+       while true; do
+               if test -r $SPL_OBJ/Module.symvers; then :
+
+                       SPL_SYMBOLS=Module.symvers
+
+elif test -r $SPL_OBJ/Modules.symvers; then :
 
-cat - <<_ACEOF >conftest.h
+                       SPL_SYMBOLS=Modules.symvers
 
-_ACEOF
+elif test -r $SPL_OBJ/module/Module.symvers; then :
 
+                       SPL_SYMBOLS=Module.symvers
 
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
+elif test -r $SPL_OBJ/module/Modules.symvers; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+                       SPL_SYMBOLS=Modules.symvers
 
-$as_echo "#define HAVE_BDEV_LOGICAL_BLOCK_SIZE 1" >>confdefs.h
+fi
+
+               if test $SPL_SYMBOLS != NONE -o $timeout -le 0; then :
 
+                       break;
 
 else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+                       sleep 1
+                       timeout=$((timeout-1))
 
+fi
+       done
+
+       if test "$SPL_SYMBOLS" = NONE; then :
 
+               SPL_SYMBOLS=$LINUX_SYMBOLS
 
 fi
-       rm -Rf build
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $SPL_SYMBOLS" >&5
+$as_echo "$SPL_SYMBOLS" >&6; }
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bdev_physical_block_size() is available" >&5
-$as_echo_n "checking whether bdev_physical_block_size() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether modules can be built" >&5
+$as_echo_n "checking whether modules can be built... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
-
 int
 main (void)
 {
 
-               struct block_device *bdev = NULL;
-               bdev_physical_block_size(bdev);
-
   ;
   return 0;
 }
@@ -24136,15 +22397,19 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BDEV_PHYSICAL_BLOCK_SIZE 1" >>confdefs.h
-
-
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
+               if test "x$enable_linux_builtin" != xyes; then
+                       as_fn_error $? "*** Unable to build an empty module." "$LINENO" 5
+               else
+                       as_fn_error $? "
+       *** Unable to build an empty module.
+       *** Please run 'make scripts' inside the kernel source tree." "$LINENO" 5
+               fi
 
 
 
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bio has bi_iter" >&5
-$as_echo_n "checking whether bio has bi_iter... " >&6; }
 
 
-cat confdefs.h - <<_ACEOF >conftest.c
+       if test "$cross_compiling" = yes; then :
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot run test program while cross compiling
+See \`config.log' for more details" "$LINENO" 5; }
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
 
-               #include <linux/bio.h>
+                       #include "$LINUX/include/linux/license.h"
 
 int
-main (void)
+main ()
 {
 
-               struct bio bio;
-               bio.bi_iter.bi_sector = 0;
+                       return !license_is_gpl_compatible("$ZFS_META_LICENSE");
 
   ;
   return 0;
 }
 
 _ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
 
 
-
-cat - <<_ACEOF >conftest.h
-
-_ACEOF
-
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_BIO_BVEC_ITER 1" >>confdefs.h
-
-
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
+$as_echo "#define ZFS_IS_GPL_COMPATIBLE 1" >>confdefs.h
 
 
 fi
-       rm -Rf build
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
 
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether BIO_RW_FAILFAST is defined" >&5
-$as_echo_n "checking whether BIO_RW_FAILFAST is defined... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether Linux was built with CONFIG_DEBUG_LOCK_ALLOC" >&5
+$as_echo_n "checking whether Linux was built with CONFIG_DEBUG_LOCK_ALLOC... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/bio.h>
+               #include <linux/module.h>
 
 int
 main (void)
 {
 
-               int flags __attribute__ ((unused));
-               flags = (1 << BIO_RW_FAILFAST);
+               #ifndef CONFIG_DEBUG_LOCK_ALLOC
+               #error CONFIG_DEBUG_LOCK_ALLOC not #defined
+               #endif
 
   ;
   return 0;
@@ -24269,41 +22505,28 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BIO_RW_FAILFAST 1" >>confdefs.h
-
-
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-
-
-fi
-       rm -Rf build
-
-
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether BIO_RW_FAILFAST_* are defined" >&5
-$as_echo_n "checking whether BIO_RW_FAILFAST_* are defined... " >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether mutex_lock() is GPL-only" >&5
+$as_echo_n "checking whether mutex_lock() is GPL-only... " >&6; }
+               tmp_flags="$EXTRA_KCFLAGS"
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/bio.h>
+                       #include <linux/module.h>
+                       #include <linux/mutex.h>
+
+                       MODULE_LICENSE("$ZFS_META_LICENSE");
 
 int
 main (void)
 {
 
-               int flags __attribute__ ((unused));
-               flags = ((1 << BIO_RW_FAILFAST_DEV) |
-                        (1 << BIO_RW_FAILFAST_TRANSPORT) |
-                        (1 << BIO_RW_FAILFAST_DRIVER));
+                       struct mutex lock;
+
+                       mutex_init(&lock);
+                       mutex_lock(&lock);
+                       mutex_unlock(&lock);
 
   ;
   return 0;
@@ -24334,10 +22557,28 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
+                       as_fn_error $? "
+       *** Kernel built with CONFIG_DEBUG_LOCK_ALLOC which is incompatible
+       *** with the CDDL license and will prevent the module linking stage
+       *** from succeeding.  You must rebuild your kernel without this
+       *** option enabled." "$LINENO" 5
+
 
-$as_echo "#define HAVE_BIO_RW_FAILFAST_DTD 1" >>confdefs.h
+
+fi
+       rm -Rf build
+
+
+               EXTRA_KCFLAGS="$tmp_flags"
 
 
 else
@@ -24349,27 +22590,37 @@ $as_echo "no" >&6; }
 
 
 
+
 fi
        rm -Rf build
 
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether REQ_FAILFAST_MASK is defined" >&5
-$as_echo_n "checking whether REQ_FAILFAST_MASK is defined... " >&6; }
+
+
+       tmp_flags="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="-I\$(src)"
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether DECLARE_EVENT_CLASS() is available" >&5
+$as_echo_n "checking whether DECLARE_EVENT_CLASS() is available... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/bio.h>
+               #include <linux/module.h>
+               MODULE_LICENSE(ZFS_META_LICENSE);
+
+               #define CREATE_TRACE_POINTS
+               #include "conftest.h"
 
 int
 main (void)
 {
 
-               int flags __attribute__ ((unused));
-               flags = REQ_FAILFAST_MASK;
+               trace_zfs_autoconf_event_one(1UL);
+               trace_zfs_autoconf_event_two(2UL);
 
   ;
   return 0;
@@ -24381,6 +22632,39 @@ _ACEOF
 
 cat - <<_ACEOF >conftest.h
 
+               #if !defined(_CONFTEST_H) || defined(TRACE_HEADER_MULTI_READ)
+               #define _CONFTEST_H
+
+               #undef  TRACE_SYSTEM
+               #define TRACE_SYSTEM zfs
+               #include <linux/tracepoint.h>
+
+               DECLARE_EVENT_CLASS(zfs_autoconf_event_class,
+                       TP_PROTO(unsigned long i),
+                       TP_ARGS(i),
+                       TP_STRUCT__entry(
+                               __field(unsigned long, i)
+                       ),
+                       TP_fast_assign(
+                               __entry->i = i;
+                       ),
+                       TP_printk("i = %lu", __entry->i)
+               );
+
+               #define DEFINE_AUTOCONF_EVENT(name) \
+               DEFINE_EVENT(zfs_autoconf_event_class, name, \
+                       TP_PROTO(unsigned long i), \
+                       TP_ARGS(i))
+               DEFINE_AUTOCONF_EVENT(zfs_autoconf_event_one);
+               DEFINE_AUTOCONF_EVENT(zfs_autoconf_event_two);
+
+               #endif /* _CONFTEST_H */
+
+               #undef  TRACE_INCLUDE_PATH
+               #define TRACE_INCLUDE_PATH .
+               #define TRACE_INCLUDE_FILE conftest
+               #include <trace/define_trace.h>
+
 _ACEOF
 
 
@@ -24403,7 +22687,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BIO_REQ_FAILFAST_MASK 1" >>confdefs.h
+$as_echo "#define HAVE_DECLARE_EVENT_CLASS 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
+       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bio_end_io_t wants 2 args" >&5
-$as_echo_n "checking whether bio_end_io_t wants 2 args... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether current->bio_tail exists" >&5
+$as_echo_n "checking whether current->bio_tail exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/bio.h>
-
-               void wanted_end_io(struct bio *bio, int x) { return; }
-
-               bio_end_io_t *end_io __attribute__ ((unused)) = wanted_end_io;
+               #include <linux/sched.h>
 
 int
 main (void)
 {
 
+               current->bio_tail = (struct bio **) NULL;
 
   ;
   return 0;
@@ -24471,7 +22753,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_2ARGS_BIO_END_IO_T 1" >>confdefs.h
+$as_echo "#define HAVE_CURRENT_BIO_TAIL 1" >>confdefs.h
 
 
 else
@@ -24480,30 +22762,20 @@ sed 's/^/| /' conftest.$ac_ext >&5
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
-
-
-
-fi
-       rm -Rf build
-
-
-
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether BIO_RW_SYNC is defined" >&5
-$as_echo_n "checking whether BIO_RW_SYNC is defined... " >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether current->bio_list exists" >&5
+$as_echo_n "checking whether current->bio_list exists... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/bio.h>
+                       #include <linux/sched.h>
 
 int
 main (void)
 {
 
-               int flags __attribute__ ((unused));
-               flags = BIO_RW_SYNC;
+                       current->bio_list = (struct bio_list *) NULL;
 
   ;
   return 0;
@@ -24534,18 +22806,25 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BIO_RW_SYNC 1" >>confdefs.h
+$as_echo "#define HAVE_CURRENT_BIO_LIST 1" >>confdefs.h
 
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+                       as_fn_error $? "no - Please file a bug report at
+                           https://github.com/zfsonlinux/zfs/issues/new" "$LINENO" 5
+
+
+
+fi
+       rm -Rf build
+
+
 
 
 
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether BIO_RW_SYNCIO is defined" >&5
-$as_echo_n "checking whether BIO_RW_SYNCIO is defined... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking block device operation prototypes" >&5
+$as_echo_n "checking block device operation prototypes... " >&6; }
+       tmp_flags="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/bio.h>
+               #include <linux/blkdev.h>
+
+               int blk_open(struct block_device *bdev, fmode_t mode)
+                   { return 0; }
+               int blk_ioctl(struct block_device *bdev, fmode_t mode,
+                   unsigned x, unsigned long y) { return 0; }
+               int blk_compat_ioctl(struct block_device * bdev, fmode_t mode,
+                   unsigned x, unsigned long y) { return 0; }
+
+               static const struct block_device_operations
+                   bops __attribute__ ((unused)) = {
+                       .open           = blk_open,
+                       .release        = NULL,
+                       .ioctl          = blk_ioctl,
+                       .compat_ioctl   = blk_compat_ioctl,
+               };
 
 int
 main (void)
 {
 
-               int flags __attribute__ ((unused));
-               flags = BIO_RW_SYNCIO;
 
   ;
   return 0;
@@ -24600,18 +22894,18 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: struct block_device" >&5
+$as_echo "struct block_device" >&6; }
 
-$as_echo "#define HAVE_BIO_RW_SYNCIO 1" >>confdefs.h
+$as_echo "#define HAVE_BDEV_BLOCK_DEVICE_OPERATIONS 1" >>confdefs.h
 
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: struct inode" >&5
+$as_echo "struct inode" >&6; }
 
 
 
        rm -Rf build
 
 
+       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether REQ_SYNC is defined" >&5
-$as_echo_n "checking whether REQ_SYNC is defined... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether block_device_operations.release is void" >&5
+$as_echo_n "checking whether block_device_operations.release is void... " >&6; }
+       tmp_flags="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/bio.h>
+               #include <linux/blkdev.h>
+
+               void blk_release(struct gendisk *g, fmode_t mode) { return; }
+
+               static const struct block_device_operations
+                   bops __attribute__ ((unused)) = {
+                       .open           = NULL,
+                       .release        = blk_release,
+                       .ioctl          = NULL,
+                       .compat_ioctl   = NULL,
+               };
 
 int
 main (void)
 {
 
-               int flags __attribute__ ((unused));
-               flags = REQ_SYNC;
 
   ;
   return 0;
@@ -24666,18 +22971,18 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: void" >&5
+$as_echo "void" >&6; }
 
-$as_echo "#define HAVE_REQ_SYNC 1" >>confdefs.h
+$as_echo "#define HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID 1" >>confdefs.h
 
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: int" >&5
+$as_echo "int" >&6; }
 
 
 
        rm -Rf build
 
 
+       EXTRA_KCFLAGS="$tmp_flags"
 
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_end_request() is available" >&5
-$as_echo_n "checking whether blk_end_request() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether kernel defines fmode_t" >&5
+$as_echo_n "checking whether kernel defines fmode_t... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/types.h>
 
 int
 main (void)
 {
 
-               struct request *req = NULL;
-               (void) blk_end_request(req, 0, 0);
+               fmode_t *ptr __attribute__ ((unused));
 
   ;
   return 0;
@@ -24737,7 +23039,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_END_REQUEST 1" >>confdefs.h
+$as_echo "#define HAVE_FMODE_T 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_end_request() is GPL-only" >&5
-$as_echo_n "checking whether blk_end_request() is GPL-only... " >&6; }
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether kernel defines KOBJ_NAME_LEN" >&5
+$as_echo_n "checking whether kernel defines KOBJ_NAME_LEN... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/module.h>
-               #include <linux/blkdev.h>
-
-               MODULE_LICENSE("$ZFS_META_LICENSE");
+               #include <linux/kobject.h>
 
 int
 main (void)
 {
 
-               struct request *req = NULL;
-               (void) blk_end_request(req, 0, 0);
+               int val __attribute__ ((unused));
+               val = KOBJ_NAME_LEN;
 
   ;
   return 0;
@@ -24802,18 +23102,18 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_KOBJ_NAME_LEN 1" >>confdefs.h
+
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_BLK_END_REQUEST_GPL_ONLY 1" >>confdefs.h
-
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
 
 
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_flush() is available" >&5
-$as_echo_n "checking whether blk_queue_flush() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blkdev_get() wants 3 args" >&5
+$as_echo_n "checking whether blkdev_get() wants 3 args... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               (void) blk_queue_flush(q, REQ_FLUSH);
+               struct block_device *bdev = NULL;
+               (void) blkdev_get(bdev, 0, NULL);
 
   ;
   return 0;
@@ -24874,7 +23171,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_QUEUE_FLUSH 1" >>confdefs.h
+$as_echo "#define HAVE_3ARG_BLKDEV_GET 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_flush() is GPL-only" >&5
-$as_echo_n "checking whether blk_queue_flush() is GPL-only... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blkdev_get_by_path() is available" >&5
+$as_echo_n "checking whether blkdev_get_by_path() is available... " >&6; }
 
 
-cat confdefs.h - <<_ACEOF >conftest.c
 
+cat confdefs.h - <<_ACEOF >conftest.c
 
-               #include <linux/module.h>
-               #include <linux/blkdev.h>
 
-               MODULE_LICENSE("$ZFS_META_LICENSE");
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               (void) blk_queue_flush(q, REQ_FLUSH);
+               blkdev_get_by_path(NULL, 0, NULL);
 
   ;
   return 0;
@@ -24938,46 +23232,81 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
+  rc=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+ rc=1
+
+
+fi
+       rm -Rf build
+
+
+       if test $rc -ne 0; then :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+       else
+               if test "x$enable_linux_builtin" != xyes; then
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+       grep -q -E '[[:space:]]blkdev_get_by_path[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in fs/block_dev.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(blkdev_get_by_path)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
 
-$as_echo "#define HAVE_BLK_QUEUE_FLUSH_GPL_ONLY 1" >>confdefs.h
+               fi
+               if test $rc -ne 0; then :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
+               else :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
+$as_echo "#define HAVE_BLKDEV_GET_BY_PATH 1" >>confdefs.h
 
-fi
-       rm -Rf build
 
+               fi
+       fi
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether open_bdev_exclusive() is available" >&5
+$as_echo_n "checking whether open_bdev_exclusive() is available... " >&6; }
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_max_hw_sectors() is available" >&5
-$as_echo_n "checking whether blk_queue_max_hw_sectors() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               (void) blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
+               open_bdev_exclusive(NULL, 0, NULL);
 
   ;
   return 0;
@@ -25007,46 +23336,81 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
+  rc=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+ rc=1
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_QUEUE_MAX_HW_SECTORS 1" >>confdefs.h
+fi
+       rm -Rf build
 
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+       if test $rc -ne 0; then :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+       else
+               if test "x$enable_linux_builtin" != xyes; then
+
+       grep -q -E '[[:space:]]open_bdev_exclusive[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in fs/block_dev.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(open_bdev_exclusive)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
+
+               fi
+               if test $rc -ne 0; then :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
+               else :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
-fi
-       rm -Rf build
+$as_echo "#define HAVE_OPEN_BDEV_EXCLUSIVE 1" >>confdefs.h
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
+               fi
+       fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_max_segments() is available" >&5
-$as_echo_n "checking whether blk_queue_max_segments() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether lookup_bdev() is available" >&5
+$as_echo_n "checking whether lookup_bdev() is available... " >&6; }
+
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               (void) blk_queue_max_segments(q, BLK_MAX_SEGMENTS);
+               lookup_bdev(NULL);
 
   ;
   return 0;
@@ -25076,47 +23440,82 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
+  rc=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+ rc=1
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_QUEUE_MAX_SEGMENTS 1" >>confdefs.h
+fi
+       rm -Rf build
 
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+       if test $rc -ne 0; then :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
+       else
+               if test "x$enable_linux_builtin" != xyes; then
 
+       grep -q -E '[[:space:]]lookup_bdev[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in fs/block_dev.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(lookup_bdev)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
 
-fi
-       rm -Rf build
+               fi
+               if test $rc -ne 0; then :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
-       EXTRA_KCFLAGS="$tmp_flags"
+               else :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_LOOKUP_BDEV 1" >>confdefs.h
+
+
+               fi
+       fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_physical_block_size() is available" >&5
-$as_echo_n "checking whether blk_queue_physical_block_size() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether invalidate_bdev() wants 1 arg" >&5
+$as_echo_n "checking whether invalidate_bdev() wants 1 arg... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/buffer_head.h>
 
 int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               unsigned short block_size = 1;
-               (void) blk_queue_physical_block_size(q, block_size);
+               struct block_device *bdev = NULL;
+               invalidate_bdev(bdev);
 
   ;
   return 0;
@@ -25150,7 +23549,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_QUEUE_PHYSICAL_BLOCK_SIZE 1" >>confdefs.h
+$as_echo "#define HAVE_1ARG_INVALIDATE_BDEV 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_io_opt() is available" >&5
-$as_echo_n "checking whether blk_queue_io_opt() is available... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bdev_logical_block_size() is available" >&5
+$as_echo_n "checking whether bdev_logical_block_size() is available... " >&6; }
        tmp_flags="$EXTRA_KCFLAGS"
        EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
@@ -25184,9 +23582,8 @@ int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               unsigned int opt = 1;
-               (void) blk_queue_io_opt(q, opt);
+               struct block_device *bdev = NULL;
+               bdev_logical_block_size(bdev);
 
   ;
   return 0;
@@ -25220,7 +23617,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_QUEUE_IO_OPT 1" >>confdefs.h
+$as_echo "#define HAVE_BDEV_LOGICAL_BLOCK_SIZE 1" >>confdefs.h
 
 
 else
@@ -25239,8 +23636,8 @@ fi
        EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_nonrot() is available" >&5
-$as_echo_n "checking whether blk_queue_nonrot() is available... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bdev_physical_block_size() is available" >&5
+$as_echo_n "checking whether bdev_physical_block_size() is available... " >&6; }
        tmp_flags="$EXTRA_KCFLAGS"
        EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
@@ -25254,8 +23651,8 @@ int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               (void) blk_queue_nonrot(q);
+               struct block_device *bdev = NULL;
+               bdev_physical_block_size(bdev);
 
   ;
   return 0;
@@ -25289,7 +23686,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_QUEUE_NONROT 1" >>confdefs.h
+$as_echo "#define HAVE_BDEV_PHYSICAL_BLOCK_SIZE 1" >>confdefs.h
 
 
 else
        EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_discard() is available" >&5
-$as_echo_n "checking whether blk_queue_discard() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bio has bi_iter" >&5
+$as_echo_n "checking whether bio has bi_iter... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/bio.h>
 
 int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               (void) blk_queue_discard(q);
+               struct bio bio;
+               bio.bi_iter.bi_sector = 0;
 
   ;
   return 0;
@@ -25358,7 +23753,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_QUEUE_DISCARD 1" >>confdefs.h
+$as_echo "#define HAVE_BIO_BVEC_ITER 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_fetch_request() is available" >&5
-$as_echo_n "checking whether blk_fetch_request() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether BIO_RW_FAILFAST_* are defined" >&5
+$as_echo_n "checking whether BIO_RW_FAILFAST_* are defined... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/bio.h>
 
 int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               (void) blk_fetch_request(q);
+               int flags __attribute__ ((unused));
+               flags = ((1 << BIO_RW_FAILFAST_DEV) |
+                        (1 << BIO_RW_FAILFAST_TRANSPORT) |
+                        (1 << BIO_RW_FAILFAST_DRIVER));
 
   ;
   return 0;
@@ -25427,7 +23821,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_FETCH_REQUEST 1" >>confdefs.h
+$as_echo "#define HAVE_BIO_RW_FAILFAST_DTD 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_requeue_request() is available" >&5
-$as_echo_n "checking whether blk_requeue_request() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether REQ_FAILFAST_MASK is defined" >&5
+$as_echo_n "checking whether REQ_FAILFAST_MASK is defined... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/bio.h>
 
 int
 main (void)
 {
 
-               struct request_queue *q = NULL;
-               struct request *req = NULL;
-               blk_requeue_request(q, req);
+               int flags __attribute__ ((unused));
+               flags = REQ_FAILFAST_MASK;
 
   ;
   return 0;
@@ -25497,7 +23887,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_REQUEUE_REQUEST 1" >>confdefs.h
+$as_echo "#define HAVE_REQ_FAILFAST_MASK 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_rq_bytes() is available" >&5
-$as_echo_n "checking whether blk_rq_bytes() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bio_end_io_t wants 1 arg" >&5
+$as_echo_n "checking whether bio_end_io_t wants 1 arg... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/bio.h>
+
+               void wanted_end_io(struct bio *bio) { return; }
+
+               bio_end_io_t *end_io __attribute__ ((unused)) = wanted_end_io;
 
 int
 main (void)
 {
 
-               struct request *req = NULL;
-               (void) blk_rq_bytes(req);
 
   ;
   return 0;
@@ -25566,7 +23955,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_RQ_BYTES 1" >>confdefs.h
+$as_echo "#define HAVE_1ARG_BIO_END_IO_T 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_rq_bytes() is GPL-only" >&5
-$as_echo_n "checking whether blk_rq_bytes() is GPL-only... " >&6; }
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether BIO_RW_BARRIER is defined" >&5
+$as_echo_n "checking whether BIO_RW_BARRIER is defined... " >&6; }
 
-cat confdefs.h - <<_ACEOF >conftest.c
 
+cat confdefs.h - <<_ACEOF >conftest.c
 
-               #include <linux/module.h>
-               #include <linux/blkdev.h>
 
-               MODULE_LICENSE("$ZFS_META_LICENSE");
+               #include <linux/bio.h>
 
 int
 main (void)
 {
 
-               struct request *req = NULL;
-               (void) blk_rq_bytes(req);
+               int flags __attribute__ ((unused));
+               flags = BIO_RW_BARRIER;
 
   ;
   return 0;
@@ -25631,18 +24018,18 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_BIO_RW_BARRIER 1" >>confdefs.h
+
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_BLK_RQ_BYTES_GPL_ONLY 1" >>confdefs.h
-
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
 
 
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_rq_pos() is available" >&5
-$as_echo_n "checking whether blk_rq_pos() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether BIO_RW_DISCARD is defined" >&5
+$as_echo_n "checking whether BIO_RW_DISCARD is defined... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/bio.h>
 
 int
 main (void)
 {
 
-               struct request *req = NULL;
-               (void) blk_rq_pos(req);
+               int flags __attribute__ ((unused));
+               flags = BIO_RW_DISCARD;
 
   ;
   return 0;
@@ -25703,7 +24087,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_RQ_POS 1" >>confdefs.h
+$as_echo "#define HAVE_BIO_RW_DISCARD 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_rq_sectors() is available" >&5
-$as_echo_n "checking whether blk_rq_sectors() is available... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_flush() is available" >&5
+$as_echo_n "checking whether blk_queue_flush() is available... " >&6; }
        tmp_flags="$EXTRA_KCFLAGS"
        EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
@@ -25737,8 +24120,8 @@ int
 main (void)
 {
 
-               struct request *req = NULL;
-               (void) blk_rq_sectors(req);
+               struct request_queue *q = NULL;
+               (void) blk_queue_flush(q, REQ_FLUSH);
 
   ;
   return 0;
@@ -25772,7 +24155,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_BLK_RQ_SECTORS 1" >>confdefs.h
+$as_echo "#define HAVE_BLK_QUEUE_FLUSH 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
-
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether get_disk_ro() is available" >&5
-$as_echo_n "checking whether get_disk_ro() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_flush() is GPL-only" >&5
+$as_echo_n "checking whether blk_queue_flush() is GPL-only... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
+               #include <linux/module.h>
                #include <linux/blkdev.h>
 
+               MODULE_LICENSE("$ZFS_META_LICENSE");
+
 int
 main (void)
 {
 
-               struct gendisk *disk = NULL;
-               (void) get_disk_ro(disk);
+               struct request_queue *q = NULL;
+               (void) blk_queue_flush(q, REQ_FLUSH);
 
   ;
   return 0;
@@ -25838,18 +24220,18 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_GET_DISK_RO 1" >>confdefs.h
-
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_BLK_QUEUE_FLUSH_GPL_ONLY 1" >>confdefs.h
+
 
 
 
 
        EXTRA_KCFLAGS="$tmp_flags"
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether get_gendisk() is available" >&5
-$as_echo_n "checking whether get_gendisk() is available... " >&6; }
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_max_hw_sectors() is available" >&5
+$as_echo_n "checking whether blk_queue_max_hw_sectors() is available... " >&6; }
+       tmp_flags="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/genhd.h>
+               #include <linux/blkdev.h>
 
 int
 main (void)
 {
 
-               get_gendisk(0, NULL);
+               struct request_queue *q = NULL;
+               (void) blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
 
   ;
   return 0;
@@ -25903,69 +24288,31 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
-
-
-fi
-       rm -Rf build
-
 
-       if test $rc -ne 0; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-       else
-               if test "x$enable_linux_builtin" != xyes; then
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
-       grep -q -E '[[:space:]]get_gendisk[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in block/genhd.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(get_gendisk)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
+$as_echo "#define HAVE_BLK_QUEUE_MAX_HW_SECTORS 1" >>confdefs.h
 
-               fi
-               if test $rc -ne 0; then :
+
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
-               else :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_GET_GENDISK 1" >>confdefs.h
 
+fi
+       rm -Rf build
 
-               fi
-       fi
 
+       EXTRA_KCFLAGS="$tmp_flags"
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether rq_is_sync() is available" >&5
-$as_echo_n "checking whether rq_is_sync() is available... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether blk_queue_max_segments() is available" >&5
+$as_echo_n "checking whether blk_queue_max_segments() is available... " >&6; }
        tmp_flags="$EXTRA_KCFLAGS"
        EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
@@ -25979,8 +24326,8 @@ int
 main (void)
 {
 
-               struct request *req = NULL;
-               (void) rq_is_sync(req);
+               struct request_queue *q = NULL;
+               (void) blk_queue_max_segments(q, BLK_MAX_SEGMENTS);
 
   ;
   return 0;
@@ -26014,7 +24361,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_RQ_IS_SYNC 1" >>confdefs.h
+$as_echo "#define HAVE_BLK_QUEUE_MAX_SEGMENTS 1" >>confdefs.h
 
 
 else
        EXTRA_KCFLAGS="$tmp_flags"
 
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether get_disk_ro() is available" >&5
+$as_echo_n "checking whether get_disk_ro() is available... " >&6; }
        tmp_flags="$EXTRA_KCFLAGS"
        EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether rq_for_each_segment() wants bio_vec *" >&5
-$as_echo_n "checking whether rq_for_each_segment() wants bio_vec *... " >&6; }
-
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
@@ -26049,10 +24395,8 @@ int
 main (void)
 {
 
-               struct bio_vec *bv;
-               struct req_iterator iter;
-               struct request *req = NULL;
-               rq_for_each_segment(bv, req, iter) { }
+               struct gendisk *disk = NULL;
+               (void) get_disk_ro(disk);
 
   ;
   return 0;
@@ -26086,10 +24430,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_RQ_FOR_EACH_SEGMENT 1" >>confdefs.h
-
-
-$as_echo "#define HAVE_RQ_FOR_EACH_SEGMENT_BVP 1" >>confdefs.h
+$as_echo "#define HAVE_GET_DISK_RO 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
+       EXTRA_KCFLAGS="$tmp_flags"
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether get_gendisk() is available" >&5
+$as_echo_n "checking whether get_gendisk() is available... " >&6; }
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether rq_for_each_segment() wants bio_vec" >&5
-$as_echo_n "checking whether rq_for_each_segment() wants bio_vec... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
+               #include <linux/genhd.h>
 
 int
 main (void)
 {
 
-               struct bio_vec bv;
-               struct req_iterator iter;
-               struct request *req = NULL;
-               rq_for_each_segment(bv, req, iter) { }
+               get_gendisk(0, NULL);
 
   ;
   return 0;
@@ -26152,31 +24492,65 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
+  rc=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+ rc=1
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_RQ_FOR_EACH_SEGMENT 1" >>confdefs.h
+fi
+       rm -Rf build
+
 
+       if test $rc -ne 0; then :
 
-$as_echo "#define HAVE_RQ_FOR_EACH_SEGMENT_BV 1" >>confdefs.h
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
+       else
+               if test "x$enable_linux_builtin" != xyes; then
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+       grep -q -E '[[:space:]]get_gendisk[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in block/genhd.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(get_gendisk)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
+
+               fi
+               if test $rc -ne 0; then :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
+               else :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
-fi
-       rm -Rf build
+$as_echo "#define HAVE_GET_GENDISK 1" >>confdefs.h
 
 
+               fi
+       fi
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
 
        { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ql->discard_granularity is available" >&5
@@ -28951,41 +27325,249 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_COMMIT_METADATA 1" >>confdefs.h
+
+
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+
+
+fi
+       rm -Rf build
+
+
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether clear_inode() is available" >&5
+$as_echo_n "checking whether clear_inode() is available... " >&6; }
+
+
+
+cat confdefs.h - <<_ACEOF >conftest.c
+
+
+               #include <linux/fs.h>
+
+int
+main (void)
+{
+
+               clear_inode(NULL);
+
+  ;
+  return 0;
+}
+
+_ACEOF
+
+
+
+cat - <<_ACEOF >conftest.h
+
+_ACEOF
+
+
+       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
+       echo "obj-m := conftest.o" >build/Makefile
+       modpost_flag=''
+       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
+       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+  rc=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+ rc=1
+
+
+fi
+       rm -Rf build
+
+
+       if test $rc -ne 0; then :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+       else
+               if test "x$enable_linux_builtin" != xyes; then
+
+       grep -q -E '[[:space:]]clear_inode[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in fs/inode.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(clear_inode)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
+
+               fi
+               if test $rc -ne 0; then :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+               else :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_CLEAR_INODE 1" >>confdefs.h
+
+
+               fi
+       fi
+
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether insert_inode_locked() is available" >&5
+$as_echo_n "checking whether insert_inode_locked() is available... " >&6; }
+
+
+
+cat confdefs.h - <<_ACEOF >conftest.c
+
+
+               #include <linux/fs.h>
+
+int
+main (void)
+{
+
+               insert_inode_locked(NULL);
+
+  ;
+  return 0;
+}
+
+_ACEOF
+
+
+
+cat - <<_ACEOF >conftest.h
+
+_ACEOF
+
+
+       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
+       echo "obj-m := conftest.o" >build/Makefile
+       modpost_flag=''
+       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
+       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+  rc=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+ rc=1
+
 
-$as_echo "#define HAVE_COMMIT_METADATA 1" >>confdefs.h
+fi
+       rm -Rf build
 
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+       if test $rc -ne 0; then :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+       else
+               if test "x$enable_linux_builtin" != xyes; then
+
+       grep -q -E '[[:space:]]insert_inode_locked[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in fs/inode.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(insert_inode_locked)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
+
+               fi
+               if test $rc -ne 0; then :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
+               else :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
+$as_echo "#define HAVE_INSERT_INODE_LOCKED 1" >>confdefs.h
 
-fi
-       rm -Rf build
 
+               fi
+       fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether clear_inode() is available" >&5
-$as_echo_n "checking whether clear_inode() is available... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether d_make_root() is available" >&5
+$as_echo_n "checking whether d_make_root() is available... " >&6; }
 
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
+               #include <linux/dcache.h>
 
 int
 main (void)
 {
 
-               clear_inode(NULL);
+               d_make_root(NULL);
 
   ;
   return 0;
@@ -29034,13 +27616,13 @@ $as_echo "no" >&6; }
        else
                if test "x$enable_linux_builtin" != xyes; then
 
-       grep -q -E '[[:space:]]clear_inode[[:space:]]' \
+       grep -q -E '[[:space:]]d_make_root[[:space:]]' \
                $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
        rc=$?
        if test $rc -ne 0; then
                export=0
-               for file in fs/inode.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(clear_inode)" \
+               for file in fs/dcache.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(d_make_root)" \
                                "$LINUX/$file" 2>/dev/null
                        rc=$?
                        if test $rc -eq 0; then
@@ -29068,28 +27650,28 @@ $as_echo "no" >&6; }
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_CLEAR_INODE 1" >>confdefs.h
+$as_echo "#define HAVE_D_MAKE_ROOT 1" >>confdefs.h
 
 
                fi
        fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether insert_inode_locked() is available" >&5
-$as_echo_n "checking whether insert_inode_locked() is available... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether d_obtain_alias() is available" >&5
+$as_echo_n "checking whether d_obtain_alias() is available... " >&6; }
 
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
+               #include <linux/dcache.h>
 
 int
 main (void)
 {
 
-               insert_inode_locked(NULL);
+               d_obtain_alias(NULL);
 
   ;
   return 0;
@@ -29138,13 +27720,13 @@ $as_echo "no" >&6; }
        else
                if test "x$enable_linux_builtin" != xyes; then
 
-       grep -q -E '[[:space:]]insert_inode_locked[[:space:]]' \
+       grep -q -E '[[:space:]]d_obtain_alias[[:space:]]' \
                $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
        rc=$?
        if test $rc -ne 0; then
                export=0
-               for file in fs/inode.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(insert_inode_locked)" \
+               for file in fs/dcache.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(d_obtain_alias)" \
                                "$LINUX/$file" 2>/dev/null
                        rc=$?
                        if test $rc -eq 0; then
@@ -29172,15 +27754,15 @@ $as_echo "no" >&6; }
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_INSERT_INODE_LOCKED 1" >>confdefs.h
+$as_echo "#define HAVE_D_OBTAIN_ALIAS 1" >>confdefs.h
 
 
                fi
        fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether d_make_root() is available" >&5
-$as_echo_n "checking whether d_make_root() is available... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether d_prune_aliases() is available" >&5
+$as_echo_n "checking whether d_prune_aliases() is available... " >&6; }
 
 
 
@@ -29193,7 +27775,8 @@ int
 main (void)
 {
 
-               d_make_root(NULL);
+               struct inode *ip = NULL;
+               d_prune_aliases(ip);
 
   ;
   return 0;
@@ -29242,13 +27825,13 @@ $as_echo "no" >&6; }
        else
                if test "x$enable_linux_builtin" != xyes; then
 
-       grep -q -E '[[:space:]]d_make_root[[:space:]]' \
+       grep -q -E '[[:space:]]d_prune_aliases[[:space:]]' \
                $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
        rc=$?
        if test $rc -ne 0; then
                export=0
                for file in fs/dcache.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(d_make_root)" \
+                       grep -q -E "EXPORT_SYMBOL.*(d_prune_aliases)" \
                                "$LINUX/$file" 2>/dev/null
                        rc=$?
                        if test $rc -eq 0; then
@@ -29276,15 +27859,15 @@ $as_echo "no" >&6; }
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_D_MAKE_ROOT 1" >>confdefs.h
+$as_echo "#define HAVE_D_PRUNE_ALIASES 1" >>confdefs.h
 
 
                fi
        fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether d_obtain_alias() is available" >&5
-$as_echo_n "checking whether d_obtain_alias() is available... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether d_set_d_op() is available" >&5
+$as_echo_n "checking whether d_set_d_op() is available... " >&6; }
 
 
 
@@ -29297,7 +27880,7 @@ int
 main (void)
 {
 
-               d_obtain_alias(NULL);
+               d_set_d_op(NULL, NULL);
 
   ;
   return 0;
@@ -29346,13 +27929,13 @@ $as_echo "no" >&6; }
        else
                if test "x$enable_linux_builtin" != xyes; then
 
-       grep -q -E '[[:space:]]d_obtain_alias[[:space:]]' \
+       grep -q -E '[[:space:]]d_set_d_op[[:space:]]' \
                $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
        rc=$?
        if test $rc -ne 0; then
                export=0
                for file in fs/dcache.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(d_obtain_alias)" \
+                       grep -q -E "EXPORT_SYMBOL.*(d_set_d_op)" \
                                "$LINUX/$file" 2>/dev/null
                        rc=$?
                        if test $rc -eq 0; then
@@ -29369,40 +27952,182 @@ $as_echo "no" >&6; }
                rc=0
        fi
 
-               fi
-               if test $rc -ne 0; then :
+               fi
+               if test $rc -ne 0; then :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+               else :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_D_SET_D_OP 1" >>confdefs.h
+
+
+               fi
+       fi
+
+
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether dops->d_revalidate() takes struct nameidata" >&5
+$as_echo_n "checking whether dops->d_revalidate() takes struct nameidata... " >&6; }
+
+
+cat confdefs.h - <<_ACEOF >conftest.c
+
+
+               #include <linux/dcache.h>
+
+               int revalidate (struct dentry *dentry,
+                   struct nameidata *nidata) { return 0; }
+
+               static const struct dentry_operations
+                   dops __attribute__ ((unused)) = {
+                       .d_revalidate   = revalidate,
+               };
+
+int
+main (void)
+{
+
+
+  ;
+  return 0;
+}
+
+_ACEOF
+
+
+
+cat - <<_ACEOF >conftest.h
+
+_ACEOF
+
+
+       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
+       echo "obj-m := conftest.o" >build/Makefile
+       modpost_flag=''
+       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
+       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_D_REVALIDATE_NAMEIDATA 1" >>confdefs.h
+
+
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+
+
+fi
+       rm -Rf build
+
+
+
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether dentry uses const struct dentry_operations" >&5
+$as_echo_n "checking whether dentry uses const struct dentry_operations... " >&6; }
+
+
+cat confdefs.h - <<_ACEOF >conftest.c
+
+
+               #include <linux/dcache.h>
+
+               const struct dentry_operations test_d_op = {
+                       .d_revalidate = NULL,
+               };
+
+int
+main (void)
+{
+
+               struct dentry d __attribute__ ((unused));
+
+               d.d_op = &test_d_op;
+
+  ;
+  return 0;
+}
+
+_ACEOF
+
+
+
+cat - <<_ACEOF >conftest.h
+
+_ACEOF
+
+
+       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
+       echo "obj-m := conftest.o" >build/Makefile
+       modpost_flag=''
+       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
+       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_CONST_DENTRY_OPERATIONS 1" >>confdefs.h
+
+
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
-               else :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_D_OBTAIN_ALIAS 1" >>confdefs.h
 
+fi
+       rm -Rf build
 
-               fi
-       fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether d_prune_aliases() is available" >&5
-$as_echo_n "checking whether d_prune_aliases() is available... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether check_disk_size_change() is available" >&5
+$as_echo_n "checking whether check_disk_size_change() is available... " >&6; }
 
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/dcache.h>
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
-               struct inode *ip = NULL;
-               d_prune_aliases(ip);
+               check_disk_size_change(NULL, NULL);
 
   ;
   return 0;
@@ -29451,13 +28176,13 @@ $as_echo "no" >&6; }
        else
                if test "x$enable_linux_builtin" != xyes; then
 
-       grep -q -E '[[:space:]]d_prune_aliases[[:space:]]' \
+       grep -q -E '[[:space:]]check_disk_size_change[[:space:]]' \
                $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
        rc=$?
        if test $rc -ne 0; then
                export=0
-               for file in fs/dcache.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(d_prune_aliases)" \
+               for file in fs/block_dev.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(check_disk_size_change)" \
                                "$LINUX/$file" 2>/dev/null
                        rc=$?
                        if test $rc -eq 0; then
@@ -29485,28 +28210,28 @@ $as_echo "no" >&6; }
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_D_PRUNE_ALIASES 1" >>confdefs.h
+$as_echo "#define HAVE_CHECK_DISK_SIZE_CHANGE 1" >>confdefs.h
 
 
                fi
        fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether d_set_d_op() is available" >&5
-$as_echo_n "checking whether d_set_d_op() is available... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether truncate_setsize() is available" >&5
+$as_echo_n "checking whether truncate_setsize() is available... " >&6; }
 
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/dcache.h>
+               #include <linux/mm.h>
 
 int
 main (void)
 {
 
-               d_set_d_op(NULL, NULL);
+               truncate_setsize(NULL, 0);
 
   ;
   return 0;
@@ -29555,13 +28280,13 @@ $as_echo "no" >&6; }
        else
                if test "x$enable_linux_builtin" != xyes; then
 
-       grep -q -E '[[:space:]]d_set_d_op[[:space:]]' \
+       grep -q -E '[[:space:]]truncate_setsize[[:space:]]' \
                $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
        rc=$?
        if test $rc -ne 0; then
                export=0
-               for file in fs/dcache.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(d_set_d_op)" \
+               for file in mm/truncate.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(truncate_setsize)" \
                                "$LINUX/$file" 2>/dev/null
                        rc=$?
                        if test $rc -eq 0; then
@@ -29589,7 +28314,7 @@ $as_echo "no" >&6; }
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_D_SET_D_OP 1" >>confdefs.h
+$as_echo "#define HAVE_TRUNCATE_SETSIZE 1" >>confdefs.h
 
 
                fi
@@ -29597,27 +28322,27 @@ $as_echo "#define HAVE_D_SET_D_OP 1" >>confdefs.h
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether dops->d_revalidate() takes struct nameidata" >&5
-$as_echo_n "checking whether dops->d_revalidate() takes struct nameidata... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether security_inode_init_security wants 6 args" >&5
+$as_echo_n "checking whether security_inode_init_security wants 6 args... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/dcache.h>
-
-               int revalidate (struct dentry *dentry,
-                   struct nameidata *nidata) { return 0; }
-
-               static const struct dentry_operations
-                   dops __attribute__ ((unused)) = {
-                       .d_revalidate   = revalidate,
-               };
+               #include <linux/security.h>
 
 int
 main (void)
 {
 
+               struct inode *ip __attribute__ ((unused)) = NULL;
+               struct inode *dip __attribute__ ((unused)) = NULL;
+               const struct qstr *str __attribute__ ((unused)) = NULL;
+               char *name __attribute__ ((unused)) = NULL;
+               void *value __attribute__ ((unused)) = NULL;
+               size_t len __attribute__ ((unused)) = 0;
+
+               security_inode_init_security(ip, dip, str, &name, &value, &len);
 
   ;
   return 0;
@@ -29651,7 +28376,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_D_REVALIDATE_NAMEIDATA 1" >>confdefs.h
+$as_echo "#define HAVE_6ARGS_SECURITY_INODE_INIT_SECURITY 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether dentry uses const struct dentry_operations" >&5
-$as_echo_n "checking whether dentry uses const struct dentry_operations... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether security_inode_init_security wants callback" >&5
+$as_echo_n "checking whether security_inode_init_security wants callback... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/dcache.h>
-
-               const struct dentry_operations test_d_op = {
-                       .d_revalidate = NULL,
-               };
+               #include <linux/security.h>
 
 int
 main (void)
 {
 
-               struct dentry d __attribute__ ((unused));
+               struct inode *ip __attribute__ ((unused)) = NULL;
+               struct inode *dip __attribute__ ((unused)) = NULL;
+               const struct qstr *str __attribute__ ((unused)) = NULL;
+               initxattrs func __attribute__ ((unused)) = NULL;
 
-               d.d_op = &test_d_op;
+               security_inode_init_security(ip, dip, str, func, NULL);
 
   ;
   return 0;
@@ -29722,7 +28446,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_CONST_DENTRY_OPERATIONS 1" >>confdefs.h
+$as_echo "#define HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY 1" >>confdefs.h
 
 
 else
@@ -29739,8 +28463,8 @@ fi
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether check_disk_size_change() is available" >&5
-$as_echo_n "checking whether check_disk_size_change() is available... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether mount_nodev() is available" >&5
+$as_echo_n "checking whether mount_nodev() is available... " >&6; }
 
 
 
@@ -29753,7 +28477,7 @@ int
 main (void)
 {
 
-               check_disk_size_change(NULL, NULL);
+               mount_nodev(NULL, 0, NULL, NULL);
 
   ;
   return 0;
@@ -29802,13 +28526,13 @@ $as_echo "no" >&6; }
        else
                if test "x$enable_linux_builtin" != xyes; then
 
-       grep -q -E '[[:space:]]check_disk_size_change[[:space:]]' \
+       grep -q -E '[[:space:]]mount_nodev[[:space:]]' \
                $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
        rc=$?
        if test $rc -ne 0; then
                export=0
-               for file in fs/block_dev.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(check_disk_size_change)" \
+               for file in fs/super.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(mount_nodev)" \
                                "$LINUX/$file" 2>/dev/null
                        rc=$?
                        if test $rc -eq 0; then
@@ -29836,28 +28560,172 @@ $as_echo "no" >&6; }
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_CHECK_DISK_SIZE_CHANGE 1" >>confdefs.h
+$as_echo "#define HAVE_MOUNT_NODEV 1" >>confdefs.h
 
 
                fi
        fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether truncate_setsize() is available" >&5
-$as_echo_n "checking whether truncate_setsize() is available... " >&6; }
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether super_block has s_shrink" >&5
+$as_echo_n "checking whether super_block has s_shrink... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/mm.h>
+               #include <linux/fs.h>
+
+               int shrink(struct shrinker *s, struct shrink_control *sc)
+                   { return 0; }
+
+               static const struct super_block
+                   sb __attribute__ ((unused)) = {
+                       .s_shrink.shrink = shrink,
+                       .s_shrink.seeks = DEFAULT_SEEKS,
+                       .s_shrink.batch = 0,
+               };
+
+int
+main (void)
+{
+
+
+  ;
+  return 0;
+}
+
+_ACEOF
+
+
+
+cat - <<_ACEOF >conftest.h
+
+_ACEOF
+
+
+       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
+       echo "obj-m := conftest.o" >build/Makefile
+       modpost_flag=''
+       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
+       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_SHRINK 1" >>confdefs.h
+
+
+
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+
+
+fi
+       rm -Rf build
+
+
+
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether shrink_control has nid" >&5
+$as_echo_n "checking whether shrink_control has nid... " >&6; }
+
+
+cat confdefs.h - <<_ACEOF >conftest.c
+
+
+               #include <linux/fs.h>
+
+int
+main (void)
+{
+
+               struct shrink_control sc __attribute__ ((unused));
+               unsigned long scnidsize __attribute__ ((unused)) =
+                   sizeof(sc.nid);
+
+  ;
+  return 0;
+}
+
+_ACEOF
+
+
+
+cat - <<_ACEOF >conftest.h
+
+_ACEOF
+
+
+       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
+       echo "obj-m := conftest.o" >build/Makefile
+       modpost_flag=''
+       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
+       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define SHRINK_CONTROL_HAS_NID 1" >>confdefs.h
+
+
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+
+
+fi
+       rm -Rf build
+
+
+
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether super_block has s_instances list_head" >&5
+$as_echo_n "checking whether super_block has s_instances list_head... " >&6; }
+
+
+cat confdefs.h - <<_ACEOF >conftest.c
+
+
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
-               truncate_setsize(NULL, 0);
+               struct super_block sb __attribute__ ((unused));
+
+               INIT_LIST_HEAD(&sb.s_instances);
 
   ;
   return 0;
@@ -29887,88 +28755,42 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
-
 
-fi
-       rm -Rf build
-
-
-       if test $rc -ne 0; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
-       else
-               if test "x$enable_linux_builtin" != xyes; then
+$as_echo "#define HAVE_S_INSTANCES_LIST_HEAD 1" >>confdefs.h
 
-       grep -q -E '[[:space:]]truncate_setsize[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in mm/truncate.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(truncate_setsize)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
 
-               fi
-               if test $rc -ne 0; then :
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
-               else :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_TRUNCATE_SETSIZE 1" >>confdefs.h
 
 
-               fi
-       fi
+fi
+       rm -Rf build
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether security_inode_init_security wants 6 args" >&5
-$as_echo_n "checking whether security_inode_init_security wants 6 args... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether super_block has s_d_op" >&5
+$as_echo_n "checking whether super_block has s_d_op... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/security.h>
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
-               struct inode *ip __attribute__ ((unused)) = NULL;
-               struct inode *dip __attribute__ ((unused)) = NULL;
-               const struct qstr *str __attribute__ ((unused)) = NULL;
-               char *name __attribute__ ((unused)) = NULL;
-               void *value __attribute__ ((unused)) = NULL;
-               size_t len __attribute__ ((unused)) = 0;
-
-               security_inode_init_security(ip, dip, str, &name, &value, &len);
+               struct super_block sb __attribute__ ((unused));
+               sb.s_d_op = NULL;
 
   ;
   return 0;
@@ -30002,7 +28824,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_6ARGS_SECURITY_INODE_INIT_SECURITY 1" >>confdefs.h
+$as_echo "#define HAVE_S_D_OP 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether security_inode_init_security wants callback" >&5
-$as_echo_n "checking whether security_inode_init_security wants callback... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bdi_setup_and_register() wants 2 args" >&5
+$as_echo_n "checking whether bdi_setup_and_register() wants 2 args... " >&6; }
+
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/security.h>
+               #include <linux/backing-dev.h>
+               struct backing_dev_info bdi;
 
 int
 main (void)
 {
 
-               struct inode *ip __attribute__ ((unused)) = NULL;
-               struct inode *dip __attribute__ ((unused)) = NULL;
-               const struct qstr *str __attribute__ ((unused)) = NULL;
-               initxattrs func __attribute__ ((unused)) = NULL;
-
-               security_inode_init_security(ip, dip, str, func, NULL);
+               char *name = "bdi";
+               int error __attribute__((unused)) =
+                   bdi_setup_and_register(&bdi, name);
 
   ;
   return 0;
@@ -30068,42 +28889,40 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY 1" >>confdefs.h
-
-
+  rc=0
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
+ rc=1
 
 
 fi
        rm -Rf build
 
 
+       if test $rc -ne 0; then :
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether mount_nodev() is available" >&5
-$as_echo_n "checking whether mount_nodev() is available... " >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bdi_setup_and_register() wants 3 args" >&5
+$as_echo_n "checking whether bdi_setup_and_register() wants 3 args... " >&6; }
 
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
+                       #include <linux/backing-dev.h>
+                       struct backing_dev_info bdi;
 
 int
 main (void)
 {
 
-               mount_nodev(NULL, 0, NULL, NULL);
+                       char *name = "bdi";
+                       unsigned int cap = BDI_CAP_MAP_COPY;
+                       int error __attribute__((unused)) =
+                           bdi_setup_and_register(&bdi, name, cap);
 
   ;
   return 0;
 
        if test $rc -ne 0; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
        else
                if test "x$enable_linux_builtin" != xyes; then
 
-       grep -q -E '[[:space:]]mount_nodev[[:space:]]' \
+       grep -q -E '[[:space:]]bdi_setup_and_register[[:space:]]' \
                $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
        rc=$?
        if test $rc -ne 0; then
                export=0
-               for file in fs/super.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(mount_nodev)" \
+               for file in mm/backing-dev.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(bdi_setup_and_register)" \
                                "$LINUX/$file" 2>/dev/null
                        rc=$?
                        if test $rc -eq 0; then
@@ -30178,45 +28997,71 @@ $as_echo "no" >&6; }
                fi
                if test $rc -ne 0; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
                else :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_MOUNT_NODEV 1" >>confdefs.h
+$as_echo "#define HAVE_3ARGS_BDI_SETUP_AND_REGISTER 1" >>confdefs.h
 
 
                fi
        fi
 
 
+       else
+               if test "x$enable_linux_builtin" != xyes; then
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether super_block has s_shrink" >&5
-$as_echo_n "checking whether super_block has s_shrink... " >&6; }
+       grep -q -E '[[:space:]]bdi_setup_and_register[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in mm/backing-dev.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(bdi_setup_and_register)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
 
+               fi
+               if test $rc -ne 0; then :
 
-cat confdefs.h - <<_ACEOF >conftest.c
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bdi_setup_and_register() wants 3 args" >&5
+$as_echo_n "checking whether bdi_setup_and_register() wants 3 args... " >&6; }
 
 
-               #include <linux/fs.h>
 
-               int shrink(struct shrinker *s, struct shrink_control *sc)
-                   { return 0; }
+cat confdefs.h - <<_ACEOF >conftest.c
 
-               static const struct super_block
-                   sb __attribute__ ((unused)) = {
-                       .s_shrink.shrink = shrink,
-                       .s_shrink.seeks = DEFAULT_SEEKS,
-                       .s_shrink.batch = 0,
-               };
+
+                       #include <linux/backing-dev.h>
+                       struct backing_dev_info bdi;
 
 int
 main (void)
 {
 
+                       char *name = "bdi";
+                       unsigned int cap = BDI_CAP_MAP_COPY;
+                       int error __attribute__((unused)) =
+                           bdi_setup_and_register(&bdi, name, cap);
 
   ;
   return 0;
@@ -30246,98 +29091,81 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_SHRINK 1" >>confdefs.h
-
-
-
+  rc=0
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
+ rc=1
 
 
 fi
        rm -Rf build
 
 
+       if test $rc -ne 0; then :
 
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether shrink_control has nid" >&5
-$as_echo_n "checking whether shrink_control has nid... " >&6; }
-
-
-cat confdefs.h - <<_ACEOF >conftest.c
-
-
-               #include <linux/fs.h>
-
-int
-main (void)
-{
-
-               struct shrink_control sc __attribute__ ((unused));
-               unsigned long scnidsize __attribute__ ((unused)) =
-                   sizeof(sc.nid);
-
-  ;
-  return 0;
-}
-
-_ACEOF
-
-
+       else
+               if test "x$enable_linux_builtin" != xyes; then
 
-cat - <<_ACEOF >conftest.h
+       grep -q -E '[[:space:]]bdi_setup_and_register[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in mm/backing-dev.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(bdi_setup_and_register)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
 
-_ACEOF
+               fi
+               if test $rc -ne 0; then :
 
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
+               else :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define SHRINK_CONTROL_HAS_NID 1" >>confdefs.h
+$as_echo "#define HAVE_3ARGS_BDI_SETUP_AND_REGISTER 1" >>confdefs.h
 
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+               fi
+       fi
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
 
+               else :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
-fi
-       rm -Rf build
+$as_echo "#define HAVE_2ARGS_BDI_SETUP_AND_REGISTER 1" >>confdefs.h
 
 
+               fi
+       fi
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether super_block has s_instances list_head" >&5
-$as_echo_n "checking whether super_block has s_instances list_head... " >&6; }
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether set_nlink() is available" >&5
+$as_echo_n "checking whether set_nlink() is available... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
@@ -30349,9 +29177,9 @@ int
 main (void)
 {
 
-               struct super_block sb __attribute__ ((unused));
-
-               INIT_LIST_HEAD(&sb.s_instances);
+               struct inode node;
+               unsigned int link = 0;
+               (void) set_nlink(&node, link);
 
   ;
   return 0;
@@ -30385,7 +29213,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_S_INSTANCES_LIST_HEAD 1" >>confdefs.h
+$as_echo "#define HAVE_SET_NLINK 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether super_block has s_d_op" >&5
-$as_echo_n "checking whether super_block has s_d_op... " >&6; }
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether elevator_change() is available" >&5
+$as_echo_n "checking whether elevator_change() is available... " >&6; }
+       tmp_flags="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
+               #include <linux/blkdev.h>
+               #include <linux/elevator.h>
 
 int
 main (void)
 {
 
-               struct super_block sb __attribute__ ((unused));
-               sb.s_d_op = NULL;
+               int ret;
+               struct request_queue *q = NULL;
+               char *elevator = NULL;
+               ret = elevator_change(q, elevator);
 
   ;
   return 0;
@@ -30450,7 +29284,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_S_D_OP 1" >>confdefs.h
+$as_echo "#define HAVE_ELEVATOR_CHANGE 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
+       EXTRA_KCFLAGS="$tmp_flags"
 
-
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bdi_setup_and_register() wants 2 args" >&5
-$as_echo_n "checking whether bdi_setup_and_register() wants 2 args... " >&6; }
-
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether sget() wants 5 args" >&5
+$as_echo_n "checking whether sget() wants 5 args... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/backing-dev.h>
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
-               struct backing_dev_info bdi;
-               char *name = "bdi";
-               int error __attribute__((unused)) =
-                   bdi_setup_and_register(&bdi, name);
+               struct file_system_type *type = NULL;
+               int (*test)(struct super_block *,void *) = NULL;
+               int (*set)(struct super_block *,void *) = NULL;
+               int flags = 0;
+               void *data = NULL;
+               (void) sget(type, test, set, flags, data);
 
   ;
   return 0;
@@ -30515,40 +29350,47 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
-  rc=0
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_5ARG_SGET 1" >>confdefs.h
+
+
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
 
 
 fi
        rm -Rf build
 
 
-       if test $rc -ne 0; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bdi_setup_and_register() wants 3 args" >&5
-$as_echo_n "checking whether bdi_setup_and_register() wants 3 args... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether lseek_execute() is available" >&5
+$as_echo_n "checking whether lseek_execute() is available... " >&6; }
 
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-                       #include <linux/backing-dev.h>
+               #include <linux/fs.h>
 
 int
 main (void)
 {
 
-                       struct backing_dev_info bdi;
-                       char *name = "bdi";
-                       unsigned int cap = BDI_CAP_MAP_COPY;
-                       int error __attribute__((unused)) =
-                           bdi_setup_and_register(&bdi, name, cap);
+               struct file *fp __attribute__ ((unused)) = NULL;
+               struct inode *ip __attribute__ ((unused)) = NULL;
+               loff_t offset __attribute__ ((unused)) = 0;
+               loff_t maxsize __attribute__ ((unused)) = 0;
+
+               lseek_execute(fp, ip, offset, maxsize);
 
   ;
   return 0;
 
        if test $rc -ne 0; then :
 
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
        else
                if test "x$enable_linux_builtin" != xyes; then
 
-       grep -q -E '[[:space:]]bdi_setup_and_register[[:space:]]' \
+       grep -q -E '[[:space:]]lseek_exclusive[[:space:]]' \
                $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
        rc=$?
        if test $rc -ne 0; then
                export=0
-               for file in mm/backing-dev.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(bdi_setup_and_register)" \
+               for file in fs/read_write.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(lseek_exclusive)" \
                                "$LINUX/$file" 2>/dev/null
                        rc=$?
                        if test $rc -eq 0; then
@@ -30623,71 +29465,42 @@ $as_echo "no" >&6; }
                fi
                if test $rc -ne 0; then :
 
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
                else :
 
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_3ARGS_BDI_SETUP_AND_REGISTER 1" >>confdefs.h
-
-
-               fi
-       fi
-
+$as_echo "#define HAVE_LSEEK_EXECUTE 1" >>confdefs.h
 
-       else
-               if test "x$enable_linux_builtin" != xyes; then
 
-       grep -q -E '[[:space:]]bdi_setup_and_register[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in mm/backing-dev.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(bdi_setup_and_register)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
                fi
-       else :
-               rc=0
        fi
 
-               fi
-               if test $rc -ne 0; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether bdi_setup_and_register() wants 3 args" >&5
-$as_echo_n "checking whether bdi_setup_and_register() wants 3 args... " >&6; }
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->iterate() is available" >&5
+$as_echo_n "checking whether fops->iterate() is available... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-                       #include <linux/backing-dev.h>
+               #include <linux/fs.h>
+               int iterate(struct file *filp, struct dir_context * context)
+                   { return 0; }
+
+               static const struct file_operations fops
+                   __attribute__ ((unused)) = {
+                       .iterate         = iterate,
+               };
 
 int
 main (void)
 {
 
-                       struct backing_dev_info bdi;
-                       char *name = "bdi";
-                       unsigned int cap = BDI_CAP_MAP_COPY;
-                       int error __attribute__((unused)) =
-                           bdi_setup_and_register(&bdi, name, cap);
 
   ;
   return 0;
@@ -30697,115 +29510,60 @@ _ACEOF
 
 
 
-cat - <<_ACEOF >conftest.h
-
-_ACEOF
-
-
-       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
-       echo "obj-m := conftest.o" >build/Makefile
-       modpost_flag=''
-       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
-       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
-
-
-fi
-       rm -Rf build
-
-
-       if test $rc -ne 0; then :
-
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-       else
-               if test "x$enable_linux_builtin" != xyes; then
-
-       grep -q -E '[[:space:]]bdi_setup_and_register[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in mm/backing-dev.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(bdi_setup_and_register)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
-
-               fi
-               if test $rc -ne 0; then :
-
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-               else :
-
-                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-$as_echo "#define HAVE_3ARGS_BDI_SETUP_AND_REGISTER 1" >>confdefs.h
-
+cat - <<_ACEOF >conftest.h
 
-               fi
-       fi
+_ACEOF
 
 
-               else :
+       rm -Rf build && mkdir -p build && touch build/conftest.mod.c
+       echo "obj-m := conftest.o" >build/Makefile
+       modpost_flag=''
+       test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage
+       if { ac_try='cp conftest.c conftest.h build && make modules -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; } >/dev/null && { ac_try='test -s build/conftest.o'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_2ARGS_BDI_SETUP_AND_REGISTER 1" >>confdefs.h
-
+$as_echo "#define HAVE_VFS_ITERATE 1" >>confdefs.h
 
-               fi
-       fi
 
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether set_nlink() is available" >&5
-$as_echo_n "checking whether set_nlink() is available... " >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->readdir() is available" >&5
+$as_echo_n "checking whether fops->readdir() is available... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
+                       #include <linux/fs.h>
+                       int readdir(struct file *filp, void *entry, filldir_t func)
+                           { return 0; }
+
+                       static const struct file_operations fops
+                           __attribute__ ((unused)) = {
+                               .readdir = readdir,
+                       };
 
 int
 main (void)
 {
 
-               struct inode node;
-               unsigned int link = 0;
-               (void) set_nlink(&node, link);
 
   ;
   return 0;
@@ -30836,18 +29594,17 @@ _ACEOF
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_SET_NLINK 1" >>confdefs.h
+$as_echo "#define HAVE_VFS_READDIR 1" >>confdefs.h
 
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+                       as_fn_error $? "no; file a bug report with ZFSOnLinux" "$LINENO" 5
 
 
 
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether elevator_change() is available" >&5
-$as_echo_n "checking whether elevator_change() is available... " >&6; }
-       tmp_flags="$EXTRA_KCFLAGS"
-       EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+
+
+fi
+       rm -Rf build
+
+
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->read/write_iter() are available" >&5
+$as_echo_n "checking whether fops->read/write_iter() are available... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/blkdev.h>
-               #include <linux/elevator.h>
+               #include <linux/fs.h>
+
+               ssize_t test_read(struct kiocb *kiocb, struct iov_iter *to)
+                   { return 0; }
+               ssize_t test_write(struct kiocb *kiocb, struct iov_iter *from)
+                   { return 0; }
+
+               static const struct file_operations
+                   fops __attribute__ ((unused)) = {
+                   .read_iter = test_read,
+                   .write_iter = test_write,
+               };
 
 int
 main (void)
 {
 
-               int ret;
-               struct request_queue *q = NULL;
-               char *elevator = NULL;
-               ret = elevator_change(q, elevator);
 
   ;
   return 0;
@@ -30910,7 +29678,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_ELEVATOR_CHANGE 1" >>confdefs.h
+$as_echo "#define HAVE_VFS_RW_ITERATE 1" >>confdefs.h
 
 
 else
        rm -Rf build
 
 
-       EXTRA_KCFLAGS="$tmp_flags"
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether sget() wants 5 args" >&5
-$as_echo_n "checking whether sget() wants 5 args... " >&6; }
+
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether kmap_atomic wants 1 args" >&5
+$as_echo_n "checking whether kmap_atomic wants 1 args... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
+               #include <linux/pagemap.h>
 
 int
 main (void)
 {
 
-               struct file_system_type *type = NULL;
-               int (*test)(struct super_block *,void *) = NULL;
-               int (*set)(struct super_block *,void *) = NULL;
-               int flags = 0;
-               void *data = NULL;
-               (void) sget(type, test, set, flags, data);
+               struct page page;
+               kmap_atomic(&page);
 
   ;
   return 0;
@@ -30980,7 +29744,7 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_5ARG_SGET 1" >>confdefs.h
+$as_echo "#define HAVE_1ARG_KMAP_ATOMIC 1" >>confdefs.h
 
 
 else
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether lseek_execute() is available" >&5
-$as_echo_n "checking whether lseek_execute() is available... " >&6; }
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether follow_down_one() is available" >&5
+$as_echo_n "checking whether follow_down_one() is available... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
+               #include <linux/namei.h>
 
 int
 main (void)
 {
 
-               struct file *fp __attribute__ ((unused)) = NULL;
-               struct inode *ip __attribute__ ((unused)) = NULL;
-               loff_t offset __attribute__ ((unused)) = 0;
-               loff_t maxsize __attribute__ ((unused)) = 0;
-
-               lseek_execute(fp, ip, offset, maxsize);
+               struct path *p = NULL;
+               follow_down_one(p);
 
   ;
   return 0;
@@ -31046,87 +29806,47 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
-  rc=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
- rc=1
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 
-fi
-       rm -Rf build
-
-
-       if test $rc -ne 0; then :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-       else
-               if test "x$enable_linux_builtin" != xyes; then
+$as_echo "#define HAVE_FOLLOW_DOWN_ONE 1" >>confdefs.h
 
-       grep -q -E '[[:space:]]lseek_exclusive[[:space:]]' \
-               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
-       rc=$?
-       if test $rc -ne 0; then
-               export=0
-               for file in fs/read_write.c; do
-                       grep -q -E "EXPORT_SYMBOL.*(lseek_exclusive)" \
-                               "$LINUX/$file" 2>/dev/null
-                       rc=$?
-                       if test $rc -eq 0; then
-                               export=1
-                               break;
-                       fi
-               done
-               if test $export -eq 0; then :
-                       rc=1
-               else :
-                       rc=0
-               fi
-       else :
-               rc=0
-       fi
 
-               fi
-               if test $rc -ne 0; then :
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
-               else :
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_LSEEK_EXECUTE 1" >>confdefs.h
 
+fi
+       rm -Rf build
 
-               fi
-       fi
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->iterate() is available" >&5
-$as_echo_n "checking whether fops->iterate() is available... " >&6; }
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether make_request_fn() returns int" >&5
+$as_echo_n "checking whether make_request_fn() returns int... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
-               int iterate(struct file *filp, struct dir_context * context)
-                   { return 0; }
+               #include <linux/blkdev.h>
 
-               static const struct file_operations fops
-                   __attribute__ ((unused)) = {
-                       .iterate         = iterate,
-               };
+               int make_request(struct request_queue *q, struct bio *bio)
+               {
+                       return (0);
+               }
 
 int
 main (void)
 {
 
+               blk_queue_make_request(NULL, &make_request);
 
   ;
   return 0;
@@ -31160,7 +29880,10 @@ _ACEOF
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_VFS_ITERATE 1" >>confdefs.h
+$as_echo "#define MAKE_REQUEST_FN_RET int" >>confdefs.h
+
+
+$as_echo "#define HAVE_MAKE_REQUEST_FN_RET_INT 1" >>confdefs.h
 
 
 else
@@ -31169,27 +29892,25 @@ sed 's/^/| /' conftest.$ac_ext >&5
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
-
-               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->readdir() is available" >&5
-$as_echo_n "checking whether fops->readdir() is available... " >&6; }
+               { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether make_request_fn() returns void" >&5
+$as_echo_n "checking whether make_request_fn() returns void... " >&6; }
 
 
 cat confdefs.h - <<_ACEOF >conftest.c
 
 
-                       #include <linux/fs.h>
-                       int readdir(struct file *filp, void *entry, filldir_t func)
-                           { return 0; }
+                       #include <linux/blkdev.h>
 
-                       static const struct file_operations fops
-                           __attribute__ ((unused)) = {
-                               .readdir = readdir,
-                       };
+                       void make_request(struct request_queue *q, struct bio *bio)
+                       {
+                               return;
+                       }
 
 int
 main (void)
 {
 
+                       blk_queue_make_request(NULL, &make_request);
 
   ;
   return 0;
@@ -31223,14 +29944,15 @@ _ACEOF
                        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_VFS_READDIR 1" >>confdefs.h
+$as_echo "#define MAKE_REQUEST_FN_RET void" >>confdefs.h
 
 
 else
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-                       as_fn_error $? "no; file a bug report with ZFSOnLinux" "$LINENO" 5
+                       as_fn_error $? "no - Please file a bug report at
+                           https://github.com/zfsonlinux/zfs/issues/new" "$LINENO" 5
 
 
 
 
 
 
-
 fi
        rm -Rf build
 
 
 
-       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether fops->read/write_iter() are available" >&5
-$as_echo_n "checking whether fops->read/write_iter() are available... " >&6; }
 
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether generic IO accounting symbols are avaliable" >&5
+$as_echo_n "checking whether generic IO accounting symbols are avaliable... " >&6; }
 
-cat confdefs.h - <<_ACEOF >conftest.c
 
 
-               #include <linux/fs.h>
+cat confdefs.h - <<_ACEOF >conftest.c
 
-               ssize_t test_read(struct kiocb *kiocb, struct iov_iter *to)
-                   { return 0; }
-               ssize_t test_write(struct kiocb *kiocb, struct iov_iter *from)
-                   { return 0; }
 
-               static const struct file_operations
-                   fops __attribute__ ((unused)) = {
-                   .read_iter = test_read,
-                   .write_iter = test_write,
-               };
+               #include <linux/bio.h>
+
+               void (*generic_start_io_acct_f)(int, unsigned long,
+                   struct hd_struct *) = &generic_start_io_acct;
+               void (*generic_end_io_acct_f)(int, struct hd_struct *,
+                   unsigned long) = &generic_end_io_acct;
 
 int
 main (void)
 {
 
+               generic_start_io_acct(0, 0, NULL);
+               generic_end_io_acct(0, NULL, 0);
 
   ;
   return 0;
@@ -31300,25 +30019,64 @@ _ACEOF
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then :
+  rc=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+ rc=1
 
-               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
 
-$as_echo "#define HAVE_VFS_RW_ITERATE 1" >>confdefs.h
+fi
+       rm -Rf build
 
 
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
+       if test $rc -ne 0; then :
+
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+       else
+               if test "x$enable_linux_builtin" != xyes; then
+
+       grep -q -E '[[:space:]]generic_start_io_acct[[:space:]]' \
+               $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
+       rc=$?
+       if test $rc -ne 0; then
+               export=0
+               for file in block/bio.c; do
+                       grep -q -E "EXPORT_SYMBOL.*(generic_start_io_acct)" \
+                               "$LINUX/$file" 2>/dev/null
+                       rc=$?
+                       if test $rc -eq 0; then
+                               export=1
+                               break;
+                       fi
+               done
+               if test $export -eq 0; then :
+                       rc=1
+               else :
+                       rc=0
+               fi
+       else :
+               rc=0
+       fi
+
+               fi
+               if test $rc -ne 0; then :
 
                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 
+               else :
 
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_GENERIC_IO_ACCT 1" >>confdefs.h
 
-fi
-       rm -Rf build
 
+               fi
+       fi
 
 
 
@@ -31331,6 +30089,7 @@ fi
 
 
                        KERNELCPPFLAGS="$KERNELCPPFLAGS $NO_UNUSED_BUT_SET_VARIABLE"
+       KERNELCPPFLAGS="$KERNELCPPFLAGS $NO_BOOL_COMPARE"
        KERNELCPPFLAGS="$KERNELCPPFLAGS -DHAVE_SPL -D_KERNEL"
        KERNELCPPFLAGS="$KERNELCPPFLAGS -DTEXT_DOMAIN=\\\"zfs-linux-kernel\\\""
 
@@ -31430,7 +30189,7 @@ $as_echo_n "checking whether dmu tx validation is enabled... " >&6; }
 $as_echo "$enable_debug_dmu_tx" >&6; }
 
 
-ac_config_files="$ac_config_files Makefile dracut/Makefile dracut/90zfs/Makefile udev/Makefile udev/rules.d/Makefile etc/Makefile etc/init.d/Makefile etc/zfs/Makefile etc/systemd/Makefile etc/systemd/system/Makefile etc/modules-load.d/Makefile man/Makefile man/man1/Makefile man/man5/Makefile man/man8/Makefile lib/Makefile lib/libspl/Makefile lib/libspl/asm-generic/Makefile lib/libspl/asm-i386/Makefile lib/libspl/asm-x86_64/Makefile lib/libspl/include/Makefile lib/libspl/include/ia32/Makefile lib/libspl/include/ia32/sys/Makefile lib/libspl/include/rpc/Makefile lib/libspl/include/sys/Makefile lib/libspl/include/sys/sysevent/Makefile lib/libspl/include/sys/dktp/Makefile lib/libspl/include/util/Makefile lib/libavl/Makefile lib/libefi/Makefile lib/libnvpair/Makefile lib/libunicode/Makefile lib/libuutil/Makefile lib/libzpool/Makefile lib/libzfs/libzfs.pc lib/libzfs/libzfs_core.pc lib/libzfs/Makefile lib/libzfs_core/Makefile lib/libshare/Makefile cmd/Makefile cmd/zdb/Makefile cmd/zhack/Makefile cmd/zfs/Makefile cmd/zinject/Makefile cmd/zpool/Makefile cmd/zstreamdump/Makefile cmd/ztest/Makefile cmd/zpios/Makefile cmd/mount_zfs/Makefile cmd/fsck_zfs/Makefile cmd/zvol_id/Makefile cmd/vdev_id/Makefile cmd/arcstat/Makefile cmd/dbufstat/Makefile cmd/arc_summary/Makefile cmd/zed/Makefile contrib/Makefile contrib/bash_completion.d/Makefile module/Makefile module/avl/Makefile module/nvpair/Makefile module/unicode/Makefile module/zcommon/Makefile module/zfs/Makefile module/zpios/Makefile include/Makefile include/linux/Makefile include/sys/Makefile include/sys/fs/Makefile include/sys/fm/Makefile include/sys/fm/fs/Makefile scripts/Makefile scripts/zpios-profile/Makefile scripts/zpios-test/Makefile scripts/zpool-config/Makefile scripts/common.sh rpm/Makefile rpm/redhat/Makefile rpm/redhat/zfs.spec rpm/redhat/zfs-kmod.spec rpm/redhat/zfs-dkms.spec rpm/generic/Makefile rpm/generic/zfs.spec rpm/generic/zfs-kmod.spec rpm/generic/zfs-dkms.spec zfs-script-config.sh zfs.release"
+ac_config_files="$ac_config_files Makefile udev/Makefile udev/rules.d/Makefile etc/Makefile etc/init.d/Makefile etc/zfs/Makefile etc/systemd/Makefile etc/systemd/system/Makefile etc/modules-load.d/Makefile man/Makefile man/man1/Makefile man/man5/Makefile man/man8/Makefile lib/Makefile lib/libspl/Makefile lib/libspl/asm-generic/Makefile lib/libspl/asm-i386/Makefile lib/libspl/asm-x86_64/Makefile lib/libspl/include/Makefile lib/libspl/include/ia32/Makefile lib/libspl/include/ia32/sys/Makefile lib/libspl/include/rpc/Makefile lib/libspl/include/sys/Makefile lib/libspl/include/sys/sysevent/Makefile lib/libspl/include/sys/dktp/Makefile lib/libspl/include/util/Makefile lib/libavl/Makefile lib/libefi/Makefile lib/libnvpair/Makefile lib/libunicode/Makefile lib/libuutil/Makefile lib/libzpool/Makefile lib/libzfs/libzfs.pc lib/libzfs/libzfs_core.pc lib/libzfs/Makefile lib/libzfs_core/Makefile lib/libshare/Makefile cmd/Makefile cmd/zdb/Makefile cmd/zhack/Makefile cmd/zfs/Makefile cmd/zinject/Makefile cmd/zpool/Makefile cmd/zstreamdump/Makefile cmd/ztest/Makefile cmd/zpios/Makefile cmd/mount_zfs/Makefile cmd/fsck_zfs/Makefile cmd/zvol_id/Makefile cmd/vdev_id/Makefile cmd/arcstat/Makefile cmd/dbufstat/Makefile cmd/arc_summary/Makefile cmd/zed/Makefile contrib/Makefile contrib/bash_completion.d/Makefile contrib/dracut/Makefile contrib/dracut/90zfs/Makefile contrib/initramfs/Makefile module/Makefile module/avl/Makefile module/nvpair/Makefile module/unicode/Makefile module/zcommon/Makefile module/zfs/Makefile module/zpios/Makefile include/Makefile include/linux/Makefile include/sys/Makefile include/sys/fs/Makefile include/sys/fm/Makefile include/sys/fm/fs/Makefile scripts/Makefile scripts/zpios-profile/Makefile scripts/zpios-test/Makefile scripts/zpool-config/Makefile scripts/common.sh rpm/Makefile rpm/redhat/Makefile rpm/redhat/zfs.spec rpm/redhat/zfs-kmod.spec rpm/redhat/zfs-dkms.spec rpm/generic/Makefile rpm/generic/zfs.spec rpm/generic/zfs-kmod.spec rpm/generic/zfs-dkms.spec zfs-script-config.sh zfs.release"
 
 
 cat >confcache <<\_ACEOF
@@ -31979,7 +30738,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by zfs $as_me 0.6.4.2, which was
+This file was extended by zfs $as_me 0.6.5.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -32045,7 +30804,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-zfs config.status 0.6.4.2
+zfs config.status 0.6.5.3
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
@@ -32455,8 +31214,6 @@ do
     "depfiles") CONFIG_COMMANDS="$CONFIG_COMMANDS depfiles" ;;
     "libtool") CONFIG_COMMANDS="$CONFIG_COMMANDS libtool" ;;
     "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;;
-    "dracut/Makefile") CONFIG_FILES="$CONFIG_FILES dracut/Makefile" ;;
-    "dracut/90zfs/Makefile") CONFIG_FILES="$CONFIG_FILES dracut/90zfs/Makefile" ;;
     "udev/Makefile") CONFIG_FILES="$CONFIG_FILES udev/Makefile" ;;
     "udev/rules.d/Makefile") CONFIG_FILES="$CONFIG_FILES udev/rules.d/Makefile" ;;
     "etc/Makefile") CONFIG_FILES="$CONFIG_FILES etc/Makefile" ;;
@@ -32512,6 +31269,9 @@ do
     "cmd/zed/Makefile") CONFIG_FILES="$CONFIG_FILES cmd/zed/Makefile" ;;
     "contrib/Makefile") CONFIG_FILES="$CONFIG_FILES contrib/Makefile" ;;
     "contrib/bash_completion.d/Makefile") CONFIG_FILES="$CONFIG_FILES contrib/bash_completion.d/Makefile" ;;
+    "contrib/dracut/Makefile") CONFIG_FILES="$CONFIG_FILES contrib/dracut/Makefile" ;;
+    "contrib/dracut/90zfs/Makefile") CONFIG_FILES="$CONFIG_FILES contrib/dracut/90zfs/Makefile" ;;
+    "contrib/initramfs/Makefile") CONFIG_FILES="$CONFIG_FILES contrib/initramfs/Makefile" ;;
     "module/Makefile") CONFIG_FILES="$CONFIG_FILES module/Makefile" ;;
     "module/avl/Makefile") CONFIG_FILES="$CONFIG_FILES module/avl/Makefile" ;;
     "module/nvpair/Makefile") CONFIG_FILES="$CONFIG_FILES module/nvpair/Makefile" ;;
index 63d0073e9a137c1cf60c80c8f5a4603062b80606..9907857e2e12844f78a26a61433f19a5827c2a3c 100644 (file)
@@ -58,8 +58,6 @@ ZFS_AC_DEBUG_DMU_TX
 
 AC_CONFIG_FILES([ 
        Makefile
-       dracut/Makefile
-       dracut/90zfs/Makefile
        udev/Makefile
        udev/rules.d/Makefile
        etc/Makefile
@@ -115,6 +113,9 @@ AC_CONFIG_FILES([
        cmd/zed/Makefile
        contrib/Makefile
        contrib/bash_completion.d/Makefile
+       contrib/dracut/Makefile
+       contrib/dracut/90zfs/Makefile
+       contrib/initramfs/Makefile
        module/Makefile
        module/avl/Makefile
        module/nvpair/Makefile
index 5fe60973ce9fe00a7a03041d4b53fc3b06449244..b05e5c45b3cdc91ad980a8f317ba342b7cdded74 100644 (file)
@@ -1,2 +1,2 @@
-SUBDIRS = bash_completion.d
-DIST_SUBDIRS = bash_completion.d
+SUBDIRS = bash_completion.d dracut initramfs
+DIST_SUBDIRS = bash_completion.d dracut initramfs
index 71628a6f16c848e7e22f228ebb4727fd254a7e3e..a07a353860ae10c4f89f2af366a3692f44725183 100644 (file)
@@ -90,7 +90,7 @@ host_triplet = @host@
 target_triplet = @target@
 subdir = contrib
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps =  \
+am__aclocal_m4_deps = $(top_srcdir)/config/always-no-bool-compare.m4 \
        $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \
        $(top_srcdir)/config/dkms.m4 \
        $(top_srcdir)/config/kernel-acl.m4 \
@@ -102,20 +102,11 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-bio-bvec-iter.m4 \
        $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \
        $(top_srcdir)/config/kernel-bio-failfast.m4 \
-       $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \
-       $(top_srcdir)/config/kernel-blk-end-request.m4 \
-       $(top_srcdir)/config/kernel-blk-fetch-request.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-discard.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-barrier.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-discard.m4 \
        $(top_srcdir)/config/kernel-blk-queue-flush.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \
-       $(top_srcdir)/config/kernel-blk-requeue-request.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-pos.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \
        $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \
        $(top_srcdir)/config/kernel-blkdev-get.m4 \
        $(top_srcdir)/config/kernel-block-device-operations-release-void.m4 \
@@ -123,6 +114,7 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-clear-inode.m4 \
        $(top_srcdir)/config/kernel-commit-metadata.m4 \
        $(top_srcdir)/config/kernel-create-nameidata.m4 \
+       $(top_srcdir)/config/kernel-current_bio_tail.m4 \
        $(top_srcdir)/config/kernel-d-make-root.m4 \
        $(top_srcdir)/config/kernel-d-obtain-alias.m4 \
        $(top_srcdir)/config/kernel-d-prune-aliases.m4 \
@@ -136,23 +128,25 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-fallocate.m4 \
        $(top_srcdir)/config/kernel-file-inode.m4 \
        $(top_srcdir)/config/kernel-fmode-t.m4 \
+       $(top_srcdir)/config/kernel-follow-down-one.m4 \
        $(top_srcdir)/config/kernel-follow-link-nameidata.m4 \
        $(top_srcdir)/config/kernel-fsync.m4 \
+       $(top_srcdir)/config/kernel-generic_io_acct.m4 \
        $(top_srcdir)/config/kernel-get-disk-ro.m4 \
        $(top_srcdir)/config/kernel-get-gendisk.m4 \
        $(top_srcdir)/config/kernel-insert-inode-locked.m4 \
        $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \
        $(top_srcdir)/config/kernel-is_owner_or_cap.m4 \
+       $(top_srcdir)/config/kernel-kmap-atomic-args.m4 \
        $(top_srcdir)/config/kernel-kobj-name-len.m4 \
        $(top_srcdir)/config/kernel-lookup-bdev.m4 \
        $(top_srcdir)/config/kernel-lookup-nameidata.m4 \
        $(top_srcdir)/config/kernel-lseek-execute.m4 \
+       $(top_srcdir)/config/kernel-mk-request-fn.m4 \
        $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \
        $(top_srcdir)/config/kernel-mount-nodev.m4 \
        $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \
        $(top_srcdir)/config/kernel-put-link-nameidata.m4 \
-       $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \
-       $(top_srcdir)/config/kernel-rq-is_sync.m4 \
        $(top_srcdir)/config/kernel-security-inode-init.m4 \
        $(top_srcdir)/config/kernel-set-nlink.m4 \
        $(top_srcdir)/config/kernel-sget-args.m4 \
@@ -292,9 +286,11 @@ DEBUG_CFLAGS = @DEBUG_CFLAGS@
 DEBUG_DMU_TX = @DEBUG_DMU_TX@
 DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@
 DEBUG_ZFS = @DEBUG_ZFS@
+DEFAULT_INITCONF_DIR = @DEFAULT_INITCONF_DIR@
 DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@
 DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@
 DEFAULT_PACKAGE = @DEFAULT_PACKAGE@
+DEFINE_INITRAMFS = @DEFINE_INITRAMFS@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -344,6 +340,7 @@ MANIFEST_TOOL = @MANIFEST_TOOL@
 MKDIR_P = @MKDIR_P@
 NM = @NM@
 NMEDIT = @NMEDIT@
+NO_BOOL_COMPARE = @NO_BOOL_COMPARE@
 NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@
 OBJDUMP = @OBJDUMP@
 OBJEXT = @OBJEXT@
@@ -462,8 +459,8 @@ top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 udevdir = @udevdir@
 udevruledir = @udevruledir@
-SUBDIRS = bash_completion.d
-DIST_SUBDIRS = bash_completion.d
+SUBDIRS = bash_completion.d dracut initramfs
+DIST_SUBDIRS = bash_completion.d dracut initramfs
 all: all-recursive
 
 .SUFFIXES:
index 9a47a9044c53479b4acb332d6fbe14eb9fe9e5d8..814f9aa96f3f425c05d3002e801781b4c9d01682 100644 (file)
@@ -91,7 +91,7 @@ host_triplet = @host@
 target_triplet = @target@
 subdir = contrib/bash_completion.d
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps =  \
+am__aclocal_m4_deps = $(top_srcdir)/config/always-no-bool-compare.m4 \
        $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \
        $(top_srcdir)/config/dkms.m4 \
        $(top_srcdir)/config/kernel-acl.m4 \
@@ -103,20 +103,11 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-bio-bvec-iter.m4 \
        $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \
        $(top_srcdir)/config/kernel-bio-failfast.m4 \
-       $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \
-       $(top_srcdir)/config/kernel-blk-end-request.m4 \
-       $(top_srcdir)/config/kernel-blk-fetch-request.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-discard.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-barrier.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-discard.m4 \
        $(top_srcdir)/config/kernel-blk-queue-flush.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \
-       $(top_srcdir)/config/kernel-blk-requeue-request.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-pos.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \
        $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \
        $(top_srcdir)/config/kernel-blkdev-get.m4 \
        $(top_srcdir)/config/kernel-block-device-operations-release-void.m4 \
@@ -124,6 +115,7 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-clear-inode.m4 \
        $(top_srcdir)/config/kernel-commit-metadata.m4 \
        $(top_srcdir)/config/kernel-create-nameidata.m4 \
+       $(top_srcdir)/config/kernel-current_bio_tail.m4 \
        $(top_srcdir)/config/kernel-d-make-root.m4 \
        $(top_srcdir)/config/kernel-d-obtain-alias.m4 \
        $(top_srcdir)/config/kernel-d-prune-aliases.m4 \
@@ -137,23 +129,25 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-fallocate.m4 \
        $(top_srcdir)/config/kernel-file-inode.m4 \
        $(top_srcdir)/config/kernel-fmode-t.m4 \
+       $(top_srcdir)/config/kernel-follow-down-one.m4 \
        $(top_srcdir)/config/kernel-follow-link-nameidata.m4 \
        $(top_srcdir)/config/kernel-fsync.m4 \
+       $(top_srcdir)/config/kernel-generic_io_acct.m4 \
        $(top_srcdir)/config/kernel-get-disk-ro.m4 \
        $(top_srcdir)/config/kernel-get-gendisk.m4 \
        $(top_srcdir)/config/kernel-insert-inode-locked.m4 \
        $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \
        $(top_srcdir)/config/kernel-is_owner_or_cap.m4 \
+       $(top_srcdir)/config/kernel-kmap-atomic-args.m4 \
        $(top_srcdir)/config/kernel-kobj-name-len.m4 \
        $(top_srcdir)/config/kernel-lookup-bdev.m4 \
        $(top_srcdir)/config/kernel-lookup-nameidata.m4 \
        $(top_srcdir)/config/kernel-lseek-execute.m4 \
+       $(top_srcdir)/config/kernel-mk-request-fn.m4 \
        $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \
        $(top_srcdir)/config/kernel-mount-nodev.m4 \
        $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \
        $(top_srcdir)/config/kernel-put-link-nameidata.m4 \
-       $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \
-       $(top_srcdir)/config/kernel-rq-is_sync.m4 \
        $(top_srcdir)/config/kernel-security-inode-init.m4 \
        $(top_srcdir)/config/kernel-set-nlink.m4 \
        $(top_srcdir)/config/kernel-sget-args.m4 \
@@ -235,9 +229,11 @@ DEBUG_CFLAGS = @DEBUG_CFLAGS@
 DEBUG_DMU_TX = @DEBUG_DMU_TX@
 DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@
 DEBUG_ZFS = @DEBUG_ZFS@
+DEFAULT_INITCONF_DIR = @DEFAULT_INITCONF_DIR@
 DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@
 DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@
 DEFAULT_PACKAGE = @DEFAULT_PACKAGE@
+DEFINE_INITRAMFS = @DEFINE_INITRAMFS@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -287,6 +283,7 @@ MANIFEST_TOOL = @MANIFEST_TOOL@
 MKDIR_P = @MKDIR_P@
 NM = @NM@
 NMEDIT = @NMEDIT@
+NO_BOOL_COMPARE = @NO_BOOL_COMPARE@
 NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@
 OBJDUMP = @OBJDUMP@
 OBJEXT = @OBJEXT@
diff --git a/zfs/contrib/dracut/90zfs/Makefile.am b/zfs/contrib/dracut/90zfs/Makefile.am
new file mode 100644 (file)
index 0000000..b778a27
--- /dev/null
@@ -0,0 +1,25 @@
+pkgdracutdir = $(dracutdir)/modules.d/90zfs
+pkgdracut_SCRIPTS = \
+       export-zfs.sh \
+       module-setup.sh \
+       mount-zfs.sh \
+       parse-zfs.sh \
+       zfs-lib.sh
+
+EXTRA_DIST = \
+       $(top_srcdir)/contrib/dracut/90zfs/export-zfs.sh.in \
+       $(top_srcdir)/contrib/dracut/90zfs/module-setup.sh.in \
+       $(top_srcdir)/contrib/dracut/90zfs/mount-zfs.sh.in \
+       $(top_srcdir)/contrib/dracut/90zfs/parse-zfs.sh.in \
+       $(top_srcdir)/contrib/dracut/90zfs/zfs-lib.sh.in
+
+$(pkgdracut_SCRIPTS):
+       -$(SED) -e 's,@bindir\@,$(bindir),g' \
+               -e 's,@sbindir\@,$(sbindir),g' \
+               -e 's,@udevdir\@,$(udevdir),g' \
+               -e 's,@udevruledir\@,$(udevruledir),g' \
+               -e 's,@sysconfdir\@,$(sysconfdir),g' \
+               "$(top_srcdir)/contrib/dracut/90zfs/$@.in" >'$@'
+
+distclean-local::
+       -$(RM) $(pkgdracut_SCRIPTS)
diff --git a/zfs/contrib/dracut/90zfs/Makefile.in b/zfs/contrib/dracut/90zfs/Makefile.in
new file mode 100644 (file)
index 0000000..a1f6d5c
--- /dev/null
@@ -0,0 +1,692 @@
+# Makefile.in generated by automake 1.15 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2014 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+VPATH = @srcdir@
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \  ]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs  ]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+subdir = contrib/dracut/90zfs
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/config/always-no-bool-compare.m4 \
+       $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \
+       $(top_srcdir)/config/dkms.m4 \
+       $(top_srcdir)/config/kernel-acl.m4 \
+       $(top_srcdir)/config/kernel-automount.m4 \
+       $(top_srcdir)/config/kernel-bdev-block-device-operations.m4 \
+       $(top_srcdir)/config/kernel-bdev-logical-size.m4 \
+       $(top_srcdir)/config/kernel-bdev-physical-size.m4 \
+       $(top_srcdir)/config/kernel-bdi-setup-and-register.m4 \
+       $(top_srcdir)/config/kernel-bio-bvec-iter.m4 \
+       $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \
+       $(top_srcdir)/config/kernel-bio-failfast.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-barrier.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-discard.m4 \
+       $(top_srcdir)/config/kernel-blk-queue-flush.m4 \
+       $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \
+       $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \
+       $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \
+       $(top_srcdir)/config/kernel-blkdev-get.m4 \
+       $(top_srcdir)/config/kernel-block-device-operations-release-void.m4 \
+       $(top_srcdir)/config/kernel-check-disk-size-change.m4 \
+       $(top_srcdir)/config/kernel-clear-inode.m4 \
+       $(top_srcdir)/config/kernel-commit-metadata.m4 \
+       $(top_srcdir)/config/kernel-create-nameidata.m4 \
+       $(top_srcdir)/config/kernel-current_bio_tail.m4 \
+       $(top_srcdir)/config/kernel-d-make-root.m4 \
+       $(top_srcdir)/config/kernel-d-obtain-alias.m4 \
+       $(top_srcdir)/config/kernel-d-prune-aliases.m4 \
+       $(top_srcdir)/config/kernel-declare-event-class.m4 \
+       $(top_srcdir)/config/kernel-dentry-operations.m4 \
+       $(top_srcdir)/config/kernel-dirty-inode.m4 \
+       $(top_srcdir)/config/kernel-discard-granularity.m4 \
+       $(top_srcdir)/config/kernel-elevator-change.m4 \
+       $(top_srcdir)/config/kernel-encode-fh-inode.m4 \
+       $(top_srcdir)/config/kernel-evict-inode.m4 \
+       $(top_srcdir)/config/kernel-fallocate.m4 \
+       $(top_srcdir)/config/kernel-file-inode.m4 \
+       $(top_srcdir)/config/kernel-fmode-t.m4 \
+       $(top_srcdir)/config/kernel-follow-down-one.m4 \
+       $(top_srcdir)/config/kernel-follow-link-nameidata.m4 \
+       $(top_srcdir)/config/kernel-fsync.m4 \
+       $(top_srcdir)/config/kernel-generic_io_acct.m4 \
+       $(top_srcdir)/config/kernel-get-disk-ro.m4 \
+       $(top_srcdir)/config/kernel-get-gendisk.m4 \
+       $(top_srcdir)/config/kernel-insert-inode-locked.m4 \
+       $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \
+       $(top_srcdir)/config/kernel-is_owner_or_cap.m4 \
+       $(top_srcdir)/config/kernel-kmap-atomic-args.m4 \
+       $(top_srcdir)/config/kernel-kobj-name-len.m4 \
+       $(top_srcdir)/config/kernel-lookup-bdev.m4 \
+       $(top_srcdir)/config/kernel-lookup-nameidata.m4 \
+       $(top_srcdir)/config/kernel-lseek-execute.m4 \
+       $(top_srcdir)/config/kernel-mk-request-fn.m4 \
+       $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \
+       $(top_srcdir)/config/kernel-mount-nodev.m4 \
+       $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \
+       $(top_srcdir)/config/kernel-put-link-nameidata.m4 \
+       $(top_srcdir)/config/kernel-security-inode-init.m4 \
+       $(top_srcdir)/config/kernel-set-nlink.m4 \
+       $(top_srcdir)/config/kernel-sget-args.m4 \
+       $(top_srcdir)/config/kernel-show-options.m4 \
+       $(top_srcdir)/config/kernel-shrink.m4 \
+       $(top_srcdir)/config/kernel-truncate-range.m4 \
+       $(top_srcdir)/config/kernel-truncate-setsize.m4 \
+       $(top_srcdir)/config/kernel-vfs-iterate.m4 \
+       $(top_srcdir)/config/kernel-vfs-rw-iterate.m4 \
+       $(top_srcdir)/config/kernel-xattr-handler.m4 \
+       $(top_srcdir)/config/kernel.m4 $(top_srcdir)/config/libtool.m4 \
+       $(top_srcdir)/config/ltoptions.m4 \
+       $(top_srcdir)/config/ltsugar.m4 \
+       $(top_srcdir)/config/ltversion.m4 \
+       $(top_srcdir)/config/lt~obsolete.m4 \
+       $(top_srcdir)/config/mount-helper.m4 \
+       $(top_srcdir)/config/user-arch.m4 \
+       $(top_srcdir)/config/user-dracut.m4 \
+       $(top_srcdir)/config/user-frame-larger-than.m4 \
+       $(top_srcdir)/config/user-libblkid.m4 \
+       $(top_srcdir)/config/user-libuuid.m4 \
+       $(top_srcdir)/config/user-runstatedir.m4 \
+       $(top_srcdir)/config/user-systemd.m4 \
+       $(top_srcdir)/config/user-sysvinit.m4 \
+       $(top_srcdir)/config/user-udev.m4 \
+       $(top_srcdir)/config/user-zlib.m4 $(top_srcdir)/config/user.m4 \
+       $(top_srcdir)/config/zfs-build.m4 \
+       $(top_srcdir)/config/zfs-meta.m4 $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+       $(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/zfs_config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+    *) f=$$p;; \
+  esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+  for p in $$list; do echo "$$p $$p"; done | \
+  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+    if (++n[$$2] == $(am__install_max)) \
+      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+    END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
+am__installdirs = "$(DESTDIR)$(pkgdracutdir)"
+SCRIPTS = $(pkgdracut_SCRIPTS)
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+SOURCES =
+DIST_SOURCES =
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+am__DIST_COMMON = $(srcdir)/Makefile.in
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALIEN = @ALIEN@
+ALIEN_VERSION = @ALIEN_VERSION@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEBUG_CFLAGS = @DEBUG_CFLAGS@
+DEBUG_DMU_TX = @DEBUG_DMU_TX@
+DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@
+DEBUG_ZFS = @DEBUG_ZFS@
+DEFAULT_INITCONF_DIR = @DEFAULT_INITCONF_DIR@
+DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@
+DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@
+DEFAULT_PACKAGE = @DEFAULT_PACKAGE@
+DEFINE_INITRAMFS = @DEFINE_INITRAMFS@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DPKG = @DPKG@
+DPKGBUILD = @DPKGBUILD@
+DPKGBUILD_VERSION = @DPKGBUILD_VERSION@
+DPKG_VERSION = @DPKG_VERSION@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+FGREP = @FGREP@
+FRAME_LARGER_THAN = @FRAME_LARGER_THAN@
+GREP = @GREP@
+HAVE_ALIEN = @HAVE_ALIEN@
+HAVE_DPKG = @HAVE_DPKG@
+HAVE_DPKGBUILD = @HAVE_DPKGBUILD@
+HAVE_RPM = @HAVE_RPM@
+HAVE_RPMBUILD = @HAVE_RPMBUILD@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+KERNELCPPFLAGS = @KERNELCPPFLAGS@
+KERNELMAKE_PARAMS = @KERNELMAKE_PARAMS@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBBLKID = @LIBBLKID@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIBUUID = @LIBUUID@
+LINUX = @LINUX@
+LINUX_OBJ = @LINUX_OBJ@
+LINUX_SYMBOLS = @LINUX_SYMBOLS@
+LINUX_VERSION = @LINUX_VERSION@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+NO_BOOL_COMPARE = @NO_BOOL_COMPARE@
+NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+RANLIB = @RANLIB@
+RELEASE = @RELEASE@
+RPM = @RPM@
+RPMBUILD = @RPMBUILD@
+RPMBUILD_VERSION = @RPMBUILD_VERSION@
+RPM_DEFINE_COMMON = @RPM_DEFINE_COMMON@
+RPM_DEFINE_DKMS = @RPM_DEFINE_DKMS@
+RPM_DEFINE_KMOD = @RPM_DEFINE_KMOD@
+RPM_DEFINE_UTIL = @RPM_DEFINE_UTIL@
+RPM_SPEC_DIR = @RPM_SPEC_DIR@
+RPM_VERSION = @RPM_VERSION@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+SPL = @SPL@
+SPL_OBJ = @SPL_OBJ@
+SPL_SYMBOLS = @SPL_SYMBOLS@
+SPL_VERSION = @SPL_VERSION@
+SRPM_DEFINE_COMMON = @SRPM_DEFINE_COMMON@
+SRPM_DEFINE_DKMS = @SRPM_DEFINE_DKMS@
+SRPM_DEFINE_KMOD = @SRPM_DEFINE_KMOD@
+SRPM_DEFINE_UTIL = @SRPM_DEFINE_UTIL@
+STRIP = @STRIP@
+TARGET_ASM_DIR = @TARGET_ASM_DIR@
+VENDOR = @VENDOR@
+VERSION = @VERSION@
+ZFS_CONFIG = @ZFS_CONFIG@
+ZFS_INIT_SYSTEMD = @ZFS_INIT_SYSTEMD@
+ZFS_INIT_SYSV = @ZFS_INIT_SYSV@
+ZFS_META_ALIAS = @ZFS_META_ALIAS@
+ZFS_META_AUTHOR = @ZFS_META_AUTHOR@
+ZFS_META_DATA = @ZFS_META_DATA@
+ZFS_META_LICENSE = @ZFS_META_LICENSE@
+ZFS_META_LT_AGE = @ZFS_META_LT_AGE@
+ZFS_META_LT_CURRENT = @ZFS_META_LT_CURRENT@
+ZFS_META_LT_REVISION = @ZFS_META_LT_REVISION@
+ZFS_META_NAME = @ZFS_META_NAME@
+ZFS_META_RELEASE = @ZFS_META_RELEASE@
+ZFS_META_VERSION = @ZFS_META_VERSION@
+ZFS_MODULE_LOAD = @ZFS_MODULE_LOAD@
+ZLIB = @ZLIB@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dracutdir = @dracutdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+modulesloaddir = @modulesloaddir@
+mounthelperdir = @mounthelperdir@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+runstatedir = @runstatedir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+systemdpresetdir = @systemdpresetdir@
+systemdunitdir = @systemdunitdir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+udevdir = @udevdir@
+udevruledir = @udevruledir@
+pkgdracutdir = $(dracutdir)/modules.d/90zfs
+pkgdracut_SCRIPTS = \
+       export-zfs.sh \
+       module-setup.sh \
+       mount-zfs.sh \
+       parse-zfs.sh \
+       zfs-lib.sh
+
+EXTRA_DIST = \
+       $(top_srcdir)/contrib/dracut/90zfs/export-zfs.sh.in \
+       $(top_srcdir)/contrib/dracut/90zfs/module-setup.sh.in \
+       $(top_srcdir)/contrib/dracut/90zfs/mount-zfs.sh.in \
+       $(top_srcdir)/contrib/dracut/90zfs/parse-zfs.sh.in \
+       $(top_srcdir)/contrib/dracut/90zfs/zfs-lib.sh.in
+
+all: all-am
+
+.SUFFIXES:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+       @for dep in $?; do \
+         case '$(am__configure_deps)' in \
+           *$$dep*) \
+             ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+               && { if test -f $@; then exit 0; else break; fi; }; \
+             exit 1;; \
+         esac; \
+       done; \
+       echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu contrib/dracut/90zfs/Makefile'; \
+       $(am__cd) $(top_srcdir) && \
+         $(AUTOMAKE) --gnu contrib/dracut/90zfs/Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+       @case '$?' in \
+         *config.status*) \
+           cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+         *) \
+           echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+           cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+       esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+       cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+       cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+       cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+install-pkgdracutSCRIPTS: $(pkgdracut_SCRIPTS)
+       @$(NORMAL_INSTALL)
+       @list='$(pkgdracut_SCRIPTS)'; test -n "$(pkgdracutdir)" || list=; \
+       if test -n "$$list"; then \
+         echo " $(MKDIR_P) '$(DESTDIR)$(pkgdracutdir)'"; \
+         $(MKDIR_P) "$(DESTDIR)$(pkgdracutdir)" || exit 1; \
+       fi; \
+       for p in $$list; do \
+         if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+         if test -f "$$d$$p"; then echo "$$d$$p"; echo "$$p"; else :; fi; \
+       done | \
+       sed -e 'p;s,.*/,,;n' \
+           -e 'h;s|.*|.|' \
+           -e 'p;x;s,.*/,,;$(transform)' | sed 'N;N;N;s,\n, ,g' | \
+       $(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1; } \
+         { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \
+           if ($$2 == $$4) { files[d] = files[d] " " $$1; \
+             if (++n[d] == $(am__install_max)) { \
+               print "f", d, files[d]; n[d] = 0; files[d] = "" } } \
+           else { print "f", d "/" $$4, $$1 } } \
+         END { for (d in files) print "f", d, files[d] }' | \
+       while read type dir files; do \
+            if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \
+            test -z "$$files" || { \
+              echo " $(INSTALL_SCRIPT) $$files '$(DESTDIR)$(pkgdracutdir)$$dir'"; \
+              $(INSTALL_SCRIPT) $$files "$(DESTDIR)$(pkgdracutdir)$$dir" || exit $$?; \
+            } \
+       ; done
+
+uninstall-pkgdracutSCRIPTS:
+       @$(NORMAL_UNINSTALL)
+       @list='$(pkgdracut_SCRIPTS)'; test -n "$(pkgdracutdir)" || exit 0; \
+       files=`for p in $$list; do echo "$$p"; done | \
+              sed -e 's,.*/,,;$(transform)'`; \
+       dir='$(DESTDIR)$(pkgdracutdir)'; $(am__uninstall_files_from_dir)
+
+mostlyclean-libtool:
+       -rm -f *.lo
+
+clean-libtool:
+       -rm -rf .libs _libs
+tags TAGS:
+
+ctags CTAGS:
+
+cscope cscopelist:
+
+
+distdir: $(DISTFILES)
+       @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+       topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+       list='$(DISTFILES)'; \
+         dist_files=`for file in $$list; do echo $$file; done | \
+         sed -e "s|^$$srcdirstrip/||;t" \
+             -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+       case $$dist_files in \
+         */*) $(MKDIR_P) `echo "$$dist_files" | \
+                          sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+                          sort -u` ;; \
+       esac; \
+       for file in $$dist_files; do \
+         if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+         if test -d $$d/$$file; then \
+           dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+           if test -d "$(distdir)/$$file"; then \
+             find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+           fi; \
+           if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+             cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+             find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+           fi; \
+           cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+         else \
+           test -f "$(distdir)/$$file" \
+           || cp -p $$d/$$file "$(distdir)/$$file" \
+           || exit 1; \
+         fi; \
+       done
+check-am: all-am
+check: check-am
+all-am: Makefile $(SCRIPTS)
+installdirs:
+       for dir in "$(DESTDIR)$(pkgdracutdir)"; do \
+         test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+       done
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+       @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+       if test -z '$(STRIP)'; then \
+         $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+           install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+             install; \
+       else \
+         $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+           install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+           "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+       fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+       -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+       -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+       @echo "This command is intended for maintainers to use"
+       @echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-am
+       -rm -f Makefile
+distclean-am: clean-am distclean-generic distclean-local
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am: install-pkgdracutSCRIPTS
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+       -rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-generic mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-pkgdracutSCRIPTS
+
+.MAKE: install-am install-strip
+
+.PHONY: all all-am check check-am clean clean-generic clean-libtool \
+       cscopelist-am ctags-am distclean distclean-generic \
+       distclean-libtool distclean-local distdir dvi dvi-am html \
+       html-am info info-am install install-am install-data \
+       install-data-am install-dvi install-dvi-am install-exec \
+       install-exec-am install-html install-html-am install-info \
+       install-info-am install-man install-pdf install-pdf-am \
+       install-pkgdracutSCRIPTS install-ps install-ps-am \
+       install-strip installcheck installcheck-am installdirs \
+       maintainer-clean maintainer-clean-generic mostlyclean \
+       mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+       tags-am uninstall uninstall-am uninstall-pkgdracutSCRIPTS
+
+.PRECIOUS: Makefile
+
+
+$(pkgdracut_SCRIPTS):
+       -$(SED) -e 's,@bindir\@,$(bindir),g' \
+               -e 's,@sbindir\@,$(sbindir),g' \
+               -e 's,@udevdir\@,$(udevdir),g' \
+               -e 's,@udevruledir\@,$(udevruledir),g' \
+               -e 's,@sysconfdir\@,$(sysconfdir),g' \
+               "$(top_srcdir)/contrib/dracut/90zfs/$@.in" >'$@'
+
+distclean-local::
+       -$(RM) $(pkgdracut_SCRIPTS)
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/zfs/contrib/dracut/90zfs/export-zfs.sh.in b/zfs/contrib/dracut/90zfs/export-zfs.sh.in
new file mode 100755 (executable)
index 0000000..393753f
--- /dev/null
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+. /lib/dracut-zfs-lib.sh
+
+_do_zpool_export() {
+       local ret=0
+       local final="${1}"
+       local opts=""
+
+       if [ "x${final}" != "x" ]; then
+               opts="-f"
+       fi
+
+       info "Exporting ZFS storage pools."
+       export_all ${opts} || ret=$?
+
+       if [ "x${final}" != "x" ]; then
+               info "zpool list"
+               zpool list 2>&1 | vinfo
+       fi
+
+       return ${ret}
+}
+
+if command -v zpool >/dev/null; then
+       _do_zpool_export "${1}"
+else
+       :
+fi
diff --git a/zfs/contrib/dracut/90zfs/module-setup.sh.in b/zfs/contrib/dracut/90zfs/module-setup.sh.in
new file mode 100755 (executable)
index 0000000..9eb9f57
--- /dev/null
@@ -0,0 +1,61 @@
+#!/bin/sh
+
+check() {
+       # We depend on udev-rules being loaded
+       [ "${1}" = "-d" ] && return 0
+
+       # Verify the zfs tool chain
+       which zpool >/dev/null 2>&1 || return 1
+       which zfs >/dev/null 2>&1 || return 1
+
+       return 0
+}
+
+depends() {
+       echo udev-rules
+       return 0
+}
+
+installkernel() {
+       instmods zfs
+       instmods zcommon
+       instmods znvpair
+       instmods zavl
+       instmods zunicode
+       instmods spl
+       instmods zlib_deflate
+       instmods zlib_inflate
+}
+
+install() {
+       inst_rules @udevruledir@/90-zfs.rules
+       inst_rules @udevruledir@/69-vdev.rules
+       inst_rules @udevruledir@/60-zvol.rules
+       dracut_install @sbindir@/zfs
+       dracut_install @sbindir@/zpool
+       dracut_install @udevdir@/vdev_id
+       dracut_install @udevdir@/zvol_id
+       dracut_install mount.zfs
+       dracut_install hostid
+       dracut_install awk
+       dracut_install head
+       inst_hook cmdline 95 "${moddir}/parse-zfs.sh"
+       inst_hook mount 98 "${moddir}/mount-zfs.sh"
+       inst_hook shutdown 30 "${moddir}/export-zfs.sh"
+
+       inst_simple "${moddir}/zfs-lib.sh" "/lib/dracut-zfs-lib.sh"
+       if [ -e @sysconfdir@/zfs/zpool.cache ]; then
+               inst @sysconfdir@/zfs/zpool.cache
+       fi
+
+       if [ -e @sysconfdir@/zfs/vdev_id.conf ]; then
+               inst @sysconfdir@/zfs/vdev_id.conf
+       fi
+
+       # Synchronize initramfs and system hostid
+       AA=`hostid | cut -b 1,2`
+       BB=`hostid | cut -b 3,4`
+       CC=`hostid | cut -b 5,6`
+       DD=`hostid | cut -b 7,8`
+       printf "\x${DD}\x${CC}\x${BB}\x${AA}" > "${initdir}/etc/hostid"
+}
diff --git a/zfs/contrib/dracut/90zfs/mount-zfs.sh.in b/zfs/contrib/dracut/90zfs/mount-zfs.sh.in
new file mode 100755 (executable)
index 0000000..2abc876
--- /dev/null
@@ -0,0 +1,44 @@
+#!/bin/sh
+
+. /lib/dracut-zfs-lib.sh
+
+ZFS_DATASET=""
+ZFS_POOL=""
+
+case "${root}" in
+       zfs:*) ;;
+       *) return ;;
+esac
+
+# Delay until all required block devices are present.
+udevadm settle
+
+if [ "${root}" = "zfs:AUTO" ] ; then
+       ZFS_DATASET="$(find_bootfs)"
+       if [ $? -ne 0 ] ; then
+               zpool import -N -a ${ZPOOL_IMPORT_OPTS}
+               ZFS_DATASET="$(find_bootfs)"
+               if [ $? -ne 0 ] ; then
+                       warn "ZFS: No bootfs attribute found in importable pools."
+                       export_all || export_all "-f"
+
+                       rootok=0
+                       return 1
+               fi
+       fi
+       info "ZFS: Using ${ZFS_DATASET} as root."
+fi
+
+ZFS_DATASET="${ZFS_DATASET:-${root#zfs:}}"
+ZFS_POOL="${ZFS_DATASET%%/*}"
+
+if import_pool "${ZFS_POOL}" ; then
+       info "ZFS: Mounting dataset ${ZFS_DATASET}..."
+       if mount_dataset "${ZFS_DATASET}" ; then
+               ROOTFS_MOUNTED=yes
+               return 0
+       fi
+fi
+
+rootok=0
+need_shutdown
diff --git a/zfs/contrib/dracut/90zfs/parse-zfs.sh.in b/zfs/contrib/dracut/90zfs/parse-zfs.sh.in
new file mode 100755 (executable)
index 0000000..c305c78
--- /dev/null
@@ -0,0 +1,59 @@
+#!/bin/sh
+
+. /lib/dracut-lib.sh
+
+# Let the command line override our host id.
+spl_hostid=`getarg spl_hostid=`
+if [ -n "${spl_hostid}" ] ; then
+       info "ZFS: Using hostid from command line: ${spl_hostid}"
+       AA=`echo ${spl_hostid} | cut -b 1,2`
+       BB=`echo ${spl_hostid} | cut -b 3,4`
+       CC=`echo ${spl_hostid} | cut -b 5,6`
+       DD=`echo ${spl_hostid} | cut -b 7,8`
+       printf "\x${DD}\x${CC}\x${BB}\x${AA}" >/etc/hostid
+elif [ -f "/etc/hostid" ] ; then
+       info "ZFS: Using hostid from /etc/hostid: `hostid`"
+else
+       warn "ZFS: No hostid found on kernel command line or /etc/hostid."
+       warn "ZFS: Pools may not import correctly."
+fi
+
+wait_for_zfs=0
+case "${root}" in
+       ""|zfs|zfs:)
+               # We'll take root unset, root=zfs, or root=zfs:
+               # No root set, so we want to read the bootfs attribute.  We
+               # can't do that until udev settles so we'll set dummy values
+               # and hope for the best later on.
+               root="zfs:AUTO"
+               rootok=1
+               wait_for_zfs=1
+
+               info "ZFS: Enabling autodetection of bootfs after udev settles."
+               ;;
+
+       ZFS\=*|zfs:*|zfs:FILESYSTEM\=*|FILESYSTEM\=*)
+               # root is explicit ZFS root.  Parse it now.  We can handle
+               # a root=... param in any of the following formats:
+               # root=ZFS=rpool/ROOT
+               # root=zfs:rpool/ROOT
+               # root=zfs:FILESYSTEM=rpool/ROOT
+               # root=FILESYSTEM=rpool/ROOT
+
+               # Strip down to just the pool/fs
+               root="${root#zfs:}"
+               root="${root#FILESYSTEM=}"
+               root="zfs:${root#ZFS=}"
+               rootok=1
+               wait_for_zfs=1
+
+               info "ZFS: Set ${root} as bootfs."
+               ;;
+esac
+
+# Make sure Dracut is happy that we have a root and will wait for ZFS
+# modules to settle before mounting.
+if [ ${wait_for_zfs} -eq 1 ]; then
+       ln -s /dev/null /dev/root 2>/dev/null
+       echo '[ -e /dev/zfs ]' > "${hookdir}/initqueue/finished/zfs.sh"
+fi
diff --git a/zfs/contrib/dracut/90zfs/zfs-lib.sh.in b/zfs/contrib/dracut/90zfs/zfs-lib.sh.in
new file mode 100755 (executable)
index 0000000..1c223be
--- /dev/null
@@ -0,0 +1,87 @@
+#!/bin/sh
+
+command -v getarg >/dev/null || . /lib/dracut-lib.sh
+
+OLDIFS="${IFS}"
+NEWLINE="
+"
+
+ZPOOL_IMPORT_OPTS=""
+if getargbool 0 zfs_force -y zfs.force -y zfsforce ; then
+       warn "ZFS: Will force-import pools if necessary."
+       ZPOOL_IMPORT_OPTS="${ZPOOL_IMPORT_OPTS} -f"
+fi
+
+# find_bootfs
+#   returns the first dataset with the bootfs attribute.
+find_bootfs() {
+       IFS="${NEWLINE}"
+       for dataset in $(zpool list -H -o bootfs); do
+               case "${dataset}" in
+                       "" | "-")
+                               continue
+                               ;;
+                       "no pools available")
+                               IFS="${OLDIFS}"
+                               return 1
+                               ;;
+                       *)
+                               IFS="${OLDIFS}"
+                               echo "${dataset}"
+                               return 0
+                               ;;
+               esac
+       done
+
+       IFS="${OLDIFS}"
+       return 1
+}
+
+# import_pool POOL
+#   imports the given zfs pool if it isn't imported already.
+import_pool() {
+       local pool="${1}"
+
+       if ! zpool list -H "${pool}" 2>&1 > /dev/null ; then
+               info "ZFS: Importing pool ${pool}..."
+               if ! zpool import -N ${ZPOOL_IMPORT_OPTS} "${pool}" ; then
+                       warn "ZFS: Unable to import pool ${pool}"
+                       return 1
+               fi
+       fi
+
+       return 0
+}
+
+# mount_dataset DATASET
+#   mounts the given zfs dataset.
+mount_dataset() {
+       local dataset="${1}"
+       local mountpoint="$(zfs get -H -o value mountpoint "${dataset}")"
+
+       # We need zfsutil for non-legacy mounts and not for legacy mounts.
+       if [ "${mountpoint}" = "legacy" ] ; then
+               mount -t zfs "${dataset}" "${NEWROOT}"
+       else
+               mount -o zfsutil -t zfs "${dataset}" "${NEWROOT}"
+       fi
+
+       return $?
+}
+
+# export_all OPTS
+#   exports all imported zfs pools.
+export_all() {
+       local opts="${1}"
+       local ret=0
+
+       IFS="${NEWLINE}"
+       for pool in `zpool list -H -o name` ; do
+               if zpool list -H "${pool}" 2>&1 > /dev/null ; then
+                       zpool export "${pool}" ${opts} || ret=$?
+               fi
+       done
+       IFS="${OLDIFS}"
+
+       return ${ret}
+}
diff --git a/zfs/contrib/dracut/Makefile.am b/zfs/contrib/dracut/Makefile.am
new file mode 100644 (file)
index 0000000..35b88c3
--- /dev/null
@@ -0,0 +1,3 @@
+SUBDIRS = 90zfs
+
+EXTRA_DIST = README.dracut.markdown
diff --git a/zfs/contrib/dracut/Makefile.in b/zfs/contrib/dracut/Makefile.in
new file mode 100644 (file)
index 0000000..7a1a9a1
--- /dev/null
@@ -0,0 +1,779 @@
+# Makefile.in generated by automake 1.15 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2014 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+VPATH = @srcdir@
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \  ]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs  ]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+subdir = contrib/dracut
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/config/always-no-bool-compare.m4 \
+       $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \
+       $(top_srcdir)/config/dkms.m4 \
+       $(top_srcdir)/config/kernel-acl.m4 \
+       $(top_srcdir)/config/kernel-automount.m4 \
+       $(top_srcdir)/config/kernel-bdev-block-device-operations.m4 \
+       $(top_srcdir)/config/kernel-bdev-logical-size.m4 \
+       $(top_srcdir)/config/kernel-bdev-physical-size.m4 \
+       $(top_srcdir)/config/kernel-bdi-setup-and-register.m4 \
+       $(top_srcdir)/config/kernel-bio-bvec-iter.m4 \
+       $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \
+       $(top_srcdir)/config/kernel-bio-failfast.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-barrier.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-discard.m4 \
+       $(top_srcdir)/config/kernel-blk-queue-flush.m4 \
+       $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \
+       $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \
+       $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \
+       $(top_srcdir)/config/kernel-blkdev-get.m4 \
+       $(top_srcdir)/config/kernel-block-device-operations-release-void.m4 \
+       $(top_srcdir)/config/kernel-check-disk-size-change.m4 \
+       $(top_srcdir)/config/kernel-clear-inode.m4 \
+       $(top_srcdir)/config/kernel-commit-metadata.m4 \
+       $(top_srcdir)/config/kernel-create-nameidata.m4 \
+       $(top_srcdir)/config/kernel-current_bio_tail.m4 \
+       $(top_srcdir)/config/kernel-d-make-root.m4 \
+       $(top_srcdir)/config/kernel-d-obtain-alias.m4 \
+       $(top_srcdir)/config/kernel-d-prune-aliases.m4 \
+       $(top_srcdir)/config/kernel-declare-event-class.m4 \
+       $(top_srcdir)/config/kernel-dentry-operations.m4 \
+       $(top_srcdir)/config/kernel-dirty-inode.m4 \
+       $(top_srcdir)/config/kernel-discard-granularity.m4 \
+       $(top_srcdir)/config/kernel-elevator-change.m4 \
+       $(top_srcdir)/config/kernel-encode-fh-inode.m4 \
+       $(top_srcdir)/config/kernel-evict-inode.m4 \
+       $(top_srcdir)/config/kernel-fallocate.m4 \
+       $(top_srcdir)/config/kernel-file-inode.m4 \
+       $(top_srcdir)/config/kernel-fmode-t.m4 \
+       $(top_srcdir)/config/kernel-follow-down-one.m4 \
+       $(top_srcdir)/config/kernel-follow-link-nameidata.m4 \
+       $(top_srcdir)/config/kernel-fsync.m4 \
+       $(top_srcdir)/config/kernel-generic_io_acct.m4 \
+       $(top_srcdir)/config/kernel-get-disk-ro.m4 \
+       $(top_srcdir)/config/kernel-get-gendisk.m4 \
+       $(top_srcdir)/config/kernel-insert-inode-locked.m4 \
+       $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \
+       $(top_srcdir)/config/kernel-is_owner_or_cap.m4 \
+       $(top_srcdir)/config/kernel-kmap-atomic-args.m4 \
+       $(top_srcdir)/config/kernel-kobj-name-len.m4 \
+       $(top_srcdir)/config/kernel-lookup-bdev.m4 \
+       $(top_srcdir)/config/kernel-lookup-nameidata.m4 \
+       $(top_srcdir)/config/kernel-lseek-execute.m4 \
+       $(top_srcdir)/config/kernel-mk-request-fn.m4 \
+       $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \
+       $(top_srcdir)/config/kernel-mount-nodev.m4 \
+       $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \
+       $(top_srcdir)/config/kernel-put-link-nameidata.m4 \
+       $(top_srcdir)/config/kernel-security-inode-init.m4 \
+       $(top_srcdir)/config/kernel-set-nlink.m4 \
+       $(top_srcdir)/config/kernel-sget-args.m4 \
+       $(top_srcdir)/config/kernel-show-options.m4 \
+       $(top_srcdir)/config/kernel-shrink.m4 \
+       $(top_srcdir)/config/kernel-truncate-range.m4 \
+       $(top_srcdir)/config/kernel-truncate-setsize.m4 \
+       $(top_srcdir)/config/kernel-vfs-iterate.m4 \
+       $(top_srcdir)/config/kernel-vfs-rw-iterate.m4 \
+       $(top_srcdir)/config/kernel-xattr-handler.m4 \
+       $(top_srcdir)/config/kernel.m4 $(top_srcdir)/config/libtool.m4 \
+       $(top_srcdir)/config/ltoptions.m4 \
+       $(top_srcdir)/config/ltsugar.m4 \
+       $(top_srcdir)/config/ltversion.m4 \
+       $(top_srcdir)/config/lt~obsolete.m4 \
+       $(top_srcdir)/config/mount-helper.m4 \
+       $(top_srcdir)/config/user-arch.m4 \
+       $(top_srcdir)/config/user-dracut.m4 \
+       $(top_srcdir)/config/user-frame-larger-than.m4 \
+       $(top_srcdir)/config/user-libblkid.m4 \
+       $(top_srcdir)/config/user-libuuid.m4 \
+       $(top_srcdir)/config/user-runstatedir.m4 \
+       $(top_srcdir)/config/user-systemd.m4 \
+       $(top_srcdir)/config/user-sysvinit.m4 \
+       $(top_srcdir)/config/user-udev.m4 \
+       $(top_srcdir)/config/user-zlib.m4 $(top_srcdir)/config/user.m4 \
+       $(top_srcdir)/config/zfs-build.m4 \
+       $(top_srcdir)/config/zfs-meta.m4 $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+       $(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/zfs_config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+SOURCES =
+DIST_SOURCES =
+RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
+       ctags-recursive dvi-recursive html-recursive info-recursive \
+       install-data-recursive install-dvi-recursive \
+       install-exec-recursive install-html-recursive \
+       install-info-recursive install-pdf-recursive \
+       install-ps-recursive install-recursive installcheck-recursive \
+       installdirs-recursive pdf-recursive ps-recursive \
+       tags-recursive uninstall-recursive
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive        \
+  distclean-recursive maintainer-clean-recursive
+am__recursive_targets = \
+  $(RECURSIVE_TARGETS) \
+  $(RECURSIVE_CLEAN_TARGETS) \
+  $(am__extra_recursive_targets)
+AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
+       distdir
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+DIST_SUBDIRS = $(SUBDIRS)
+am__DIST_COMMON = $(srcdir)/Makefile.in
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+am__relativize = \
+  dir0=`pwd`; \
+  sed_first='s,^\([^/]*\)/.*$$,\1,'; \
+  sed_rest='s,^[^/]*/*,,'; \
+  sed_last='s,^.*/\([^/]*\)$$,\1,'; \
+  sed_butlast='s,/*[^/]*$$,,'; \
+  while test -n "$$dir1"; do \
+    first=`echo "$$dir1" | sed -e "$$sed_first"`; \
+    if test "$$first" != "."; then \
+      if test "$$first" = ".."; then \
+        dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
+        dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
+      else \
+        first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
+        if test "$$first2" = "$$first"; then \
+          dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
+        else \
+          dir2="../$$dir2"; \
+        fi; \
+        dir0="$$dir0"/"$$first"; \
+      fi; \
+    fi; \
+    dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
+  done; \
+  reldir="$$dir2"
+ACLOCAL = @ACLOCAL@
+ALIEN = @ALIEN@
+ALIEN_VERSION = @ALIEN_VERSION@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEBUG_CFLAGS = @DEBUG_CFLAGS@
+DEBUG_DMU_TX = @DEBUG_DMU_TX@
+DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@
+DEBUG_ZFS = @DEBUG_ZFS@
+DEFAULT_INITCONF_DIR = @DEFAULT_INITCONF_DIR@
+DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@
+DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@
+DEFAULT_PACKAGE = @DEFAULT_PACKAGE@
+DEFINE_INITRAMFS = @DEFINE_INITRAMFS@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DPKG = @DPKG@
+DPKGBUILD = @DPKGBUILD@
+DPKGBUILD_VERSION = @DPKGBUILD_VERSION@
+DPKG_VERSION = @DPKG_VERSION@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+FGREP = @FGREP@
+FRAME_LARGER_THAN = @FRAME_LARGER_THAN@
+GREP = @GREP@
+HAVE_ALIEN = @HAVE_ALIEN@
+HAVE_DPKG = @HAVE_DPKG@
+HAVE_DPKGBUILD = @HAVE_DPKGBUILD@
+HAVE_RPM = @HAVE_RPM@
+HAVE_RPMBUILD = @HAVE_RPMBUILD@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+KERNELCPPFLAGS = @KERNELCPPFLAGS@
+KERNELMAKE_PARAMS = @KERNELMAKE_PARAMS@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBBLKID = @LIBBLKID@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIBUUID = @LIBUUID@
+LINUX = @LINUX@
+LINUX_OBJ = @LINUX_OBJ@
+LINUX_SYMBOLS = @LINUX_SYMBOLS@
+LINUX_VERSION = @LINUX_VERSION@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+NO_BOOL_COMPARE = @NO_BOOL_COMPARE@
+NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+RANLIB = @RANLIB@
+RELEASE = @RELEASE@
+RPM = @RPM@
+RPMBUILD = @RPMBUILD@
+RPMBUILD_VERSION = @RPMBUILD_VERSION@
+RPM_DEFINE_COMMON = @RPM_DEFINE_COMMON@
+RPM_DEFINE_DKMS = @RPM_DEFINE_DKMS@
+RPM_DEFINE_KMOD = @RPM_DEFINE_KMOD@
+RPM_DEFINE_UTIL = @RPM_DEFINE_UTIL@
+RPM_SPEC_DIR = @RPM_SPEC_DIR@
+RPM_VERSION = @RPM_VERSION@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+SPL = @SPL@
+SPL_OBJ = @SPL_OBJ@
+SPL_SYMBOLS = @SPL_SYMBOLS@
+SPL_VERSION = @SPL_VERSION@
+SRPM_DEFINE_COMMON = @SRPM_DEFINE_COMMON@
+SRPM_DEFINE_DKMS = @SRPM_DEFINE_DKMS@
+SRPM_DEFINE_KMOD = @SRPM_DEFINE_KMOD@
+SRPM_DEFINE_UTIL = @SRPM_DEFINE_UTIL@
+STRIP = @STRIP@
+TARGET_ASM_DIR = @TARGET_ASM_DIR@
+VENDOR = @VENDOR@
+VERSION = @VERSION@
+ZFS_CONFIG = @ZFS_CONFIG@
+ZFS_INIT_SYSTEMD = @ZFS_INIT_SYSTEMD@
+ZFS_INIT_SYSV = @ZFS_INIT_SYSV@
+ZFS_META_ALIAS = @ZFS_META_ALIAS@
+ZFS_META_AUTHOR = @ZFS_META_AUTHOR@
+ZFS_META_DATA = @ZFS_META_DATA@
+ZFS_META_LICENSE = @ZFS_META_LICENSE@
+ZFS_META_LT_AGE = @ZFS_META_LT_AGE@
+ZFS_META_LT_CURRENT = @ZFS_META_LT_CURRENT@
+ZFS_META_LT_REVISION = @ZFS_META_LT_REVISION@
+ZFS_META_NAME = @ZFS_META_NAME@
+ZFS_META_RELEASE = @ZFS_META_RELEASE@
+ZFS_META_VERSION = @ZFS_META_VERSION@
+ZFS_MODULE_LOAD = @ZFS_MODULE_LOAD@
+ZLIB = @ZLIB@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dracutdir = @dracutdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+modulesloaddir = @modulesloaddir@
+mounthelperdir = @mounthelperdir@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+runstatedir = @runstatedir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+systemdpresetdir = @systemdpresetdir@
+systemdunitdir = @systemdunitdir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+udevdir = @udevdir@
+udevruledir = @udevruledir@
+SUBDIRS = 90zfs
+EXTRA_DIST = README.dracut.markdown
+all: all-recursive
+
+.SUFFIXES:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+       @for dep in $?; do \
+         case '$(am__configure_deps)' in \
+           *$$dep*) \
+             ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+               && { if test -f $@; then exit 0; else break; fi; }; \
+             exit 1;; \
+         esac; \
+       done; \
+       echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu contrib/dracut/Makefile'; \
+       $(am__cd) $(top_srcdir) && \
+         $(AUTOMAKE) --gnu contrib/dracut/Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+       @case '$?' in \
+         *config.status*) \
+           cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+         *) \
+           echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+           cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+       esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+       cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+       cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+       cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+mostlyclean-libtool:
+       -rm -f *.lo
+
+clean-libtool:
+       -rm -rf .libs _libs
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run 'make' without going through this Makefile.
+# To change the values of 'make' variables: instead of editing Makefiles,
+# (1) if the variable is set in 'config.status', edit 'config.status'
+#     (which will cause the Makefiles to be regenerated when you run 'make');
+# (2) otherwise, pass the desired values on the 'make' command line.
+$(am__recursive_targets):
+       @fail=; \
+       if $(am__make_keepgoing); then \
+         failcom='fail=yes'; \
+       else \
+         failcom='exit 1'; \
+       fi; \
+       dot_seen=no; \
+       target=`echo $@ | sed s/-recursive//`; \
+       case "$@" in \
+         distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+         *) list='$(SUBDIRS)' ;; \
+       esac; \
+       for subdir in $$list; do \
+         echo "Making $$target in $$subdir"; \
+         if test "$$subdir" = "."; then \
+           dot_seen=yes; \
+           local_target="$$target-am"; \
+         else \
+           local_target="$$target"; \
+         fi; \
+         ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+         || eval $$failcom; \
+       done; \
+       if test "$$dot_seen" = "no"; then \
+         $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+       fi; test -z "$$fail"
+
+ID: $(am__tagged_files)
+       $(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-recursive
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+       set x; \
+       here=`pwd`; \
+       if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
+         include_option=--etags-include; \
+         empty_fix=.; \
+       else \
+         include_option=--include; \
+         empty_fix=; \
+       fi; \
+       list='$(SUBDIRS)'; for subdir in $$list; do \
+         if test "$$subdir" = .; then :; else \
+           test ! -f $$subdir/TAGS || \
+             set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
+         fi; \
+       done; \
+       $(am__define_uniq_tagged_files); \
+       shift; \
+       if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+         test -n "$$unique" || unique=$$empty_fix; \
+         if test $$# -gt 0; then \
+           $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+             "$$@" $$unique; \
+         else \
+           $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+             $$unique; \
+         fi; \
+       fi
+ctags: ctags-recursive
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+       $(am__define_uniq_tagged_files); \
+       test -z "$(CTAGS_ARGS)$$unique" \
+         || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+            $$unique
+
+GTAGS:
+       here=`$(am__cd) $(top_builddir) && pwd` \
+         && $(am__cd) $(top_srcdir) \
+         && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-recursive
+
+cscopelist-am: $(am__tagged_files)
+       list='$(am__tagged_files)'; \
+       case "$(srcdir)" in \
+         [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+         *) sdir=$(subdir)/$(srcdir) ;; \
+       esac; \
+       for i in $$list; do \
+         if test -f "$$i"; then \
+           echo "$(subdir)/$$i"; \
+         else \
+           echo "$$sdir/$$i"; \
+         fi; \
+       done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+       -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+       @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+       topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+       list='$(DISTFILES)'; \
+         dist_files=`for file in $$list; do echo $$file; done | \
+         sed -e "s|^$$srcdirstrip/||;t" \
+             -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+       case $$dist_files in \
+         */*) $(MKDIR_P) `echo "$$dist_files" | \
+                          sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+                          sort -u` ;; \
+       esac; \
+       for file in $$dist_files; do \
+         if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+         if test -d $$d/$$file; then \
+           dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+           if test -d "$(distdir)/$$file"; then \
+             find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+           fi; \
+           if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+             cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+             find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+           fi; \
+           cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+         else \
+           test -f "$(distdir)/$$file" \
+           || cp -p $$d/$$file "$(distdir)/$$file" \
+           || exit 1; \
+         fi; \
+       done
+       @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
+         if test "$$subdir" = .; then :; else \
+           $(am__make_dryrun) \
+             || test -d "$(distdir)/$$subdir" \
+             || $(MKDIR_P) "$(distdir)/$$subdir" \
+             || exit 1; \
+           dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
+           $(am__relativize); \
+           new_distdir=$$reldir; \
+           dir1=$$subdir; dir2="$(top_distdir)"; \
+           $(am__relativize); \
+           new_top_distdir=$$reldir; \
+           echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
+           echo "     am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
+           ($(am__cd) $$subdir && \
+             $(MAKE) $(AM_MAKEFLAGS) \
+               top_distdir="$$new_top_distdir" \
+               distdir="$$new_distdir" \
+               am__remove_distdir=: \
+               am__skip_length_check=: \
+               am__skip_mode_fix=: \
+               distdir) \
+             || exit 1; \
+         fi; \
+       done
+check-am: all-am
+check: check-recursive
+all-am: Makefile
+installdirs: installdirs-recursive
+installdirs-am:
+install: install-recursive
+install-exec: install-exec-recursive
+install-data: install-data-recursive
+uninstall: uninstall-recursive
+
+install-am: all-am
+       @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-recursive
+install-strip:
+       if test -z '$(STRIP)'; then \
+         $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+           install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+             install; \
+       else \
+         $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+           install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+           "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+       fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+       -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+       -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+       @echo "This command is intended for maintainers to use"
+       @echo "it deletes files that may require special tools to rebuild."
+clean: clean-recursive
+
+clean-am: clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-recursive
+       -rm -f Makefile
+distclean-am: clean-am distclean-generic distclean-tags
+
+dvi: dvi-recursive
+
+dvi-am:
+
+html: html-recursive
+
+html-am:
+
+info: info-recursive
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-recursive
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-recursive
+
+install-html-am:
+
+install-info: install-info-recursive
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-recursive
+
+install-pdf-am:
+
+install-ps: install-ps-recursive
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-recursive
+       -rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-recursive
+
+mostlyclean-am: mostlyclean-generic mostlyclean-libtool
+
+pdf: pdf-recursive
+
+pdf-am:
+
+ps: ps-recursive
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: $(am__recursive_targets) install-am install-strip
+
+.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am check \
+       check-am clean clean-generic clean-libtool cscopelist-am ctags \
+       ctags-am distclean distclean-generic distclean-libtool \
+       distclean-tags distdir dvi dvi-am html html-am info info-am \
+       install install-am install-data install-data-am install-dvi \
+       install-dvi-am install-exec install-exec-am install-html \
+       install-html-am install-info install-info-am install-man \
+       install-pdf install-pdf-am install-ps install-ps-am \
+       install-strip installcheck installcheck-am installdirs \
+       installdirs-am maintainer-clean maintainer-clean-generic \
+       mostlyclean mostlyclean-generic mostlyclean-libtool pdf pdf-am \
+       ps ps-am tags tags-am uninstall uninstall-am
+
+.PRECIOUS: Makefile
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/zfs/contrib/dracut/README.dracut.markdown b/zfs/contrib/dracut/README.dracut.markdown
new file mode 100644 (file)
index 0000000..46d032f
--- /dev/null
@@ -0,0 +1,207 @@
+How to setup a zfs root filesystem using dracut
+-----------------------------------------------
+
+1) Install the zfs-dracut package.  This package adds a zfs dracut module
+to the /usr/share/dracut/modules.d/ directory which allows dracut to
+create an initramfs which is zfs aware.
+
+2) Set the bootfs property for the bootable dataset in the pool.  Then set
+the dataset mountpoint property to '/'.
+
+    $ zpool set bootfs=pool/dataset pool
+    $ zfs set mountpoint=/ pool/dataset
+
+It is also possible to set the bootfs property for an entire pool, just in
+case you are not using a dedicated dataset for '/'.
+
+    $ zpool set bootfs=pool pool
+    $ zfs set mountpoint=/ pool
+
+Alternately, legacy mountpoints can be used by setting the 'root=' option
+on the kernel line of your grub.conf/menu.lst configuration file.  Then
+set the dataset mountpoint property to 'legacy'.
+
+    $ grub.conf/menu.lst: kernel ... root=ZFS=pool/dataset
+    $ zfs set mountpoint=legacy pool/dataset
+
+3) To set zfs module options put them in /etc/modprobe.d/zfs.conf file.
+The complete list of zfs module options is available by running the
+_modinfo zfs_ command.  Commonly set options include: zfs_arc_min,
+zfs_arc_max, zfs_prefetch_disable, and zfs_vdev_max_pending.
+
+4) Finally, create your new initramfs by running dracut.
+
+    $ dracut --force /path/to/initramfs kernel_version
+
+Kernel Command Line
+-------------------
+
+The initramfs' behavior is influenced by the following kernel command line
+parameters passed in from the boot loader:
+
+* `root=...`: If not set, importable pools are searched for a bootfs
+attribute.  If an explicitly set root is desired, you may use
+`root=ZFS:pool/dataset`
+
+* `zfs_force=0`: If set to 1, the initramfs will run `zpool import -f` when
+attempting to import pools if the required pool isn't automatically imported
+by the zfs module.  This can save you a trip to a bootcd if hostid has
+changed, but is dangerous and can lead to zpool corruption, particularly in
+cases where storage is on a shared fabric such as iSCSI where multiple hosts
+can access storage devices concurrently.  _Please understand the implications
+of force-importing a pool before enabling this option!_
+
+* `spl_hostid`: By default, the hostid used by the SPL module is read from
+/etc/hostid inside the initramfs.  This file is placed there from the host
+system when the initramfs is built which effectively ties the ramdisk to the
+host which builds it.  If a different hostid is desired, one may be set in
+this attribute and will override any file present in the ramdisk.  The
+format should be hex exactly as found in the `/etc/hostid` file, IE
+`spl_hostid=0x00bab10c`.
+
+Note that changing the hostid between boots will most likely lead to an
+un-importable pool since the last importing hostid won't match.  In order
+to recover from this, you may use the `zfs_force` option or boot from a
+different filesystem and `zpool import -f` then `zpool export` the pool
+before rebooting with the new hostid.
+
+How it Works
+============
+
+The Dracut module consists of the following files (less Makefile's):
+
+* `module-setup.sh`: Script run by the initramfs builder to create the
+ramdisk.  Contains instructions on which files are required by the modules
+and z* programs.  Also triggers inclusion of `/etc/hostid` and the zpool
+cache.  This file is not included in the initramfs.
+
+* `90-zfs.rules`: udev rules which trigger loading of the ZFS modules at boot.
+
+* `zfs-lib.sh`: Utility functions used by the other files.
+
+* `parse-zfs.sh`: Run early in the initramfs boot process to parse kernel
+command line and determine if ZFS is the active root filesystem.
+
+* `mount-zfs.sh`: Run later in initramfs boot process after udev has settled
+to mount the root dataset.
+
+* `export-zfs.sh`: Run on shutdown after dracut has restored the initramfs
+and pivoted to it, allowing for a clean unmount and export of the ZFS root.
+
+`zfs-lib.sh`
+------------
+
+This file provides a few handy functions for working with ZFS. Those
+functions are used by the `mount-zfs.sh` and `export-zfs.sh` files.
+However, they could be used by any other file as well, as long as the file
+sources `/lib/dracut-zfs-lib.sh`.
+
+`module-setup.sh`
+-----------------
+
+This file is run by the Dracut script within the live system, not at boot
+time.  It's not included in the final initramfs.  Functions in this script
+describe which files are needed by ZFS at boot time.
+
+Currently all the various z* and spl modules are included, a dependency is
+asserted on udev-rules, and the various zfs, zpool, etc. helpers are included.
+Dracut provides library functions which automatically gather the shared libs
+necessary to run each of these binaries, so statically built binaries are
+not required.
+
+The zpool and zvol udev rules files are copied from where they are
+installed by the ZFS build.  __PACKAGERS TAKE NOTE__: If you move
+`/etc/udev/rules/60-z*.rules`, you'll need to update this file to match.
+
+Currently this file also includes `/etc/hostid` and `/etc/zfs/zpool.cache`
+which means the generated ramdisk is specific to the host system which built
+it.  If a generic initramfs is required, it may be preferable to omit these
+files and specify the `spl_hostid` from the boot loader instead.
+
+`parse-zfs.sh`
+--------------
+
+Run during the cmdline phase of the initramfs boot process, this script
+performs some basic sanity checks on kernel command line parameters to
+determine if booting from ZFS is likely to be what is desired.  Dracut
+requires this script to adjust the `root` variable if required and to set
+`rootok=1` if a mountable root filesystem is available.  Unfortunately this
+script must run before udev is settled and kernel modules are known to be
+loaded, so accessing the zpool and zfs commands is unsafe.
+
+If the root=ZFS... parameter is set on the command line, then it's at least
+certain that ZFS is what is desired, though this script is unable to
+determine if ZFS is in fact available.  This script will alter the `root`
+parameter to replace several historical forms of specifying the pool and
+dataset name with the canonical form of `zfs:pool/dataset`.
+
+If no root= parameter is set, the best this script can do is guess that
+ZFS is desired.  At present, no other known filesystems will work with no
+root= parameter, though this might possibly interfere with using the
+compiled-in default root in the kernel image.  It's considered unlikely
+that would ever be the case when an initramfs is in use, so this script
+sets `root=zfs:AUTO` and hopes for the best.
+
+Once the root=... (or lack thereof) parameter is parsed, a dummy symlink
+is created from `/dev/root` -> `/dev/null` to satisfy parts of the Dracut
+process which check for presence of a single root device node.
+
+Finally, an initqueue/finished hook is registered which causes the initqueue
+phase of Dracut to wait for `/dev/zfs` to become available before attempting
+to mount anything.
+
+`mount-zfs.sh`
+--------------
+
+This script is run after udev has settled and all tasks in the initqueue
+have succeeded.  This ensures that `/dev/zfs` is available and that the
+various ZFS modules are successfully loaded.  As it is now safe to call
+zpool and friends, we can proceed to find the bootfs attribute if necessary.
+
+If the root parameter was explicitly set on the command line, no parsing is
+necessary.  The list of imported pools is checked to see if the desired pool
+is already imported.  If it's not, and attempt is made to import the pool
+explicitly, though no force is attempted.  Finally the specified dataset
+is mounted on `$NEWROOT`, first using the `-o zfsutil` option to handle
+non-legacy mounts, then if that fails, without zfsutil to handle legacy
+mount points.
+
+If no root parameter was specified, this script attempts to find a pool with
+its bootfs attribute set.  First, already-imported pools are scanned and if
+an appropriate pool is found, no additional pools are imported.  If no pool
+with bootfs is found, any additional pools in the system are imported with
+`zpool import -N -a`, and the scan for bootfs is tried again.  If no bootfs
+is found with all pools imported, all pools are re-exported, and boot fails.
+Assuming a bootfs is found, an attempt is made to mount it to `$NEWROOT`,
+first with, then without the zfsutil option as above.
+
+Ordinarily pools are imported _without_ the force option which may cause
+boot to fail if the hostid has changed or a pool has been physically moved
+between servers.  The `zfs_force` kernel parameter is provided which when
+set to `1` causes `zpool import` to be run with the `-f` flag.  Forcing pool
+import can lead to serious data corruption and loss of pools, so this option
+should be used with extreme caution.  Note that even with this flag set, if
+the required zpool was auto-imported by the kernel module, no additional
+`zpool import` commands are run, so nothing is forced.
+
+`export-zfs.sh`
+---------------
+
+Normally the zpool containing the root dataset cannot be exported on
+shutdown as it is still in use by the init process. To work around this,
+Dracut is able to restore the initramfs on shutdown and pivot to it.
+All remaining process are then running from a ramdisk, allowing for a
+clean unmount and export of the ZFS root. The theory of operation is
+described in detail in the [Dracut manual](https://www.kernel.org/pub/linux/utils/boot/dracut/dracut.html#_dracut_on_shutdown).
+
+This script will try to export all remaining zpools after Dracut has
+pivoted to the initramfs. If an initial regular export is not successful,
+Dracut will call this script once more with the `final` option,
+in which case a forceful export is attempted.
+
+Other Dracut modules include similar shutdown scripts and Dracut
+invokes these scripts round-robin until they succeed. In particular,
+the `90dm` module installs a script which tries to close and remove
+all device mapper targets. Thus, if there are ZVOLs containing
+dm-crypt volumes or if the zpool itself is backed by a dm-crypt
+volume, the shutdown scripts will try to untangle this.
diff --git a/zfs/contrib/initramfs/Makefile.am b/zfs/contrib/initramfs/Makefile.am
new file mode 100644 (file)
index 0000000..fa2b5a2
--- /dev/null
@@ -0,0 +1,21 @@
+initrddir = $(datarootdir)/initramfs-tools
+
+initrd_SCRIPTS = conf-hooks.d/zfs hooks/zfs scripts/zfs
+
+EXTRA_DIST = \
+       $(top_srcdir)/contrib/initramfs/conf-hooks.d/zfs \
+       $(top_srcdir)/contrib/initramfs/hooks/zfs \
+       $(top_srcdir)/contrib/initramfs/scripts/zfs \
+       $(top_srcdir)/contrib/initramfs/README.initramfs.markdown
+
+install-initrdSCRIPTS: $(EXTRA_DIST)
+       for d in conf-hooks.d hooks scripts; do \
+         $(MKDIR_P) $(DESTDIR)$(initrddir)/$$d; \
+         cp $(top_srcdir)/contrib/initramfs/$$d/zfs \
+           $(DESTDIR)$(initrddir)/$$d/; \
+       done
+       if [ -f etc/init.d/zfs ]; then \
+         $(MKDIR_P) $(DESTDIR)$(DEFAULT_INITCONF_DIR); \
+         cp $(top_srcdir)/etc/init.d/zfs \
+           $(DESTDIR)$(DEFAULT_INITCONF_DIR)/; \
+       fi
diff --git a/zfs/contrib/initramfs/Makefile.in b/zfs/contrib/initramfs/Makefile.in
new file mode 100644 (file)
index 0000000..8ead20e
--- /dev/null
@@ -0,0 +1,658 @@
+# Makefile.in generated by automake 1.15 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2014 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+VPATH = @srcdir@
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \  ]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs  ]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+subdir = contrib/initramfs
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/config/always-no-bool-compare.m4 \
+       $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \
+       $(top_srcdir)/config/dkms.m4 \
+       $(top_srcdir)/config/kernel-acl.m4 \
+       $(top_srcdir)/config/kernel-automount.m4 \
+       $(top_srcdir)/config/kernel-bdev-block-device-operations.m4 \
+       $(top_srcdir)/config/kernel-bdev-logical-size.m4 \
+       $(top_srcdir)/config/kernel-bdev-physical-size.m4 \
+       $(top_srcdir)/config/kernel-bdi-setup-and-register.m4 \
+       $(top_srcdir)/config/kernel-bio-bvec-iter.m4 \
+       $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \
+       $(top_srcdir)/config/kernel-bio-failfast.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-barrier.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-discard.m4 \
+       $(top_srcdir)/config/kernel-blk-queue-flush.m4 \
+       $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \
+       $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \
+       $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \
+       $(top_srcdir)/config/kernel-blkdev-get.m4 \
+       $(top_srcdir)/config/kernel-block-device-operations-release-void.m4 \
+       $(top_srcdir)/config/kernel-check-disk-size-change.m4 \
+       $(top_srcdir)/config/kernel-clear-inode.m4 \
+       $(top_srcdir)/config/kernel-commit-metadata.m4 \
+       $(top_srcdir)/config/kernel-create-nameidata.m4 \
+       $(top_srcdir)/config/kernel-current_bio_tail.m4 \
+       $(top_srcdir)/config/kernel-d-make-root.m4 \
+       $(top_srcdir)/config/kernel-d-obtain-alias.m4 \
+       $(top_srcdir)/config/kernel-d-prune-aliases.m4 \
+       $(top_srcdir)/config/kernel-declare-event-class.m4 \
+       $(top_srcdir)/config/kernel-dentry-operations.m4 \
+       $(top_srcdir)/config/kernel-dirty-inode.m4 \
+       $(top_srcdir)/config/kernel-discard-granularity.m4 \
+       $(top_srcdir)/config/kernel-elevator-change.m4 \
+       $(top_srcdir)/config/kernel-encode-fh-inode.m4 \
+       $(top_srcdir)/config/kernel-evict-inode.m4 \
+       $(top_srcdir)/config/kernel-fallocate.m4 \
+       $(top_srcdir)/config/kernel-file-inode.m4 \
+       $(top_srcdir)/config/kernel-fmode-t.m4 \
+       $(top_srcdir)/config/kernel-follow-down-one.m4 \
+       $(top_srcdir)/config/kernel-follow-link-nameidata.m4 \
+       $(top_srcdir)/config/kernel-fsync.m4 \
+       $(top_srcdir)/config/kernel-generic_io_acct.m4 \
+       $(top_srcdir)/config/kernel-get-disk-ro.m4 \
+       $(top_srcdir)/config/kernel-get-gendisk.m4 \
+       $(top_srcdir)/config/kernel-insert-inode-locked.m4 \
+       $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \
+       $(top_srcdir)/config/kernel-is_owner_or_cap.m4 \
+       $(top_srcdir)/config/kernel-kmap-atomic-args.m4 \
+       $(top_srcdir)/config/kernel-kobj-name-len.m4 \
+       $(top_srcdir)/config/kernel-lookup-bdev.m4 \
+       $(top_srcdir)/config/kernel-lookup-nameidata.m4 \
+       $(top_srcdir)/config/kernel-lseek-execute.m4 \
+       $(top_srcdir)/config/kernel-mk-request-fn.m4 \
+       $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \
+       $(top_srcdir)/config/kernel-mount-nodev.m4 \
+       $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \
+       $(top_srcdir)/config/kernel-put-link-nameidata.m4 \
+       $(top_srcdir)/config/kernel-security-inode-init.m4 \
+       $(top_srcdir)/config/kernel-set-nlink.m4 \
+       $(top_srcdir)/config/kernel-sget-args.m4 \
+       $(top_srcdir)/config/kernel-show-options.m4 \
+       $(top_srcdir)/config/kernel-shrink.m4 \
+       $(top_srcdir)/config/kernel-truncate-range.m4 \
+       $(top_srcdir)/config/kernel-truncate-setsize.m4 \
+       $(top_srcdir)/config/kernel-vfs-iterate.m4 \
+       $(top_srcdir)/config/kernel-vfs-rw-iterate.m4 \
+       $(top_srcdir)/config/kernel-xattr-handler.m4 \
+       $(top_srcdir)/config/kernel.m4 $(top_srcdir)/config/libtool.m4 \
+       $(top_srcdir)/config/ltoptions.m4 \
+       $(top_srcdir)/config/ltsugar.m4 \
+       $(top_srcdir)/config/ltversion.m4 \
+       $(top_srcdir)/config/lt~obsolete.m4 \
+       $(top_srcdir)/config/mount-helper.m4 \
+       $(top_srcdir)/config/user-arch.m4 \
+       $(top_srcdir)/config/user-dracut.m4 \
+       $(top_srcdir)/config/user-frame-larger-than.m4 \
+       $(top_srcdir)/config/user-libblkid.m4 \
+       $(top_srcdir)/config/user-libuuid.m4 \
+       $(top_srcdir)/config/user-runstatedir.m4 \
+       $(top_srcdir)/config/user-systemd.m4 \
+       $(top_srcdir)/config/user-sysvinit.m4 \
+       $(top_srcdir)/config/user-udev.m4 \
+       $(top_srcdir)/config/user-zlib.m4 $(top_srcdir)/config/user.m4 \
+       $(top_srcdir)/config/zfs-build.m4 \
+       $(top_srcdir)/config/zfs-meta.m4 $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+       $(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/zfs_config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+    *) f=$$p;; \
+  esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+  for p in $$list; do echo "$$p $$p"; done | \
+  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+    if (++n[$$2] == $(am__install_max)) \
+      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+    END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
+am__installdirs = "$(DESTDIR)$(initrddir)"
+SCRIPTS = $(initrd_SCRIPTS)
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+SOURCES =
+DIST_SOURCES =
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+am__DIST_COMMON = $(srcdir)/Makefile.in
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALIEN = @ALIEN@
+ALIEN_VERSION = @ALIEN_VERSION@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEBUG_CFLAGS = @DEBUG_CFLAGS@
+DEBUG_DMU_TX = @DEBUG_DMU_TX@
+DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@
+DEBUG_ZFS = @DEBUG_ZFS@
+DEFAULT_INITCONF_DIR = @DEFAULT_INITCONF_DIR@
+DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@
+DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@
+DEFAULT_PACKAGE = @DEFAULT_PACKAGE@
+DEFINE_INITRAMFS = @DEFINE_INITRAMFS@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DPKG = @DPKG@
+DPKGBUILD = @DPKGBUILD@
+DPKGBUILD_VERSION = @DPKGBUILD_VERSION@
+DPKG_VERSION = @DPKG_VERSION@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+FGREP = @FGREP@
+FRAME_LARGER_THAN = @FRAME_LARGER_THAN@
+GREP = @GREP@
+HAVE_ALIEN = @HAVE_ALIEN@
+HAVE_DPKG = @HAVE_DPKG@
+HAVE_DPKGBUILD = @HAVE_DPKGBUILD@
+HAVE_RPM = @HAVE_RPM@
+HAVE_RPMBUILD = @HAVE_RPMBUILD@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+KERNELCPPFLAGS = @KERNELCPPFLAGS@
+KERNELMAKE_PARAMS = @KERNELMAKE_PARAMS@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBBLKID = @LIBBLKID@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIBUUID = @LIBUUID@
+LINUX = @LINUX@
+LINUX_OBJ = @LINUX_OBJ@
+LINUX_SYMBOLS = @LINUX_SYMBOLS@
+LINUX_VERSION = @LINUX_VERSION@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+NO_BOOL_COMPARE = @NO_BOOL_COMPARE@
+NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+RANLIB = @RANLIB@
+RELEASE = @RELEASE@
+RPM = @RPM@
+RPMBUILD = @RPMBUILD@
+RPMBUILD_VERSION = @RPMBUILD_VERSION@
+RPM_DEFINE_COMMON = @RPM_DEFINE_COMMON@
+RPM_DEFINE_DKMS = @RPM_DEFINE_DKMS@
+RPM_DEFINE_KMOD = @RPM_DEFINE_KMOD@
+RPM_DEFINE_UTIL = @RPM_DEFINE_UTIL@
+RPM_SPEC_DIR = @RPM_SPEC_DIR@
+RPM_VERSION = @RPM_VERSION@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+SPL = @SPL@
+SPL_OBJ = @SPL_OBJ@
+SPL_SYMBOLS = @SPL_SYMBOLS@
+SPL_VERSION = @SPL_VERSION@
+SRPM_DEFINE_COMMON = @SRPM_DEFINE_COMMON@
+SRPM_DEFINE_DKMS = @SRPM_DEFINE_DKMS@
+SRPM_DEFINE_KMOD = @SRPM_DEFINE_KMOD@
+SRPM_DEFINE_UTIL = @SRPM_DEFINE_UTIL@
+STRIP = @STRIP@
+TARGET_ASM_DIR = @TARGET_ASM_DIR@
+VENDOR = @VENDOR@
+VERSION = @VERSION@
+ZFS_CONFIG = @ZFS_CONFIG@
+ZFS_INIT_SYSTEMD = @ZFS_INIT_SYSTEMD@
+ZFS_INIT_SYSV = @ZFS_INIT_SYSV@
+ZFS_META_ALIAS = @ZFS_META_ALIAS@
+ZFS_META_AUTHOR = @ZFS_META_AUTHOR@
+ZFS_META_DATA = @ZFS_META_DATA@
+ZFS_META_LICENSE = @ZFS_META_LICENSE@
+ZFS_META_LT_AGE = @ZFS_META_LT_AGE@
+ZFS_META_LT_CURRENT = @ZFS_META_LT_CURRENT@
+ZFS_META_LT_REVISION = @ZFS_META_LT_REVISION@
+ZFS_META_NAME = @ZFS_META_NAME@
+ZFS_META_RELEASE = @ZFS_META_RELEASE@
+ZFS_META_VERSION = @ZFS_META_VERSION@
+ZFS_MODULE_LOAD = @ZFS_MODULE_LOAD@
+ZLIB = @ZLIB@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dracutdir = @dracutdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+modulesloaddir = @modulesloaddir@
+mounthelperdir = @mounthelperdir@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+runstatedir = @runstatedir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+systemdpresetdir = @systemdpresetdir@
+systemdunitdir = @systemdunitdir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+udevdir = @udevdir@
+udevruledir = @udevruledir@
+initrddir = $(datarootdir)/initramfs-tools
+initrd_SCRIPTS = conf-hooks.d/zfs hooks/zfs scripts/zfs
+EXTRA_DIST = \
+       $(top_srcdir)/contrib/initramfs/conf-hooks.d/zfs \
+       $(top_srcdir)/contrib/initramfs/hooks/zfs \
+       $(top_srcdir)/contrib/initramfs/scripts/zfs \
+       $(top_srcdir)/contrib/initramfs/README.initramfs.markdown
+
+all: all-am
+
+.SUFFIXES:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+       @for dep in $?; do \
+         case '$(am__configure_deps)' in \
+           *$$dep*) \
+             ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+               && { if test -f $@; then exit 0; else break; fi; }; \
+             exit 1;; \
+         esac; \
+       done; \
+       echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu contrib/initramfs/Makefile'; \
+       $(am__cd) $(top_srcdir) && \
+         $(AUTOMAKE) --gnu contrib/initramfs/Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+       @case '$?' in \
+         *config.status*) \
+           cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+         *) \
+           echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+           cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+       esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+       cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+       cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+       cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+uninstall-initrdSCRIPTS:
+       @$(NORMAL_UNINSTALL)
+       @list='$(initrd_SCRIPTS)'; test -n "$(initrddir)" || exit 0; \
+       files=`for p in $$list; do echo "$$p"; done | \
+              sed -e 's,.*/,,;$(transform)'`; \
+       dir='$(DESTDIR)$(initrddir)'; $(am__uninstall_files_from_dir)
+
+mostlyclean-libtool:
+       -rm -f *.lo
+
+clean-libtool:
+       -rm -rf .libs _libs
+tags TAGS:
+
+ctags CTAGS:
+
+cscope cscopelist:
+
+
+distdir: $(DISTFILES)
+       @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+       topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+       list='$(DISTFILES)'; \
+         dist_files=`for file in $$list; do echo $$file; done | \
+         sed -e "s|^$$srcdirstrip/||;t" \
+             -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+       case $$dist_files in \
+         */*) $(MKDIR_P) `echo "$$dist_files" | \
+                          sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+                          sort -u` ;; \
+       esac; \
+       for file in $$dist_files; do \
+         if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+         if test -d $$d/$$file; then \
+           dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+           if test -d "$(distdir)/$$file"; then \
+             find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+           fi; \
+           if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+             cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+             find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+           fi; \
+           cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+         else \
+           test -f "$(distdir)/$$file" \
+           || cp -p $$d/$$file "$(distdir)/$$file" \
+           || exit 1; \
+         fi; \
+       done
+check-am: all-am
+check: check-am
+all-am: Makefile $(SCRIPTS)
+installdirs:
+       for dir in "$(DESTDIR)$(initrddir)"; do \
+         test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+       done
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+       @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+       if test -z '$(STRIP)'; then \
+         $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+           install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+             install; \
+       else \
+         $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+           install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+           "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+       fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+       -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+       -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+       @echo "This command is intended for maintainers to use"
+       @echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-am
+       -rm -f Makefile
+distclean-am: clean-am distclean-generic
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am: install-initrdSCRIPTS
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+       -rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-generic mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-initrdSCRIPTS
+
+.MAKE: install-am install-strip
+
+.PHONY: all all-am check check-am clean clean-generic clean-libtool \
+       cscopelist-am ctags-am distclean distclean-generic \
+       distclean-libtool distdir dvi dvi-am html html-am info info-am \
+       install install-am install-data install-data-am install-dvi \
+       install-dvi-am install-exec install-exec-am install-html \
+       install-html-am install-info install-info-am \
+       install-initrdSCRIPTS install-man install-pdf install-pdf-am \
+       install-ps install-ps-am install-strip installcheck \
+       installcheck-am installdirs maintainer-clean \
+       maintainer-clean-generic mostlyclean mostlyclean-generic \
+       mostlyclean-libtool pdf pdf-am ps ps-am tags-am uninstall \
+       uninstall-am uninstall-initrdSCRIPTS
+
+.PRECIOUS: Makefile
+
+
+install-initrdSCRIPTS: $(EXTRA_DIST)
+       for d in conf-hooks.d hooks scripts; do \
+         $(MKDIR_P) $(DESTDIR)$(initrddir)/$$d; \
+         cp $(top_srcdir)/contrib/initramfs/$$d/zfs \
+           $(DESTDIR)$(initrddir)/$$d/; \
+       done
+       if [ -f etc/init.d/zfs ]; then \
+         $(MKDIR_P) $(DESTDIR)$(DEFAULT_INITCONF_DIR); \
+         cp $(top_srcdir)/etc/init.d/zfs \
+           $(DESTDIR)$(DEFAULT_INITCONF_DIR)/; \
+       fi
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/zfs/contrib/initramfs/README.initramfs.markdown b/zfs/contrib/initramfs/README.initramfs.markdown
new file mode 100644 (file)
index 0000000..f6f3a79
--- /dev/null
@@ -0,0 +1,94 @@
+DESCRIPTION
+  These scripts is intended to be used with initramfs-tools, which is a similar
+  software product to "dracut" (which is more used in RedHat based distributions,
+  and is mainly used by Debian GNU/Linux and derivates to create a initramfs so
+  that the system can be booted of a ZFS filesystem. If you have no need or
+  interest for this, then it can safely be ignored.
+
+  These script were written with the primary intention of being portable and
+  usable on as many systems as possible.
+
+  This is, in practice, usually not possible. But the intention is there.
+  And it is a good one.
+
+  They have been tested successfully on:
+
+    * Debian GNU/Linux Wheezy
+    * Debian GNU/Linux Jessie
+
+  It uses some functionality common with the SYSV init scripts, primarily
+  the "/etc/zfs/zfs-functions" script.
+
+FUNCTIONALITY
+  * Supports booting of a ZFS snapshot.
+    Do this by cloning the snapshot into a dataset. If this, the resulting
+    dataset, already exists, destroy it. Then mount it as the root filesystem.
+    * If snapshot does not exist, use base dataset (the part before '@')
+      as boot filesystem instead.
+    * Clone with 'mountpoint=none' and 'canmount=noauto' - we mount manually
+      and explicitly.
+    * Allow rollback of snapshots instead of clone it and boot from the clone.
+    * If no snapshot is specified on the 'root=' kernel command line, but
+      there is an '@', then get a list of snapshots below that filesystem
+      and ask the user which to use.
+
+  * Support all currently used kernel command line arguments
+    * Core options:
+      All the different distributions have their own standard on what to specify
+      on the kernel command line to boot of a ZFS filesystem.
+
+      Supports the following kernel command line argument combinations
+      (in this order - first match win):
+      * rpool=<pool>                   (tries to finds bootfs automatically)
+      * bootfs=<pool>/<dataset>                (uses this for rpool - first part)
+      * rpool=<pool> bootfs=<pool>/<dataset>
+      * -B zfs-bootfs=<pool>/<fs>      (uses this for rpool - first part)
+      * rpool=rpool                    (default if none of the above is used)
+      * root=<pool>/<dataset>          (uses this for rpool - first part)
+      * root=ZFS=<pool>/<dataset>      (uses this for rpool - first part, without 'ZFS=')
+      * root=zfs:AUTO                  (tries to detect both pool and rootfs
+      * root=zfs:<pool>/<dataset>      (uses this for rpool - first part, without 'zfs:')
+
+      Option <dataset> could also be <snapshot>
+    * Extra (control) options:
+      * zfsdebug=(on,yes,1)   Show extra debugging information
+      * zfsforce=(on,yes,1)   Force import the pool
+      * rollback=(on,yes,1)   Rollback (instead of clone) the snapshot
+
+  * 'Smarter' way to import pools. Don't just try cache file or /dev.
+    * Try to use /dev/disk/by-vdev (if /etc/zfs/vdev_id.conf exists),
+    * Try /dev/mapper (to be able to use LUKS backed pools as well as
+      multi-path devices).
+    * /dev/disk/by-id and any other /dev/disk/by-* directory that may exist.
+    * Use /dev as a last ditch attempt.
+    * Fallback to using the cache file if that exist if nothing else worked.
+    * Only try to import pool if it haven't already been imported
+      * This will negate the need to force import a pool that have not been
+        exported cleanly.
+      * Support exclusion of pools to import by setting ZFS_POOL_EXCEPTIONS
+         in /etc/default/zfs.
+
+    Controlling in which order devices is searched for is controlled by
+    ZPOOL_IMPORT_PATH variable set in /etc/defaults/zfs.
+
+  * Support additional configuration variable ZFS_INITRD_ADDITIONAL_DATASETS
+    to mount additional filesystems not located under your root dataset.
+
+    For example, if the root fs is specified as 'rpool/ROOT/rootfs', it will
+    automatically and without specific configuration mount any filesystems
+    below this on the mount point specified in the 'mountpoint' property.
+    Such as 'rpool/root/rootfs/var', 'rpool/root/rootfs/usr' etc)
+
+    However, if one prefer to have separate filesystems, not located below
+    the root fs (such as 'rpool/var', 'rpool/ROOT/opt' etc), special
+    configuration needs to be done. This is what the variable, set in
+    /etc/defaults/zfs file, needs to be configured. The 'mountpoint'
+    property needs to be correct for this to work though.
+
+  * Allows mounting a rootfs with mountpoint=legacy set.
+
+  * Include /etc/modprobe.d/{zfs,spl}.conf in the initrd if it/they exist.
+
+  * Include the udev rule to use by-vdev for pool imports.
+
+  * Include the /etc/default/zfs file to the initrd.
diff --git a/zfs/contrib/initramfs/conf-hooks.d/zfs b/zfs/contrib/initramfs/conf-hooks.d/zfs
new file mode 100644 (file)
index 0000000..29950ca
--- /dev/null
@@ -0,0 +1,2 @@
+# Force the inclusion of Busybox in the initramfs.
+BUSYBOX=y
diff --git a/zfs/contrib/initramfs/hooks/zfs b/zfs/contrib/initramfs/hooks/zfs
new file mode 100755 (executable)
index 0000000..53e876d
--- /dev/null
@@ -0,0 +1,103 @@
+#!/bin/sh
+#
+# Add ZoL filesystem capabilities to an initrd, usually for a native ZFS root.
+#
+
+# This hook installs udev rules for ZoL.
+PREREQ="zdev"
+
+# These prerequisites are provided by the zfsutils package. The zdb utility is
+# not strictly required, but it can be useful at the initramfs recovery prompt.
+COPY_EXEC_LIST="/sbin/zdb /sbin/zpool /sbin/zfs /sbin/mount.zfs"
+COPY_EXEC_LIST="$COPY_EXEC_LIST /usr/bin/dirname /lib/udev/vdev_id"
+COPY_FILE_LIST="/etc/hostid /etc/zfs/zpool.cache /etc/default/zfs"
+COPY_FILE_LIST="$COPY_FILE_LIST /etc/zfs/zfs-functions /etc/zfs/vdev_id.conf"
+COPY_FILE_LIST="$COPY_FILE_LIST /lib/udev/rules.d/69-vdev.rules"
+
+# These prerequisites are provided by the base system.
+COPY_EXEC_LIST="$COPY_EXEC_LIST /bin/hostname /sbin/blkid"
+
+# Explicitly specify all kernel modules because automatic dependency resolution
+# is unreliable on many systems.
+BASE_MODULES="zlib_deflate spl zavl zcommon znvpair zunicode zfs"
+CRPT_MODULES="sun-ccm sun-gcm sun-ctr"
+MANUAL_ADD_MODULES_LIST="$BASE_MODULES"
+
+# Generic result code.
+RC=0
+
+case $1 in
+prereqs)
+       echo "$PREREQ"
+       exit 0
+       ;;
+esac
+
+for ii in $COPY_EXEC_LIST
+do
+       if [ ! -x "$ii" ]
+       then
+               echo "Error: $ii is not executable."
+               RC=2
+       fi
+done
+
+if [ "$RC" -ne 0 ]
+then
+       exit "$RC"
+fi
+
+. /usr/share/initramfs-tools/hook-functions
+
+mkdir -p "$DESTDIR/etc/"
+
+# ZDB uses pthreads for some functions, but the library dependency is not
+# automatically detected. The `find` utility and extended `cp` options are
+# used here because libgcc_s.so could be in a subdirectory of /lib for
+# multi-arch installations.
+cp --target-directory="$DESTDIR" --parents $(find /lib -type f -name libgcc_s.so.1)
+
+for ii in $COPY_EXEC_LIST
+do
+       copy_exec "$ii"
+done
+
+for ii in $COPY_FILE_LIST
+do
+       dir=$(dirname "$ii")
+       [ -d "$dir" ] && mkdir -p "$DESTDIR/$dir"
+       [ -f "$ii" ] && cp -p "$ii" "$DESTDIR/$ii"
+done
+
+for ii in $MANUAL_ADD_MODULES_LIST
+do
+       manual_add_modules "$ii"
+done
+
+if [ -f "/etc/hostname" ]
+then
+       cp -p "/etc/hostname" "$DESTDIR/etc/"
+else
+       hostname >"$DESTDIR/etc/hostname"
+fi
+
+for ii in zfs zfs.conf spl spl.conf
+do  
+       if [ -f "/etc/modprobe.d/$ii" ]; then
+               if [ ! -d "$DESTDIR/etc/modprobe.d" ]; then
+                       mkdir -p $DESTDIR/etc/modprobe.d
+               fi
+               cp -p "/etc/modprobe.d/$ii" $DESTDIR/etc/modprobe.d/
+       fi
+done
+
+# With pull request #1476 (not yet merged) comes a verbose warning
+# if /usr/bin/net doesn't exist or isn't executable. Just create
+# a dummy...
+[ ! -d "$DESTDIR/usr/bin" ] && mkdir -p "$DESTDIR/usr/bin"
+if [ ! -x "$DESTDIR/usr/bin/net" ]; then
+    touch "$DESTDIR/usr/bin/net"
+    chmod +x "$DESTDIR/usr/bin/net"
+fi
+
+exit 0
diff --git a/zfs/contrib/initramfs/scripts/zfs b/zfs/contrib/initramfs/scripts/zfs
new file mode 100644 (file)
index 0000000..6a78a46
--- /dev/null
@@ -0,0 +1,971 @@
+# ZFS boot stub for initramfs-tools.
+#
+# In the initramfs environment, the /init script sources this stub to
+# override the default functions in the /scripts/local script.
+#
+# Enable this by passing boot=zfs on the kernel command line.
+#
+
+# Source the common init script
+. /etc/zfs/zfs-functions
+
+# Paths to what we need - in the initrd, these paths are hardcoded,
+# so override the defines in zfs-functions.
+ZFS="/sbin/zfs"
+ZPOOL="/sbin/zpool"
+ZPOOL_CACHE="/etc/zfs/zpool.cache"
+export ZFS ZPOOL ZPOOL_CACHE
+
+# This runs any scripts that should run before we start importing
+# pools and mounting any filesystems.
+pre_mountroot()
+{
+       if type run_scripts > /dev/null 2>&1 && \
+           [ -f "/scripts/local-top" -o -d "/scripts/local-top" ]
+       then
+               [ "$quiet" != "y" ] && \
+                   zfs_log_begin_msg "Running /scripts/local-top"
+               run_scripts /scripts/local-top
+               [ "$quiet" != "y" ] && zfs_log_end_msg
+       fi
+
+       if type run_scripts > /dev/null 2>&1 && \
+           [ -f "/scripts/local-premount" -o -d "/scripts/local-premount" ]
+       then
+               [ "$quiet" != "y" ] && \
+                   zfs_log_begin_msg "Running /scripts/local-premount"
+               run_scripts /scripts/local-premount
+               [ "$quiet" != "y" ] && zfs_log_end_msg
+       fi
+}
+
+# If plymouth is availible, hide the splash image.
+disable_plymouth()
+{
+       if [ -x /bin/plymouth ] && /bin/plymouth --ping
+       then
+               /bin/plymouth hide-splash >/dev/null 2>&1
+       fi
+}
+
+# Get a ZFS filesystem property value.
+get_fs_value()
+{
+       local fs="$1"
+       local value=$2
+
+       "${ZFS}" get -H -ovalue $value "$fs" 2> /dev/null
+}
+
+# Find the 'bootfs' property on pool $1.
+# If the property does not contain '/', then ignore this
+# pool by exporting it again.
+find_rootfs()
+{
+       local pool="$1"
+
+       # If 'POOL_IMPORTED' isn't set, no pool imported and therefor
+       # we won't be able to find a root fs.
+       [ -z "${POOL_IMPORTED}" ] && return 1
+
+       # If it's already specified, just keep it mounted and exit
+       # User (kernel command line) must be correct.
+       [ -n "${ZFS_BOOTFS}" ] && return 0
+
+       # Not set, try to find it in the 'bootfs' property of the pool.
+       # NOTE: zpool does not support 'get -H -ovalue bootfs'...
+       ZFS_BOOTFS=$("${ZPOOL}" list -H -obootfs "$pool")
+
+       # Make sure it's not '-' and that it starts with /.
+       if [ "${ZFS_BOOTFS}" != "-" ] && \
+               $(get_fs_value "${ZFS_BOOTFS}" mountpoint | grep -q '^/$')
+       then
+               # Keep it mounted
+               POOL_IMPORTED=1
+               return 0
+       fi
+
+       # Not boot fs here, export it and later try again..
+       "${ZPOOL}" export "$pool"
+       POOL_IMPORTED=""
+
+       return 1
+}
+
+# Support function to get a list of all pools, separated with ';'
+find_pools()
+{
+       local CMD="$*"
+       local pools pool
+
+       pools=$($CMD 2> /dev/null | \
+               grep -E "pool:|^[a-zA-Z0-9]" | \
+               sed 's@.*: @@' | \
+               while read pool; do \
+                   echo -n "$pool;"
+               done)
+
+       echo "${pools%%;}" # Return without the last ';'.
+}
+
+# Get a list of all availible pools
+get_pools()
+{
+       local available_pools npools
+
+       if [ -n "${ZFS_POOL_IMPORT}" ]; then
+               echo "$ZFS_POOL_IMPORT"
+               return 0
+       fi
+
+       # Get the base list of availible pools.
+       available_pools=$(find_pools "$ZPOOL" import)
+
+       # Just in case - seen it happen (that a pool isn't visable/found
+       # with a simple "zpool import" but only when using the "-d"
+       # option or setting ZPOOL_IMPORT_PATH).
+       if [ -d "/dev/disk/by-id" ]
+       then
+               npools=$(find_pools "$ZPOOL" import -d /dev/disk/by-id)
+               if [ -n "$npools" ]
+               then
+                       # Because we have found extra pool(s) here, which wasn't
+                       # found 'normaly', we need to force USE_DISK_BY_ID to
+                       # make sure we're able to actually import it/them later.
+                       USE_DISK_BY_ID='yes'
+
+                       if [ -n "$available_pools" ]
+                       then
+                               # Filter out duplicates (pools found with the simple
+                               # "zpool import" but which is also found with the
+                               # "zpool import -d ...").
+                               npools=$(echo "$npools" | sed "s,$available_pools,,")
+
+                               # Add the list to the existing list of
+                               # available pools
+                               available_pools="$available_pools;$npools"
+                       else
+                               available_pools="$npools"
+                       fi
+               fi
+       fi
+
+        # Filter out any exceptions...
+       if [ -n "$ZFS_POOL_EXCEPTIONS" ]
+       then
+               local found=""
+               local apools=""
+               local pool exception
+               OLD_IFS="$IFS" ; IFS=";"
+
+               for pool in $available_pools
+               do
+                       for exception in $ZFS_POOL_EXCEPTIONS
+                       do
+                               [ "$pool" = "$exception" ] && continue 2
+                               found="$pool"
+                       done
+
+                       if [ -n "$found" ]
+                       then
+                               if [ -n "$apools" ]
+                               then
+                                       apools="$apools;$pool"
+                               else
+                                       apools="$pool"
+                               fi
+                       fi
+               done
+
+               IFS="$OLD_IFS"
+               available_pools="$apools"
+       fi
+
+       # Return list of availible pools.
+       echo "$available_pools"
+}
+
+# Import given pool $1
+import_pool()
+{
+       local pool="$1"
+       local dirs dir
+
+       # Verify that the pool isn't already imported
+       # Make as sure as we can to not require '-f' to import.
+       "${ZPOOL}" status "$pool" > /dev/null 2>&1 && return 0
+
+       # For backwards compability, make sure that ZPOOL_IMPORT_PATH is set
+       # to something we can use later with the real import(s). We want to
+       # make sure we find all by* dirs, BUT by-vdev should be first (if it
+       # exists).
+       if [ -n "$USE_DISK_BY_ID" -a -z "$ZPOOL_IMPORT_PATH" ]
+       then
+               dirs="$(for dir in $(echo /dev/disk/by-*)
+               do
+                       # Ignore by-vdev here - we want it first!
+                       echo "$dir" | grep -q /by-vdev && continue
+                       [ ! -d "$dir" ] && continue
+
+                       echo -n "$dir:"
+               done | sed 's,:$,,g')"
+
+               if [ -d "/dev/disk/by-vdev" ]
+               then
+                       # Add by-vdev at the beginning.
+                       ZPOOL_IMPORT_PATH="/dev/disk/by-vdev:"
+               fi
+
+               # ... and /dev at the very end, just for good measure.
+               ZPOOL_IMPORT_PATH="$ZPOOL_IMPORT_PATH$dirs:/dev"
+       fi
+
+       # Needs to be exported for "zpool" to catch it.
+       [ -n "$ZPOOL_IMPORT_PATH" ] && export ZPOOL_IMPORT_PATH
+
+
+       [ "$quiet" != "y" ] && zfs_log_begin_msg \
+               "Importing pool '${pool}' using defaults"
+
+       ZFS_CMD="${ZPOOL} import -N ${ZPOOL_FORCE} ${ZPOOL_IMPORT_OPTS}"
+       ZFS_STDERR="$($ZFS_CMD "$pool" 2>&1)"
+       ZFS_ERROR="$?"
+       if [ "${ZFS_ERROR}" != 0 ]
+       then
+               [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}"
+
+               if [ -f "${ZPOOL_CACHE}" ]
+               then
+                       [ "$quiet" != "y" ] && zfs_log_begin_msg \
+                               "Importing pool '${pool}' using cachefile."
+
+                       ZFS_CMD="${ZPOOL} import -c ${ZPOOL_CACHE} -N ${ZPOOL_FORCE} ${ZPOOL_IMPORT_OPTS}"
+                       ZFS_STDERR="$($ZFS_CMD "$pool" 2>&1)"
+                       ZFS_ERROR="$?"
+               fi
+
+               if [ "${ZFS_ERROR}" != 0 ]
+               then
+                       [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}"
+
+                       disable_plymouth
+                       echo ""
+                       echo "Command: ${ZFS_CMD} '$pool'"
+                       echo "Message: $ZFS_STDERR"
+                       echo "Error: $ZFS_ERROR"
+                       echo ""
+                       echo "Failed to import pool '$pool'."
+                       echo "Manually import the pool and exit."
+                       /bin/sh
+               fi
+       fi
+
+       [ "$quiet" != "y" ] && zfs_log_end_msg
+
+       POOL_IMPORTED=1
+       return 0
+}
+
+# Load ZFS modules
+# Loading a module in a initrd require a slightly different approach,
+# with more logging etc.
+load_module_initrd()
+{
+       if [ "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP" > 0 ]
+       then
+               if [ "$quiet" != "y" ]; then
+                       zfs_log_begin_msg "Sleeping for" \
+                               "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP seconds..."
+               fi
+               sleep "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP"
+               [ "$quiet" != "y" ] && zfs_log_end_msg
+       fi
+
+       # Wait for all of the /dev/{hd,sd}[a-z] device nodes to appear.
+       if type wait_for_udev > /dev/null 2>&1 ; then
+               wait_for_udev 10
+       elif type wait_for_dev > /dev/null 2>&1 ; then
+               wait_for_dev
+       fi
+
+       # zpool import refuse to import without a valid mtab
+       [ ! -f /proc/mounts ] && mount proc /proc
+       [ ! -f /etc/mtab ] && cat /proc/mounts > /etc/mtab
+
+       # Load the module
+       load_module "zfs" || return 1
+
+       if [ "$ZFS_INITRD_POST_MODPROBE_SLEEP" > 0 ]
+       then
+               if [ "$quiet" != "y" ]; then
+                       zfs_log_begin_msg "Sleeping for" \
+                               "$ZFS_INITRD_POST_MODPROBE_SLEEP seconds..."
+               fi
+               sleep "$ZFS_INITRD_POST_MODPROBE_SLEEP"
+               [ "$quiet" != "y" ] && zfs_log_end_msg
+       fi
+
+       return 0
+}
+
+# Mount a given filesystem
+mount_fs()
+{
+       local fs="$1"
+       local mountpoint
+
+       # Check that the filesystem exists
+       "${ZFS}" list -oname -tfilesystem -H "${fs}" > /dev/null 2>&1
+       [ "$?" -ne 0 ] && return 1
+
+       # Need the _original_ datasets mountpoint!
+       mountpoint=$(get_fs_value "$fs" mountpoint)
+       if [ "$mountpoint" = "legacy" -o "$mountpoint" = "none" ]; then
+               # Can't use the mountpoint property. Might be one of our
+               # clones. Check the 'org.zol:mountpoint' property set in
+               # clone_snap() if that's usable.
+               mountpoint=$(get_fs_value "$fs" org.zol:mountpoint)
+               if [ "$mountpoint" = "legacy" -o \
+                   "$mountpoint" = "none" -o \
+                   "$mountpoint" = "-" ]
+               then
+                       if [ "$fs" != "${ZFS_BOOTFS}" ]; then
+                               # We don't have a proper mountpoint, this
+                               # isn't the root fs. So extract the root fs
+                               # value from the filesystem, and we should
+                               # (hopefully!) have a mountpoint we can use.
+                               mountpoint="${fs##$ZFS_BOOTFS}"
+                       else
+                               # Last hail-mary: Hope 'rootmnt' is set!
+                               mountpoint=""
+                       fi
+               fi
+
+               if [ "$mountpoint" = "legacy" ]; then
+                       ZFS_CMD="mount -t zfs"
+               else
+                       # If it's not a legacy filesystem, it can only be a
+                       # native one...
+                       ZFS_CMD="mount -o zfsutil -t zfs"
+               fi
+       else
+               ZFS_CMD="mount -o zfsutil -t zfs"
+       fi
+
+       # Possibly decrypt a filesystem using native encryption.
+       decrypt_fs "$fs"
+
+       [ "$quiet" != "y" ] && \
+           zfs_log_begin_msg "Mounting '${fs}' on '${rootmnt}/${mountpoint}'"
+       [ -n "${ZFS_DEBUG}" ] && \
+           zfs_log_begin_msg "CMD: '$ZFS_CMD ${fs} ${rootmnt}/${mountpoint}'"
+
+       ZFS_STDERR=$(${ZFS_CMD} "${fs}" "${rootmnt}/${mountpoint}" 2>&1)
+       ZFS_ERROR=$?
+       if [ "${ZFS_ERROR}" != 0 ]
+       then
+               [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}"
+
+               disable_plymouth
+               echo ""
+               echo "Command: ${ZFS_CMD} ${fs} ${rootmnt}/${mountpoint}"
+               echo "Message: $ZFS_STDERR"
+               echo "Error: $ZFS_ERROR"
+               echo ""
+               echo "Failed to mount ${fs} on ${rootmnt}/${mountpoint}."
+               echo "Manually mount the filesystem and exit."
+               /bin/sh
+       else
+               [ "$quiet" != "y" ] && zfs_log_end_msg
+       fi
+
+       return 0
+}
+
+# Unlock a ZFS native crypted filesystem.
+decrypt_fs()
+{
+       local fs="$1"
+
+       # If the 'zfs key' command isn't availible, exit right here.
+       "${ZFS}" 2>&1 | grep -q 'key -l ' || return 0
+
+       # Check if filesystem is encrypted. If not, exit right here.
+       [ "$(get_fs_value "$fs" encryption)" != "off" ] || return 0
+
+       [ "$quiet" != "y" ] && \
+           zfs_log_begin_msg "Loading crypto wrapper key for $fs"
+
+       # Just make sure that ALL crypto modules module is loaded.
+       # Simplest just to load all...
+       for mod in sun-ccm sun-gcm sun-ctr
+       do
+               [ "$quiet" != "y" ] && zfs_log_progress_msg "${mod} "
+
+               ZFS_CMD="load_module $mod"
+               ZFS_STDERR="$(${ZFS_CMD} 2>&1)"
+               ZFS_ERROR="$?"
+
+               if [ "${ZFS_ERROR}" != 0 ]
+               then
+                       [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}"
+
+                       disable_plymouth
+                       echo ""
+                       echo "Command: $ZFS_CMD"
+                       echo "Message: $ZFS_STDERR"
+                       echo "Error: $ZFS_ERROR"
+                       echo ""
+                       echo "Failed to load $mod module."
+                       echo "Please verify that it is availible on the initrd image"
+                       echo "(without it it won't be possible to unlock the filesystem)"
+                       echo "and rerun:  $ZFS_CMD"
+                       /bin/sh
+               else
+                       [ "$quiet" != "y" ] && zfs_log_end_msg
+               fi
+       done
+
+       # If the key isn't availible, then this will fail!
+       ZFS_CMD="${ZFS} key -l -r $fs"
+       ZFS_STDERR="$(${ZFS_CMD} 2>&1)"
+       ZFS_ERROR="$?"
+
+       if [ "${ZFS_ERROR}" != 0 ]
+       then
+               [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}"
+
+               disable_plymouth
+               echo ""
+               echo "Command: $ZFS_CMD"
+               echo "Message: $ZFS_STDERR"
+               echo "Error: $ZFS_ERROR"
+               echo ""
+               echo "Failed to load zfs encryption wrapper key (s)."
+               echo "Please verify dataset property 'keysource' for datasets"
+               echo "and rerun:  $ZFS_CMD"
+               /bin/sh
+       else
+               [ "$quiet" != "y" ] && zfs_log_end_msg
+       fi
+
+       return 0
+}
+
+# Destroy a given filesystem.
+destroy_fs()
+{
+       local fs="$1"
+
+       [ "$quiet" != "y" ] && \
+           zfs_log_begin_msg "Destroying '$fs'"
+
+       ZFS_CMD="${ZFS} destroy $fs"
+       ZFS_STDERR="$(${ZFS_CMD} 2>&1)"
+       ZFS_ERROR="$?"
+       if [ "${ZFS_ERROR}" != 0 ]
+       then
+               [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}"
+
+               disable_plymouth
+               echo ""
+               echo "Command: $ZFS_CMD"
+               echo "Message: $ZFS_STDERR"
+               echo "Error: $ZFS_ERROR"
+               echo ""
+               echo "Failed to destroy '$fs'. Please make sure that '$fs' is not availible."
+               echo "Hint: Try:  zfs destroy -Rfn $fs"
+               echo "If this dryrun looks good, then remove the 'n' from '-Rfn' and try again."
+               /bin/sh
+       else
+               [ "$quiet" != "y" ] && zfs_log_end_msg
+       fi
+
+       return 0
+}
+
+# Clone snapshot $1 to destination filesystem $2
+# Set 'canmount=noauto' and 'mountpoint=none' so that we get to keep
+# manual controll over it's mounting (i.e., make sure it's not automatically
+# mounted with a 'zfs mount -a' in the init/systemd scripts).
+clone_snap()
+{
+       local snap="$1"
+       local destfs="$2"
+       local mountpoint="$3"
+
+       [ "$quiet" != "y" ] && zfs_log_begin_msg "Cloning '$snap' to '$destfs'"
+
+       # Clone the snapshot into a dataset we can boot from
+       # + We don't want this filesystem to be automatically mounted, we
+       #   want controll over this here and nowhere else.
+       # + We don't need any mountpoint set for the same reason.
+       # We use the 'org.zol:mountpoint' property to remember the mountpoint.
+       ZFS_CMD="${ZFS} clone -o canmount=noauto -o mountpoint=none"
+       ZFS_CMD="${ZFS_CMD} -o org.zol:mountpoint=${mountpoint}"
+       ZFS_CMD="${ZFS_CMD} $snap $destfs"
+       ZFS_STDERR="$(${ZFS_CMD} 2>&1)"
+       ZFS_ERROR="$?"
+       if [ "${ZFS_ERROR}" != 0 ]
+       then
+               [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}"
+
+               disable_plymouth
+               echo ""
+               echo "Command: $ZFS_CMD"
+               echo "Message: $ZFS_STDERR"
+               echo "Error: $ZFS_ERROR"
+               echo ""
+               echo "Failed to clone snapshot."
+               echo "Make sure that the any problems are corrected and then make sure"
+               echo "that the dataset '$destfs' exists and is bootable."
+               /bin/sh
+       else
+               [ "$quiet" != "y" ] && zfs_log_end_msg
+       fi
+
+       return 0
+}
+
+# Rollback a given snapshot.
+rollback_snap()
+{
+       local snap="$1"
+
+       [ "$quiet" != "y" ] && zfs_log_begin_msg "Rollback $snap"
+
+       ZFS_CMD="${ZFS} rollback -Rf $snap"
+       ZFS_STDERR="$(${ZFS_CMD} 2>&1)"
+       ZFS_ERROR="$?"
+       if [ "${ZFS_ERROR}" != 0 ]
+       then
+               [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}"
+
+               disable_plymouth
+               echo ""
+               echo "Command: $ZFS_CMD"
+               echo "Message: $ZFS_STDERR"
+               echo "Error: $ZFS_ERROR"
+               echo ""
+               echo "Failed to rollback snapshot."
+               /bin/sh
+       else
+               [ "$quiet" != "y" ] && zfs_log_end_msg
+       fi
+
+       return 0
+}
+
+# Get a list of snapshots, give them as a numbered list
+# to the user to choose from.
+ask_user_snap()
+{
+       local fs="$1"
+       local i=1
+       local SNAP snapnr snap debug
+
+       # We need to temporarily disable debugging. Set 'debug' so we
+       # remember to enabled it again.
+       if [ -n "${ZFS_DEBUG}" ]; then
+               unset ZFS_DEBUG
+               set +x
+               debug=1
+       fi
+
+       # Because we need the resulting snapshot, which is sent on
+       # stdout to the caller, we use stderr for our questions.
+       echo "What snapshot do you want to boot from?" > /dev/stderr
+       while read snap; do
+           echo "  $i: ${snap}" > /dev/stderr
+           eval `echo SNAP_$i=$snap`
+           i=$((i + 1))
+       done <<EOT
+$("${ZFS}" list -H -oname -tsnapshot "${fs}")
+EOT
+
+       echo -n "  Snap nr [0-$((i-1))]? " > /dev/stderr
+       read snapnr
+
+       # Reenable debugging.
+       if [ -n "${debug}" ]; then
+               ZFS_DEBUG=1
+               set -x
+       fi
+
+       echo "$(eval echo "$"SNAP_$snapnr)"
+}
+
+setup_snapshot_booting()
+{
+       local snap="$1"
+       local s destfs subfs mountpoint retval=0 filesystems fs
+
+       # Make sure that the snapshot specified actually exist.
+       if [ ! $(get_fs_value "${snap}" type) ]
+       then
+               # Snapshot does not exist (...@<null> ?)
+               # ask the user for a snapshot to use.
+               snap="$(ask_user_snap "${snap%%@*}")"
+       fi
+
+       # Separate the full snapshot ('$snap') into it's filesystem and
+       # snapshot names. Would have been nice with a split() function..
+       rootfs="${snap%%@*}"
+       snapname="${snap##*@}"  
+       ZFS_BOOTFS="${rootfs}_${snapname}"
+
+       if ! grep -qiE '(^|[^\\](\\\\)* )(rollback)=(on|yes|1)( |$)' /proc/cmdline
+       then
+               # If the destination dataset for the clone
+               # already exists, destroy it. Recursivly
+               if [ $(get_fs_value "${rootfs}_${snapname}" type) ]; then
+                       filesystems=$("${ZFS}" list -oname -tfilesystem -H \
+                           -r -Sname "${ZFS_BOOTFS}")
+                       for fs in $filesystems; do
+                               destroy_fs "${fs}"
+                       done
+               fi
+       fi
+
+       # Get all snapshots, recursivly (might need to clone /usr, /var etc
+       # as well).
+       for s in $("${ZFS}" list -H -oname -tsnapshot -r "${rootfs}" | \
+           grep "${snapname}")
+       do
+               if grep -qiE '(^|[^\\](\\\\)* )(rollback)=(on|yes|1)( |$)' /proc/cmdline
+               then
+                       # Rollback snapshot
+                       rollback_snap "$s" || retval=$((retval + 1))
+               else
+                       # Setup a destination filesystem name.
+                       # Ex: Called with 'rpool/ROOT/debian@snap2'
+                       #       rpool/ROOT/debian@snap2         => rpool/ROOT/debian_snap2
+                       #       rpool/ROOT/debian/boot@snap2    => rpool/ROOT/debian_snap2/boot
+                       #       rpool/ROOT/debian/usr@snap2     => rpool/ROOT/debian_snap2/usr
+                       #       rpool/ROOT/debian/var@snap2     => rpool/ROOT/debian_snap2/var
+                       subfs="${s##$rootfs}"
+                       subfs="${subfs%%@$snapname}"
+
+                       destfs="${rootfs}_${snapname}" # base fs.
+                       [ -n "$subfs" ] && destfs="${destfs}$subfs" # + sub fs.
+
+                       # Get the mountpoint of the filesystem, to be used
+                       # with clone_snap(). If legacy or none, then use
+                       # the sub fs value.
+                       mountpoint=$(get_fs_value "${s%%@*}" mountpoint)
+                       if [ "$mountpoint" = "legacy" -o \
+                           "$mountpoint" = "none" ]
+                       then
+                               if [ -n "${subfs}" ]; then
+                                       mountpoint="${subfs}"
+                               else
+                                       mountpoint="/"
+                               fi
+                       fi
+
+                       # Clone the snapshot into its own
+                       # filesystem
+                       clone_snap "$s" "${destfs}" "${mountpoint}" || \
+                           retval=$((retval + 1))
+               fi
+       done
+
+       # If we haven't return yet, we have a problem...
+       return "${retval}"
+}
+
+# ================================================================
+
+# This is the main function.
+mountroot()
+{
+       local snaporig snapsub destfs pool POOLS
+
+       # ----------------------------------------------------------------
+       # I N I T I A L   S E T U P
+
+       # ------------
+       # Run the pre-mount scripts from /scripts/local-top.
+       pre_mountroot
+
+       # ------------
+       # Source the default setup variables.
+       [ -r '/etc/default/zfs' ] && . /etc/default/zfs
+
+       # ------------
+       # Support debug option
+       if grep -qiE '(^|[^\\](\\\\)* )(zfs_debug|zfs\.debug|zfsdebug)=(on|yes|1)( |$)' /proc/cmdline
+       then
+               ZFS_DEBUG=1
+               mkdir /var/log
+               #exec 2> /var/log/boot.debug
+               set -x
+       fi
+
+       # ------------
+       # Load ZFS module etc.
+       if ! load_module_initrd; then
+               disable_plymouth
+               echo ""
+               echo "Failed to load ZFS modules."
+               echo "Manually load the modules and exit."
+               /bin/sh
+       fi
+
+       # ------------
+       # Look for the cache file (if any).
+       [ ! -f ${ZPOOL_CACHE} ] && unset ZPOOL_CACHE
+
+       # ------------
+       # Compatibility: 'ROOT' is for Debian GNU/Linux (etc),
+       #                'root' is for Redhat/Fedora (etc),
+       #                'REAL_ROOT' is for Gentoo
+       if [ -z "$ROOT" ]
+       then
+               [ -n "$root" ] && ROOT=${root}
+
+               [ -n "$REAL_ROOT" ] && ROOT=${REAL_ROOT}
+       fi
+
+       # ------------
+       # Where to mount the root fs in the initrd - set outside this script
+       # Compatibility: 'rootmnt' is for Debian GNU/Linux (etc),
+       #                'NEWROOT' is for RedHat/Fedora (etc),
+       #                'NEW_ROOT' is for Gentoo
+       if [ -z "$rootmnt" ]
+       then
+               [ -n "$NEWROOT" ] && rootmnt=${NEWROOT}
+
+               [ -n "$NEW_ROOT" ] && rootmnt=${NEW_ROOT}
+       fi
+
+       # ------------
+       # No longer set in the defaults file, but it could have been set in
+       # get_pools() in some circumstances. If it's something, but not 'yes',
+       # it's no good to us.
+       [ -n "$USE_DISK_BY_ID" -a "$USE_DISK_BY_ID" != 'yes' ] && \
+           unset USE_DISK_BY_ID
+
+       # ----------------------------------------------------------------
+       # P A R S E   C O M M A N D   L I N E   O P T I O N S
+
+       # This part is the really ugly part - there's so many options and permutations
+       # 'out there', and if we should make this the 'primary' source for ZFS initrd
+       # scripting, we need/should support them all.
+       #
+       # Supports the following kernel command line argument combinations
+       # (in this order - first match win):
+       #
+       #       rpool=<pool>                    (tries to finds bootfs automatically)
+       #       bootfs=<pool>/<dataset>         (uses this for rpool - first part)
+       #       rpool=<pool> bootfs=<pool>/<dataset>
+       #       -B zfs-bootfs=<pool>/<fs>       (uses this for rpool - first part)
+       #       rpool=rpool                     (default if none of the above is used)
+       #       root=<pool>/<dataset>           (uses this for rpool - first part)
+       #       root=ZFS=<pool>/<dataset>       (uses this for rpool - first part, without 'ZFS=')
+       #       root=zfs:AUTO                   (tries to detect both pool and rootfs
+       #       root=zfs:<pool>/<dataset>       (uses this for rpool - first part, without 'zfs:')
+       #
+       # Option <dataset> could also be <snapshot>
+
+       # ------------
+       # Support force option
+       # In addition, setting one of zfs_force, zfs.force or zfsforce to
+       # 'yes', 'on' or '1' will make sure we force import the pool.
+       # This should (almost) never be needed, but it's here for
+       # completeness.
+       ZPOOL_FORCE=""
+       if grep -qiE '(^|[^\\](\\\\)* )(zfs_force|zfs\.force|zfsforce)=(on|yes|1)( |$)' /proc/cmdline
+       then
+               ZPOOL_FORCE="-f"
+       fi
+
+       # ------------
+       # Look for 'rpool' and 'bootfs' parameter
+       [ -n "$rpool" ] && ZFS_RPOOL="${rpool#rpool=}"
+       [ -n "$bootfs" ] && ZFS_BOOTFS="${bootfs#bootfs=}"
+
+       # ------------
+       # If we have 'ROOT' (see above), but not 'ZFS_BOOTFS', then use
+       # 'ROOT'
+       [ -n "$ROOT" -a -z "${ZFS_BOOTFS}" ] && ZFS_BOOTFS="$ROOT"
+
+       # ------------
+       # Check for the `-B zfs-bootfs=%s/%u,...` kind of parameter.
+       # NOTE: Only use the pool name and dataset. The rest is not
+       #       supported by ZoL (whatever it's for).
+       if [ -z "$ZFS_RPOOL" ]
+       then
+               # The ${zfs-bootfs} variable is set at the kernel commmand
+               # line, usually by GRUB, but it cannot be referenced here
+               # directly because bourne variable names cannot contain a
+               # hyphen.
+               #
+               # Reassign the variable by dumping the environment and
+               # stripping the zfs-bootfs= prefix.  Let the shell handle
+               # quoting through the eval command.
+               eval ZFS_RPOOL=$(set | sed -n -e 's,^zfs-bootfs=,,p')
+       fi
+
+       # ------------
+       # No root fs or pool specified - do auto detect.
+       if [ -z "$ZFS_RPOOL" -a -z "${ZFS_BOOTFS}" ]
+       then
+               # Do auto detect. Do this by 'cheating' - set 'root=zfs:AUTO'
+               # which will be caught later
+               ROOT=zfs:AUTO
+       fi
+
+       # ----------------------------------------------------------------
+       # F I N D   A N D   I M P O R T   C O R R E C T   P O O L
+
+       # ------------
+       if [ "$ROOT" = "zfs:AUTO" ]
+       then
+               # Try to detect both pool and root fs.
+
+               [ "$quiet" != "y" ] && \
+                   zfs_log_begin_msg "Attempting to import additional pools."
+
+               # Get a list of pools available for import
+               if [ -n "$ZFS_RPOOL" ]
+               then
+                       # We've specified a pool - check only that
+                       POOLS=$ZFS_RPOOL
+               else
+                       POOLS=$(get_pools)
+               fi
+
+               OLD_IFS="$IFS" ; IFS=";"
+               for pool in $POOLS
+               do
+                       [ -z "$pool" ] && continue
+
+                       import_pool "$pool"
+                       find_rootfs "$pool"
+               done
+               IFS="$OLD_IFS"
+
+               [ "$quiet" != "y" ] && zfs_log_end_msg $ZFS_ERROR
+       else
+               # No auto - use value from the command line option.
+
+               # Strip 'zfs:' and 'ZFS='.
+               ZFS_BOOTFS="${ROOT#*[:=]}"
+
+               # Stip everything after the first slash.
+               ZFS_RPOOL="${ZFS_BOOTFS%%/*}"
+       fi
+
+       # Import the pool (if not already done so in the AUTO check above).
+       if [ -n "$ZFS_RPOOL" -a -z "${POOL_IMPORTED}" ]
+       then
+               [ "$quiet" != "y" ] && \
+                   zfs_log_begin_msg "Importing ZFS root pool '$ZFS_RPOOL'"
+
+               import_pool "${ZFS_RPOOL}"
+               find_rootfs "${ZFS_RPOOL}"
+
+               [ "$quiet" != "y" ] && zfs_log_end_msg
+       fi
+
+       if [ -z "${POOL_IMPORTED}" ]
+       then
+               # No pool imported, this is serious!
+               disable_plymouth
+               echo ""
+               echo "Command: $ZFS_CMD"
+               echo "Message: $ZFS_STDERR"
+               echo "Error: $ZFS_ERROR"
+               echo ""
+               echo "No pool imported. Manually import the root pool"
+               echo "at the command prompt and then exit."
+               echo "Hint: Try:  zpool import -R ${rootmnt} -N ${ZFS_RPOOL}"
+               /bin/sh
+       fi
+
+       # ----------------------------------------------------------------
+       # P R E P A R E   R O O T   F I L E S Y S T E M
+
+       if [ -n "${ZFS_BOOTFS}" ]
+       then
+               # Booting from a snapshot?
+               # Will overwrite the ZFS_BOOTFS variable like so:
+               #   rpool/ROOT/debian@snap2 => rpool/ROOT/debian_snap2
+               echo "${ZFS_BOOTFS}" | grep -q '@' && \
+                   setup_snapshot_booting "${ZFS_BOOTFS}"
+       fi
+
+       if [ -z "${ZFS_BOOTFS}" ]
+       then
+               # Still nothing! Let the user sort this out.
+               disable_plymouth
+               echo ""
+               echo "Error: Unknown root filesystem - no 'bootfs' pool property and"
+               echo "       not specified on the kernel command line."
+               echo ""
+               echo "Manually mount the root filesystem on $rootmnt and then exit."
+               echo "Hint: Try:  mount -o zfsutil -t zfs ${ZFS_RPOOL-rpool}/ROOT/system $rootmnt"
+               /bin/sh
+       fi
+
+       # ----------------------------------------------------------------
+       # M O U N T   F I L E S Y S T E M S
+
+       # * Ideally, the root filesystem would be mounted like this:
+       #
+       #     zpool import -R "$rootmnt" -N "$ZFS_RPOOL"
+       #     zfs mount -o mountpoint=/ "${ZFS_BOOTFS}"
+       #
+       #   but the MOUNTPOINT prefix is preserved on descendent filesystem
+       #   after the pivot into the regular root, which later breaks things
+       #   like `zfs mount -a` and the /etc/mtab refresh.
+       #
+       # * Mount additional filesystems required
+       #   Such as /usr, /var, /usr/local etc.
+       #   NOTE: Mounted in the order specified in the
+       #         ZFS_INITRD_ADDITIONAL_DATASETS variable so take care!
+
+       # Go through the complete list (recursivly) of all filesystems below
+       # the real root dataset
+       filesystems=$("${ZFS}" list -oname -tfilesystem -H -r "${ZFS_BOOTFS}")
+       for fs in $filesystems $ZFS_INITRD_ADDITIONAL_DATASETS
+       do
+               mount_fs "$fs"
+       done
+
+       # ------------
+       # Debugging information
+       if [ -n "${ZFS_DEBUG}" ]
+       then
+               #exec 2>&1-
+
+               echo "DEBUG: imported pools:"
+               "${ZPOOL}" list -H
+               echo
+
+               echo "DEBUG: mounted ZFS filesystems:"
+               mount | grep zfs
+               echo
+
+               echo "=> waiting for ENTER before continuing because of 'zfsdebug=1'. "
+               echo -n "   'c' for shell, 'r' for reboot, 'ENTER' to continue. "
+               read b
+
+               [ "$b" = "c" ] && /bin/sh
+               [ "$b" = "r" ] && reboot -f
+
+               set +x
+       fi
+
+       # ------------
+       # Run local bottom script
+       if type run_scripts > /dev/null 2>&1 && \
+           [ -f "/scripts/local-bottom" -o -d "/scripts/local-bottom" ]
+       then
+               [ "$quiet" != "y" ] && \
+                   zfs_log_begin_msg "Running /scripts/local-bottom"
+               run_scripts /scripts/local-bottom
+               [ "$quiet" != "y" ] && zfs_log_end_msg
+       fi
+}
index 94bd530bd3c9674459625dda93b06757cd6808d4..36816bc53e2f6d05e91b0a2d60b878c4d20a97b4 100644 (file)
@@ -1,7 +1,7 @@
 BUILD_DEPENDS[0]="spl"
 AUTOINSTALL="yes"
 PACKAGE_NAME="zfs"
-PACKAGE_VERSION="0.6.4.2"
+PACKAGE_VERSION="0.6.5.3"
 PRE_BUILD="configure
   --prefix=/usr
   --with-config=kernel
diff --git a/zfs/dracut/90zfs/Makefile.in b/zfs/dracut/90zfs/Makefile.in
deleted file mode 100644 (file)
index b3e8c13..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-%:
-       #
diff --git a/zfs/dracut/90zfs/export-zfs.sh.in b/zfs/dracut/90zfs/export-zfs.sh.in
deleted file mode 100644 (file)
index b3e8c13..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-%:
-       #
diff --git a/zfs/dracut/90zfs/module-setup.sh.in b/zfs/dracut/90zfs/module-setup.sh.in
deleted file mode 100644 (file)
index b3e8c13..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-%:
-       #
diff --git a/zfs/dracut/90zfs/mount-zfs.sh.in b/zfs/dracut/90zfs/mount-zfs.sh.in
deleted file mode 100644 (file)
index b3e8c13..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-%:
-       #
diff --git a/zfs/dracut/90zfs/parse-zfs.sh.in b/zfs/dracut/90zfs/parse-zfs.sh.in
deleted file mode 100644 (file)
index b3e8c13..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-%:
-       #
diff --git a/zfs/dracut/Makefile.in b/zfs/dracut/Makefile.in
deleted file mode 100644 (file)
index b3e8c13..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-%:
-       #
diff --git a/zfs/etc/init.d/zfs-functions.in b/zfs/etc/init.d/zfs-functions.in
new file mode 100644 (file)
index 0000000..b3e8c13
--- /dev/null
@@ -0,0 +1,2 @@
+%:
+       #
diff --git a/zfs/etc/init.d/zfs-import.in b/zfs/etc/init.d/zfs-import.in
new file mode 100644 (file)
index 0000000..b3e8c13
--- /dev/null
@@ -0,0 +1,2 @@
+%:
+       #
diff --git a/zfs/etc/init.d/zfs-mount.in b/zfs/etc/init.d/zfs-mount.in
new file mode 100644 (file)
index 0000000..b3e8c13
--- /dev/null
@@ -0,0 +1,2 @@
+%:
+       #
diff --git a/zfs/etc/init.d/zfs-share.in b/zfs/etc/init.d/zfs-share.in
new file mode 100644 (file)
index 0000000..b3e8c13
--- /dev/null
@@ -0,0 +1,2 @@
+%:
+       #
diff --git a/zfs/etc/init.d/zfs-zed.in b/zfs/etc/init.d/zfs-zed.in
new file mode 100644 (file)
index 0000000..b3e8c13
--- /dev/null
@@ -0,0 +1,2 @@
+%:
+       #
diff --git a/zfs/etc/init.d/zfs.fedora.in b/zfs/etc/init.d/zfs.fedora.in
deleted file mode 100644 (file)
index b3e8c13..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-%:
-       #
diff --git a/zfs/etc/init.d/zfs.gentoo.in b/zfs/etc/init.d/zfs.gentoo.in
deleted file mode 100644 (file)
index b3e8c13..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-%:
-       #
diff --git a/zfs/etc/init.d/zfs.in b/zfs/etc/init.d/zfs.in
new file mode 100644 (file)
index 0000000..b3e8c13
--- /dev/null
@@ -0,0 +1,2 @@
+%:
+       #
diff --git a/zfs/etc/init.d/zfs.lsb.in b/zfs/etc/init.d/zfs.lsb.in
deleted file mode 100644 (file)
index b3e8c13..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-%:
-       #
diff --git a/zfs/etc/init.d/zfs.lunar.in b/zfs/etc/init.d/zfs.lunar.in
deleted file mode 100644 (file)
index b3e8c13..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-%:
-       #
diff --git a/zfs/etc/init.d/zfs.redhat.in b/zfs/etc/init.d/zfs.redhat.in
deleted file mode 100644 (file)
index b3e8c13..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-%:
-       #
index 6cd483d999de07eb20af0797d7a4013570daea6e..4deef36cc4dc2bb1afebe387ac3f93e2320447b3 100644 (file)
@@ -91,7 +91,7 @@ host_triplet = @host@
 target_triplet = @target@
 subdir = include
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps =  \
+am__aclocal_m4_deps = $(top_srcdir)/config/always-no-bool-compare.m4 \
        $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \
        $(top_srcdir)/config/dkms.m4 \
        $(top_srcdir)/config/kernel-acl.m4 \
@@ -103,20 +103,11 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-bio-bvec-iter.m4 \
        $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \
        $(top_srcdir)/config/kernel-bio-failfast.m4 \
-       $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \
-       $(top_srcdir)/config/kernel-blk-end-request.m4 \
-       $(top_srcdir)/config/kernel-blk-fetch-request.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-discard.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-barrier.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-discard.m4 \
        $(top_srcdir)/config/kernel-blk-queue-flush.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \
-       $(top_srcdir)/config/kernel-blk-requeue-request.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-pos.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \
        $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \
        $(top_srcdir)/config/kernel-blkdev-get.m4 \
        $(top_srcdir)/config/kernel-block-device-operations-release-void.m4 \
@@ -124,6 +115,7 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-clear-inode.m4 \
        $(top_srcdir)/config/kernel-commit-metadata.m4 \
        $(top_srcdir)/config/kernel-create-nameidata.m4 \
+       $(top_srcdir)/config/kernel-current_bio_tail.m4 \
        $(top_srcdir)/config/kernel-d-make-root.m4 \
        $(top_srcdir)/config/kernel-d-obtain-alias.m4 \
        $(top_srcdir)/config/kernel-d-prune-aliases.m4 \
@@ -137,23 +129,25 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-fallocate.m4 \
        $(top_srcdir)/config/kernel-file-inode.m4 \
        $(top_srcdir)/config/kernel-fmode-t.m4 \
+       $(top_srcdir)/config/kernel-follow-down-one.m4 \
        $(top_srcdir)/config/kernel-follow-link-nameidata.m4 \
        $(top_srcdir)/config/kernel-fsync.m4 \
+       $(top_srcdir)/config/kernel-generic_io_acct.m4 \
        $(top_srcdir)/config/kernel-get-disk-ro.m4 \
        $(top_srcdir)/config/kernel-get-gendisk.m4 \
        $(top_srcdir)/config/kernel-insert-inode-locked.m4 \
        $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \
        $(top_srcdir)/config/kernel-is_owner_or_cap.m4 \
+       $(top_srcdir)/config/kernel-kmap-atomic-args.m4 \
        $(top_srcdir)/config/kernel-kobj-name-len.m4 \
        $(top_srcdir)/config/kernel-lookup-bdev.m4 \
        $(top_srcdir)/config/kernel-lookup-nameidata.m4 \
        $(top_srcdir)/config/kernel-lseek-execute.m4 \
+       $(top_srcdir)/config/kernel-mk-request-fn.m4 \
        $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \
        $(top_srcdir)/config/kernel-mount-nodev.m4 \
        $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \
        $(top_srcdir)/config/kernel-put-link-nameidata.m4 \
-       $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \
-       $(top_srcdir)/config/kernel-rq-is_sync.m4 \
        $(top_srcdir)/config/kernel-security-inode-init.m4 \
        $(top_srcdir)/config/kernel-set-nlink.m4 \
        $(top_srcdir)/config/kernel-sget-args.m4 \
@@ -346,9 +340,11 @@ DEBUG_CFLAGS = @DEBUG_CFLAGS@
 DEBUG_DMU_TX = @DEBUG_DMU_TX@
 DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@
 DEBUG_ZFS = @DEBUG_ZFS@
+DEFAULT_INITCONF_DIR = @DEFAULT_INITCONF_DIR@
 DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@
 DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@
 DEFAULT_PACKAGE = @DEFAULT_PACKAGE@
+DEFINE_INITRAMFS = @DEFINE_INITRAMFS@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -398,6 +394,7 @@ MANIFEST_TOOL = @MANIFEST_TOOL@
 MKDIR_P = @MKDIR_P@
 NM = @NM@
 NMEDIT = @NMEDIT@
+NO_BOOL_COMPARE = @NO_BOOL_COMPARE@
 NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@
 OBJDUMP = @OBJDUMP@
 OBJEXT = @OBJEXT@
index e6a877214a642fd1e8ac1709e46ea79766cf9ab1..db8ee7167e1789b730efe133d3b19364c861be37 100644 (file)
@@ -203,6 +203,7 @@ extern void zfs_save_arguments(int argc, char **, char *, int);
 extern int zpool_log_history(libzfs_handle_t *, const char *);
 
 extern int libzfs_errno(libzfs_handle_t *);
+extern const char *libzfs_error_init(int);
 extern const char *libzfs_error_action(libzfs_handle_t *);
 extern const char *libzfs_error_description(libzfs_handle_t *);
 extern int zfs_standard_error(libzfs_handle_t *, int, const char *);
@@ -617,6 +618,9 @@ typedef struct sendflags {
        /* show progress (ie. -v) */
        boolean_t progress;
 
+       /* large blocks (>128K) are permitted */
+       boolean_t largeblock;
+
        /* WRITE_EMBEDDED records of type DATA are permitted */
        boolean_t embed_data;
 } sendflags_t;
@@ -746,7 +750,6 @@ extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *);
 #define        STDERR_VERBOSE  0x02
 
 int libzfs_run_process(const char *, char **, int flags);
-int libzfs_load_module(const char *);
 
 /*
  * Given a device or file, determine if it is part of a pool.
index d7d767055d3358173aa894488803ac4155b55f49..bdd6c951ee496dc1e21a297e7a69b1342aecf79b 100644 (file)
@@ -53,7 +53,8 @@ int lzc_release(nvlist_t *, nvlist_t **);
 int lzc_get_holds(const char *, nvlist_t **);
 
 enum lzc_send_flags {
-       LZC_SEND_FLAG_EMBED_DATA = 1 << 0
+       LZC_SEND_FLAG_EMBED_DATA = 1 << 0,
+       LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1
 };
 
 int lzc_send(const char *, const char *, int, enum lzc_send_flags);
index d00b1c8ad798c23b2642ce88be62555ccfb3bec7..595d1db01128d4176f8c2eb3869a66688e90b074 100644 (file)
@@ -5,7 +5,8 @@ KERNEL_H = \
        $(top_srcdir)/include/linux/xattr_compat.h \
        $(top_srcdir)/include/linux/vfs_compat.h \
        $(top_srcdir)/include/linux/blkdev_compat.h \
-       $(top_srcdir)/include/linux/utsname_compat.h
+       $(top_srcdir)/include/linux/utsname_compat.h \
+       $(top_srcdir)/include/linux/kmap_compat.h
 
 USER_H =
 
index abbaeff8d0085d983af5d62b2103cb7e7700d2ac..1e7710a6d68a955943c23f5819d17fe41cb1391b 100644 (file)
@@ -91,7 +91,7 @@ host_triplet = @host@
 target_triplet = @target@
 subdir = include/linux
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps =  \
+am__aclocal_m4_deps = $(top_srcdir)/config/always-no-bool-compare.m4 \
        $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \
        $(top_srcdir)/config/dkms.m4 \
        $(top_srcdir)/config/kernel-acl.m4 \
@@ -103,20 +103,11 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-bio-bvec-iter.m4 \
        $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \
        $(top_srcdir)/config/kernel-bio-failfast.m4 \
-       $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \
-       $(top_srcdir)/config/kernel-blk-end-request.m4 \
-       $(top_srcdir)/config/kernel-blk-fetch-request.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-discard.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-barrier.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-discard.m4 \
        $(top_srcdir)/config/kernel-blk-queue-flush.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \
-       $(top_srcdir)/config/kernel-blk-requeue-request.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-pos.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \
        $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \
        $(top_srcdir)/config/kernel-blkdev-get.m4 \
        $(top_srcdir)/config/kernel-block-device-operations-release-void.m4 \
@@ -124,6 +115,7 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-clear-inode.m4 \
        $(top_srcdir)/config/kernel-commit-metadata.m4 \
        $(top_srcdir)/config/kernel-create-nameidata.m4 \
+       $(top_srcdir)/config/kernel-current_bio_tail.m4 \
        $(top_srcdir)/config/kernel-d-make-root.m4 \
        $(top_srcdir)/config/kernel-d-obtain-alias.m4 \
        $(top_srcdir)/config/kernel-d-prune-aliases.m4 \
@@ -137,23 +129,25 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-fallocate.m4 \
        $(top_srcdir)/config/kernel-file-inode.m4 \
        $(top_srcdir)/config/kernel-fmode-t.m4 \
+       $(top_srcdir)/config/kernel-follow-down-one.m4 \
        $(top_srcdir)/config/kernel-follow-link-nameidata.m4 \
        $(top_srcdir)/config/kernel-fsync.m4 \
+       $(top_srcdir)/config/kernel-generic_io_acct.m4 \
        $(top_srcdir)/config/kernel-get-disk-ro.m4 \
        $(top_srcdir)/config/kernel-get-gendisk.m4 \
        $(top_srcdir)/config/kernel-insert-inode-locked.m4 \
        $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \
        $(top_srcdir)/config/kernel-is_owner_or_cap.m4 \
+       $(top_srcdir)/config/kernel-kmap-atomic-args.m4 \
        $(top_srcdir)/config/kernel-kobj-name-len.m4 \
        $(top_srcdir)/config/kernel-lookup-bdev.m4 \
        $(top_srcdir)/config/kernel-lookup-nameidata.m4 \
        $(top_srcdir)/config/kernel-lseek-execute.m4 \
+       $(top_srcdir)/config/kernel-mk-request-fn.m4 \
        $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \
        $(top_srcdir)/config/kernel-mount-nodev.m4 \
        $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \
        $(top_srcdir)/config/kernel-put-link-nameidata.m4 \
-       $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \
-       $(top_srcdir)/config/kernel-rq-is_sync.m4 \
        $(top_srcdir)/config/kernel-security-inode-init.m4 \
        $(top_srcdir)/config/kernel-set-nlink.m4 \
        $(top_srcdir)/config/kernel-sget-args.m4 \
@@ -213,7 +207,8 @@ am__kernel_HEADERS_DIST = $(top_srcdir)/include/linux/dcache_compat.h \
        $(top_srcdir)/include/linux/xattr_compat.h \
        $(top_srcdir)/include/linux/vfs_compat.h \
        $(top_srcdir)/include/linux/blkdev_compat.h \
-       $(top_srcdir)/include/linux/utsname_compat.h
+       $(top_srcdir)/include/linux/utsname_compat.h \
+       $(top_srcdir)/include/linux/kmap_compat.h
 am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
 am__vpath_adj = case $$p in \
     $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
@@ -287,9 +282,11 @@ DEBUG_CFLAGS = @DEBUG_CFLAGS@
 DEBUG_DMU_TX = @DEBUG_DMU_TX@
 DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@
 DEBUG_ZFS = @DEBUG_ZFS@
+DEFAULT_INITCONF_DIR = @DEFAULT_INITCONF_DIR@
 DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@
 DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@
 DEFAULT_PACKAGE = @DEFAULT_PACKAGE@
+DEFINE_INITRAMFS = @DEFINE_INITRAMFS@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -339,6 +336,7 @@ MANIFEST_TOOL = @MANIFEST_TOOL@
 MKDIR_P = @MKDIR_P@
 NM = @NM@
 NMEDIT = @NMEDIT@
+NO_BOOL_COMPARE = @NO_BOOL_COMPARE@
 NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@
 OBJDUMP = @OBJDUMP@
 OBJEXT = @OBJEXT@
@@ -463,7 +461,8 @@ KERNEL_H = \
        $(top_srcdir)/include/linux/xattr_compat.h \
        $(top_srcdir)/include/linux/vfs_compat.h \
        $(top_srcdir)/include/linux/blkdev_compat.h \
-       $(top_srcdir)/include/linux/utsname_compat.h
+       $(top_srcdir)/include/linux/utsname_compat.h \
+       $(top_srcdir)/include/linux/kmap_compat.h
 
 USER_H = 
 EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H)
index 163e03d7bcd269b342b9e730fd3f6babce33786d..868b89c55cbbad13825bf07cb1b3c5ee09e31c11 100644 (file)
 typedef unsigned __bitwise__ fmode_t;
 #endif /* HAVE_FMODE_T */
 
-#ifndef HAVE_BLK_FETCH_REQUEST
-static inline struct request *
-blk_fetch_request(struct request_queue *q)
-{
-       struct request *req;
-
-       req = elv_next_request(q);
-       if (req)
-               blkdev_dequeue_request(req);
-
-       return (req);
-}
-#endif /* HAVE_BLK_FETCH_REQUEST */
-
-#ifndef HAVE_BLK_REQUEUE_REQUEST
-static inline void
-blk_requeue_request(request_queue_t *q, struct request *req)
-{
-       elv_requeue_request(q, req);
-}
-#endif /* HAVE_BLK_REQUEUE_REQUEST */
-
-#ifndef HAVE_BLK_END_REQUEST
-static inline bool
-__blk_end_request(struct request *req, int error, unsigned int nr_bytes)
-{
-       LIST_HEAD(list);
-
-       /*
-        * Request has already been dequeued but 2.6.18 version of
-        * end_request() unconditionally dequeues the request so we
-        * add it to a local list to prevent hitting the BUG_ON.
-        */
-       list_add(&req->queuelist, &list);
-
-       /*
-        * The old API required the driver to end each segment and not
-        * the entire request.  In our case we always need to end the
-        * entire request partial requests are not supported.
-        */
-       req->hard_cur_sectors = nr_bytes >> 9;
-       end_request(req, ((error == 0) ? 1 : error));
-
-       return (0);
-}
-
-static inline bool
-blk_end_request(struct request *req, int error, unsigned int nr_bytes)
-{
-       struct request_queue *q = req->q;
-       bool rc;
-
-       spin_lock_irq(q->queue_lock);
-       rc = __blk_end_request(req, error, nr_bytes);
-       spin_unlock_irq(q->queue_lock);
-
-       return (rc);
-}
-#else
-#ifdef HAVE_BLK_END_REQUEST_GPL_ONLY
-/*
- * Define required to avoid conflicting 2.6.29 non-static prototype for a
- * GPL-only version of the helper.  As of 2.6.31 the helper is available
- * to non-GPL modules and is not explicitly exported GPL-only.
- */
-#define        __blk_end_request __blk_end_request_x
-#define        blk_end_request blk_end_request_x
-
-static inline bool
-__blk_end_request_x(struct request *req, int error, unsigned int nr_bytes)
-{
-       /*
-        * The old API required the driver to end each segment and not
-        * the entire request.  In our case we always need to end the
-        * entire request partial requests are not supported.
-        */
-       req->hard_cur_sectors = nr_bytes >> 9;
-       end_request(req, ((error == 0) ? 1 : error));
-
-       return (0);
-}
-static inline bool
-blk_end_request_x(struct request *req, int error, unsigned int nr_bytes)
-{
-       struct request_queue *q = req->q;
-       bool rc;
-
-       spin_lock_irq(q->queue_lock);
-       rc = __blk_end_request_x(req, error, nr_bytes);
-       spin_unlock_irq(q->queue_lock);
-
-       return (rc);
-}
-#endif /* HAVE_BLK_END_REQUEST_GPL_ONLY */
-#endif /* HAVE_BLK_END_REQUEST */
-
 /*
  * 2.6.36 API change,
  * The blk_queue_flush() interface has replaced blk_queue_ordered()
@@ -148,37 +52,6 @@ __blk_queue_flush(struct request_queue *q, unsigned int flags)
        q->flush_flags = flags & (REQ_FLUSH | REQ_FUA);
 }
 #endif /* HAVE_BLK_QUEUE_FLUSH && HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */
-
-#ifndef HAVE_BLK_RQ_POS
-static inline sector_t
-blk_rq_pos(struct request *req)
-{
-       return (req->sector);
-}
-#endif /* HAVE_BLK_RQ_POS */
-
-#ifndef HAVE_BLK_RQ_SECTORS
-static inline unsigned int
-blk_rq_sectors(struct request *req)
-{
-       return (req->nr_sectors);
-}
-#endif /* HAVE_BLK_RQ_SECTORS */
-
-#if !defined(HAVE_BLK_RQ_BYTES) || defined(HAVE_BLK_RQ_BYTES_GPL_ONLY)
-/*
- * Define required to avoid conflicting 2.6.29 non-static prototype for a
- * GPL-only version of the helper.  As of 2.6.31 the helper is available
- * to non-GPL modules in the form of a static inline in the header.
- */
-#define        blk_rq_bytes __blk_rq_bytes
-static inline unsigned int
-__blk_rq_bytes(struct request *req)
-{
-       return (blk_rq_sectors(req) << 9);
-}
-#endif /* !HAVE_BLK_RQ_BYTES || HAVE_BLK_RQ_BYTES_GPL_ONLY */
-
 /*
  * Most of the blk_* macros were removed in 2.6.36.  Ostensibly this was
  * done to improve readability and allow easier grepping.  However, from
@@ -228,25 +101,6 @@ __blk_queue_max_segments(struct request_queue *q, unsigned short max_segments)
 }
 #endif
 
-/*
- * 2.6.30 API change,
- * The blk_queue_physical_block_size() function was introduced to
- * indicate the smallest I/O the device can write without incurring
- * a read-modify-write penalty.  For older kernels this is a no-op.
- */
-#ifndef HAVE_BLK_QUEUE_PHYSICAL_BLOCK_SIZE
-#define        blk_queue_physical_block_size(q, x)     ((void)(0))
-#endif
-
-/*
- * 2.6.30 API change,
- * The blk_queue_io_opt() function was added to indicate the optimal
- * I/O size for the device.  For older kernels this is a no-op.
- */
-#ifndef HAVE_BLK_QUEUE_IO_OPT
-#define        blk_queue_io_opt(q, x)                  ((void)(0))
-#endif
-
 #ifndef HAVE_GET_DISK_RO
 static inline int
 get_disk_ro(struct gendisk *disk)
@@ -260,64 +114,20 @@ get_disk_ro(struct gendisk *disk)
 }
 #endif /* HAVE_GET_DISK_RO */
 
-#ifndef HAVE_RQ_IS_SYNC
-static inline bool
-rq_is_sync(struct request *req)
-{
-       return (req->flags & REQ_RW_SYNC);
-}
-#endif /* HAVE_RQ_IS_SYNC */
-
-#ifndef HAVE_RQ_FOR_EACH_SEGMENT
-struct req_iterator {
-       int i;
-       struct bio *bio;
-};
-
-#define        for_each_bio(_bio)              \
-       for (; _bio; _bio = _bio->bi_next)
-
-#define        __rq_for_each_bio(_bio, rq)     \
-       if ((rq->bio))                  \
-               for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
-
-#define        rq_for_each_segment(bvl, _rq, _iter)                    \
-       __rq_for_each_bio(_iter.bio, _rq)                       \
-               bio_for_each_segment(bvl, _iter.bio, _iter.i)
-
-#define        HAVE_RQ_FOR_EACH_SEGMENT_BVP 1
-#endif /* HAVE_RQ_FOR_EACH_SEGMENT */
-
-/*
- * 3.14 API change
- * rq_for_each_segment changed from taking bio_vec * to taking bio_vec.
- * We provide rq_for_each_segment4 which takes both.
- * You should not modify the fields in @bv and @bvp.
- *
- * Note: the if-else is just to inject the assignment before the loop body.
- */
-#ifdef HAVE_RQ_FOR_EACH_SEGMENT_BVP
-#define        rq_for_each_segment4(bv, bvp, rq, iter) \
-       rq_for_each_segment(bvp, rq, iter)      \
-               if ((bv = *bvp), 0)             \
-                       ;                       \
-               else
-#else
-#define        rq_for_each_segment4(bv, bvp, rq, iter) \
-       rq_for_each_segment(bv, rq, iter)       \
-               if ((bvp = &bv), 0)             \
-                       ;                       \
-               else
-#endif
-
 #ifdef HAVE_BIO_BVEC_ITER
 #define        BIO_BI_SECTOR(bio)      (bio)->bi_iter.bi_sector
 #define        BIO_BI_SIZE(bio)        (bio)->bi_iter.bi_size
 #define        BIO_BI_IDX(bio)         (bio)->bi_iter.bi_idx
+#define        bio_for_each_segment4(bv, bvp, b, i)    \
+       bio_for_each_segment((bv), (b), (i))
+typedef struct bvec_iter bvec_iterator_t;
 #else
 #define        BIO_BI_SECTOR(bio)      (bio)->bi_sector
 #define        BIO_BI_SIZE(bio)        (bio)->bi_size
 #define        BIO_BI_IDX(bio)         (bio)->bi_idx
+#define        bio_for_each_segment4(bv, bvp, b, i)    \
+       bio_for_each_segment((bvp), (b), (i))
+typedef int bvec_iterator_t;
 #endif
 
 /*
@@ -347,26 +157,21 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags)
 #endif /* BLOCK_EXT_MAJOR */
 #endif /* CONFIG_BUG */
 
-#ifdef HAVE_BIO_RW_FAILFAST_DTD
+#if defined(HAVE_BIO_RW_FAILFAST_DTD)
        /* BIO_RW_FAILFAST_* preferred interface from 2.6.28 - 2.6.35 */
        *flags |= (
            (1 << BIO_RW_FAILFAST_DEV) |
            (1 << BIO_RW_FAILFAST_TRANSPORT) |
            (1 << BIO_RW_FAILFAST_DRIVER));
-#else
-#ifdef HAVE_BIO_RW_FAILFAST
-       /* BIO_RW_FAILFAST preferred interface from 2.6.12 - 2.6.27 */
-       *flags |= (1 << BIO_RW_FAILFAST);
-#else
-#ifdef HAVE_REQ_FAILFAST_MASK
+#elif defined(HAVE_REQ_FAILFAST_MASK)
        /*
         * REQ_FAILFAST_* preferred interface from 2.6.36 - 2.6.xx,
         * the BIO_* and REQ_* flags were unified under REQ_* flags.
         */
        *flags |= REQ_FAILFAST_MASK;
-#endif /* HAVE_REQ_FAILFAST_MASK */
-#endif /* HAVE_BIO_RW_FAILFAST */
-#endif /* HAVE_BIO_RW_FAILFAST_DTD */
+#else
+#error "Undefined block IO FAILFAST interface."
+#endif
 }
 
 /*
@@ -478,17 +283,30 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags)
 #define        VDEV_REQ_FUA                    REQ_FUA
 #else
 #define        VDEV_WRITE_FLUSH_FUA            WRITE_BARRIER
+#ifdef HAVE_BIO_RW_BARRIER
+#define        VDEV_REQ_FLUSH                  (1 << BIO_RW_BARRIER)
+#define        VDEV_REQ_FUA                    (1 << BIO_RW_BARRIER)
+#else
 #define        VDEV_REQ_FLUSH                  REQ_HARDBARRIER
-#define        VDEV_REQ_FUA                    REQ_HARDBARRIER
+#define        VDEV_REQ_FUA                    REQ_FUA
+#endif
 #endif
 
 /*
  * 2.6.32 API change
  * Use the normal I/O patch for discards.
  */
-#ifdef REQ_DISCARD
+#ifdef QUEUE_FLAG_DISCARD
+#ifdef HAVE_BIO_RW_DISCARD
+#define        VDEV_REQ_DISCARD                (1 << BIO_RW_DISCARD)
+#else
 #define        VDEV_REQ_DISCARD                REQ_DISCARD
 #endif
+#else
+#error "Allowing the build will cause discard requests to become writes "
+       "potentially triggering the DMU_MAX_ACCESS assertion. Please file a "
+       "an issue report at: https://github.com/zfsonlinux/zfs/issues/new"
+#endif
 
 /*
  * 2.6.33 API change
@@ -522,4 +340,9 @@ blk_queue_discard_granularity(struct request_queue *q, unsigned int dg)
  */
 #define        VDEV_HOLDER                     ((void *)0x2401de7)
 
+#ifndef HAVE_GENERIC_IO_ACCT
+#define        generic_start_io_acct(rw, slen, part)           ((void)0)
+#define        generic_end_io_acct(rw, part, start_jiffies)    ((void)0)
+#endif
+
 #endif /* _ZFS_BLKDEV_H */
diff --git a/zfs/include/linux/kmap_compat.h b/zfs/include/linux/kmap_compat.h
new file mode 100644 (file)
index 0000000..59ae566
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ */
+
+#ifndef _ZFS_KMAP_H
+#define        _ZFS_KMAP_H
+
+#include <linux/highmem.h>
+
+#ifdef HAVE_1ARG_KMAP_ATOMIC
+/* 2.6.37 API change */
+#define        zfs_kmap_atomic(page, km_type)          kmap_atomic(page)
+#define        zfs_kunmap_atomic(addr, km_type)        kunmap_atomic(addr)
+#else
+#define        zfs_kmap_atomic(page, km_type)          kmap_atomic(page, km_type)
+#define        zfs_kunmap_atomic(addr, km_type)        kunmap_atomic(addr, km_type)
+#endif
+
+#endif /* _ZFS_KMAP_H */
index 40832d9ddda770d8a80b053f93920be8f8c41728..bcec1146a0c4cdd6a17e51e6b65c28909d64460f 100644 (file)
@@ -352,4 +352,15 @@ static inline struct inode *file_inode(const struct file *f)
 }
 #endif /* HAVE_FILE_INODE */
 
+/*
+ * 2.6.38 API change
+ */
+#ifdef HAVE_FOLLOW_DOWN_ONE
+#define        zpl_follow_down_one(path)               follow_down_one(path)
+#define        zpl_follow_up(path)                     follow_up(path)
+#else
+#define        zpl_follow_down_one(path)               follow_down(path)
+#define        zpl_follow_up(path)                     follow_up(path)
+#endif
+
 #endif /* _ZFS_VFS_H */
index 5211e656456d10f341c3e27008a74d311d58cb17..77ecfb2dcf8e39eb4aafa0d38bf4cc9d9a121a7a 100644 (file)
@@ -33,6 +33,8 @@ COMMON_H = \
        $(top_srcdir)/include/sys/efi_partition.h \
        $(top_srcdir)/include/sys/metaslab.h \
        $(top_srcdir)/include/sys/metaslab_impl.h \
+       $(top_srcdir)/include/sys/mntent.h \
+       $(top_srcdir)/include/sys/multilist.h \
        $(top_srcdir)/include/sys/nvpair.h \
        $(top_srcdir)/include/sys/nvpair_impl.h \
        $(top_srcdir)/include/sys/range_tree.h \
@@ -53,6 +55,7 @@ COMMON_H = \
        $(top_srcdir)/include/sys/trace_dbuf.h \
        $(top_srcdir)/include/sys/trace_dmu.h \
        $(top_srcdir)/include/sys/trace_dnode.h \
+       $(top_srcdir)/include/sys/trace_multilist.h \
        $(top_srcdir)/include/sys/trace_txg.h \
        $(top_srcdir)/include/sys/trace_zil.h \
        $(top_srcdir)/include/sys/trace_zrlock.h \
index 6a48a0f3633800228a520395fe448286590f4654..d3abc396bf0a2f47947b52448b3c1fb2aab0e74c 100644 (file)
@@ -91,7 +91,7 @@ host_triplet = @host@
 target_triplet = @target@
 subdir = include/sys
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps =  \
+am__aclocal_m4_deps = $(top_srcdir)/config/always-no-bool-compare.m4 \
        $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \
        $(top_srcdir)/config/dkms.m4 \
        $(top_srcdir)/config/kernel-acl.m4 \
@@ -103,20 +103,11 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-bio-bvec-iter.m4 \
        $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \
        $(top_srcdir)/config/kernel-bio-failfast.m4 \
-       $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \
-       $(top_srcdir)/config/kernel-blk-end-request.m4 \
-       $(top_srcdir)/config/kernel-blk-fetch-request.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-discard.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-barrier.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-discard.m4 \
        $(top_srcdir)/config/kernel-blk-queue-flush.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \
-       $(top_srcdir)/config/kernel-blk-requeue-request.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-pos.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \
        $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \
        $(top_srcdir)/config/kernel-blkdev-get.m4 \
        $(top_srcdir)/config/kernel-block-device-operations-release-void.m4 \
@@ -124,6 +115,7 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-clear-inode.m4 \
        $(top_srcdir)/config/kernel-commit-metadata.m4 \
        $(top_srcdir)/config/kernel-create-nameidata.m4 \
+       $(top_srcdir)/config/kernel-current_bio_tail.m4 \
        $(top_srcdir)/config/kernel-d-make-root.m4 \
        $(top_srcdir)/config/kernel-d-obtain-alias.m4 \
        $(top_srcdir)/config/kernel-d-prune-aliases.m4 \
@@ -137,23 +129,25 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-fallocate.m4 \
        $(top_srcdir)/config/kernel-file-inode.m4 \
        $(top_srcdir)/config/kernel-fmode-t.m4 \
+       $(top_srcdir)/config/kernel-follow-down-one.m4 \
        $(top_srcdir)/config/kernel-follow-link-nameidata.m4 \
        $(top_srcdir)/config/kernel-fsync.m4 \
+       $(top_srcdir)/config/kernel-generic_io_acct.m4 \
        $(top_srcdir)/config/kernel-get-disk-ro.m4 \
        $(top_srcdir)/config/kernel-get-gendisk.m4 \
        $(top_srcdir)/config/kernel-insert-inode-locked.m4 \
        $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \
        $(top_srcdir)/config/kernel-is_owner_or_cap.m4 \
+       $(top_srcdir)/config/kernel-kmap-atomic-args.m4 \
        $(top_srcdir)/config/kernel-kobj-name-len.m4 \
        $(top_srcdir)/config/kernel-lookup-bdev.m4 \
        $(top_srcdir)/config/kernel-lookup-nameidata.m4 \
        $(top_srcdir)/config/kernel-lseek-execute.m4 \
+       $(top_srcdir)/config/kernel-mk-request-fn.m4 \
        $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \
        $(top_srcdir)/config/kernel-mount-nodev.m4 \
        $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \
        $(top_srcdir)/config/kernel-put-link-nameidata.m4 \
-       $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \
-       $(top_srcdir)/config/kernel-rq-is_sync.m4 \
        $(top_srcdir)/config/kernel-security-inode-init.m4 \
        $(top_srcdir)/config/kernel-set-nlink.m4 \
        $(top_srcdir)/config/kernel-sget-args.m4 \
@@ -249,6 +243,8 @@ am__kernel_HEADERS_DIST = $(top_srcdir)/include/sys/arc.h \
        $(top_srcdir)/include/sys/efi_partition.h \
        $(top_srcdir)/include/sys/metaslab.h \
        $(top_srcdir)/include/sys/metaslab_impl.h \
+       $(top_srcdir)/include/sys/mntent.h \
+       $(top_srcdir)/include/sys/multilist.h \
        $(top_srcdir)/include/sys/nvpair.h \
        $(top_srcdir)/include/sys/nvpair_impl.h \
        $(top_srcdir)/include/sys/range_tree.h \
@@ -269,6 +265,7 @@ am__kernel_HEADERS_DIST = $(top_srcdir)/include/sys/arc.h \
        $(top_srcdir)/include/sys/trace_dbuf.h \
        $(top_srcdir)/include/sys/trace_dmu.h \
        $(top_srcdir)/include/sys/trace_dnode.h \
+       $(top_srcdir)/include/sys/trace_multilist.h \
        $(top_srcdir)/include/sys/trace_txg.h \
        $(top_srcdir)/include/sys/trace_zil.h \
        $(top_srcdir)/include/sys/trace_zrlock.h \
@@ -374,6 +371,8 @@ am__libzfs_HEADERS_DIST = $(top_srcdir)/include/sys/arc.h \
        $(top_srcdir)/include/sys/efi_partition.h \
        $(top_srcdir)/include/sys/metaslab.h \
        $(top_srcdir)/include/sys/metaslab_impl.h \
+       $(top_srcdir)/include/sys/mntent.h \
+       $(top_srcdir)/include/sys/multilist.h \
        $(top_srcdir)/include/sys/nvpair.h \
        $(top_srcdir)/include/sys/nvpair_impl.h \
        $(top_srcdir)/include/sys/range_tree.h \
@@ -394,6 +393,7 @@ am__libzfs_HEADERS_DIST = $(top_srcdir)/include/sys/arc.h \
        $(top_srcdir)/include/sys/trace_dbuf.h \
        $(top_srcdir)/include/sys/trace_dmu.h \
        $(top_srcdir)/include/sys/trace_dnode.h \
+       $(top_srcdir)/include/sys/trace_multilist.h \
        $(top_srcdir)/include/sys/trace_txg.h \
        $(top_srcdir)/include/sys/trace_zil.h \
        $(top_srcdir)/include/sys/trace_zrlock.h \
@@ -514,9 +514,11 @@ DEBUG_CFLAGS = @DEBUG_CFLAGS@
 DEBUG_DMU_TX = @DEBUG_DMU_TX@
 DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@
 DEBUG_ZFS = @DEBUG_ZFS@
+DEFAULT_INITCONF_DIR = @DEFAULT_INITCONF_DIR@
 DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@
 DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@
 DEFAULT_PACKAGE = @DEFAULT_PACKAGE@
+DEFINE_INITRAMFS = @DEFINE_INITRAMFS@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -566,6 +568,7 @@ MANIFEST_TOOL = @MANIFEST_TOOL@
 MKDIR_P = @MKDIR_P@
 NM = @NM@
 NMEDIT = @NMEDIT@
+NO_BOOL_COMPARE = @NO_BOOL_COMPARE@
 NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@
 OBJDUMP = @OBJDUMP@
 OBJEXT = @OBJEXT@
@@ -718,6 +721,8 @@ COMMON_H = \
        $(top_srcdir)/include/sys/efi_partition.h \
        $(top_srcdir)/include/sys/metaslab.h \
        $(top_srcdir)/include/sys/metaslab_impl.h \
+       $(top_srcdir)/include/sys/mntent.h \
+       $(top_srcdir)/include/sys/multilist.h \
        $(top_srcdir)/include/sys/nvpair.h \
        $(top_srcdir)/include/sys/nvpair_impl.h \
        $(top_srcdir)/include/sys/range_tree.h \
@@ -738,6 +743,7 @@ COMMON_H = \
        $(top_srcdir)/include/sys/trace_dbuf.h \
        $(top_srcdir)/include/sys/trace_dmu.h \
        $(top_srcdir)/include/sys/trace_dnode.h \
+       $(top_srcdir)/include/sys/trace_multilist.h \
        $(top_srcdir)/include/sys/trace_txg.h \
        $(top_srcdir)/include/sys/trace_zil.h \
        $(top_srcdir)/include/sys/trace_zrlock.h \
index 215c75b6dfa31519066802eae796ea75d0517784..db7a64aa2e22e43d7adccd3900e7e53abe51a6a3 100644 (file)
@@ -38,6 +38,12 @@ extern "C" {
 #include <sys/spa.h>
 #include <sys/refcount.h>
 
+/*
+ * Used by arc_flush() to inform arc_evict_state() that it should evict
+ * all available buffers from the arc state being passed in.
+ */
+#define        ARC_EVICT_ALL   -1ULL
+
 typedef struct arc_buf_hdr arc_buf_hdr_t;
 typedef struct arc_buf arc_buf_t;
 typedef struct arc_prune arc_prune_t;
@@ -45,6 +51,9 @@ typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
 typedef void arc_prune_func_t(int64_t bytes, void *private);
 typedef int arc_evict_func_t(void *private);
 
+/* Shared module parameters */
+extern int zfs_arc_average_blocksize;
+
 /* generic arc_done_func_t's which you can use */
 arc_done_func_t arc_bcopy_func;
 arc_done_func_t arc_getbuf_func;
@@ -53,10 +62,51 @@ arc_done_func_t arc_getbuf_func;
 struct arc_prune {
        arc_prune_func_t        *p_pfunc;
        void                    *p_private;
+       uint64_t                p_adjust;
        list_node_t             p_node;
        refcount_t              p_refcnt;
 };
 
+typedef enum arc_strategy {
+       ARC_STRATEGY_META_ONLY          = 0, /* Evict only meta data buffers */
+       ARC_STRATEGY_META_BALANCED      = 1, /* Evict data buffers if needed */
+} arc_strategy_t;
+
+typedef enum arc_flags
+{
+       /*
+        * Public flags that can be passed into the ARC by external consumers.
+        */
+       ARC_FLAG_NONE                   = 1 << 0,       /* No flags set */
+       ARC_FLAG_WAIT                   = 1 << 1,       /* perform sync I/O */
+       ARC_FLAG_NOWAIT                 = 1 << 2,       /* perform async I/O */
+       ARC_FLAG_PREFETCH               = 1 << 3,       /* I/O is a prefetch */
+       ARC_FLAG_CACHED                 = 1 << 4,       /* I/O was in cache */
+       ARC_FLAG_L2CACHE                = 1 << 5,       /* cache in L2ARC */
+       ARC_FLAG_L2COMPRESS             = 1 << 6,       /* compress in L2ARC */
+
+       /*
+        * Private ARC flags.  These flags are private ARC only flags that
+        * will show up in b_flags in the arc_hdr_buf_t. These flags should
+        * only be set by ARC code.
+        */
+       ARC_FLAG_IN_HASH_TABLE          = 1 << 7,       /* buffer is hashed */
+       ARC_FLAG_IO_IN_PROGRESS         = 1 << 8,       /* I/O in progress */
+       ARC_FLAG_IO_ERROR               = 1 << 9,       /* I/O failed for buf */
+       ARC_FLAG_FREED_IN_READ          = 1 << 10,      /* freed during read */
+       ARC_FLAG_BUF_AVAILABLE          = 1 << 11,      /* block not in use */
+       ARC_FLAG_INDIRECT               = 1 << 12,      /* indirect block */
+       ARC_FLAG_L2_WRITING             = 1 << 13,      /* write in progress */
+       ARC_FLAG_L2_EVICTED             = 1 << 14,      /* evicted during I/O */
+       ARC_FLAG_L2_WRITE_HEAD          = 1 << 15,      /* head of write list */
+       /* indicates that the buffer contains metadata (otherwise, data) */
+       ARC_FLAG_BUFC_METADATA          = 1 << 16,
+
+       /* Flags specifying whether optional hdr struct fields are defined */
+       ARC_FLAG_HAS_L1HDR              = 1 << 17,
+       ARC_FLAG_HAS_L2HDR              = 1 << 18,
+} arc_flags_t;
+
 struct arc_buf {
        arc_buf_hdr_t           *b_hdr;
        arc_buf_t               *b_next;
@@ -71,15 +121,6 @@ typedef enum arc_buf_contents {
        ARC_BUFC_METADATA,                      /* buffer contains metadata */
        ARC_BUFC_NUMTYPES
 } arc_buf_contents_t;
-/*
- * These are the flags we pass into calls to the arc
- */
-#define        ARC_WAIT        (1 << 1)        /* perform I/O synchronously */
-#define        ARC_NOWAIT      (1 << 2)        /* perform I/O asynchronously */
-#define        ARC_PREFETCH    (1 << 3)        /* I/O is a prefetch */
-#define        ARC_CACHED      (1 << 4)        /* I/O was already in cache */
-#define        ARC_L2CACHE     (1 << 5)        /* cache in L2ARC */
-#define        ARC_L2COMPRESS  (1 << 6)        /* compress in L2ARC */
 
 /*
  * The following breakdows of arc_size exist for kstat only.
@@ -106,7 +147,6 @@ typedef enum arc_state_type {
 typedef struct arc_buf_info {
        arc_state_type_t        abi_state_type;
        arc_buf_contents_t      abi_state_contents;
-       uint64_t                abi_state_index;
        uint32_t                abi_flags;
        uint32_t                abi_datacnt;
        uint64_t                abi_size;
@@ -146,7 +186,7 @@ int arc_referenced(arc_buf_t *buf);
 
 int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_done_func_t *done, void *private, zio_priority_t priority, int flags,
-    uint32_t *arc_flags, const zbookmark_phys_t *zb);
+    arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
 zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
@@ -160,7 +200,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp);
 void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
 boolean_t arc_clear_callback(arc_buf_t *buf);
 
-void arc_flush(spa_t *spa);
+void arc_flush(spa_t *spa, boolean_t retry);
 void arc_tempreserve_clear(uint64_t reserve);
 int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
 
index e7068ea188e37f65cca2bd4e6c90ed24f501836f..a9dbfc8dd73e29dbbbff3afc0aae8d096d7d2f06 100644 (file)
@@ -67,15 +67,25 @@ extern "C" {
  */
 
 typedef struct arc_state {
-       list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
-       uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
-       uint64_t arcs_size;     /* total amount of data in this state */
-       kmutex_t arcs_mtx;
+       /*
+        * list of evictable buffers
+        */
+       multilist_t arcs_list[ARC_BUFC_NUMTYPES];
+       /*
+        * total amount of evictable data in this state
+        */
+       uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
+       /*
+        * total amount of data in this state; this includes: evictable,
+        * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
+        */
+       refcount_t arcs_size;
+       /*
+        * supports the "dbufs" kstat
+        */
        arc_state_type_t arcs_state;
 } arc_state_t;
 
-typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
-
 typedef struct arc_callback arc_callback_t;
 
 struct arc_callback {
@@ -96,31 +106,49 @@ struct arc_write_callback {
        arc_buf_t       *awcb_buf;
 };
 
-struct arc_buf_hdr {
-       /* protected by hash lock */
-       dva_t                   b_dva;
-       uint64_t                b_birth;
-       uint64_t                b_cksum0;
-
+/*
+ * ARC buffers are separated into multiple structs as a memory saving measure:
+ *   - Common fields struct, always defined, and embedded within it:
+ *       - L2-only fields, always allocated but undefined when not in L2ARC
+ *       - L1-only fields, only allocated when in L1ARC
+ *
+ *           Buffer in L1                     Buffer only in L2
+ *    +------------------------+          +------------------------+
+ *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
+ *    |                        |          |                        |
+ *    |                        |          |                        |
+ *    |                        |          |                        |
+ *    +------------------------+          +------------------------+
+ *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
+ *    | (undefined if L1-only) |          |                        |
+ *    +------------------------+          +------------------------+
+ *    | l1arc_buf_hdr_t        |
+ *    |                        |
+ *    |                        |
+ *    |                        |
+ *    |                        |
+ *    +------------------------+
+ *
+ * Because it's possible for the L2ARC to become extremely large, we can wind
+ * up eating a lot of memory in L2ARC buffer headers, so the size of a header
+ * is minimized by only allocating the fields necessary for an L1-cached buffer
+ * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
+ * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
+ * words in pointers. arc_hdr_realloc() is used to switch a header between
+ * these two allocation states.
+ */
+typedef struct l1arc_buf_hdr {
        kmutex_t                b_freeze_lock;
-       zio_cksum_t             *b_freeze_cksum;
 
-       arc_buf_hdr_t           *b_hash_next;
        arc_buf_t               *b_buf;
-       uint32_t                b_flags;
        uint32_t                b_datacnt;
-
-       arc_callback_t          *b_acb;
+       /* for waiting on writes to complete */
        kcondvar_t              b_cv;
 
-       /* immutable */
-       arc_buf_contents_t      b_type;
-       uint64_t                b_size;
-       uint64_t                b_spa;
 
        /* protected by arc state mutex */
        arc_state_t             *b_state;
-       list_node_t             b_arc_node;
+       multilist_node_t        b_arc_node;
 
        /* updated atomically */
        clock_t                 b_arc_access;
@@ -133,9 +161,10 @@ struct arc_buf_hdr {
        /* self protecting */
        refcount_t              b_refcnt;
 
-       l2arc_buf_hdr_t         *b_l2hdr;
-       list_node_t             b_l2node;
-};
+       arc_callback_t          *b_acb;
+       /* temporary buffer holder for in-flight compressed data */
+       void                    *b_tmp_cdata;
+} l1arc_buf_hdr_t;
 
 typedef struct l2arc_dev {
        vdev_t                  *l2ad_vdev;     /* vdev */
@@ -143,18 +172,55 @@ typedef struct l2arc_dev {
        uint64_t                l2ad_hand;      /* next write location */
        uint64_t                l2ad_start;     /* first addr on device */
        uint64_t                l2ad_end;       /* last addr on device */
-       uint64_t                l2ad_evict;     /* last addr eviction reached */
        boolean_t               l2ad_first;     /* first sweep through */
        boolean_t               l2ad_writing;   /* currently writing */
-       list_t                  *l2ad_buflist;  /* buffer list */
+       kmutex_t                l2ad_mtx;       /* lock for buffer list */
+       list_t                  l2ad_buflist;   /* buffer list */
        list_node_t             l2ad_node;      /* device list node */
+       refcount_t              l2ad_alloc;     /* allocated bytes */
 } l2arc_dev_t;
 
+typedef struct l2arc_buf_hdr {
+       /* protected by arc_buf_hdr mutex */
+       l2arc_dev_t             *b_dev;         /* L2ARC device */
+       uint64_t                b_daddr;        /* disk address, offset byte */
+       /* real alloc'd buffer size depending on b_compress applied */
+       uint32_t                b_hits;
+       int32_t                 b_asize;
+       uint8_t                 b_compress;
+
+       list_node_t             b_l2node;
+} l2arc_buf_hdr_t;
+
 typedef struct l2arc_write_callback {
        l2arc_dev_t     *l2wcb_dev;             /* device info */
        arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 } l2arc_write_callback_t;
 
+struct arc_buf_hdr {
+       /* protected by hash lock */
+       dva_t                   b_dva;
+       uint64_t                b_birth;
+       /*
+        * Even though this checksum is only set/verified when a buffer is in
+        * the L1 cache, it needs to be in the set of common fields because it
+        * must be preserved from the time before a buffer is written out to
+        * L2ARC until after it is read back in.
+        */
+       zio_cksum_t             *b_freeze_cksum;
+
+       arc_buf_hdr_t           *b_hash_next;
+       arc_flags_t             b_flags;
+
+       /* immutable */
+       int32_t                 b_size;
+       uint64_t                b_spa;
+
+       /* L2ARC fields. Undefined when not in L2ARC. */
+       l2arc_buf_hdr_t         b_l2hdr;
+       /* L1ARC fields. Undefined when in l2arc_only state */
+       l1arc_buf_hdr_t         b_l1hdr;
+};
 #ifdef __cplusplus
 }
 #endif
index ba305c9082392ee7b970b2032704c8ea4a74df9c..10e0ddaeef884244ac23b84399962651609ec14b 100644 (file)
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
 #ifndef        _AVL_H
 #define        _AVL_H
 
@@ -39,7 +43,7 @@ extern "C" {
 #include <sys/avl_impl.h>
 
 /*
- * This is a generic implemenatation of AVL trees for use in the Solaris kernel.
+ * This is a generic implementation of AVL trees for use in the Solaris kernel.
  * The interfaces provide an efficient way of implementing an ordered set of
  * data structures.
  *
@@ -175,7 +179,7 @@ extern void avl_insert(avl_tree_t *tree, void *node, avl_index_t where);
  * Insert "new_data" in "tree" in the given "direction" either after
  * or before the data "here".
  *
- * This might be usefull for avl clients caching recently accessed
+ * This might be useful for avl clients caching recently accessed
  * data to avoid doing avl_find() again for insertion.
  *
  * new_data    - new data to insert
@@ -259,6 +263,11 @@ extern boolean_t avl_update(avl_tree_t *, void *);
 extern boolean_t avl_update_lt(avl_tree_t *, void *);
 extern boolean_t avl_update_gt(avl_tree_t *, void *);
 
+/*
+ * Swaps the contents of the two trees.
+ */
+extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2);
+
 /*
  * Return the number of nodes in the tree
  */
index af975c734560d122977048026d84ad58354f7795..2a365199ce449c793a89682bf95a3889a38241b5 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef        _SYS_BPOBJ_H
@@ -77,7 +77,6 @@ void bpobj_close(bpobj_t *bpo);
 
 int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
 int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
-int bpobj_iterate_dbg(bpobj_t *bpo, uint64_t *itorp, blkptr_t *bp);
 
 void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
 void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
index 1eabfd7daccfbd37df7524691a7e21f885e05204..0d262e87b5bc9fca09073783719b75eab25a83c5 100644 (file)
@@ -20,8 +20,9 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef        _SYS_DBUF_H
@@ -66,8 +67,13 @@ extern "C" {
  *             |                        |
  *             |                        |
  *             +--------> NOFILL -------+
+ *
+ * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
+ * to find all dbufs in a range of a dnode and must be less than any other
+ * dbuf_states_t (see comment on dn_dbufs in dnode.h).
  */
 typedef enum dbuf_states {
+       DB_SEARCH = -1,
        DB_UNCACHED,
        DB_FILL,
        DB_NOFILL,
@@ -217,18 +223,32 @@ typedef struct dmu_buf_impl {
         * Our link on the owner dnodes's dn_dbufs list.
         * Protected by its dn_dbufs_mtx.
         */
-       list_node_t db_link;
+       avl_node_t db_link;
 
        /* Data which is unique to data (leaf) blocks: */
 
-       /* stuff we store for the user (see dmu_buf_set_user) */
-       void *db_user_ptr;
-       void **db_user_data_ptr_ptr;
-       dmu_buf_evict_func_t *db_evict_func;
+       /* User callback information. */
+       dmu_buf_user_t *db_user;
 
-       uint8_t db_immediate_evict;
+       /*
+        * Evict user data as soon as the dirty and reference
+        * counts are equal.
+        */
+       uint8_t db_user_immediate_evict;
+
+       /*
+        * This block was freed while a read or write was
+        * active.
+        */
        uint8_t db_freed_in_flight;
 
+       /*
+        * dnode_evict_dbufs() or dnode_evict_bonus() tried to
+        * evict this dbuf, but couldn't due to outstanding
+        * references.  Evict once the refcount drops to 0.
+        */
+       uint8_t db_pending_evict;
+
        uint8_t db_dirtycnt;
 } dmu_buf_impl_t;
 
@@ -258,12 +278,15 @@ int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
 void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio);
 
 void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
+boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
+    uint64_t blkid, void *tag);
 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
 
 void dbuf_rele(dmu_buf_impl_t *db, void *tag);
 void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
 
-dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
+dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
+    uint64_t blkid);
 
 int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
 void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
@@ -280,7 +303,7 @@ void dbuf_clear(dmu_buf_impl_t *db);
 void dbuf_evict(dmu_buf_impl_t *db);
 
 void dbuf_unoverride(dbuf_dirty_record_t *dr);
-void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
 void dbuf_release_bp(dmu_buf_impl_t *db);
 
 void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
index c9c687b5aa62ce5941b5590c58cf02b3cd0d53a0..d9434db463831df9b04f2e4214c07cc084e01732 100644 (file)
@@ -24,6 +24,7 @@
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
  * dmu_spa.h.
  */
 
+#include <sys/zfs_context.h>
 #include <sys/inttypes.h>
-#include <sys/types.h>
-#include <sys/param.h>
 #include <sys/cred.h>
-#include <sys/time.h>
 #include <sys/fs/zfs.h>
 #include <sys/uio.h>
 
@@ -241,12 +240,13 @@ void zfs_znode_byteswap(void *buf, size_t size);
 
 #define        DS_FIND_SNAPSHOTS       (1<<0)
 #define        DS_FIND_CHILDREN        (1<<1)
+#define        DS_FIND_SERIALIZE       (1<<2)
 
 /*
  * The maximum number of bytes that can be accessed as part of one
  * operation, including metadata.
  */
-#define        DMU_MAX_ACCESS (10<<20) /* 10MB */
+#define        DMU_MAX_ACCESS (64 * 1024 * 1024) /* 64MB */
 #define        DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
 
 #define        DMU_USERUSED_OBJECT     (-1ULL)
@@ -288,8 +288,6 @@ typedef struct dmu_buf {
        void *db_data;                  /* data in buffer */
 } dmu_buf_t;
 
-typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
-
 /*
  * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
  */
@@ -457,7 +455,23 @@ int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
  */
 int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **, int flags);
+
+/*
+ * Add a reference to a dmu buffer that has already been held via
+ * dmu_buf_hold() in the current context.
+ */
 void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
+
+/*
+ * Attempt to add a reference to a dmu buffer that is in an unknown state,
+ * using a pointer that may have been invalidated by eviction processing.
+ * The request will succeed if the passed in dbuf still represents the
+ * same os/object/blkid, is ineligible for eviction, and has at least
+ * one hold by a user other than the syncer.
+ */
+boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object,
+    uint64_t blkid, void *tag);
+
 void dmu_buf_rele(dmu_buf_t *db, void *tag);
 uint64_t dmu_buf_refcount(dmu_buf_t *db);
 
@@ -475,43 +489,127 @@ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
     uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
 void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
 
+typedef void dmu_buf_evict_func_t(void *user_ptr);
+
+/*
+ * A DMU buffer user object may be associated with a dbuf for the
+ * duration of its lifetime.  This allows the user of a dbuf (client)
+ * to attach private data to a dbuf (e.g. in-core only data such as a
+ * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
+ * when that dbuf has been evicted.  Clients typically respond to the
+ * eviction notification by freeing their private data, thus ensuring
+ * the same lifetime for both dbuf and private data.
+ *
+ * The mapping from a dmu_buf_user_t to any client private data is the
+ * client's responsibility.  All current consumers of the API with private
+ * data embed a dmu_buf_user_t as the first member of the structure for
+ * their private data.  This allows conversions between the two types
+ * with a simple cast.  Since the DMU buf user API never needs access
+ * to the private data, other strategies can be employed if necessary
+ * or convenient for the client (e.g. using container_of() to do the
+ * conversion for private data that cannot have the dmu_buf_user_t as
+ * its first member).
+ *
+ * Eviction callbacks are executed without the dbuf mutex held or any
+ * other type of mechanism to guarantee that the dbuf is still available.
+ * For this reason, users must assume the dbuf has already been freed
+ * and not reference the dbuf from the callback context.
+ *
+ * Users requesting "immediate eviction" are notified as soon as the dbuf
+ * is only referenced by dirty records (dirties == holds).  Otherwise the
+ * notification occurs after eviction processing for the dbuf begins.
+ */
+typedef struct dmu_buf_user {
+       /*
+        * Asynchronous user eviction callback state.
+        */
+       taskq_ent_t     dbu_tqent;
+
+       /* This instance's eviction function pointer. */
+       dmu_buf_evict_func_t *dbu_evict_func;
+#ifdef ZFS_DEBUG
+       /*
+        * Pointer to user's dbuf pointer.  NULL for clients that do
+        * not associate a dbuf with their user data.
+        *
+        * The dbuf pointer is cleared upon eviction so as to catch
+        * use-after-evict bugs in clients.
+        */
+       dmu_buf_t **dbu_clear_on_evict_dbufp;
+#endif
+} dmu_buf_user_t;
+
+/*
+ * Initialize the given dmu_buf_user_t instance with the eviction function
+ * evict_func, to be called when the user is evicted.
+ *
+ * NOTE: This function should only be called once on a given dmu_buf_user_t.
+ *       To allow enforcement of this, dbu must already be zeroed on entry.
+ */
+#ifdef __lint
+/* Very ugly, but it beats issuing suppression directives in many Makefiles. */
+extern void
+dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func,
+    dmu_buf_t **clear_on_evict_dbufp);
+#else /* __lint */
+static inline void
+dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func,
+    dmu_buf_t **clear_on_evict_dbufp)
+{
+       ASSERT(dbu->dbu_evict_func == NULL);
+       ASSERT(evict_func != NULL);
+       dbu->dbu_evict_func = evict_func;
+       taskq_init_ent(&dbu->dbu_tqent);
+#ifdef ZFS_DEBUG
+       dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
+#endif
+}
+#endif /* __lint */
+
 /*
- * Returns NULL on success, or the existing user ptr if it's already
- * been set.
+ * Attach user data to a dbuf and mark it for normal (when the dbuf's
+ * data is cleared or its reference count goes to zero) eviction processing.
  *
- * user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
- *
- * user_data_ptr_ptr should be NULL, or a pointer to a pointer which
- * will be set to db->db_data when you are allowed to access it.  Note
- * that db->db_data (the pointer) can change when you do dmu_buf_read(),
- * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill().
- * *user_data_ptr_ptr will be set to the new value when it changes.
+ * Returns NULL on success, or the existing user if another user currently
+ * owns the buffer.
+ */
+void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);
+
+/*
+ * Attach user data to a dbuf and mark it for immediate (its dirty and
+ * reference counts are equal) eviction processing.
  *
- * If non-NULL, pageout func will be called when this buffer is being
- * excised from the cache, so that you can clean up the data structure
- * pointed to by user_ptr.
+ * Returns NULL on success, or the existing user if another user currently
+ * owns the buffer.
+ */
+void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);
+
+/*
+ * Replace the current user of a dbuf.
  *
- * dmu_evict_user() will call the pageout func for all buffers in a
- * objset with a given pageout func.
+ * If given the current user of a dbuf, replaces the dbuf's user with
+ * "new_user" and returns the user data pointer that was replaced.
+ * Otherwise returns the current, and unmodified, dbuf user pointer.
  */
-void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr,
-    dmu_buf_evict_func_t *pageout_func);
+void *dmu_buf_replace_user(dmu_buf_t *db,
+    dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);
+
 /*
- * set_user_ie is the same as set_user, but request immediate eviction
- * when hold count goes to zero.
+ * Remove the specified user data for a DMU buffer.
+ *
+ * Returns the user that was removed on success, or the current user if
+ * another user currently owns the buffer.
  */
-void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
-    void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func);
-void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
-    void *user_ptr, void *user_data_ptr_ptr,
-    dmu_buf_evict_func_t *pageout_func);
-void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
+void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
- * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
+ * Returns the user data (dmu_buf_user_t *) associated with this dbuf.
  */
 void *dmu_buf_get_user(dmu_buf_t *db);
 
+/* Block until any in-progress dmu buf user evictions complete. */
+void dmu_buf_user_evict_wait(void);
+
 /*
  * Returns the blkptr associated with this dbuf, or NULL if not set.
  */
@@ -612,10 +710,11 @@ void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
        dmu_tx_t *tx);
 #ifdef _KERNEL
 #include <linux/blkdev_compat.h>
-int dmu_read_req(objset_t *os, uint64_t object, struct request *req);
-int dmu_write_req(objset_t *os, uint64_t object, struct request *req,
+int dmu_read_bio(objset_t *os, uint64_t object, struct bio *bio);
+int dmu_write_bio(objset_t *os, uint64_t object, struct bio *bio,
        dmu_tx_t *tx);
 int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
+int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
 int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
        dmu_tx_t *tx);
 int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
@@ -636,6 +735,7 @@ void xuio_stat_wbuf_copied(void);
 void xuio_stat_wbuf_nocopy(void);
 
 extern int zfs_prefetch_disable;
+extern int zfs_max_recordsize;
 
 /*
  * Asynchronously try to read in the data.
index cbf0394e6db173b3b4a3c8c6a23de283c6a204a7..837a0d5107b7f1aa636445a0ec1ebb281bd6167a 100644 (file)
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -74,16 +75,18 @@ struct objset {
        arc_buf_t *os_phys_buf;
        objset_phys_t *os_phys;
        /*
-        * The following "special" dnodes have no parent and are exempt from
-        * dnode_move(), but they root their descendents in this objset using
-        * handles anyway, so that all access to dnodes from dbufs consistently
-        * uses handles.
+        * The following "special" dnodes have no parent, are exempt
+        * from dnode_move(), and are not recorded in os_dnodes, but they
+        * root their descendents in this objset using handles anyway, so
+        * that all access to dnodes from dbufs consistently uses handles.
         */
        dnode_handle_t os_meta_dnode;
        dnode_handle_t os_userused_dnode;
        dnode_handle_t os_groupused_dnode;
        zilog_t *os_zil;
 
+       list_node_t os_evicting_node;
+
        /* can change, under dsl_dir's locks: */
        enum zio_checksum os_checksum;
        enum zio_compress os_compress;
@@ -95,6 +98,7 @@ struct objset {
        zfs_cache_type_t os_secondary_cache;
        zfs_sync_type_t os_sync;
        zfs_redundant_metadata_type_t os_redundant_metadata;
+       int os_recordsize;
 
        /* no lock needed: */
        struct dmu_tx *os_synctx; /* XXX sketchy */
@@ -137,6 +141,8 @@ struct objset {
 int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
 int dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp);
+int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj,
+    dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp);
 void dmu_objset_refresh_ownership(objset_t *os, void *tag);
 void dmu_objset_rele(objset_t *os, void *tag);
 void dmu_objset_disown(objset_t *os, void *tag);
@@ -168,6 +174,8 @@ int dmu_objset_userspace_upgrade(objset_t *os);
 boolean_t dmu_objset_userspace_present(objset_t *os);
 int dmu_fsname(const char *snapname, char *buf);
 
+void dmu_objset_evict_done(objset_t *os);
+
 void dmu_objset_init(void);
 void dmu_objset_fini(void);
 
index de590f1d503ba94125227cfe32f8b9a3088fa01b..2442a1f8aab10b86a5eff4d89e02f1f3db4bb4b9 100644 (file)
@@ -21,9 +21,9 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 #ifndef _DMU_SEND_H
@@ -37,12 +37,16 @@ struct dsl_dataset;
 struct drr_begin;
 struct avl_tree;
 
-int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+int dmu_send(const char *tosnap, const char *fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
     int outfd, struct vnode *vp, offset_t *off);
 int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
     uint64_t *sizep);
+int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg,
+    uint64_t *sizep);
 int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
-    boolean_t embedok, int outfd, vnode_t *vp, offset_t *off);
+    boolean_t embedok, boolean_t large_block_ok,
+    int outfd, struct vnode *vp, offset_t *off);
 
 typedef struct dmu_recv_cookie {
        struct dsl_dataset *drc_ds;
@@ -56,6 +60,7 @@ typedef struct dmu_recv_cookie {
        zio_cksum_t drc_cksum;
        uint64_t drc_newsnapobj;
        void *drc_owner;
+       cred_t *drc_cred;
 } dmu_recv_cookie_t;
 
 int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
index b63549b4675320199e3644f87d2e6f2c081e49eb..50e01155903aa3e2b7b1be879dd31e62ddcba5e6 100644 (file)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef        _SYS_DNODE_H
@@ -233,7 +234,18 @@ typedef struct dnode {
        refcount_t dn_holds;
 
        kmutex_t dn_dbufs_mtx;
-       list_t dn_dbufs;                /* descendent dbufs */
+       /*
+        * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs
+        * can contain multiple dbufs of the same (level, blkid) when a
+        * dbuf is marked DB_EVICTING without being removed from
+        * dn_dbufs. To maintain the avl invariant that there cannot be
+        * duplicate entries, we order the dbufs by an arbitrary value -
+        * their address in memory. This means that dn_dbufs cannot be used to
+        * directly look up a dbuf. Instead, callers must use avl_walk, have
+        * a reference to the dbuf, or look up a non-existant node with
+        * db_state = DB_SEARCH (see dbuf_free_range for an example).
+        */
+       avl_tree_t dn_dbufs;
 
        /* protected by dn_struct_rwlock */
        struct dmu_buf_impl *dn_bonus;  /* bonus buffer dbuf */
@@ -266,8 +278,9 @@ typedef struct dnode_handle {
 } dnode_handle_t;
 
 typedef struct dnode_children {
+       dmu_buf_user_t dnc_dbu;         /* User evict data */
        size_t dnc_count;               /* number of children */
-       dnode_handle_t dnc_children[1]; /* sized dynamically */
+       dnode_handle_t dnc_children[];  /* sized dynamically */
 } dnode_children_t;
 
 typedef struct free_range {
@@ -276,7 +289,7 @@ typedef struct free_range {
        uint64_t fr_nblks;
 } free_range_t;
 
-dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
+void dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
     uint64_t object, dnode_handle_t *dnh);
 void dnode_special_close(dnode_handle_t *dnh);
 
index 4ef70adc27cfa7ad15754546d7a6b69287081ebd..d6da5dcfdb9f8472ae3eff9f6d3bb09c7dce5688 100644 (file)
@@ -21,8 +21,9 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef        _SYS_DSL_DATASET_H
@@ -48,7 +49,7 @@ struct dsl_pool;
 
 #define        DS_FLAG_INCONSISTENT    (1ULL<<0)
 #define        DS_IS_INCONSISTENT(ds)  \
-       ((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT)
+       (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT)
 
 /*
  * Do not allow this dataset to be promoted.
@@ -68,7 +69,7 @@ struct dsl_pool;
  */
 #define        DS_FLAG_DEFER_DESTROY   (1ULL<<3)
 #define        DS_IS_DEFER_DESTROY(ds) \
-       ((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY)
+       (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_DEFER_DESTROY)
 
 /*
  * DS_FIELD_* are strings that are used in the "extensified" dataset zap object.
@@ -82,6 +83,13 @@ struct dsl_pool;
  */
 #define        DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
 
+/*
+ * This field is present (with value=0) if this dataset may contain large
+ * blocks (>128KB).  If it is present, then this dataset
+ * is counted in the refcount of the SPA_FEATURE_LARGE_BLOCKS feature.
+ */
+#define        DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks"
+
 /*
  * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
  * name lookups should be performed case-insensitively.
@@ -125,16 +133,20 @@ typedef struct dsl_dataset_phys {
 } dsl_dataset_phys_t;
 
 typedef struct dsl_dataset {
+       dmu_buf_user_t ds_dbu;
+
        /* Immutable: */
        struct dsl_dir *ds_dir;
-       dsl_dataset_phys_t *ds_phys;
        dmu_buf_t *ds_dbuf;
        uint64_t ds_object;
        uint64_t ds_fsid_guid;
+       boolean_t ds_is_snapshot;
 
        /* only used in syncing context, only valid for non-snapshots: */
        struct dsl_dataset *ds_prev;
        uint64_t ds_bookmarks;  /* DMU_OTN_ZAP_METADATA */
+       boolean_t ds_large_blocks;
+       boolean_t ds_need_large_blocks;
 
        /* has internal locking: */
        dsl_deadlist_t ds_deadlist;
@@ -177,6 +189,12 @@ typedef struct dsl_dataset {
        char ds_snapname[MAXNAMELEN];
 } dsl_dataset_t;
 
+static inline dsl_dataset_phys_t *
+dsl_dataset_phys(dsl_dataset_t *ds)
+{
+       return (ds->ds_dbuf->db_data);
+}
+
 /*
  * The max length of a temporary tag prefix is the number of hex digits
  * required to express UINT64_MAX plus one for the hyphen.
@@ -184,13 +202,15 @@ typedef struct dsl_dataset {
 #define        MAX_TAG_PREFIX_LEN      17
 
 #define        dsl_dataset_is_snapshot(ds) \
-       ((ds)->ds_phys->ds_num_children != 0)
+       (dsl_dataset_phys(ds)->ds_num_children != 0)
 
 #define        DS_UNIQUE_IS_ACCURATE(ds)       \
-       (((ds)->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
+       ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
 
 int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag,
     dsl_dataset_t **dsp);
+boolean_t dsl_dataset_try_add_ref(struct dsl_pool *dp, dsl_dataset_t *ds,
+    void *tag);
 int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag,
     dsl_dataset_t **);
 void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
@@ -244,6 +264,8 @@ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
 int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
 boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
+int dsl_dataset_activate_large_blocks(const char *dsname);
+void dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx);
 
 int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
 
@@ -266,7 +288,7 @@ int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
 void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
     dsl_dataset_t *origin_head, dmu_tx_t *tx);
 int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
-    dmu_tx_t *tx, boolean_t recv);
+    dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr);
 void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
     dmu_tx_t *tx);
 
@@ -276,7 +298,8 @@ void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds);
 int dsl_dataset_get_snapname(dsl_dataset_t *ds);
 int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name,
     uint64_t *value);
-int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx);
+int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
+    boolean_t adj_cnt);
 void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
     zprop_source_t source, uint64_t value, dmu_tx_t *tx);
 void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx);
index a0a3ef1ded1969378adffe3e99d162a7a86157c6..55f3a8e5baa9317a7ea99f8455dabe8d20844136 100644 (file)
@@ -21,6 +21,8 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef        _SYS_DSL_DIR_H
@@ -38,6 +40,14 @@ extern "C" {
 
 struct dsl_dataset;
 
+/*
+ * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object.
+ * They should be of the format <reverse-dns>:<field>.
+ */
+
+#define        DD_FIELD_FILESYSTEM_COUNT       "com.joyent:filesystem_count"
+#define        DD_FIELD_SNAPSHOT_COUNT         "com.joyent:snapshot_count"
+
 typedef enum dd_used {
        DD_USED_HEAD,
        DD_USED_SNAP,
@@ -75,12 +85,15 @@ typedef struct dsl_dir_phys {
 } dsl_dir_phys_t;
 
 struct dsl_dir {
+       dmu_buf_user_t dd_dbu;
+
        /* These are immutable; no lock needed: */
        uint64_t dd_object;
-       dsl_dir_phys_t *dd_phys;
-       dmu_buf_t *dd_dbuf;
        dsl_pool_t *dd_pool;
 
+       /* Stable until user eviction; no lock needed: */
+       dmu_buf_t *dd_dbuf;
+
        /* protected by lock on pool's dp_dirty_dirs list */
        txg_node_t dd_dirty_link;
 
@@ -102,7 +115,14 @@ struct dsl_dir {
        char dd_myname[MAXNAMELEN];
 };
 
+static inline dsl_dir_phys_t *
+dsl_dir_phys(dsl_dir_t *dd)
+{
+       return (dd->dd_dbuf->db_data);
+}
+
 void dsl_dir_rele(dsl_dir_t *dd, void *tag);
+void dsl_dir_async_rele(dsl_dir_t *dd, void *tag);
 int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
     dsl_dir_t **, const char **tail);
 int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
@@ -129,8 +149,13 @@ int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
     uint64_t quota);
 int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
     uint64_t reservation);
+int dsl_dir_activate_fs_ss_limit(const char *);
+int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *,
+    cred_t *);
+void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *);
 int dsl_dir_rename(const char *oldname, const char *newname);
-int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
+int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
+    uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *);
 boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
 void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
     uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
@@ -138,6 +163,8 @@ void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
 timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
 void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
     dmu_tx_t *tx);
+void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx);
+boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);
 
 /* internal reserved dir name */
 #define        MOS_DIR_NAME "$MOS"
index 34dc65ba40ea25c02b8b3c8b66c3d05f7ee56033..48b12e8eb1346236288f71cf198f6c45023456c7 100644 (file)
@@ -156,8 +156,10 @@ void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
     int64_t used, int64_t comp, int64_t uncomp);
 boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);
 void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
+void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag);
 void dsl_pool_config_exit(dsl_pool_t *dp, void *tag);
 boolean_t dsl_pool_config_held(dsl_pool_t *dp);
+boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);
 
 taskq_t *dsl_pool_iput_taskq(dsl_pool_t *dp);
 
index ef86fb64cf0c58d62fcef3a17491f4903ddf4220..6139303c1564847cbff4526325b9eb19063775e1 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef        _SYS_DSL_SYNCTASK_H
@@ -38,11 +38,41 @@ struct dsl_pool;
 typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *);
 typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *);
 
+typedef enum zfs_space_check {
+       /*
+        * Normal space check: if there is less than 3.2% free space,
+        * the operation will fail.  Operations which are logically
+        * creating things should use this (e.g. "zfs create", "zfs snapshot").
+        * User writes (via the ZPL / ZVOL) also fail at this point.
+        */
+       ZFS_SPACE_CHECK_NORMAL,
+
+       /*
+        * Space check allows use of half the slop space.  If there
+        * is less than 1.6% free space, the operation will fail.  Most
+        * operations should use this (e.g. "zfs set", "zfs rename"),
+        * because we want them to succeed even after user writes are failing,
+        * so that they can be used as part of the space recovery process.
+        */
+       ZFS_SPACE_CHECK_RESERVED,
+
+       /*
+        * No space check is performed.  Only operations which we expect to
+        * result in a net reduction in space should use this
+        * (e.g. "zfs destroy". Setting quotas & reservations also uses
+        * this because it needs to circumvent the quota/reservation checks).
+        *
+        * See also the comments above spa_slop_shift.
+        */
+       ZFS_SPACE_CHECK_NONE,
+} zfs_space_check_t;
+
 typedef struct dsl_sync_task {
        txg_node_t dst_node;
        struct dsl_pool *dst_pool;
        uint64_t dst_txg;
        int dst_space;
+       zfs_space_check_t dst_space_check;
        dsl_checkfunc_t *dst_checkfunc;
        dsl_syncfunc_t *dst_syncfunc;
        void *dst_arg;
@@ -50,11 +80,11 @@ typedef struct dsl_sync_task {
        boolean_t dst_nowaiter;
 } dsl_sync_task_t;
 
-void dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx);
-int dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
-    dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified);
-void dsl_sync_task_nowait(struct dsl_pool *dp, dsl_syncfunc_t *syncfunc,
-    void *arg, int blocks_modified, dmu_tx_t *tx);
+void dsl_sync_task_sync(dsl_sync_task_t *, dmu_tx_t *);
+int dsl_sync_task(const char *, dsl_checkfunc_t *,
+    dsl_syncfunc_t *, void *, int, zfs_space_check_t);
+void dsl_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
+    void *, int, zfs_space_check_t, dmu_tx_t *);
 
 #ifdef __cplusplus
 }
index ed9a3e4e132de66f65267ea23c4e0ddc97b2b24c..1f344e94f12b571623e44f94c29a79cb47ba811b 100644 (file)
@@ -91,7 +91,7 @@ host_triplet = @host@
 target_triplet = @target@
 subdir = include/sys/fm
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps =  \
+am__aclocal_m4_deps = $(top_srcdir)/config/always-no-bool-compare.m4 \
        $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \
        $(top_srcdir)/config/dkms.m4 \
        $(top_srcdir)/config/kernel-acl.m4 \
@@ -103,20 +103,11 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-bio-bvec-iter.m4 \
        $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \
        $(top_srcdir)/config/kernel-bio-failfast.m4 \
-       $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \
-       $(top_srcdir)/config/kernel-blk-end-request.m4 \
-       $(top_srcdir)/config/kernel-blk-fetch-request.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-discard.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-barrier.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-discard.m4 \
        $(top_srcdir)/config/kernel-blk-queue-flush.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \
-       $(top_srcdir)/config/kernel-blk-requeue-request.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-pos.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \
        $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \
        $(top_srcdir)/config/kernel-blkdev-get.m4 \
        $(top_srcdir)/config/kernel-block-device-operations-release-void.m4 \
@@ -124,6 +115,7 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-clear-inode.m4 \
        $(top_srcdir)/config/kernel-commit-metadata.m4 \
        $(top_srcdir)/config/kernel-create-nameidata.m4 \
+       $(top_srcdir)/config/kernel-current_bio_tail.m4 \
        $(top_srcdir)/config/kernel-d-make-root.m4 \
        $(top_srcdir)/config/kernel-d-obtain-alias.m4 \
        $(top_srcdir)/config/kernel-d-prune-aliases.m4 \
@@ -137,23 +129,25 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-fallocate.m4 \
        $(top_srcdir)/config/kernel-file-inode.m4 \
        $(top_srcdir)/config/kernel-fmode-t.m4 \
+       $(top_srcdir)/config/kernel-follow-down-one.m4 \
        $(top_srcdir)/config/kernel-follow-link-nameidata.m4 \
        $(top_srcdir)/config/kernel-fsync.m4 \
+       $(top_srcdir)/config/kernel-generic_io_acct.m4 \
        $(top_srcdir)/config/kernel-get-disk-ro.m4 \
        $(top_srcdir)/config/kernel-get-gendisk.m4 \
        $(top_srcdir)/config/kernel-insert-inode-locked.m4 \
        $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \
        $(top_srcdir)/config/kernel-is_owner_or_cap.m4 \
+       $(top_srcdir)/config/kernel-kmap-atomic-args.m4 \
        $(top_srcdir)/config/kernel-kobj-name-len.m4 \
        $(top_srcdir)/config/kernel-lookup-bdev.m4 \
        $(top_srcdir)/config/kernel-lookup-nameidata.m4 \
        $(top_srcdir)/config/kernel-lseek-execute.m4 \
+       $(top_srcdir)/config/kernel-mk-request-fn.m4 \
        $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \
        $(top_srcdir)/config/kernel-mount-nodev.m4 \
        $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \
        $(top_srcdir)/config/kernel-put-link-nameidata.m4 \
-       $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \
-       $(top_srcdir)/config/kernel-rq-is_sync.m4 \
        $(top_srcdir)/config/kernel-security-inode-init.m4 \
        $(top_srcdir)/config/kernel-set-nlink.m4 \
        $(top_srcdir)/config/kernel-sget-args.m4 \
@@ -328,9 +322,11 @@ DEBUG_CFLAGS = @DEBUG_CFLAGS@
 DEBUG_DMU_TX = @DEBUG_DMU_TX@
 DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@
 DEBUG_ZFS = @DEBUG_ZFS@
+DEFAULT_INITCONF_DIR = @DEFAULT_INITCONF_DIR@
 DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@
 DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@
 DEFAULT_PACKAGE = @DEFAULT_PACKAGE@
+DEFINE_INITRAMFS = @DEFINE_INITRAMFS@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -380,6 +376,7 @@ MANIFEST_TOOL = @MANIFEST_TOOL@
 MKDIR_P = @MKDIR_P@
 NM = @NM@
 NMEDIT = @NMEDIT@
+NO_BOOL_COMPARE = @NO_BOOL_COMPARE@
 NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@
 OBJDUMP = @OBJDUMP@
 OBJEXT = @OBJEXT@
index c8490c0ecf8eaf36c7f2d40b12731428cf2971d4..58330de54187a7adae89354070d6f9d5aacb0386 100644 (file)
@@ -91,7 +91,7 @@ host_triplet = @host@
 target_triplet = @target@
 subdir = include/sys/fm/fs
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps =  \
+am__aclocal_m4_deps = $(top_srcdir)/config/always-no-bool-compare.m4 \
        $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \
        $(top_srcdir)/config/dkms.m4 \
        $(top_srcdir)/config/kernel-acl.m4 \
@@ -103,20 +103,11 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-bio-bvec-iter.m4 \
        $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \
        $(top_srcdir)/config/kernel-bio-failfast.m4 \
-       $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \
-       $(top_srcdir)/config/kernel-blk-end-request.m4 \
-       $(top_srcdir)/config/kernel-blk-fetch-request.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-discard.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-barrier.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-discard.m4 \
        $(top_srcdir)/config/kernel-blk-queue-flush.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \
-       $(top_srcdir)/config/kernel-blk-requeue-request.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-pos.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \
        $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \
        $(top_srcdir)/config/kernel-blkdev-get.m4 \
        $(top_srcdir)/config/kernel-block-device-operations-release-void.m4 \
@@ -124,6 +115,7 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-clear-inode.m4 \
        $(top_srcdir)/config/kernel-commit-metadata.m4 \
        $(top_srcdir)/config/kernel-create-nameidata.m4 \
+       $(top_srcdir)/config/kernel-current_bio_tail.m4 \
        $(top_srcdir)/config/kernel-d-make-root.m4 \
        $(top_srcdir)/config/kernel-d-obtain-alias.m4 \
        $(top_srcdir)/config/kernel-d-prune-aliases.m4 \
@@ -137,23 +129,25 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-fallocate.m4 \
        $(top_srcdir)/config/kernel-file-inode.m4 \
        $(top_srcdir)/config/kernel-fmode-t.m4 \
+       $(top_srcdir)/config/kernel-follow-down-one.m4 \
        $(top_srcdir)/config/kernel-follow-link-nameidata.m4 \
        $(top_srcdir)/config/kernel-fsync.m4 \
+       $(top_srcdir)/config/kernel-generic_io_acct.m4 \
        $(top_srcdir)/config/kernel-get-disk-ro.m4 \
        $(top_srcdir)/config/kernel-get-gendisk.m4 \
        $(top_srcdir)/config/kernel-insert-inode-locked.m4 \
        $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \
        $(top_srcdir)/config/kernel-is_owner_or_cap.m4 \
+       $(top_srcdir)/config/kernel-kmap-atomic-args.m4 \
        $(top_srcdir)/config/kernel-kobj-name-len.m4 \
        $(top_srcdir)/config/kernel-lookup-bdev.m4 \
        $(top_srcdir)/config/kernel-lookup-nameidata.m4 \
        $(top_srcdir)/config/kernel-lseek-execute.m4 \
+       $(top_srcdir)/config/kernel-mk-request-fn.m4 \
        $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \
        $(top_srcdir)/config/kernel-mount-nodev.m4 \
        $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \
        $(top_srcdir)/config/kernel-put-link-nameidata.m4 \
-       $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \
-       $(top_srcdir)/config/kernel-rq-is_sync.m4 \
        $(top_srcdir)/config/kernel-security-inode-init.m4 \
        $(top_srcdir)/config/kernel-set-nlink.m4 \
        $(top_srcdir)/config/kernel-sget-args.m4 \
@@ -284,9 +278,11 @@ DEBUG_CFLAGS = @DEBUG_CFLAGS@
 DEBUG_DMU_TX = @DEBUG_DMU_TX@
 DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@
 DEBUG_ZFS = @DEBUG_ZFS@
+DEFAULT_INITCONF_DIR = @DEFAULT_INITCONF_DIR@
 DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@
 DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@
 DEFAULT_PACKAGE = @DEFAULT_PACKAGE@
+DEFINE_INITRAMFS = @DEFINE_INITRAMFS@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -336,6 +332,7 @@ MANIFEST_TOOL = @MANIFEST_TOOL@
 MKDIR_P = @MKDIR_P@
 NM = @NM@
 NMEDIT = @NMEDIT@
+NO_BOOL_COMPARE = @NO_BOOL_COMPARE@
 NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@
 OBJDUMP = @OBJDUMP@
 OBJEXT = @OBJEXT@
index d541b07a37295d4c1a182d1bf54013fda86895a3..0d7eadd4f445903b6af4e2f046406252daa5ff3f 100644 (file)
@@ -96,7 +96,6 @@ extern "C" {
 #define        FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE     "zio_pipeline"
 #define        FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY        "zio_delay"
 #define        FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP    "zio_timestamp"
-#define        FM_EREPORT_PAYLOAD_ZFS_ZIO_DEADLINE     "zio_deadline"
 #define        FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA        "zio_delta"
 #define        FM_EREPORT_PAYLOAD_ZFS_PREV_STATE       "prev_state"
 #define        FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED   "cksum_expected"
index df0ff318080c7e80a73792b81c3fa7a9bd6a1b3a..17b07a38a047d209a2d62e7ad6c46e5da5f889b3 100644 (file)
@@ -91,7 +91,7 @@ host_triplet = @host@
 target_triplet = @target@
 subdir = include/sys/fs
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps =  \
+am__aclocal_m4_deps = $(top_srcdir)/config/always-no-bool-compare.m4 \
        $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \
        $(top_srcdir)/config/dkms.m4 \
        $(top_srcdir)/config/kernel-acl.m4 \
@@ -103,20 +103,11 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-bio-bvec-iter.m4 \
        $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \
        $(top_srcdir)/config/kernel-bio-failfast.m4 \
-       $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \
-       $(top_srcdir)/config/kernel-blk-end-request.m4 \
-       $(top_srcdir)/config/kernel-blk-fetch-request.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-discard.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-barrier.m4 \
+       $(top_srcdir)/config/kernel-bio-rw-discard.m4 \
        $(top_srcdir)/config/kernel-blk-queue-flush.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \
        $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \
-       $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \
-       $(top_srcdir)/config/kernel-blk-requeue-request.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-pos.m4 \
-       $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \
        $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \
        $(top_srcdir)/config/kernel-blkdev-get.m4 \
        $(top_srcdir)/config/kernel-block-device-operations-release-void.m4 \
@@ -124,6 +115,7 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-clear-inode.m4 \
        $(top_srcdir)/config/kernel-commit-metadata.m4 \
        $(top_srcdir)/config/kernel-create-nameidata.m4 \
+       $(top_srcdir)/config/kernel-current_bio_tail.m4 \
        $(top_srcdir)/config/kernel-d-make-root.m4 \
        $(top_srcdir)/config/kernel-d-obtain-alias.m4 \
        $(top_srcdir)/config/kernel-d-prune-aliases.m4 \
@@ -137,23 +129,25 @@ am__aclocal_m4_deps =  \
        $(top_srcdir)/config/kernel-fallocate.m4 \
        $(top_srcdir)/config/kernel-file-inode.m4 \
        $(top_srcdir)/config/kernel-fmode-t.m4 \
+       $(top_srcdir)/config/kernel-follow-down-one.m4 \
        $(top_srcdir)/config/kernel-follow-link-nameidata.m4 \
        $(top_srcdir)/config/kernel-fsync.m4 \
+       $(top_srcdir)/config/kernel-generic_io_acct.m4 \
        $(top_srcdir)/config/kernel-get-disk-ro.m4 \
        $(top_srcdir)/config/kernel-get-gendisk.m4 \
        $(top_srcdir)/config/kernel-insert-inode-locked.m4 \
        $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \
        $(top_srcdir)/config/kernel-is_owner_or_cap.m4 \
+       $(top_srcdir)/config/kernel-kmap-atomic-args.m4 \
        $(top_srcdir)/config/kernel-kobj-name-len.m4 \
        $(top_srcdir)/config/kernel-lookup-bdev.m4 \
        $(top_srcdir)/config/kernel-lookup-nameidata.m4 \
        $(top_srcdir)/config/kernel-lseek-execute.m4 \
+       $(top_srcdir)/config/kernel-mk-request-fn.m4 \
        $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \
        $(top_srcdir)/config/kernel-mount-nodev.m4 \
        $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \
        $(top_srcdir)/config/kernel-put-link-nameidata.m4 \
-       $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \
-       $(top_srcdir)/config/kernel-rq-is_sync.m4 \
        $(top_srcdir)/config/kernel-security-inode-init.m4 \
        $(top_srcdir)/config/kernel-set-nlink.m4 \
        $(top_srcdir)/config/kernel-sget-args.m4 \
@@ -284,9 +278,11 @@ DEBUG_CFLAGS = @DEBUG_CFLAGS@
 DEBUG_DMU_TX = @DEBUG_DMU_TX@
 DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@
 DEBUG_ZFS = @DEBUG_ZFS@
+DEFAULT_INITCONF_DIR = @DEFAULT_INITCONF_DIR@
 DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@
 DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@
 DEFAULT_PACKAGE = @DEFAULT_PACKAGE@
+DEFINE_INITRAMFS = @DEFINE_INITRAMFS@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
 DLLTOOL = @DLLTOOL@
@@ -336,6 +332,7 @@ MANIFEST_TOOL = @MANIFEST_TOOL@
 MKDIR_P = @MKDIR_P@
 NM = @NM@
 NMEDIT = @NMEDIT@
+NO_BOOL_COMPARE = @NO_BOOL_COMPARE@
 NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@
 OBJDUMP = @OBJDUMP@
 OBJEXT = @OBJEXT@
index 477d98daa173338a70d638259428c6117a5b8af7..4da144c724abcadd6091caf17b50a1184fbe0042 100644 (file)
@@ -23,7 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -142,6 +142,10 @@ typedef enum {
        ZFS_PROP_LOGICALUSED,
        ZFS_PROP_LOGICALREFERENCED,
        ZFS_PROP_INCONSISTENT,          /* not exposed to the user */
+       ZFS_PROP_FILESYSTEM_LIMIT,
+       ZFS_PROP_SNAPSHOT_LIMIT,
+       ZFS_PROP_FILESYSTEM_COUNT,
+       ZFS_PROP_SNAPSHOT_COUNT,
        ZFS_PROP_SNAPDEV,
        ZFS_PROP_ACLTYPE,
        ZFS_PROP_SELINUX_CONTEXT,
@@ -196,6 +200,7 @@ typedef enum {
        ZPOOL_PROP_FREEING,
        ZPOOL_PROP_FRAGMENTATION,
        ZPOOL_PROP_LEAKED,
+       ZPOOL_PROP_MAXBLOCKSIZE,
        ZPOOL_PROP_TNAME,
        ZPOOL_NUM_PROPS
 } zpool_prop_t;
diff --git a/zfs/include/sys/mntent.h b/zfs/include/sys/mntent.h
new file mode 100644 (file)
index 0000000..7284f05
--- /dev/null
@@ -0,0 +1,102 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ *     Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T
+ *             All Rights Reserved
+ */
+
+#ifndef _SYS_MNTENT_H
+#define        _SYS_MNTENT_H
+
+#define        MNTTYPE_ZFS     "zfs"           /* ZFS file system */
+
+#define        MOUNT_SUCCESS   0x00            /* Success */
+#define        MOUNT_USAGE     0x01            /* Invalid invocation or permissions */
+#define        MOUNT_SYSERR    0x02            /* System error (ENOMEM, etc) */
+#define        MOUNT_SOFTWARE  0x04            /* Internal mount bug */
+#define        MOUNT_USER      0x08            /* Interrupted by user (EINTR) */
+#define        MOUNT_FILEIO    0x10            /* Error updating/locking /etc/mtab */
+#define        MOUNT_FAIL      0x20            /* Mount failed */
+#define        MOUNT_SOMEOK    0x40            /* At least on mount succeeded */
+#define        MOUNT_BUSY      0x80            /* Mount failed due to EBUSY */
+
+#define        MNTOPT_ASYNC    "async"         /* all I/O is asynchronous */
+#define        MNTOPT_ATIME    "atime"         /* update atime for files */
+#define        MNTOPT_NOATIME  "noatime"       /* do not update atime for files */
+#define        MNTOPT_AUTO     "auto"          /* automount */
+#define        MNTOPT_NOAUTO   "noauto"        /* do not automount */
+#define        MNTOPT_CONTEXT  "context"       /* selinux context */
+#define        MNTOPT_FSCONTEXT "fscontext"    /* selinux fscontext */
+#define        MNTOPT_DEFCONTEXT "defcontext"  /* selinux defcontext */
+#define        MNTOPT_ROOTCONTEXT "rootcontext" /* selinux rootcontext */
+#define        MNTOPT_DEFAULTS "defaults"      /* defaults */
+#define        MNTOPT_DEVICES  "dev"           /* device-special allowed */
+#define        MNTOPT_NODEVICES "nodev"        /* device-special disallowed */
+#define        MNTOPT_DIRATIME "diratime"      /* update atime for dirs */
+#define        MNTOPT_NODIRATIME "nodiratime"  /* do not update atime for dirs */
+#define        MNTOPT_DIRSYNC  "dirsync"       /* do dir updates synchronously */
+#define        MNTOPT_EXEC     "exec"          /* enable executables */
+#define        MNTOPT_NOEXEC   "noexec"        /* disable executables */
+#define        MNTOPT_GROUP    "group"         /* allow group mount */
+#define        MNTOPT_NOGROUP  "nogroup"       /* do not allow group mount */
+#define        MNTOPT_IVERSION "iversion"      /* update inode version */
+#define        MNTOPT_NOIVERSION "noiversion"  /* do not update inode version */
+#define        MNTOPT_NBMAND   "mand"          /* allow non-blocking mandatory locks */
+#define        MNTOPT_NONBMAND "nomand"        /* deny non-blocking mandatory locks */
+#define        MNTOPT_NETDEV   "_netdev"       /* network device */
+#define        MNTOPT_NOFAIL   "nofail"        /* no failure */
+#define        MNTOPT_RELATIME "relatime"      /* allow relative time updates */
+#define        MNTOPT_NORELATIME "norelatime"  /* do not allow relative time updates */
+#define        MNTOPT_DFRATIME "strictatime"   /* Deferred access time updates */
+#define        MNTOPT_NODFRATIME "nostrictatime" /* No Deferred access time updates */
+#define        MNTOPT_SETUID   "suid"          /* Both setuid and devices allowed */
+#define        MNTOPT_NOSETUID "nosuid"        /* Neither setuid nor devices allowed */
+#define        MNTOPT_OWNER    "owner"         /* allow owner mount */
+#define        MNTOPT_NOOWNER  "noowner"       /* do not allow owner mount */
+#define        MNTOPT_REMOUNT  "remount"       /* change mount options */
+#define        MNTOPT_RO       "ro"            /* read only */
+#define        MNTOPT_RW       "rw"            /* read/write */
+#define        MNTOPT_SYNC     "sync"          /* all I/O is synchronous */
+#define        MNTOPT_USER     "user"          /* allow user mount */
+#define        MNTOPT_NOUSER   "nouser"        /* do not allow user mount */
+#define        MNTOPT_USERS    "users"         /* allow user mount */
+#define        MNTOPT_NOUSERS  "nousers"       /* do not allow user mount */
+#define        MNTOPT_SUB      "sub"           /* allow mounts on subdirs */
+#define        MNTOPT_NOSUB    "nosub"         /* do not allow mounts on subdirs */
+#define        MNTOPT_QUIET    "quiet"         /* quiet mount */
+#define        MNTOPT_LOUD     "loud"          /* verbose mount */
+#define        MNTOPT_BIND     "bind"          /* remount part of a tree */
+#define        MNTOPT_RBIND    "rbind"         /* include subtrees */
+#define        MNTOPT_DIRXATTR "dirxattr"      /* enable directory xattrs */
+#define        MNTOPT_SAXATTR  "saxattr"       /* enable system-attribute xattrs */
+#define        MNTOPT_XATTR    "xattr"         /* enable extended attributes */
+#define        MNTOPT_NOXATTR  "noxattr"       /* disable extended attributes */
+#define        MNTOPT_COMMENT  "comment"       /* comment */
+#define        MNTOPT_ZFSUTIL  "zfsutil"       /* called by zfs utility */
+#define        MNTOPT_ACL      "acl"           /* passed by util-linux-2.24 mount */
+#define        MNTOPT_NOACL    "noacl"         /* likewise */
+#define        MNTOPT_POSIXACL "posixacl"      /* likewise */
+#define        MNTOPT_MNTPOINT "mntpoint"      /* mount point hint */
+
+#endif /* _SYS_MNTENT_H */
diff --git a/zfs/include/sys/multilist.h b/zfs/include/sys/multilist.h
new file mode 100644 (file)
index 0000000..98d707d
--- /dev/null
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef        _SYS_MULTILIST_H
+#define        _SYS_MULTILIST_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef list_node_t multilist_node_t;
+typedef struct multilist multilist_t;
+typedef struct multilist_sublist multilist_sublist_t;
+typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *);
+
+struct multilist_sublist {
+       /*
+        * The mutex used internally to implement thread safe insertions
+        * and removals to this individual sublist. It can also be locked
+        * by a consumer using multilist_sublist_{lock,unlock}, which is
+        * useful if a consumer needs to traverse the list in a thread
+        * safe manner.
+        */
+       kmutex_t        mls_lock;
+       /*
+        * The actual list object containing all objects in this sublist.
+        */
+       list_t          mls_list;
+       /*
+        * Pad to cache line, in an effort to try and prevent cache line
+        * contention.
+        */
+} ____cacheline_aligned;
+
+struct multilist {
+       /*
+        * This is used to get to the multilist_node_t structure given
+        * the void *object contained on the list.
+        */
+       size_t                          ml_offset;
+       /*
+        * The number of sublists used internally by this multilist.
+        */
+       uint64_t                        ml_num_sublists;
+       /*
+        * The array of pointers to the actual sublists.
+        */
+       multilist_sublist_t             *ml_sublists;
+       /*
+        * Pointer to function which determines the sublist to use
+        * when inserting and removing objects from this multilist.
+        * Please see the comment above multilist_create for details.
+        */
+       multilist_sublist_index_func_t  *ml_index_func;
+};
+
+void multilist_destroy(multilist_t *);
+void multilist_create(multilist_t *, size_t, size_t, unsigned int,
+    multilist_sublist_index_func_t *);
+
+void multilist_insert(multilist_t *, void *);
+void multilist_remove(multilist_t *, void *);
+int  multilist_is_empty(multilist_t *);
+
+unsigned int multilist_get_num_sublists(multilist_t *);
+unsigned int multilist_get_random_index(multilist_t *);
+
+multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
+void multilist_sublist_unlock(multilist_sublist_t *);
+
+void multilist_sublist_insert_head(multilist_sublist_t *, void *);
+void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
+void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
+void multilist_sublist_remove(multilist_sublist_t *, void *);
+
+void *multilist_sublist_head(multilist_sublist_t *);
+void *multilist_sublist_tail(multilist_sublist_t *);
+void *multilist_sublist_next(multilist_sublist_t *, void *);
+void *multilist_sublist_prev(multilist_sublist_t *, void *);
+
+void multilist_link_init(multilist_node_t *);
+int  multilist_link_active(multilist_node_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MULTILIST_H */
index 25c8a52467e724284f2271e49b9945a00bd73378..7a328fd680305bd0689977a066aac700da91b024 100644 (file)
@@ -72,6 +72,7 @@ void rrw_init(rrwlock_t *rrl, boolean_t track_all);
 void rrw_destroy(rrwlock_t *rrl);
 void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag);
 void rrw_enter_read(rrwlock_t *rrl, void *tag);
+void rrw_enter_read_prio(rrwlock_t *rrl, void *tag);
 void rrw_enter_write(rrwlock_t *rrl);
 void rrw_exit(rrwlock_t *rrl, void *tag);
 boolean_t rrw_held(rrwlock_t *rrl, krw_t rw);
@@ -82,6 +83,31 @@ void rrw_tsd_destroy(void *arg);
 #define        RRW_LOCK_HELD(x) \
        (rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER))
 
+/*
+ * A reader-mostly lock implementation, tuning above reader-writer locks
+ * for hightly parallel read acquisitions, pessimizing write acquisitions.
+ *
+ * This should be a prime number.  See comment in rrwlock.c near
+ * RRM_TD_LOCK() for details.
+ */
+#define        RRM_NUM_LOCKS           17
+typedef struct rrmlock {
+       rrwlock_t       locks[RRM_NUM_LOCKS];
+} rrmlock_t;
+
+void rrm_init(rrmlock_t *rrl, boolean_t track_all);
+void rrm_destroy(rrmlock_t *rrl);
+void rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag);
+void rrm_enter_read(rrmlock_t *rrl, void *tag);
+void rrm_enter_write(rrmlock_t *rrl);
+void rrm_exit(rrmlock_t *rrl, void *tag);
+boolean_t rrm_held(rrmlock_t *rrl, krw_t rw);
+
+#define        RRM_READ_HELD(x)        rrm_held(x, RW_READER)
+#define        RRM_WRITE_HELD(x)       rrm_held(x, RW_WRITER)
+#define        RRM_LOCK_HELD(x) \
+       (rrm_held(x, RW_WRITER) || rrm_held(x, RW_READER))
+
 #ifdef __cplusplus
 }
 #endif
index 7b5b03a5629ffda3fd93ea66dd151a294a8f568d..48e3bcd7cdf373e4547a5441aff10278b9569aa5 100644 (file)
@@ -133,7 +133,6 @@ int sa_update_from_cb(sa_handle_t *, sa_attr_type_t,
     uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *);
 void sa_object_info(sa_handle_t *, dmu_object_info_t *);
 void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *);
-void sa_update_user(sa_handle_t *, sa_handle_t *);
 void *sa_get_userdata(sa_handle_t *);
 void sa_set_userp(sa_handle_t *, void *);
 dmu_buf_t *sa_get_db(sa_handle_t *);
index fcbd8eb34e9171061aef49ae7abbdcfdd1751908..6f2f1db6dcf9fdce2d7c6e6769305c94ad5afc6c 100644 (file)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef        _SYS_SA_IMPL_H
@@ -208,11 +209,12 @@ typedef enum sa_data_op {
  */
 
 struct sa_handle {
+       dmu_buf_user_t  sa_dbu;
        kmutex_t        sa_lock;
        dmu_buf_t       *sa_bonus;
        dmu_buf_t       *sa_spill;
        objset_t        *sa_os;
-       void            *sa_userp;
+       void            *sa_userp;
        sa_idx_tab_t    *sa_bonus_tab;   /* idx of bonus */
        sa_idx_tab_t    *sa_spill_tab; /* only present if spill activated */
 };
index 83b6723a4f739c99921eeda2efaeedca9754f6e0..5dc9084dad6b5a5479abe399072db0d300e6ce33 100644 (file)
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef _SYS_SPA_H
@@ -97,17 +98,26 @@ _NOTE(CONSTCOND) } while (0)
 _NOTE(CONSTCOND) } while (0)
 
 /*
- * We currently support nine block sizes, from 512 bytes to 128K.
- * We could go higher, but the benefits are near-zero and the cost
- * of COWing a giant block to modify one byte would become excessive.
+ * We currently support block sizes from 512 bytes to 16MB.
+ * The benefits of larger blocks, and thus larger IO, need to be weighed
+ * against the cost of COWing a giant block to modify one byte, and the
+ * large latency of reading or writing a large block.
+ *
+ * Note that although blocks up to 16MB are supported, the recordsize
+ * property can not be set larger than zfs_max_recordsize (default 1MB).
+ * See the comment near zfs_max_recordsize in dsl_dataset.c for details.
+ *
+ * Note that although the LSIZE field of the blkptr_t can store sizes up
+ * to 32MB, the dnode's dn_datablkszsec can only store sizes up to
+ * 32MB - 512 bytes.  Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
  */
 #define        SPA_MINBLOCKSHIFT       9
-#define        SPA_MAXBLOCKSHIFT       17
+#define        SPA_OLD_MAXBLOCKSHIFT   17
+#define        SPA_MAXBLOCKSHIFT       24
 #define        SPA_MINBLOCKSIZE        (1ULL << SPA_MINBLOCKSHIFT)
+#define        SPA_OLD_MAXBLOCKSIZE    (1ULL << SPA_OLD_MAXBLOCKSHIFT)
 #define        SPA_MAXBLOCKSIZE        (1ULL << SPA_MAXBLOCKSHIFT)
 
-#define        SPA_BLOCKSIZES          (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
-
 /*
  * Size of block to hold the configuration data (a packed nvlist)
  */
@@ -680,6 +690,7 @@ extern spa_t *spa_next(spa_t *prev);
 /* Refcount functions */
 extern void spa_open_ref(spa_t *spa, void *tag);
 extern void spa_close(spa_t *spa, void *tag);
+extern void spa_async_close(spa_t *spa, void *tag);
 extern boolean_t spa_refcount_zero(spa_t *spa);
 
 #define        SCL_NONE        0x00
@@ -784,14 +795,17 @@ extern spa_load_state_t spa_load_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
 extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
 extern uint64_t spa_get_dspace(spa_t *spa);
+extern uint64_t spa_get_slop_space(spa_t *spa);
 extern void spa_update_dspace(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern boolean_t spa_deflate(spa_t *spa);
 extern metaslab_class_t *spa_normal_class(spa_t *spa);
 extern metaslab_class_t *spa_log_class(spa_t *spa);
+extern void spa_evicting_os_register(spa_t *, objset_t *os);
+extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
+extern void spa_evicting_os_wait(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
 extern int spa_prev_software_version(spa_t *spa);
-extern int spa_busy(void);
 extern uint8_t spa_get_failmode(spa_t *spa);
 extern boolean_t spa_suspended(spa_t *spa);
 extern uint64_t spa_bootfs(spa_t *spa);
@@ -825,6 +839,8 @@ extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
 extern boolean_t spa_has_pending_synctask(spa_t *spa);
+extern int spa_maxblocksize(spa_t *spa);
+extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
 
 extern int spa_mode(spa_t *spa);
 extern uint64_t strtonum(const char *str, char **nptr);
index 19ba1153741f93af4f776c54520571fb23194ed8..0b49c7147b1086c840c06b1bcbfb93bcfd28fed9 100644 (file)
@@ -20,8 +20,9 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef _SYS_SPA_IMPL_H
@@ -144,8 +145,13 @@ struct spa {
        uint64_t        spa_claim_max_txg;      /* highest claimed birth txg */
        timespec_t      spa_loaded_ts;          /* 1st successful open time */
        objset_t        *spa_meta_objset;       /* copy of dp->dp_meta_objset */
+       kmutex_t        spa_evicting_os_lock;   /* Evicting objset list lock */
+       list_t          spa_evicting_os_list;   /* Objsets being evicted. */
+       kcondvar_t      spa_evicting_os_cv;     /* Objset Eviction Completion */
        txg_list_t      spa_vdev_txg_list;      /* per-txg dirty vdev list */
        vdev_t          *spa_root_vdev;         /* top-level vdev container */
+       int             spa_min_ashift;         /* of vdevs in normal class */
+       int             spa_max_ashift;         /* of vdevs in normal class */
        uint64_t        spa_config_guid;        /* config pool guid */
        uint64_t        spa_load_guid;          /* spa_load initialized guid */
        uint64_t        spa_last_synced_guid;   /* last synced guid */
index 780a131da27d223992402b29ae492293e940f874..b94cb79c656a82232268ae3c92192574c861c37f 100644 (file)
@@ -24,6 +24,9 @@
 #undef TRACE_SYSTEM
 #define        TRACE_SYSTEM zfs
 
+#undef TRACE_SYSTEM_VAR
+#define        TRACE_SYSTEM_VAR zfs_acl
+
 #if !defined(_TRACE_ACL_H) || defined(TRACE_HEADER_MULTI_READ)
 #define        _TRACE_ACL_H
 
index 8b885eff73a76bafd2bf4e362c67815351e6fb7e..31c3cdcb9b21f088e59aff7efbc91939b7f73528 100644 (file)
@@ -26,6 +26,9 @@
 #undef TRACE_SYSTEM
 #define        TRACE_SYSTEM zfs
 
+#undef TRACE_SYSTEM_VAR
+#define        TRACE_SYSTEM_VAR zfs_arc
+
 #if !defined(_TRACE_ARC_H) || defined(TRACE_HEADER_MULTI_READ)
 #define        _TRACE_ARC_H
 
@@ -45,7 +48,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
        TP_STRUCT__entry(
            __array(uint64_t,           hdr_dva_word, 2)
            __field(uint64_t,           hdr_birth)
-           __field(uint64_t,           hdr_cksum0)
            __field(uint32_t,           hdr_flags)
            __field(uint32_t,           hdr_datacnt)
            __field(arc_buf_contents_t, hdr_type)
@@ -64,27 +66,25 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
            __entry->hdr_dva_word[0]    = ab->b_dva.dva_word[0];
            __entry->hdr_dva_word[1]    = ab->b_dva.dva_word[1];
            __entry->hdr_birth          = ab->b_birth;
-           __entry->hdr_cksum0         = ab->b_cksum0;
            __entry->hdr_flags          = ab->b_flags;
-           __entry->hdr_datacnt        = ab->b_datacnt;
-           __entry->hdr_type           = ab->b_type;
+           __entry->hdr_datacnt        = ab->b_l1hdr.b_datacnt;
            __entry->hdr_size           = ab->b_size;
            __entry->hdr_spa            = ab->b_spa;
-           __entry->hdr_state_type     = ab->b_state->arcs_state;
-           __entry->hdr_access         = ab->b_arc_access;
-           __entry->hdr_mru_hits       = ab->b_mru_hits;
-           __entry->hdr_mru_ghost_hits = ab->b_mru_ghost_hits;
-           __entry->hdr_mfu_hits       = ab->b_mfu_hits;
-           __entry->hdr_mfu_ghost_hits = ab->b_mfu_ghost_hits;
-           __entry->hdr_l2_hits        = ab->b_l2_hits;
-           __entry->hdr_refcount       = ab->b_refcnt.rc_count;
+           __entry->hdr_state_type     = ab->b_l1hdr.b_state->arcs_state;
+           __entry->hdr_access         = ab->b_l1hdr.b_arc_access;
+           __entry->hdr_mru_hits       = ab->b_l1hdr.b_mru_hits;
+           __entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits;
+           __entry->hdr_mfu_hits       = ab->b_l1hdr.b_mfu_hits;
+           __entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits;
+           __entry->hdr_l2_hits        = ab->b_l1hdr.b_l2_hits;
+           __entry->hdr_refcount       = ab->b_l1hdr.b_refcnt.rc_count;
        ),
-       TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx "
+       TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
            "flags 0x%x datacnt %u type %u size %llu spa %llu "
            "state_type %u access %lu mru_hits %u mru_ghost_hits %u "
            "mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }",
            __entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
-           __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags,
+           __entry->hdr_birth, __entry->hdr_flags,
            __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size,
            __entry->hdr_spa, __entry->hdr_state_type,
            __entry->hdr_access, __entry->hdr_mru_hits,
@@ -261,7 +261,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
        TP_STRUCT__entry(
            __array(uint64_t,           hdr_dva_word, 2)
            __field(uint64_t,           hdr_birth)
-           __field(uint64_t,           hdr_cksum0)
            __field(uint32_t,           hdr_flags)
            __field(uint32_t,           hdr_datacnt)
            __field(arc_buf_contents_t, hdr_type)
@@ -292,20 +291,18 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
            __entry->hdr_dva_word[0]    = hdr->b_dva.dva_word[0];
            __entry->hdr_dva_word[1]    = hdr->b_dva.dva_word[1];
            __entry->hdr_birth          = hdr->b_birth;
-           __entry->hdr_cksum0         = hdr->b_cksum0;
            __entry->hdr_flags          = hdr->b_flags;
-           __entry->hdr_datacnt        = hdr->b_datacnt;
-           __entry->hdr_type           = hdr->b_type;
+           __entry->hdr_datacnt        = hdr->b_l1hdr.b_datacnt;
            __entry->hdr_size           = hdr->b_size;
            __entry->hdr_spa            = hdr->b_spa;
-           __entry->hdr_state_type     = hdr->b_state->arcs_state;
-           __entry->hdr_access         = hdr->b_arc_access;
-           __entry->hdr_mru_hits       = hdr->b_mru_hits;
-           __entry->hdr_mru_ghost_hits = hdr->b_mru_ghost_hits;
-           __entry->hdr_mfu_hits       = hdr->b_mfu_hits;
-           __entry->hdr_mfu_ghost_hits = hdr->b_mfu_ghost_hits;
-           __entry->hdr_l2_hits        = hdr->b_l2_hits;
-           __entry->hdr_refcount       = hdr->b_refcnt.rc_count;
+           __entry->hdr_state_type     = hdr->b_l1hdr.b_state->arcs_state;
+           __entry->hdr_access         = hdr->b_l1hdr.b_arc_access;
+           __entry->hdr_mru_hits       = hdr->b_l1hdr.b_mru_hits;
+           __entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
+           __entry->hdr_mfu_hits       = hdr->b_l1hdr.b_mfu_hits;
+           __entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
+           __entry->hdr_l2_hits        = hdr->b_l1hdr.b_l2_hits;
+           __entry->hdr_refcount       = hdr->b_l1hdr.b_refcnt.rc_count;
 
            __entry->bp_dva0[0]         = bp->blk_dva[0].dva_word[0];
            __entry->bp_dva0[1]         = bp->blk_dva[0].dva_word[1];
@@ -325,8 +322,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
            __entry->zb_level           = zb->zb_level;
            __entry->zb_blkid           = zb->zb_blkid;
        ),
-       TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx "
-           "flags 0x%x datacnt %u type %u size %llu spa %llu state_type %u "
+       TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
+           "flags 0x%x datacnt %u size %llu spa %llu state_type %u "
            "access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u "
            "mfu_ghost_hits %u l2_hits %u refcount %lli } "
            "bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 "
@@ -334,8 +331,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
            "lsize %llu } zb { objset %llu object %llu level %lli "
            "blkid %llu }",
            __entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
-           __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags,
-           __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size,
+           __entry->hdr_birth, __entry->hdr_flags,
+           __entry->hdr_datacnt, __entry->hdr_size,
            __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access,
            __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits,
            __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits,
@@ -378,7 +375,6 @@ DECLARE_EVENT_CLASS(zfs_l2arc_evict_class,
            __field(uint64_t,           l2ad_hand)
            __field(uint64_t,           l2ad_start)
            __field(uint64_t,           l2ad_end)
-           __field(uint64_t,           l2ad_evict)
            __field(boolean_t,          l2ad_first)
            __field(boolean_t,          l2ad_writing)
 
@@ -393,7 +389,6 @@ DECLARE_EVENT_CLASS(zfs_l2arc_evict_class,
            __entry->l2ad_hand          = dev->l2ad_hand;
            __entry->l2ad_start         = dev->l2ad_start;
            __entry->l2ad_end           = dev->l2ad_end;
-           __entry->l2ad_evict         = dev->l2ad_evict;
            __entry->l2ad_first         = dev->l2ad_first;
            __entry->l2ad_writing       = dev->l2ad_writing;
 
@@ -401,12 +396,11 @@ DECLARE_EVENT_CLASS(zfs_l2arc_evict_class,
            __entry->all                = all;
        ),
        TP_printk("l2ad { vdev { id %llu guid %llu state %llu } "
-           "hand %llu start %llu end %llu evict %llu "
+           "hand %llu start %llu end %llu "
            "first %d writing %d } taddr %llu all %d",
            __entry->vdev_id, __entry->vdev_guid, __entry->vdev_state,
            __entry->l2ad_hand, __entry->l2ad_start,
-           __entry->l2ad_end, __entry->l2ad_evict,
-           __entry->l2ad_first, __entry->l2ad_writing,
+           __entry->l2ad_end, __entry->l2ad_first, __entry->l2ad_writing,
            __entry->taddr, __entry->all)
 );
 
index 24b34bcf3e65fa050356ce4e041d6bb30107c488..e493a45802ed8cb8646e0e51b6e384c8a9d608d3 100644 (file)
  * be guarded separately.
  */
 
-/*
- * Generic support for one argument tracepoints of the form:
- *
- * DTRACE_PROBE1(...,
- *     const char *, ...);
- */
-
-DECLARE_EVENT_CLASS(zfs_dbgmsg_class,
-       TP_PROTO(const char *msg),
-       TP_ARGS(msg),
-       TP_STRUCT__entry(
-           __string(msg, msg)
-       ),
-       TP_fast_assign(
-           __assign_str(msg, msg);
-       ),
-       TP_printk("%s", __get_str(msg))
-);
-
-#define        DEFINE_DBGMSG_EVENT(name) \
-DEFINE_EVENT(zfs_dbgmsg_class, name, \
-       TP_PROTO(const char *msg), \
-       TP_ARGS(msg))
-DEFINE_DBGMSG_EVENT(zfs_zfs__dbgmsg);
-
-
 /*
  * Generic support for four argument tracepoints of the form:
  *
index 34bc74e1e60d80290534ed92e9261a4b8aec70f3..49e35e3dcbfe2bd32a54deea7480b135d9f9dba2 100644 (file)
@@ -24,6 +24,9 @@
 #undef TRACE_SYSTEM
 #define        TRACE_SYSTEM zfs
 
+#undef TRACE_SYSTEM_VAR
+#define        TRACE_SYSTEM_VAR zfs_dbuf
+
 #if !defined(_TRACE_DBUF_H) || defined(TRACE_HEADER_MULTI_READ)
 #define        _TRACE_DBUF_H
 
index 154e32096c83562c4045dd858a18945814450e49..e070997bca432b9b479144ec67a9ac707e8f62eb 100644 (file)
@@ -24,6 +24,9 @@
 #undef TRACE_SYSTEM
 #define        TRACE_SYSTEM zfs
 
+#undef TRACE_SYSTEM_VAR
+#define        TRACE_SYSTEM_VAR zfs_dmu
+
 #if !defined(_TRACE_DMU_H) || defined(TRACE_HEADER_MULTI_READ)
 #define        _TRACE_DMU_H
 
index 2f874d596fc712486a29f21f2ac5b5671b2d88e6..ee63c29cd119d2d436738496142f2ef34ef610e4 100644 (file)
@@ -24,6 +24,9 @@
 #undef TRACE_SYSTEM
 #define        TRACE_SYSTEM zfs
 
+#undef TRACE_SYSTEM_VAR
+#define        TRACE_SYSTEM_VAR zfs_dnode
+
 #if !defined(_TRACE_DNODE_H) || defined(TRACE_HEADER_MULTI_READ)
 #define        _TRACE_DNODE_H
 
diff --git a/zfs/include/sys/trace_multilist.h b/zfs/include/sys/trace_multilist.h
new file mode 100644 (file)
index 0000000..08e27a9
--- /dev/null
@@ -0,0 +1,79 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS)
+
+#undef TRACE_SYSTEM
+#define        TRACE_SYSTEM zfs
+
+#undef TRACE_SYSTEM_VAR
+#define        TRACE_SYSTEM_VAR zfs_multilist
+
+#if !defined(_TRACE_MULTILIST_H) || defined(TRACE_HEADER_MULTI_READ)
+#define        _TRACE_MULTILIST_H
+
+#include <linux/tracepoint.h>
+#include <sys/types.h>
+
+/*
+ * Generic support for three argument tracepoints of the form:
+ *
+ * DTRACE_PROBE3(...,
+ *     multilist_t *, ...,
+ *     unsigned int, ...,
+ *     void *, ...);
+ */
+
+DECLARE_EVENT_CLASS(zfs_multilist_insert_remove_class,
+       TP_PROTO(multilist_t *ml, unsigned sublist_idx, void *obj),
+       TP_ARGS(ml, sublist_idx, obj),
+       TP_STRUCT__entry(
+           __field(size_t,             ml_offset)
+           __field(uint64_t,           ml_num_sublists)
+
+           __field(unsigned int,       sublist_idx)
+       ),
+       TP_fast_assign(
+           __entry->ml_offset          = ml->ml_offset;
+           __entry->ml_num_sublists    = ml->ml_num_sublists;
+
+           __entry->sublist_idx        = sublist_idx;
+       ),
+       TP_printk("ml { offset %ld numsublists %llu sublistidx %u } ",
+           __entry->ml_offset, __entry->ml_num_sublists, __entry->sublist_idx)
+);
+
+#define        DEFINE_MULTILIST_INSERT_REMOVE_EVENT(name) \
+DEFINE_EVENT(zfs_multilist_insert_remove_class, name, \
+       TP_PROTO(multilist_t *ml, unsigned int sublist_idx, void *obj), \
+       TP_ARGS(ml, sublist_idx, obj))
+DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__insert);
+DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__remove);
+
+#endif /* _TRACE_MULTILIST_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define        TRACE_INCLUDE_PATH sys
+#define        TRACE_INCLUDE_FILE trace_multilist
+#include <trace/define_trace.h>
+
+#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */
index e977b25a8dd66b8a19c9c7f11f8be40caa15922d..61d650984cd349357a4b92bbd45fc2bc89f64c9f 100644 (file)
@@ -24,6 +24,9 @@
 #undef TRACE_SYSTEM
 #define        TRACE_SYSTEM zfs
 
+#undef TRACE_SYSTEM_VAR
+#define        TRACE_SYSTEM_VAR zfs_txg
+
 #if !defined(_TRACE_TXG_H) || defined(TRACE_HEADER_MULTI_READ)
 #define        _TRACE_TXG_H
 
index 3ff68fb2a82f6355ccbf0202de4ed2d2b6d8a9b5..e97466fde5e3689bcd6ba8760f2e0c16aba6d68a 100644 (file)
@@ -24,6 +24,9 @@
 #undef TRACE_SYSTEM
 #define        TRACE_SYSTEM zfs
 
+#undef TRACE_SYSTEM_VAR
+#define        TRACE_SYSTEM_VAR zfs_zil
+
 #if !defined(_TRACE_ZIL_H) || defined(TRACE_HEADER_MULTI_READ)
 #define        _TRACE_ZIL_H
 
index 8c811c2aa0b33781d4a522deb69785518165c910..e1399c468a74ad6e19b8621d14175aa926f4824f 100644 (file)
@@ -24,6 +24,9 @@
 #undef TRACE_SYSTEM
 #define        TRACE_SYSTEM zfs
 
+#undef TRACE_SYSTEM_VAR
+#define        TRACE_SYSTEM_VAR zfs_zrlock
+
 #if !defined(_TRACE_ZRLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
 #define        _TRACE_ZRLOCK_H
 
index b5bb91573145273eeb6c22cf2896690aa119b0f8..21e7ae0de7a7cfa76c63d8985a0b8be682f4f1da 100644 (file)
@@ -22,6 +22,9 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
 
 #ifndef _SYS_UBERBLOCK_H
 #define        _SYS_UBERBLOCK_H
@@ -36,8 +39,8 @@ extern "C" {
 
 typedef struct uberblock uberblock_t;
 
-extern int uberblock_verify(uberblock_t *ub);
-extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg);
+extern int uberblock_verify(uberblock_t *);
+extern boolean_t uberblock_update(uberblock_t *, vdev_t *, uint64_t);
 
 #ifdef __cplusplus
 }
index bef6aaf8f11bd9ac6f1a92f6e4686d6848e04189..365789e524d6d3375f831f253a7bee20cc1bb65c 100644 (file)
@@ -59,6 +59,7 @@ extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
 extern boolean_t vdev_is_bootable(vdev_t *vd);
 extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
 extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
+extern int vdev_count_leaves(spa_t *spa);
 extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
     uint64_t txg, uint64_t size);
 extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
index cf38a86afeabae1ec5c3587ed5ac247c41a88f4c..aebcf55cfd9a9fab9cbaf9dbd78923be7ec2f656 100644 (file)
@@ -27,8 +27,6 @@
 #ifndef _SYS_VDEV_FILE_H
 #define        _SYS_VDEV_FILE_H
 
-
-
 #include <sys/vdev.h>
 
 #ifdef __cplusplus
@@ -39,9 +37,6 @@ typedef struct vdev_file {
        vnode_t         *vf_vnode;
 } vdev_file_t;
 
-extern void vdev_file_init(void);
-extern void vdev_file_fini(void);
-
 #ifdef __cplusplus
 }
 #endif
index a8dc9510e3e93955c724ad63c750015b048b826e..1371a3f0391f17c1e3fd072bbdc67d40405040e0 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_IMPL_H
@@ -60,7 +60,7 @@ typedef int   vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
     uint64_t *ashift);
 typedef void   vdev_close_func_t(vdev_t *vd);
 typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
-typedef int    vdev_io_start_func_t(zio_t *zio);
+typedef void   vdev_io_start_func_t(zio_t *zio);
 typedef void   vdev_io_done_func_t(zio_t *zio);
 typedef void   vdev_state_change_func_t(vdev_t *vd, int, int);
 typedef void   vdev_hold_func_t(vdev_t *vd);
@@ -113,6 +113,8 @@ struct vdev_queue {
        vdev_t          *vq_vdev;
        vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
        avl_tree_t      vq_active_tree;
+       avl_tree_t      vq_read_offset_tree;
+       avl_tree_t      vq_write_offset_tree;
        uint64_t        vq_last_offset;
        hrtime_t        vq_io_complete_ts; /* time last i/o completed */
        hrtime_t        vq_io_delta_ts;
@@ -149,6 +151,7 @@ struct vdev {
        vdev_stat_t     vdev_stat;      /* virtual device statistics    */
        boolean_t       vdev_expanding; /* expand the vdev?             */
        boolean_t       vdev_reopening; /* reopen in progress?          */
+       boolean_t       vdev_nonrot;    /* true if solid state          */
        int             vdev_open_error; /* error on last open          */
        kthread_t       *vdev_open_thread; /* thread opening children   */
        uint64_t        vdev_crtxg;     /* txg when top-level was added */
@@ -208,7 +211,7 @@ struct vdev {
        boolean_t       vdev_isl2cache; /* was a l2cache device         */
        vdev_queue_t    vdev_queue;     /* I/O deadline schedule queue  */
        vdev_cache_t    vdev_cache;     /* physical block cache         */
-       spa_aux_vdev_t  *vdev_aux;      /* for l2cache vdevs            */
+       spa_aux_vdev_t  *vdev_aux;      /* for l2cache and spares vdevs */
        zio_t           *vdev_probe_zio; /* root of current probe       */
        vdev_aux_t      vdev_label_aux; /* on-disk aux state            */
 
index fbd513098846a977b4f74cef190a55b81cb91c56..bc15237bfca2863f4974a6c491b7466311638356 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #ifndef        _SYS_ZAP_H
@@ -380,11 +380,6 @@ void zap_cursor_advance(zap_cursor_t *zc);
  */
 uint64_t zap_cursor_serialize(zap_cursor_t *zc);
 
-/*
- * Advance the cursor to the attribute having the given key.
- */
-int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt);
-
 /*
  * Initialize a zap cursor pointing to the position recorded by
  * zap_cursor_serialize (in the "serialized" argument).  You can also
index 1dc322e02f6f25419a79d16294a1c4ddd5a755be..bfd43e31da80043dfb93c0d23dc3da4117f22339 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef        _SYS_ZAP_IMPL_H
@@ -41,8 +42,7 @@ extern int fzap_default_block_shift;
 
 #define        MZAP_ENT_LEN            64
 #define        MZAP_NAME_LEN           (MZAP_ENT_LEN - 8 - 4 - 2)
-#define        MZAP_MAX_BLKSHIFT       SPA_MAXBLOCKSHIFT
-#define        MZAP_MAX_BLKSZ          (1 << MZAP_MAX_BLKSHIFT)
+#define        MZAP_MAX_BLKSZ          SPA_OLD_MAXBLOCKSIZE
 
 #define        ZAP_NEED_CD             (-1U)
 
@@ -70,7 +70,7 @@ typedef struct mzap_ent {
 } mzap_ent_t;
 
 #define        MZE_PHYS(zap, mze) \
-       (&(zap)->zap_m.zap_phys->mz_chunk[(mze)->mze_chunkid])
+       (&zap_m_phys(zap)->mz_chunk[(mze)->mze_chunkid])
 
 /*
  * The (fat) zap is stored in one object. It is an array of
@@ -104,7 +104,7 @@ struct zap_leaf;
  * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
  */
 #define        ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
-       ((uint64_t *)(zap)->zap_f.zap_phys) \
+       ((uint64_t *)zap_f_phys(zap)) \
        [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
 
 /*
@@ -140,6 +140,7 @@ typedef struct zap_phys {
 typedef struct zap_table_phys zap_table_phys_t;
 
 typedef struct zap {
+       dmu_buf_user_t zap_dbu;
        objset_t *zap_objset;
        uint64_t zap_object;
        struct dmu_buf *zap_dbuf;
@@ -149,8 +150,6 @@ typedef struct zap {
        uint64_t zap_salt;
        union {
                struct {
-                       zap_phys_t *zap_phys;
-
                        /*
                         * zap_num_entries_mtx protects
                         * zap_num_entries
@@ -159,7 +158,6 @@ typedef struct zap {
                        int zap_block_shift;
                } zap_fat;
                struct {
-                       mzap_phys_t *zap_phys;
                        int16_t zap_num_entries;
                        int16_t zap_num_chunks;
                        int16_t zap_alloc_next;
@@ -168,6 +166,18 @@ typedef struct zap {
        } zap_u;
 } zap_t;
 
+static inline zap_phys_t *
+zap_f_phys(zap_t *zap)
+{
+       return (zap->zap_dbuf->db_data);
+}
+
+static inline mzap_phys_t *
+zap_m_phys(zap_t *zap)
+{
+       return (zap->zap_dbuf->db_data);
+}
+
 typedef struct zap_name {
        zap_t *zn_zap;
        int zn_key_intlen;
@@ -187,7 +197,7 @@ boolean_t zap_match(zap_name_t *zn, const char *matchname);
 int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
 void zap_unlockdir(zap_t *zap);
-void zap_evict(dmu_buf_t *db, void *vmzap);
+void zap_evict(void *dbu);
 zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
 void zap_name_free(zap_name_t *zn);
 int zap_hashbits(zap_t *zap);
@@ -219,7 +229,6 @@ int fzap_add_cd(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
     const void *val, uint32_t cd, dmu_tx_t *tx);
 void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
-int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);
 
 #ifdef __cplusplus
 }
index f6947a72d70e947c4aece66425cb16dd743ecee8..e784c5963b2e426c2d0f14f4ee86a700e73f2caf 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef        _SYS_ZAP_LEAF_H
@@ -83,7 +84,7 @@ struct zap_stats;
  */
 #define        ZAP_LEAF_CHUNK(l, idx) \
        ((zap_leaf_chunk_t *) \
-       ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
+       (zap_leaf_phys(l)->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
 #define        ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
 
 typedef enum zap_chunk_type {
@@ -152,13 +153,18 @@ typedef union zap_leaf_chunk {
 } zap_leaf_chunk_t;
 
 typedef struct zap_leaf {
+       dmu_buf_user_t l_dbu;
        krwlock_t l_rwlock;
        uint64_t l_blkid;               /* 1<<ZAP_BLOCK_SHIFT byte block off */
        int l_bs;                       /* block size shift */
        dmu_buf_t *l_dbuf;
-       zap_leaf_phys_t *l_phys;
 } zap_leaf_t;
 
+static inline zap_leaf_phys_t *
+zap_leaf_phys(zap_leaf_t *l)
+{
+       return (l->l_dbuf->db_data);
+}
 
 typedef struct zap_entry_handle {
        /* Set by zap_leaf and public to ZAP */
index 3dc54f1d7d90ada20e94bc001acd9427b1c794ae..4f7e3287f3da3fea051acdab27943771f6b1e28c 100644 (file)
@@ -233,6 +233,7 @@ typedef struct kthread {
        kt_did_t        t_tid;
        thread_func_t   t_func;
        void *          t_arg;
+       pri_t           t_pri;
 } kthread_t;
 
 #define        curthread                       zk_thread_current()
@@ -349,8 +350,8 @@ extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
     hrtime_t res, int flag);
 extern void cv_signal(kcondvar_t *cv);
 extern void cv_broadcast(kcondvar_t *cv);
-#define        cv_timedwait_interruptible(cv, mp, at)  cv_timedwait(cv, mp, at)
-#define        cv_wait_interruptible(cv, mp)           cv_wait(cv, mp)
+#define        cv_timedwait_sig(cv, mp, at)            cv_timedwait(cv, mp, at)
+#define        cv_wait_sig(cv, mp)                     cv_wait(cv, mp)
 #define        cv_wait_io(cv, mp)                      cv_wait(cv, mp)
 
 /*
@@ -408,11 +409,14 @@ extern void kstat_set_raw_ops(kstat_t *ksp,
 #define        kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f)
 #define        kmem_cache_free(_c, _b) umem_cache_free(_c, _b)
 #define        kmem_debugging()        0
-#define        kmem_cache_reap_now(_c)         /* nothing */
+#define        kmem_cache_reap_now(_c) umem_cache_reap_now(_c);
 #define        kmem_cache_set_move(_c, _cb)    /* nothing */
+#define        vmem_qcache_reap(_v)            /* nothing */
 #define        POINTER_INVALIDATE(_pp)         /* nothing */
 #define        POINTER_IS_VALID(_p)    0
 
+extern vmem_t *zio_arena;
+
 typedef umem_cache_t kmem_cache_t;
 
 typedef enum kmem_cbrc {
@@ -468,6 +472,7 @@ extern void taskq_init_ent(taskq_ent_t *);
 extern void    taskq_destroy(taskq_t *);
 extern void    taskq_wait(taskq_t *);
 extern void    taskq_wait_id(taskq_t *, taskqid_t);
+extern void    taskq_wait_outstanding(taskq_t *, taskqid_t);
 extern int     taskq_member(taskq_t *, kthread_t *);
 extern int     taskq_cancel_id(taskq_t *, taskqid_t);
 extern void    system_taskq_init(void);
@@ -609,9 +614,14 @@ extern void delay(clock_t ticks);
        } while (0);
 
 #define        max_ncpus       64
+#define        boot_ncpus      (sysconf(_SC_NPROCESSORS_ONLN))
 
-#define        minclsyspri     60
-#define        maxclsyspri     99
+/*
+ * Process priorities as defined by setpriority(2) and getpriority(2).
+ */
+#define        minclsyspri     19
+#define        maxclsyspri     -20
+#define        defclsyspri     0
 
 #define        CPU_SEQID       (pthread_self() & (max_ncpus - 1))
 
index 1ff23a298856dbae7afbd7dd44fb07ead86399f4..960a9a6291463706cbd41fe15107926c703c3577 100644 (file)
@@ -32,6 +32,7 @@
 #define        _ZFS_CTLDIR_H
 
 #include <sys/vnode.h>
+#include <sys/pathname.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 
        (zfs_has_ctldir(zdp) && \
        (ZTOZSB(zdp)->z_show_ctldir))
 
-typedef struct {
-       char                    *se_name;
-       char                    *se_path;
-       struct inode            *se_inode;
-       taskqid_t               se_taskqid;
-       avl_node_t              se_node;
-} zfs_snapentry_t;
+extern int zfs_expire_snapshot;
 
 /* zfsctl generic functions */
-extern int snapentry_compare(const void *a, const void *b);
-extern boolean_t zfsctl_is_node(struct inode *ip);
-extern boolean_t zfsctl_is_snapdir(struct inode *ip);
-extern void zfsctl_inode_inactive(struct inode *ip);
-extern void zfsctl_inode_destroy(struct inode *ip);
 extern int zfsctl_create(zfs_sb_t *zsb);
 extern void zfsctl_destroy(zfs_sb_t *zsb);
 extern struct inode *zfsctl_root(znode_t *zp);
+extern void zfsctl_init(void);
+extern void zfsctl_fini(void);
+extern boolean_t zfsctl_is_node(struct inode *ip);
+extern boolean_t zfsctl_is_snapdir(struct inode *ip);
 extern int zfsctl_fid(struct inode *ip, fid_t *fidp);
 
 /* zfsctl '.zfs' functions */
@@ -81,9 +75,9 @@ extern int zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr,
 extern int zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap,
     struct inode **ipp, cred_t *cr, int flags);
 extern void zfsctl_snapdir_inactive(struct inode *ip);
-extern int zfsctl_unmount_snapshot(zfs_sb_t *zsb, char *name, int flags);
-extern int zfsctl_unmount_snapshots(zfs_sb_t *zsb, int flags, int *count);
-extern int zfsctl_mount_snapshot(struct path *path, int flags);
+extern int zfsctl_snapshot_mount(struct path *path, int flags);
+extern int zfsctl_snapshot_unmount(char *snapname, int flags);
+extern int zfsctl_snapshot_unmount_delay(uint64_t objsetid, int delay);
 extern int zfsctl_lookup_objset(struct super_block *sb, uint64_t objsetid,
     zfs_sb_t **zsb);
 
@@ -92,10 +86,6 @@ extern int zfsctl_shares_lookup(struct inode *dip, char *name,
     struct inode **ipp, int flags, cred_t *cr, int *direntflags,
     pathname_t *realpnp);
 
-/* zfsctl_init/fini functions */
-extern void zfsctl_init(void);
-extern void zfsctl_fini(void);
-
 /*
  * These inodes numbers are reserved for the .zfs control directory.
  * It is important that they be no larger that 48-bits because only
index 1a7062408e04a878cd118d1f694234393f881052..2f0064ee045bd45f52364ade0e07081855e56e0e 100644 (file)
@@ -51,28 +51,24 @@ extern int zfs_free_leak_on_eio;
 #define        ZFS_DEBUG_ZIO_FREE              (1<<6)
 #define        ZFS_DEBUG_HISTOGRAM_VERIFY      (1<<7)
 
-#if defined(HAVE_DECLARE_EVENT_CLASS) || !defined(_KERNEL)
 extern void __dprintf(const char *file, const char *func,
     int line, const char *fmt, ...);
 #define        dprintf(...) \
-       if (zfs_flags & ZFS_DEBUG_DPRINTF) \
-               __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__)
-#else
-#define        dprintf(...) ((void)0)
-#endif /* HAVE_DECLARE_EVENT_CLASS || !_KERNEL */
+       __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__)
+#define        zfs_dbgmsg(...) \
+       __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__)
 
 extern void zfs_panic_recover(const char *fmt, ...);
 
 typedef struct zfs_dbgmsg {
        list_node_t zdm_node;
        time_t zdm_timestamp;
+       int zdm_size;
        char zdm_msg[1]; /* variable length allocation */
 } zfs_dbgmsg_t;
 
 extern void zfs_dbgmsg_init(void);
 extern void zfs_dbgmsg_fini(void);
-extern void zfs_dbgmsg(const char *fmt, ...);
-extern void zfs_dbgmsg_print(const char *tag);
 
 #ifndef _KERNEL
 extern int dprintf_find_string(const char *string);
index 5cfdcc50fda4f840da74c92f997f0af71047a42c..09a96c043bf0e234f7973fc051f535d215135616 100644 (file)
@@ -96,13 +96,16 @@ typedef enum drr_headertype {
 /* flags #3 - #15 are reserved for incompatible closed-source implementations */
 #define        DMU_BACKUP_FEATURE_EMBED_DATA           (1<<16)
 #define        DMU_BACKUP_FEATURE_EMBED_DATA_LZ4       (1<<17)
+/* flag #18 is reserved for a Delphix feature */
+#define        DMU_BACKUP_FEATURE_LARGE_BLOCKS         (1<<19)
 
 /*
  * Mask of all supported backup features
  */
 #define        DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
     DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
-    DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)
+    DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
+    DMU_BACKUP_FEATURE_LARGE_BLOCKS)
 
 /* Are all features in the given flag word currently supported? */
 #define        DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
@@ -404,7 +407,7 @@ typedef struct zfsdev_state {
 } zfsdev_state_t;
 
 extern void *zfsdev_get_state(minor_t minor, enum zfsdev_state_type which);
-extern minor_t zfsdev_getminor(struct file *filp);
+extern int zfsdev_getminor(struct file *filp, minor_t *minorp);
 extern minor_t zfsdev_minor_alloc(void);
 
 #endif /* _KERNEL */
index 735d4b32ad48f97e3816d66dcde850ca2d00d9d1..06c4d589aa791d1efef2c930b2e7abf3f44b8c80 100644 (file)
@@ -129,7 +129,7 @@ typedef struct znode_phys {
 #ifdef _KERNEL
 
 #define        DXATTR_MAX_ENTRY_SIZE   (32768)
-#define        DXATTR_MAX_SA_SIZE      (SPA_MAXBLOCKSIZE >> 1)
+#define        DXATTR_MAX_SA_SIZE      (SPA_OLD_MAXBLOCKSIZE >> 1)
 
 int zfs_sa_readlink(struct znode *, uio_t *);
 void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *);
index c4db2a911d3e7fb0ea3b342be1b1b8ab78e16b01..28407c6f76fe1c725b3a56aff10e6d7bccaa0129 100644 (file)
@@ -41,11 +41,33 @@ extern "C" {
 struct zfs_sb;
 struct znode;
 
+typedef struct zfs_mntopts {
+       char            *z_osname;      /* Objset name */
+       char            *z_mntpoint;    /* Primary mount point */
+       uint64_t        z_xattr;
+       boolean_t       z_readonly;
+       boolean_t       z_do_readonly;
+       boolean_t       z_setuid;
+       boolean_t       z_do_setuid;
+       boolean_t       z_exec;
+       boolean_t       z_do_exec;
+       boolean_t       z_devices;
+       boolean_t       z_do_devices;
+       boolean_t       z_do_xattr;
+       boolean_t       z_atime;
+       boolean_t       z_do_atime;
+       boolean_t       z_relatime;
+       boolean_t       z_do_relatime;
+       boolean_t       z_nbmand;
+       boolean_t       z_do_nbmand;
+} zfs_mntopts_t;
+
 typedef struct zfs_sb {
        struct super_block *z_sb;       /* generic super_block */
        struct backing_dev_info z_bdi;  /* generic backing dev info */
        struct zfs_sb   *z_parent;      /* parent fs */
        objset_t        *z_os;          /* objset reference */
+       zfs_mntopts_t   *z_mntopts;     /* passed mount options */
        uint64_t        z_flags;        /* super_block flags */
        uint64_t        z_root;         /* id of root znode */
        uint64_t        z_unlinkedobj;  /* id of unlinked zapobj */
@@ -67,16 +89,15 @@ typedef struct zfs_sb {
        boolean_t       z_atime;        /* enable atimes mount option */
        boolean_t       z_relatime;     /* enable relatime mount option */
        boolean_t       z_unmounted;    /* unmounted */
-       rrwlock_t       z_teardown_lock;
+       rrmlock_t       z_teardown_lock;
        krwlock_t       z_teardown_inactive_lock;
        list_t          z_all_znodes;   /* all znodes in the fs */
        uint64_t        z_nr_znodes;    /* number of znodes in the fs */
        unsigned long   z_rollback_time; /* last online rollback time */
+       unsigned long   z_snap_defer_time; /* last snapshot unmount deferal */
        kmutex_t        z_znodes_lock;  /* lock for z_all_znodes */
        arc_prune_t     *z_arc_prune;   /* called by ARC to prune caches */
        struct inode    *z_ctldir;      /* .zfs directory inode */
-       avl_tree_t      z_ctldir_snaps; /* .zfs/snapshot entries */
-       kmutex_t        z_ctldir_lock;  /* .zfs ctldir lock */
        boolean_t       z_show_ctldir;  /* expose .zfs in the root dir */
        boolean_t       z_issnap;       /* true if this is a snapshot */
        boolean_t       z_vscan;        /* virus scan on/off */
@@ -171,7 +192,10 @@ extern boolean_t zfs_fuid_overquota(zfs_sb_t *zsb, boolean_t isgroup,
 extern int zfs_set_version(zfs_sb_t *zsb, uint64_t newvers);
 extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop,
     uint64_t *value);
-extern int zfs_sb_create(const char *name, zfs_sb_t **zsbp);
+extern zfs_mntopts_t *zfs_mntopts_alloc(void);
+extern void zfs_mntopts_free(zfs_mntopts_t *zmo);
+extern int zfs_sb_create(const char *name, zfs_mntopts_t *zmo,
+    zfs_sb_t **zsbp);
 extern int zfs_sb_setup(zfs_sb_t *zsb, boolean_t mounting);
 extern void zfs_sb_free(zfs_sb_t *zsb);
 extern int zfs_sb_prune(struct super_block *sb, unsigned long nr_to_scan,
@@ -182,10 +206,10 @@ extern boolean_t zfs_is_readonly(zfs_sb_t *zsb);
 
 extern int zfs_register_callbacks(zfs_sb_t *zsb);
 extern void zfs_unregister_callbacks(zfs_sb_t *zsb);
-extern int zfs_domount(struct super_block *sb, void *data, int silent);
+extern int zfs_domount(struct super_block *sb, zfs_mntopts_t *zmo, int silent);
 extern void zfs_preumount(struct super_block *sb);
 extern int zfs_umount(struct super_block *sb);
-extern int zfs_remount(struct super_block *sb, int *flags, char *data);
+extern int zfs_remount(struct super_block *sb, int *flags, zfs_mntopts_t *zmo);
 extern int zfs_root(zfs_sb_t *zsb, struct inode **ipp);
 extern int zfs_statvfs(struct dentry *dentry, struct kstatfs *statp);
 extern int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp);
index 4bb8a77617f973ccd3d539e2cc6a2f61e8839bc7..7e73cf99182e50ea936c9f340324e270ea350233 100644 (file)
@@ -137,8 +137,6 @@ extern "C" {
 #define        ZFS_SHARES_DIR          "SHARES"
 #define        ZFS_SA_ATTRS            "SA_ATTRS"
 
-#define        ZFS_MAX_BLOCKSIZE       (SPA_MAXBLOCKSIZE)
-
 /*
  * Path component length
  *
@@ -252,7 +250,7 @@ typedef struct znode {
 /* Called on entry to each ZFS vnode and vfs operation  */
 #define        ZFS_ENTER(zsb) \
        { \
-               rrw_enter_read(&(zsb)->z_teardown_lock, FTAG); \
+               rrm_enter_read(&(zsb)->z_teardown_lock, FTAG); \
                if ((zsb)->z_unmounted) { \
                        ZFS_EXIT(zsb); \
                        return (EIO); \
@@ -262,8 +260,7 @@ typedef struct znode {
 /* Must be called before exiting the vop */
 #define        ZFS_EXIT(zsb) \
        { \
-               rrw_exit(&(zsb)->z_teardown_lock, FTAG); \
-               tsd_exit(); \
+               rrm_exit(&(zsb)->z_teardown_lock, FTAG); \
        }
 
 /* Verifies the znode is valid */
index 9c806964d5dabc0cb4e87b4add0913b7bfe18874..65b14f1cd6a23cc72ecab7b6cbcbcad07a4e7546 100644 (file)
@@ -37,6 +37,9 @@
 extern "C" {
 #endif
 
+struct dsl_pool;
+struct dsl_dataset;
+
 /*
  * Intent log format:
  *
@@ -90,7 +93,6 @@ typedef struct zil_chain {
 } zil_chain_t;
 
 #define        ZIL_MIN_BLKSZ   4096ULL
-#define        ZIL_MAX_BLKSZ   SPA_MAXBLOCKSIZE
 
 /*
  * The words of a log block checksum.
@@ -467,8 +469,10 @@ extern void        zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
 extern void    zil_commit(zilog_t *zilog, uint64_t oid);
 
 extern int     zil_vdev_offline(const char *osname, void *txarg);
-extern int     zil_claim(const char *osname, void *txarg);
-extern int     zil_check_log_chain(const char *osname, void *txarg);
+extern int     zil_claim(struct dsl_pool *dp,
+    struct dsl_dataset *ds, void *txarg);
+extern int     zil_check_log_chain(struct dsl_pool *dp,
+    struct dsl_dataset *ds, void *tx);
 extern void    zil_sync(zilog_t *zilog, dmu_tx_t *tx);
 extern void    zil_clean(zilog_t *zilog, uint64_t synced_txg);
 
index 0db4b525cd2bc4ed7750b34ff15dd5a84fbe8ff3..0c426a15dd06e573746ab4765424160215f4d9a7 100644 (file)
@@ -140,7 +140,7 @@ typedef struct zil_bp_node {
        avl_node_t      zn_node;
 } zil_bp_node_t;
 
-#define        ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
+#define        ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
     sizeof (lr_write_t))
 
 #ifdef __cplusplus
index 18e7a40a308085f60ebb70d2c62c38bce1949a2e..278b6e0868a9038fa27afc1aee185229bc4b071c 100644 (file)
@@ -123,14 +123,19 @@ enum zio_compress {
  */
 #define        ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
 
-#define        ZIO_COMPRESS_ON_VALUE   ZIO_COMPRESS_LZJB
-#define        ZIO_COMPRESS_DEFAULT    ZIO_COMPRESS_OFF
+/*
+ * The meaning of "compress = on" selected by the compression features enabled
+ * on a given pool.
+ */
+#define        ZIO_COMPRESS_LEGACY_ON_VALUE    ZIO_COMPRESS_LZJB
+#define        ZIO_COMPRESS_LZ4_ON_VALUE       ZIO_COMPRESS_LZ4
+
+#define        ZIO_COMPRESS_DEFAULT            ZIO_COMPRESS_OFF
 
 #define        BOOTFS_COMPRESS_VALID(compress)                 \
        ((compress) == ZIO_COMPRESS_LZJB ||             \
        (compress) == ZIO_COMPRESS_LZ4 ||               \
-       ((compress) == ZIO_COMPRESS_ON &&               \
-       ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) ||  \
+       (compress) == ZIO_COMPRESS_ON ||                \
        (compress) == ZIO_COMPRESS_OFF)
 
 /*
@@ -153,9 +158,6 @@ typedef enum zio_priority {
        ZIO_PRIORITY_NOW                /* non-queued i/os (e.g. free) */
 } zio_priority_t;
 
-#define        ZIO_PIPELINE_CONTINUE           0x100
-#define        ZIO_PIPELINE_STOP               0x101
-
 enum zio_flag {
        /*
         * Flags inherited by gang, ddt, and vdev children,
@@ -427,6 +429,7 @@ struct zio {
        hrtime_t        io_delta;       /* vdev queue service delta */
        uint64_t        io_delay;       /* vdev disk service delta (ticks) */
        avl_node_t      io_queue_node;
+       avl_node_t      io_offset_node;
 
        /* Internal pipeline state */
        enum zio_flag   io_flags;
@@ -545,8 +548,8 @@ extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
     enum zio_checksum parent);
 extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
     enum zio_checksum child, enum zio_checksum parent);
-extern enum zio_compress zio_compress_select(enum zio_compress child,
-    enum zio_compress parent);
+extern enum zio_compress zio_compress_select(spa_t *spa,
+    enum zio_compress child, enum zio_compress parent);
 
 extern void zio_suspend(spa_t *spa, zio_t *zio);
 extern int zio_resume(spa_t *spa);
index c7701aae57d09f0516ea88c8bea5c8ef7864d32e..54b35e02de74f194e50aabbc0f0e874b9158e71d 100644 (file)
 #ifndef        _SYS_ZPL_H
 #define        _SYS_ZPL_H
 
+#include <sys/mntent.h>
 #include <sys/vfs.h>
 #include <linux/aio.h>
 #include <linux/dcache_compat.h>
 #include <linux/exportfs.h>
 #include <linux/falloc.h>
 #include <linux/file_compat.h>
+#include <linux/parser.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/vfs_compat.h>
 #include <linux/writeback.h>
@@ -65,11 +67,6 @@ extern const struct file_operations zpl_dir_file_operations;
 /* zpl_super.c */
 extern void zpl_prune_sb(int64_t nr_to_scan, void *arg);
 
-typedef struct zpl_mount_data {
-       const char *z_osname;   /* Dataset name */
-       void *z_data;           /* Mount options string */
-} zpl_mount_data_t;
-
 extern const struct super_operations zpl_super_operations;
 extern const struct export_operations zpl_export_operations;
 extern struct file_system_type zpl_fs_type;
index 04e0996570c9598e8c12d025dfa4156b4a0b9056..898e2352156b29ff3f1d4d4805031dd688c85901 100644 (file)
@@ -34,7 +34,7 @@
 #ifdef _KERNEL
 
 extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
-extern int zvol_check_volblocksize(uint64_t volblocksize);
+extern int zvol_check_volblocksize(const char *name, uint64_t volblocksize);
 extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
 extern boolean_t zvol_is_zvol(const char *);
 extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
index 80074db4fbcc83787735f4cf26e53bce8effd25c..e383c4ff7887a7bb6b0d585e3fe47174d4379882 100644 (file)
@@ -48,6 +48,8 @@ typedef enum spa_feature {
        SPA_FEATURE_EXTENSIBLE_DATASET,
        SPA_FEATURE_EMBEDDED_DATA,
        SPA_FEATURE_BOOKMARKS,
+       SPA_FEATURE_FS_SS_LIMIT,
+       SPA_FEATURE_LARGE_BLOCKS,
        SPA_FEATURES
 } spa_feature_t;
 
index 686402b1fc4c62d1273969ee8977447a33519fc1..e8d34c894b3d9ece2d1e8f5319e0903b79c6fb8f 100644 (file)
@@ -63,6 +63,10 @@ modules_uninstall:
        done
 
 distdir:
+       list='$(subdir-m)'; for subdir in $$list; do \
+               (find @top_srcdir@/module/$$subdir -name '*.c' -o -name '*.h' |\
+               xargs /bin/cp -t $$distdir/$$subdir); \
+       done
 
 distclean maintainer-clean: clean
 install: modules_install
index 27f9e4af4993153d0055483ebe4e9d08bd25b132..98c011e8aa8168c2083e41320ad5623c227e5218 100644 (file)
@@ -1,7 +1,10 @@
+src = @abs_top_srcdir@/module/avl
+obj = @abs_builddir@
+
 MODULE := zavl
 
 EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@
 
 obj-$(CONFIG_ZFS) := $(MODULE).o
 
-$(MODULE)-objs += @top_srcdir@/module/avl/avl.o
+$(MODULE)-objs += avl.o
index f9971da20a0fb50e1fdf98c513aa5ed91c187fd0..abf74bf7242ff2ec99147a8aa82a8af3e38e5890 100644 (file)
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
 /*
  * AVL - generic AVL tree implementation for kernel use
  *
@@ -37,7 +41,7 @@
  * insertion and deletion relatively efficiently. Searching the tree is
  * still a fast operation, roughly O(log(N)).
  *
- * The key to insertion and deletion is a set of tree maniuplations called
+ * The key to insertion and deletion is a set of tree manipulations called
  * rotations, which bring unbalanced subtrees back into the semi-balanced state.
  *
  * This implementation of AVL trees has the following peculiarities:
@@ -45,7 +49,7 @@
  *     - The AVL specific data structures are physically embedded as fields
  *       in the "using" data structures.  To maintain generality the code
  *       must constantly translate between "avl_node_t *" and containing
- *       data structure "void *"s by adding/subracting the avl_offset.
+ *       data structure "void *"s by adding/subtracting the avl_offset.
  *
  *     - Since the AVL data is always embedded in other structures, there is
  *       no locking or memory allocation in the AVL routines. This must be
  *       is a modified "avl_node_t *".  The bottom bit (normally 0 for a
  *       pointer) is set to indicate if that the new node has a value greater
  *       than the value of the indicated "avl_node_t *".
+ *
+ * Note - in addition to userland (e.g. libavl and libutil) and the kernel
+ * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module,
+ * which each have their own compilation environments and subsequent
+ * requirements. Each of these environments must be considered when adding
+ * dependencies from avl.c.
  */
 
 #include <sys/types.h>
 #include <sys/cmn_err.h>
 
 /*
- * Small arrays to translate between balance (or diff) values and child indeces.
+ * Small arrays to translate between balance (or diff) values and child indices.
  *
  * Code that deals with binary tree data structures will randomly use
  * left and right children when examining a tree.  C "if()" statements
@@ -114,7 +124,8 @@ static const int  avl_balance2child[]       = {0, 0, 1};
  *
  * - If there is a left child, go to it, then to it's rightmost descendant.
  *
- * - otherwise we return thru parent nodes until we've come from a right child.
+ * - otherwise we return through parent nodes until we've come from a right
+ *   child.
  *
  * Return Value:
  * NULL - if at the end of the nodes
@@ -863,6 +874,24 @@ avl_update(avl_tree_t *t, void *obj)
        return (B_FALSE);
 }
 
+void
+avl_swap(avl_tree_t *tree1, avl_tree_t *tree2)
+{
+       avl_node_t *temp_node;
+       ulong_t temp_numnodes;
+
+       ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar);
+       ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset);
+       ASSERT3U(tree1->avl_size, ==, tree2->avl_size);
+
+       temp_node = tree1->avl_root;
+       temp_numnodes = tree1->avl_numnodes;
+       tree1->avl_root = tree2->avl_root;
+       tree1->avl_numnodes = tree2->avl_numnodes;
+       tree2->avl_root = temp_node;
+       tree2->avl_numnodes = temp_numnodes;
+}
+
 /*
  * initialize a new AVL tree
  */
@@ -919,7 +948,7 @@ avl_is_empty(avl_tree_t *tree)
 
 /*
  * Post-order tree walk used to visit all tree nodes and destroy the tree
- * in post order. This is used for destroying a tree w/o paying any cost
+ * in post order. This is used for destroying a tree without paying any cost
  * for rebalancing it.
  *
  * example:
@@ -1058,6 +1087,8 @@ EXPORT_SYMBOL(avl_first);
 EXPORT_SYMBOL(avl_last);
 EXPORT_SYMBOL(avl_nearest);
 EXPORT_SYMBOL(avl_add);
+EXPORT_SYMBOL(avl_swap);
+EXPORT_SYMBOL(avl_is_empty);
 EXPORT_SYMBOL(avl_remove);
 EXPORT_SYMBOL(avl_numnodes);
 EXPORT_SYMBOL(avl_destroy_nodes);
index 211fc726dfab14080ffab7c8a58ca6d44d7e1246..a8144452a4b3e63b2e3e600f4c22fb7802ed5602 100644 (file)
@@ -1,10 +1,13 @@
+src = @abs_top_srcdir@/module/nvpair
+obj = @abs_builddir@
+
 MODULE := znvpair
 
 EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@
 
 obj-$(CONFIG_ZFS) := $(MODULE).o
 
-$(MODULE)-objs += @top_srcdir@/module/nvpair/nvpair.o
-$(MODULE)-objs += @top_srcdir@/module/nvpair/fnvpair.o
-$(MODULE)-objs += @top_srcdir@/module/nvpair/nvpair_alloc_spl.o
-$(MODULE)-objs += @top_srcdir@/module/nvpair/nvpair_alloc_fixed.o
+$(MODULE)-objs += nvpair.o
+$(MODULE)-objs += fnvpair.o
+$(MODULE)-objs += nvpair_alloc_spl.o
+$(MODULE)-objs += nvpair_alloc_fixed.o
index 226e23baa8d8a567675e410c114569c22a4280fa..b26e669274bed89ef2f8ea9a424f9183b8c75702 100644 (file)
@@ -1,8 +1,11 @@
+src = @abs_top_srcdir@/module/unicode
+obj = @abs_builddir@
+
 MODULE := zunicode
 
 EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@
 
 obj-$(CONFIG_ZFS) := $(MODULE).o
 
-$(MODULE)-objs += @top_srcdir@/module/unicode/u8_textprep.o
-$(MODULE)-objs += @top_srcdir@/module/unicode/uconv.o
+$(MODULE)-objs += u8_textprep.o
+$(MODULE)-objs += uconv.o
index d4f5ba7ec70e700836bcdd4fec9dde3cd9e86555..67e474ee089d29320e7191304f3c5d11cfd80599 100644 (file)
@@ -1,14 +1,17 @@
+src = @abs_top_srcdir@/module/zcommon
+obj = @abs_builddir@
+
 MODULE := zcommon
 
 EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@
 
 obj-$(CONFIG_ZFS) := $(MODULE).o
 
-$(MODULE)-objs += @top_srcdir@/module/zcommon/zfs_deleg.o
-$(MODULE)-objs += @top_srcdir@/module/zcommon/zfs_prop.o
-$(MODULE)-objs += @top_srcdir@/module/zcommon/zprop_common.o
-$(MODULE)-objs += @top_srcdir@/module/zcommon/zfs_namecheck.o
-$(MODULE)-objs += @top_srcdir@/module/zcommon/zfs_comutil.o
-$(MODULE)-objs += @top_srcdir@/module/zcommon/zfs_fletcher.o
-$(MODULE)-objs += @top_srcdir@/module/zcommon/zfs_uio.o
-$(MODULE)-objs += @top_srcdir@/module/zcommon/zpool_prop.o
+$(MODULE)-objs += zfs_deleg.o
+$(MODULE)-objs += zfs_prop.o
+$(MODULE)-objs += zprop_common.o
+$(MODULE)-objs += zfs_namecheck.o
+$(MODULE)-objs += zfs_comutil.o
+$(MODULE)-objs += zfs_fletcher.o
+$(MODULE)-objs += zfs_uio.o
+$(MODULE)-objs += zpool_prop.o
index 192f8f2210becccd90769d2fbbb036fd5a880f66..aaebab444cfa409432f41117643ddc962415af5e 100644 (file)
@@ -411,11 +411,23 @@ zfs_prop_init(void)
        zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
            PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
            "<size> | none", "REFRESERV");
+       zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit",
+           UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
+           "<count> | none", "FSLIMIT");
+       zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit",
+           UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+           "<count> | none", "SSLIMIT");
+       zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count",
+           UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
+           "<count>", "FSCOUNT");
+       zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count",
+           UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+           "<count>", "SSCOUNT");
 
        /* inherit number properties */
        zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
-           SPA_MAXBLOCKSIZE, PROP_INHERIT,
-           ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE");
+           SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
+           ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
 
        /* hidden properties */
        zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
index 90376f2acf57882670d9e4fe022aa94062f1ae91..f78db68e4ea640eafce35504b783b62f622855b9 100644 (file)
@@ -35,6 +35,9 @@
  * software developed by the University of California, Berkeley, and its
  * contributors.
  */
+/*
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ */
 
 /*
  * The uio support from OpenSolaris has been added as a short term
@@ -46,6 +49,7 @@
 
 #include <sys/types.h>
 #include <sys/uio_impl.h>
+#include <linux/kmap_compat.h>
 
 /*
  * Move "n" bytes at byte address "p"; "rw" indicates the direction
  * update to reflect the data which was moved.  Returns 0 on success or
  * a non-zero errno on failure.
  */
-int
-uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio)
+static int
+uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio)
 {
-       struct iovec *iov;
+       const struct iovec *iov = uio->uio_iov;
+       size_t skip = uio->uio_skip;
        ulong_t cnt;
 
        while (n && uio->uio_resid) {
-               iov = uio->uio_iov;
-               cnt = MIN(iov->iov_len, n);
-               if (cnt == 0l) {
-                       uio->uio_iov++;
-                       uio->uio_iovcnt--;
-                       continue;
-               }
+               cnt = MIN(iov->iov_len - skip, n);
                switch (uio->uio_segflg) {
                case UIO_USERSPACE:
                case UIO_USERISPACE:
@@ -75,22 +74,62 @@ uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio)
                         * iov->iov_base = user data pointer
                         */
                        if (rw == UIO_READ) {
-                               if (copy_to_user(iov->iov_base, p, cnt))
+                               if (copy_to_user(iov->iov_base+skip, p, cnt))
                                        return (EFAULT);
                        } else {
-                               if (copy_from_user(p, iov->iov_base, cnt))
+                               if (copy_from_user(p, iov->iov_base+skip, cnt))
                                        return (EFAULT);
                        }
                        break;
                case UIO_SYSSPACE:
                        if (rw == UIO_READ)
-                               bcopy(p, iov->iov_base, cnt);
+                               bcopy(p, iov->iov_base + skip, cnt);
                        else
-                               bcopy(iov->iov_base, p, cnt);
+                               bcopy(iov->iov_base + skip, p, cnt);
                        break;
+               default:
+                       ASSERT(0);
+               }
+               skip += cnt;
+               if (skip == iov->iov_len) {
+                       skip = 0;
+                       uio->uio_iov = (++iov);
+                       uio->uio_iovcnt--;
+               }
+               uio->uio_skip = skip;
+               uio->uio_resid -= cnt;
+               uio->uio_loffset += cnt;
+               p = (caddr_t)p + cnt;
+               n -= cnt;
+       }
+       return (0);
+}
+
+static int
+uiomove_bvec(void *p, size_t n, enum uio_rw rw, struct uio *uio)
+{
+       const struct bio_vec *bv = uio->uio_bvec;
+       size_t skip = uio->uio_skip;
+       ulong_t cnt;
+
+       while (n && uio->uio_resid) {
+               void *paddr;
+               cnt = MIN(bv->bv_len - skip, n);
+
+               paddr = zfs_kmap_atomic(bv->bv_page, KM_USER1);
+               if (rw == UIO_READ)
+                       bcopy(p, paddr + bv->bv_offset + skip, cnt);
+               else
+                       bcopy(paddr + bv->bv_offset + skip, p, cnt);
+               zfs_kunmap_atomic(paddr, KM_USER1);
+
+               skip += cnt;
+               if (skip == bv->bv_len) {
+                       skip = 0;
+                       uio->uio_bvec = (++bv);
+                       uio->uio_iovcnt--;
                }
-               iov->iov_base += cnt;
-               iov->iov_len -= cnt;
+               uio->uio_skip = skip;
                uio->uio_resid -= cnt;
                uio->uio_loffset += cnt;
                p = (caddr_t)p + cnt;
@@ -98,6 +137,15 @@ uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio)
        }
        return (0);
 }
+
+int
+uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio)
+{
+       if (uio->uio_segflg != UIO_BVEC)
+               return (uiomove_iov(p, n, rw, uio));
+       else
+               return (uiomove_bvec(p, n, rw, uio));
+}
 EXPORT_SYMBOL(uiomove);
 
 #define        fuword8(uptr, vptr)     get_user((*vptr), (uptr))
@@ -111,39 +159,38 @@ EXPORT_SYMBOL(uiomove);
 void
 uio_prefaultpages(ssize_t n, struct uio *uio)
 {
-       struct iovec *iov;
+       const struct iovec *iov;
        ulong_t cnt, incr;
        caddr_t p;
        uint8_t tmp;
        int iovcnt;
+       size_t skip = uio->uio_skip;
+
+       /* no need to fault in kernel pages */
+       switch (uio->uio_segflg) {
+               case UIO_SYSSPACE:
+               case UIO_BVEC:
+                       return;
+               case UIO_USERSPACE:
+               case UIO_USERISPACE:
+                       break;
+               default:
+                       ASSERT(0);
+       }
 
        iov = uio->uio_iov;
        iovcnt = uio->uio_iovcnt;
 
        while ((n > 0) && (iovcnt > 0)) {
-               cnt = MIN(iov->iov_len, n);
-               if (cnt == 0) {
-                       /* empty iov entry */
-                       iov++;
-                       iovcnt--;
-                       continue;
-               }
+               cnt = MIN(iov->iov_len - skip, n);
                n -= cnt;
                /*
                 * touch each page in this segment.
                 */
-               p = iov->iov_base;
+               p = iov->iov_base + skip;
                while (cnt) {
-                       switch (uio->uio_segflg) {
-                       case UIO_USERSPACE:
-                       case UIO_USERISPACE:
-                               if (fuword8((uint8_t *) p, &tmp))
-                                       return;
-                               break;
-                       case UIO_SYSSPACE:
-                               bcopy(p, &tmp, 1);
-                               break;
-                       }
+                       if (fuword8((uint8_t *) p, &tmp))
+                               return;
                        incr = MIN(cnt, PAGESIZE);
                        p += incr;
                        cnt -= incr;
@@ -152,18 +199,11 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
                 * touch the last byte in case it straddles a page.
                 */
                p--;
-               switch (uio->uio_segflg) {
-               case UIO_USERSPACE:
-               case UIO_USERISPACE:
-                       if (fuword8((uint8_t *) p, &tmp))
-                               return;
-                       break;
-               case UIO_SYSSPACE:
-                       bcopy(p, &tmp, 1);
-                       break;
-               }
+               if (fuword8((uint8_t *) p, &tmp))
+                       return;
                iov++;
                iovcnt--;
+               skip = 0;
        }
 }
 EXPORT_SYMBOL(uio_prefaultpages);
@@ -175,49 +215,13 @@ EXPORT_SYMBOL(uio_prefaultpages);
 int
 uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes)
 {
-       struct iovec *iov;
-       ulong_t cnt;
-       int iovcnt;
+       struct uio uio_copy;
+       int ret;
 
-       iovcnt = uio->uio_iovcnt;
-       *cbytes = 0;
-
-       for (iov = uio->uio_iov; n && iovcnt; iov++, iovcnt--) {
-               cnt = MIN(iov->iov_len, n);
-               if (cnt == 0)
-                       continue;
-
-               switch (uio->uio_segflg) {
-
-               case UIO_USERSPACE:
-               case UIO_USERISPACE:
-                       /*
-                        * p = kernel data pointer
-                        * iov->iov_base = user data pointer
-                        */
-                       if (rw == UIO_READ) {
-                               /* UIO_READ = copy data from kernel to user */
-                               if (copy_to_user(iov->iov_base, p, cnt))
-                                       return (EFAULT);
-                       } else {
-                               /* UIO_WRITE = copy data from user to kernel */
-                               if (copy_from_user(p, iov->iov_base, cnt))
-                                       return (EFAULT);
-                       }
-                       break;
-
-               case UIO_SYSSPACE:
-                       if (rw == UIO_READ)
-                               bcopy(p, iov->iov_base, cnt);
-                       else
-                               bcopy(iov->iov_base, p, cnt);
-                       break;
-               }
-               p = (caddr_t)p + cnt;
-               n -= cnt;
-               *cbytes += cnt;
-       }
-       return (0);
+       bcopy(uio, &uio_copy, sizeof (struct uio));
+       ret = uiomove(p, n, rw, &uio_copy);
+       *cbytes = uio->uio_resid - uio_copy.uio_resid;
+       return (ret);
 }
 EXPORT_SYMBOL(uiocopy);
 
@@ -229,21 +233,25 @@ uioskip(uio_t *uiop, size_t n)
 {
        if (n > uiop->uio_resid)
                return;
-       while (n != 0) {
-               iovec_t *iovp = uiop->uio_iov;
-               size_t          niovb = MIN(iovp->iov_len, n);
 
-               if (niovb == 0) {
+       uiop->uio_skip += n;
+       if (uiop->uio_segflg != UIO_BVEC) {
+               while (uiop->uio_iovcnt &&
+                   uiop->uio_skip >= uiop->uio_iov->iov_len) {
+                       uiop->uio_skip -= uiop->uio_iov->iov_len;
                        uiop->uio_iov++;
                        uiop->uio_iovcnt--;
-                       continue;
                }
-               iovp->iov_base += niovb;
-               uiop->uio_loffset += niovb;
-               iovp->iov_len -= niovb;
-               uiop->uio_resid -= niovb;
-               n -= niovb;
+       } else {
+               while (uiop->uio_iovcnt &&
+                   uiop->uio_skip >= uiop->uio_bvec->bv_len) {
+                       uiop->uio_skip -= uiop->uio_bvec->bv_len;
+                       uiop->uio_bvec++;
+                       uiop->uio_iovcnt--;
+               }
        }
+       uiop->uio_loffset += n;
+       uiop->uio_resid -= n;
 }
 EXPORT_SYMBOL(uioskip);
 #endif /* _KERNEL */
index e5f69c8152d46a9d32ad6d5489a1ae6d2a934b93..910c56dcc2a9226851e9f877e6aaba2a3d01a6b6 100644 (file)
@@ -131,6 +131,8 @@ zpool_prop_init(void)
        /* hidden properties */
        zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
            PROP_READONLY, ZFS_TYPE_POOL, "NAME");
+       zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize",
+           PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE");
        zprop_register_hidden(ZPOOL_PROP_TNAME, "tname", PROP_TYPE_STRING,
            PROP_ONETIME, ZFS_TYPE_POOL, "TNAME");
 }
index 954841f33137f51d7d19d97be9ad19a7a1425f7a..55f8cef16b6d5cdbb2e15ebee53de263e8de4da3 100644 (file)
+src = @abs_top_srcdir@/module/zfs
+obj = @abs_builddir@
+
 MODULE := zfs
 
 EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@
 
 obj-$(CONFIG_ZFS) := $(MODULE).o
 
-$(MODULE)-objs += @top_srcdir@/module/zfs/arc.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/blkptr.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dbuf_stats.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/bptree.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/ddt.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/ddt_zap.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dmu.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dmu_diff.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dmu_object.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dmu_objset.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dmu_send.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dmu_traverse.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dmu_tx.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dmu_zfetch.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dnode.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dnode_sync.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_dataset.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_deadlist.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_deleg.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_bookmark.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_dir.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_pool.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_prop.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_scan.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_synctask.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/fm.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/gzip.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/lzjb.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/lz4.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/metaslab.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/range_tree.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/refcount.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/rrwlock.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/sa.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/sha256.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/spa.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/spa_boot.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/spa_config.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/spa_errlog.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/spa_stats.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/space_reftree.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/txg.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/trace.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/unique.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/vdev.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_cache.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_disk.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_file.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_label.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_mirror.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_missing.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_queue.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_raidz.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_root.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zap.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zap_leaf.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zap_micro.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfeature.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfeature_common.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_acl.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_byteswap.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_ctldir.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_debug.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_dir.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_fm.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_fuid.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_ioctl.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_log.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_onexit.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_replay.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_rlock.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_sa.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_vfsops.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_vnops.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_znode.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zil.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zio.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zio_checksum.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zio_compress.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zio_inject.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zle.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zpl_ctldir.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zpl_export.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zpl_file.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zpl_inode.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zpl_super.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zpl_xattr.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zrlock.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/zvol.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_destroy.o
-$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_userhold.o
+$(MODULE)-objs += arc.o
+$(MODULE)-objs += blkptr.o
+$(MODULE)-objs += bplist.o
+$(MODULE)-objs += bpobj.o
+$(MODULE)-objs += dbuf.o
+$(MODULE)-objs += dbuf_stats.o
+$(MODULE)-objs += bptree.o
+$(MODULE)-objs += ddt.o
+$(MODULE)-objs += ddt_zap.o
+$(MODULE)-objs += dmu.o
+$(MODULE)-objs += dmu_diff.o
+$(MODULE)-objs += dmu_object.o
+$(MODULE)-objs += dmu_objset.o
+$(MODULE)-objs += dmu_send.o
+$(MODULE)-objs += dmu_traverse.o
+$(MODULE)-objs += dmu_tx.o
+$(MODULE)-objs += dmu_zfetch.o
+$(MODULE)-objs += dnode.o
+$(MODULE)-objs += dnode_sync.o
+$(MODULE)-objs += dsl_dataset.o
+$(MODULE)-objs += dsl_deadlist.o
+$(MODULE)-objs += dsl_deleg.o
+$(MODULE)-objs += dsl_bookmark.o
+$(MODULE)-objs += dsl_dir.o
+$(MODULE)-objs += dsl_pool.o
+$(MODULE)-objs += dsl_prop.o
+$(MODULE)-objs += dsl_scan.o
+$(MODULE)-objs += dsl_synctask.o
+$(MODULE)-objs += fm.o
+$(MODULE)-objs += gzip.o
+$(MODULE)-objs += lzjb.o
+$(MODULE)-objs += lz4.o
+$(MODULE)-objs += metaslab.o
+$(MODULE)-objs += multilist.o
+$(MODULE)-objs += range_tree.o
+$(MODULE)-objs += refcount.o
+$(MODULE)-objs += rrwlock.o
+$(MODULE)-objs += sa.o
+$(MODULE)-objs += sha256.o
+$(MODULE)-objs += spa.o
+$(MODULE)-objs += spa_boot.o
+$(MODULE)-objs += spa_config.o
+$(MODULE)-objs += spa_errlog.o
+$(MODULE)-objs += spa_history.o
+$(MODULE)-objs += spa_misc.o
+$(MODULE)-objs += spa_stats.o
+$(MODULE)-objs += space_map.o
+$(MODULE)-objs += space_reftree.o
+$(MODULE)-objs += txg.o
+$(MODULE)-objs += trace.o
+$(MODULE)-objs += uberblock.o
+$(MODULE)-objs += unique.o
+$(MODULE)-objs += vdev.o
+$(MODULE)-objs += vdev_cache.o
+$(MODULE)-objs += vdev_disk.o
+$(MODULE)-objs += vdev_file.o
+$(MODULE)-objs += vdev_label.o
+$(MODULE)-objs += vdev_mirror.o
+$(MODULE)-objs += vdev_missing.o
+$(MODULE)-objs += vdev_queue.o
+$(MODULE)-objs += vdev_raidz.o
+$(MODULE)-objs += vdev_root.o
+$(MODULE)-objs += zap.o
+$(MODULE)-objs += zap_leaf.o
+$(MODULE)-objs += zap_micro.o
+$(MODULE)-objs += zfeature.o
+$(MODULE)-objs += zfeature_common.o
+$(MODULE)-objs += zfs_acl.o
+$(MODULE)-objs += zfs_byteswap.o
+$(MODULE)-objs += zfs_ctldir.o
+$(MODULE)-objs += zfs_debug.o
+$(MODULE)-objs += zfs_dir.o
+$(MODULE)-objs += zfs_fm.o
+$(MODULE)-objs += zfs_fuid.o
+$(MODULE)-objs += zfs_ioctl.o
+$(MODULE)-objs += zfs_log.o
+$(MODULE)-objs += zfs_onexit.o
+$(MODULE)-objs += zfs_replay.o
+$(MODULE)-objs += zfs_rlock.o
+$(MODULE)-objs += zfs_sa.o
+$(MODULE)-objs += zfs_vfsops.o
+$(MODULE)-objs += zfs_vnops.o
+$(MODULE)-objs += zfs_znode.o
+$(MODULE)-objs += zil.o
+$(MODULE)-objs += zio.o
+$(MODULE)-objs += zio_checksum.o
+$(MODULE)-objs += zio_compress.o
+$(MODULE)-objs += zio_inject.o
+$(MODULE)-objs += zle.o
+$(MODULE)-objs += zpl_ctldir.o
+$(MODULE)-objs += zpl_export.o
+$(MODULE)-objs += zpl_file.o
+$(MODULE)-objs += zpl_inode.o
+$(MODULE)-objs += zpl_super.o
+$(MODULE)-objs += zpl_xattr.o
+$(MODULE)-objs += zrlock.o
+$(MODULE)-objs += zvol.o
+$(MODULE)-objs += dsl_destroy.o
+$(MODULE)-objs += dsl_userhold.o
index 421c81e1cfe9f09a831efb28a0948ad4f3d11103..b759e6483d53a26beecae94286cad6e72d7dea2a 100644 (file)
@@ -20,8 +20,9 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  */
 
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
  *
- * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
+ * The L2ARC uses the l2ad_mtx on each vdev for the following:
  *
  *     - L2ARC buflist creation
  *     - L2ARC buflist eviction
 #include <sys/zio_compress.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
+#include <sys/refcount.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dsl_pool.h>
+#include <sys/multilist.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <vm/anon.h>
 boolean_t arc_watch = B_FALSE;
 #endif
 
-static kmutex_t                arc_reclaim_thr_lock;
-static kcondvar_t      arc_reclaim_thr_cv;     /* used to signal reclaim thr */
-static uint8_t         arc_thread_exit;
+static kmutex_t                arc_reclaim_lock;
+static kcondvar_t      arc_reclaim_thread_cv;
+static boolean_t       arc_reclaim_thread_exit;
+static kcondvar_t      arc_reclaim_waiters_cv;
 
-/* number of objects to prune from caches when arc_meta_limit is reached */
-int zfs_arc_meta_prune = 10000;
+static kmutex_t                arc_user_evicts_lock;
+static kcondvar_t      arc_user_evicts_cv;
+static boolean_t       arc_user_evicts_thread_exit;
 
-typedef enum arc_reclaim_strategy {
-       ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
-       ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
-} arc_reclaim_strategy_t;
+/*
+ * The number of headers to evict in arc_evict_state_impl() before
+ * dropping the sublist lock and evicting from another sublist. A lower
+ * value means we're more likely to evict the "correct" header (i.e. the
+ * oldest header in the arc state), but comes with higher overhead
+ * (i.e. more invocations of arc_evict_state_impl()).
+ */
+int zfs_arc_evict_batch_limit = 10;
 
 /*
- * The number of iterations through arc_evict_*() before we
- * drop & reacquire the lock.
+ * The number of sublists used for each of the arc state lists. If this
+ * is not set to a suitable value by the user, it will be configured to
+ * the number of CPUs on the system in arc_init().
  */
-int arc_evict_iterations = 100;
+int zfs_arc_num_sublists_per_state = 0;
 
 /* number of seconds before growing cache again */
-int zfs_arc_grow_retry = 5;
+static int             arc_grow_retry = 5;
 
-/* disable anon data aggressively growing arc_p */
-int zfs_arc_p_aggressive_disable = 1;
+/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
+int            zfs_arc_overflow_shift = 8;
 
-/* disable arc_p adapt dampener in arc_adapt */
-int zfs_arc_p_dampener_disable = 1;
+/* shift of arc_c for calculating both min and max arc_p */
+static int             arc_p_min_shift = 4;
 
 /* log2(fraction of arc to reclaim) */
-int zfs_arc_shrink_shift = 5;
+static int             arc_shrink_shift = 7;
 
 /*
- * minimum lifespan of a prefetch block in clock ticks
- * (initialized in arc_init())
+ * log2(fraction of ARC which must be free to allow growing).
+ * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
+ * when reading a new block into the ARC, we will evict an equal-sized block
+ * from the ARC.
+ *
+ * This must be less than arc_shrink_shift, so that when we shrink the ARC,
+ * we will still not allow it to grow.
  */
-int zfs_arc_min_prefetch_lifespan = HZ;
+int                    arc_no_grow_shift = 5;
 
-/* disable arc proactive arc throttle due to low memory */
-int zfs_arc_memory_throttle_disable = 1;
 
-/* disable duplicate buffer eviction */
-int zfs_disable_dup_eviction = 0;
-
-/* average block used to size buf_hash_table */
-int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
+/*
+ * minimum lifespan of a prefetch block in clock ticks
+ * (initialized in arc_init())
+ */
+static int             arc_min_prefetch_lifespan;
 
 /*
  * If this percent of memory is free, don't throttle.
@@ -206,9 +219,6 @@ int arc_lotsfree_percent = 10;
 
 static int arc_dead;
 
-/* expiration time for arc_no_grow */
-static clock_t arc_grow_time = 0;
-
 /*
  * The arc has filled available memory and has now warmed up.
  */
@@ -220,11 +230,24 @@ static boolean_t arc_warm;
 unsigned long zfs_arc_max = 0;
 unsigned long zfs_arc_min = 0;
 unsigned long zfs_arc_meta_limit = 0;
+unsigned long zfs_arc_meta_min = 0;
+int zfs_arc_grow_retry = 0;
+int zfs_arc_shrink_shift = 0;
+int zfs_arc_p_min_shift = 0;
+int zfs_disable_dup_eviction = 0;
+int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 
 /*
- * Limit the number of restarts in arc_adjust_meta()
+ * These tunables are Linux specific
  */
-unsigned long zfs_arc_meta_adjust_restarts = 4096;
+unsigned long zfs_arc_sys_free = 0;
+int zfs_arc_min_prefetch_lifespan = 0;
+int zfs_arc_p_aggressive_disable = 1;
+int zfs_arc_p_dampener_disable = 1;
+int zfs_arc_meta_prune = 10000;
+int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
+int zfs_arc_meta_adjust_restarts = 4096;
+int zfs_arc_lotsfree_percent = 10;
 
 /* The 6 states: */
 static arc_state_t ARC_anon;
@@ -250,7 +273,6 @@ typedef struct arc_stats {
        kstat_named_t arcstat_mfu_hits;
        kstat_named_t arcstat_mfu_ghost_hits;
        kstat_named_t arcstat_deleted;
-       kstat_named_t arcstat_recycle_miss;
        /*
         * Number of buffers that could not be evicted because the hash lock
         * was held by another thread.  The lock may not necessarily be held
@@ -264,9 +286,15 @@ typedef struct arc_stats {
         * not from the spa we're trying to evict from.
         */
        kstat_named_t arcstat_evict_skip;
+       /*
+        * Number of times arc_evict_state() was unable to evict enough
+        * buffers to reach its target amount.
+        */
+       kstat_named_t arcstat_evict_not_enough;
        kstat_named_t arcstat_evict_l2_cached;
        kstat_named_t arcstat_evict_l2_eligible;
        kstat_named_t arcstat_evict_l2_ineligible;
+       kstat_named_t arcstat_evict_l2_skip;
        kstat_named_t arcstat_hash_elements;
        kstat_named_t arcstat_hash_elements_max;
        kstat_named_t arcstat_hash_collisions;
@@ -277,25 +305,137 @@ typedef struct arc_stats {
        kstat_named_t arcstat_c_min;
        kstat_named_t arcstat_c_max;
        kstat_named_t arcstat_size;
+       /*
+        * Number of bytes consumed by internal ARC structures necessary
+        * for tracking purposes; these structures are not actually
+        * backed by ARC buffers. This includes arc_buf_hdr_t structures
+        * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
+        * caches), and arc_buf_t structures (allocated via arc_buf_t
+        * cache).
+        */
        kstat_named_t arcstat_hdr_size;
+       /*
+        * Number of bytes consumed by ARC buffers of type equal to
+        * ARC_BUFC_DATA. This is generally consumed by buffers backing
+        * on disk user data (e.g. plain file contents).
+        */
        kstat_named_t arcstat_data_size;
-       kstat_named_t arcstat_meta_size;
+       /*
+        * Number of bytes consumed by ARC buffers of type equal to
+        * ARC_BUFC_METADATA. This is generally consumed by buffers
+        * backing on disk data that is used for internal ZFS
+        * structures (e.g. ZAP, dnode, indirect blocks, etc).
+        */
+       kstat_named_t arcstat_metadata_size;
+       /*
+        * Number of bytes consumed by various buffers and structures
+        * not actually backed with ARC buffers. This includes bonus
+        * buffers (allocated directly via zio_buf_* functions),
+        * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
+        * cache), and dnode_t structures (allocated via dnode_t cache).
+        */
        kstat_named_t arcstat_other_size;
+       /*
+        * Total number of bytes consumed by ARC buffers residing in the
+        * arc_anon state. This includes *all* buffers in the arc_anon
+        * state; e.g. data, metadata, evictable, and unevictable buffers
+        * are all included in this value.
+        */
        kstat_named_t arcstat_anon_size;
-       kstat_named_t arcstat_anon_evict_data;
-       kstat_named_t arcstat_anon_evict_metadata;
+       /*
+        * Number of bytes consumed by ARC buffers that meet the
+        * following criteria: backing buffers of type ARC_BUFC_DATA,
+        * residing in the arc_anon state, and are eligible for eviction
+        * (e.g. have no outstanding holds on the buffer).
+        */
+       kstat_named_t arcstat_anon_evictable_data;
+       /*
+        * Number of bytes consumed by ARC buffers that meet the
+        * following criteria: backing buffers of type ARC_BUFC_METADATA,
+        * residing in the arc_anon state, and are eligible for eviction
+        * (e.g. have no outstanding holds on the buffer).
+        */
+       kstat_named_t arcstat_anon_evictable_metadata;
+       /*
+        * Total number of bytes consumed by ARC buffers residing in the
+        * arc_mru state. This includes *all* buffers in the arc_mru
+        * state; e.g. data, metadata, evictable, and unevictable buffers
+        * are all included in this value.
+        */
        kstat_named_t arcstat_mru_size;
-       kstat_named_t arcstat_mru_evict_data;
-       kstat_named_t arcstat_mru_evict_metadata;
+       /*
+        * Number of bytes consumed by ARC buffers that meet the
+        * following criteria: backing buffers of type ARC_BUFC_DATA,
+        * residing in the arc_mru state, and are eligible for eviction
+        * (e.g. have no outstanding holds on the buffer).
+        */
+       kstat_named_t arcstat_mru_evictable_data;
+       /*
+        * Number of bytes consumed by ARC buffers that meet the
+        * following criteria: backing buffers of type ARC_BUFC_METADATA,
+        * residing in the arc_mru state, and are eligible for eviction
+        * (e.g. have no outstanding holds on the buffer).
+        */
+       kstat_named_t arcstat_mru_evictable_metadata;
+       /*
+        * Total number of bytes that *would have been* consumed by ARC
+        * buffers in the arc_mru_ghost state. The key thing to note
+        * here, is the fact that this size doesn't actually indicate
+        * RAM consumption. The ghost lists only consist of headers and
+        * don't actually have ARC buffers linked off of these headers.
+        * Thus, *if* the headers had associated ARC buffers, these
+        * buffers *would have* consumed this number of bytes.
+        */
        kstat_named_t arcstat_mru_ghost_size;
-       kstat_named_t arcstat_mru_ghost_evict_data;
-       kstat_named_t arcstat_mru_ghost_evict_metadata;
+       /*
+        * Number of bytes that *would have been* consumed by ARC
+        * buffers that are eligible for eviction, of type
+        * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
+        */
+       kstat_named_t arcstat_mru_ghost_evictable_data;
+       /*
+        * Number of bytes that *would have been* consumed by ARC
+        * buffers that are eligible for eviction, of type
+        * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+        */
+       kstat_named_t arcstat_mru_ghost_evictable_metadata;
+       /*
+        * Total number of bytes consumed by ARC buffers residing in the
+        * arc_mfu state. This includes *all* buffers in the arc_mfu
+        * state; e.g. data, metadata, evictable, and unevictable buffers
+        * are all included in this value.
+        */
        kstat_named_t arcstat_mfu_size;
-       kstat_named_t arcstat_mfu_evict_data;
-       kstat_named_t arcstat_mfu_evict_metadata;
+       /*
+        * Number of bytes consumed by ARC buffers that are eligible for
+        * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
+        * state.
+        */
+       kstat_named_t arcstat_mfu_evictable_data;
+       /*
+        * Number of bytes consumed by ARC buffers that are eligible for
+        * eviction, of type ARC_BUFC_METADATA, and reside in the
+        * arc_mfu state.
+        */
+       kstat_named_t arcstat_mfu_evictable_metadata;
+       /*
+        * Total number of bytes that *would have been* consumed by ARC
+        * buffers in the arc_mfu_ghost state. See the comment above
+        * arcstat_mru_ghost_size for more details.
+        */
        kstat_named_t arcstat_mfu_ghost_size;
-       kstat_named_t arcstat_mfu_ghost_evict_data;
-       kstat_named_t arcstat_mfu_ghost_evict_metadata;
+       /*
+        * Number of bytes that *would have been* consumed by ARC
+        * buffers that are eligible for eviction, of type
+        * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
+        */
+       kstat_named_t arcstat_mfu_ghost_evictable_data;
+       /*
+        * Number of bytes that *would have been* consumed by ARC
+        * buffers that are eligible for eviction, of type
+        * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+        */
+       kstat_named_t arcstat_mfu_ghost_evictable_metadata;
        kstat_named_t arcstat_l2_hits;
        kstat_named_t arcstat_l2_misses;
        kstat_named_t arcstat_l2_feeds;
@@ -305,9 +445,10 @@ typedef struct arc_stats {
        kstat_named_t arcstat_l2_writes_sent;
        kstat_named_t arcstat_l2_writes_done;
        kstat_named_t arcstat_l2_writes_error;
-       kstat_named_t arcstat_l2_writes_hdr_miss;
+       kstat_named_t arcstat_l2_writes_lock_retry;
        kstat_named_t arcstat_l2_evict_lock_retry;
        kstat_named_t arcstat_l2_evict_reading;
+       kstat_named_t arcstat_l2_evict_l1cached;
        kstat_named_t arcstat_l2_free_on_write;
        kstat_named_t arcstat_l2_cdata_free_on_write;
        kstat_named_t arcstat_l2_abort_lowmem;
@@ -332,6 +473,9 @@ typedef struct arc_stats {
        kstat_named_t arcstat_meta_used;
        kstat_named_t arcstat_meta_limit;
        kstat_named_t arcstat_meta_max;
+       kstat_named_t arcstat_meta_min;
+       kstat_named_t arcstat_need_free;
+       kstat_named_t arcstat_sys_free;
 } arc_stats_t;
 
 static arc_stats_t arc_stats = {
@@ -350,12 +494,13 @@ static arc_stats_t arc_stats = {
        { "mfu_hits",                   KSTAT_DATA_UINT64 },
        { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
        { "deleted",                    KSTAT_DATA_UINT64 },
-       { "recycle_miss",               KSTAT_DATA_UINT64 },
        { "mutex_miss",                 KSTAT_DATA_UINT64 },
        { "evict_skip",                 KSTAT_DATA_UINT64 },
+       { "evict_not_enough",           KSTAT_DATA_UINT64 },
        { "evict_l2_cached",            KSTAT_DATA_UINT64 },
        { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
        { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
+       { "evict_l2_skip",              KSTAT_DATA_UINT64 },
        { "hash_elements",              KSTAT_DATA_UINT64 },
        { "hash_elements_max",          KSTAT_DATA_UINT64 },
        { "hash_collisions",            KSTAT_DATA_UINT64 },
@@ -368,23 +513,23 @@ static arc_stats_t arc_stats = {
        { "size",                       KSTAT_DATA_UINT64 },
        { "hdr_size",                   KSTAT_DATA_UINT64 },
        { "data_size",                  KSTAT_DATA_UINT64 },
-       { "meta_size",                  KSTAT_DATA_UINT64 },
+       { "metadata_size",              KSTAT_DATA_UINT64 },
        { "other_size",                 KSTAT_DATA_UINT64 },
        { "anon_size",                  KSTAT_DATA_UINT64 },
-       { "anon_evict_data",            KSTAT_DATA_UINT64 },
-       { "anon_evict_metadata",        KSTAT_DATA_UINT64 },
+       { "anon_evictable_data",        KSTAT_DATA_UINT64 },
+       { "anon_evictable_metadata",    KSTAT_DATA_UINT64 },
        { "mru_size",                   KSTAT_DATA_UINT64 },
-       { "mru_evict_data",             KSTAT_DATA_UINT64 },
-       { "mru_evict_metadata",         KSTAT_DATA_UINT64 },
+       { "mru_evictable_data",         KSTAT_DATA_UINT64 },
+       { "mru_evictable_metadata",     KSTAT_DATA_UINT64 },
        { "mru_ghost_size",             KSTAT_DATA_UINT64 },
-       { "mru_ghost_evict_data",       KSTAT_DATA_UINT64 },
-       { "mru_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
+       { "mru_ghost_evictable_data",   KSTAT_DATA_UINT64 },
+       { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
        { "mfu_size",                   KSTAT_DATA_UINT64 },
-       { "mfu_evict_data",             KSTAT_DATA_UINT64 },
-       { "mfu_evict_metadata",         KSTAT_DATA_UINT64 },
+       { "mfu_evictable_data",         KSTAT_DATA_UINT64 },
+       { "mfu_evictable_metadata",     KSTAT_DATA_UINT64 },
        { "mfu_ghost_size",             KSTAT_DATA_UINT64 },
-       { "mfu_ghost_evict_data",       KSTAT_DATA_UINT64 },
-       { "mfu_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
+       { "mfu_ghost_evictable_data",   KSTAT_DATA_UINT64 },
+       { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
        { "l2_hits",                    KSTAT_DATA_UINT64 },
        { "l2_misses",                  KSTAT_DATA_UINT64 },
        { "l2_feeds",                   KSTAT_DATA_UINT64 },
@@ -394,9 +539,10 @@ static arc_stats_t arc_stats = {
        { "l2_writes_sent",             KSTAT_DATA_UINT64 },
        { "l2_writes_done",             KSTAT_DATA_UINT64 },
        { "l2_writes_error",            KSTAT_DATA_UINT64 },
-       { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
+       { "l2_writes_lock_retry",       KSTAT_DATA_UINT64 },
        { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
        { "l2_evict_reading",           KSTAT_DATA_UINT64 },
+       { "l2_evict_l1cached",          KSTAT_DATA_UINT64 },
        { "l2_free_on_write",           KSTAT_DATA_UINT64 },
        { "l2_cdata_free_on_write",     KSTAT_DATA_UINT64 },
        { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
@@ -421,6 +567,9 @@ static arc_stats_t arc_stats = {
        { "arc_meta_used",              KSTAT_DATA_UINT64 },
        { "arc_meta_limit",             KSTAT_DATA_UINT64 },
        { "arc_meta_max",               KSTAT_DATA_UINT64 },
+       { "arc_meta_min",               KSTAT_DATA_UINT64 },
+       { "arc_need_free",              KSTAT_DATA_UINT64 },
+       { "arc_sys_free",               KSTAT_DATA_UINT64 }
 };
 
 #define        ARCSTAT(stat)   (arc_stats.stat.value.ui64)
@@ -486,69 +635,54 @@ static arc_state_t        *arc_l2c_only;
 #define        arc_tempreserve ARCSTAT(arcstat_tempreserve)
 #define        arc_loaned_bytes        ARCSTAT(arcstat_loaned_bytes)
 #define        arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
+#define        arc_meta_min    ARCSTAT(arcstat_meta_min) /* min size for metadata */
 #define        arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 #define        arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
+#define        arc_need_free   ARCSTAT(arcstat_need_free) /* bytes to be freed */
+#define        arc_sys_free    ARCSTAT(arcstat_sys_free) /* target system free bytes */
 
 #define        L2ARC_IS_VALID_COMPRESS(_c_) \
        ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 
 static list_t arc_prune_list;
 static kmutex_t arc_prune_mtx;
+static taskq_t *arc_prune_taskq;
 static arc_buf_t *arc_eviction_list;
-static kmutex_t arc_eviction_mtx;
 static arc_buf_hdr_t arc_eviction_hdr;
-static void arc_get_data_buf(arc_buf_t *buf);
-static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
-static int arc_evict_needed(arc_buf_contents_t type);
-static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
-    arc_buf_contents_t type);
-static void arc_buf_watch(arc_buf_t *buf);
-
-static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 
 #define        GHOST_STATE(state)      \
        ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
        (state) == arc_l2c_only)
 
-/*
- * Private ARC flags.  These flags are private ARC only flags that will show up
- * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
- * be passed in as arc_flags in things like arc_read.  However, these flags
- * should never be passed and should only be set by ARC code.  When adding new
- * public flags, make sure not to smash the private ones.
- */
-
-#define        ARC_IN_HASH_TABLE       (1 << 9)        /* this buffer is hashed */
-#define        ARC_IO_IN_PROGRESS      (1 << 10)       /* I/O in progress for buf */
-#define        ARC_IO_ERROR            (1 << 11)       /* I/O failed for buf */
-#define        ARC_FREED_IN_READ       (1 << 12)       /* buf freed while in read */
-#define        ARC_BUF_AVAILABLE       (1 << 13)       /* block not in active use */
-#define        ARC_INDIRECT            (1 << 14)       /* this is an indirect block */
-#define        ARC_FREE_IN_PROGRESS    (1 << 15)       /* hdr about to be freed */
-#define        ARC_L2_WRITING          (1 << 16)       /* L2ARC write in progress */
-#define        ARC_L2_EVICTED          (1 << 17)       /* evicted during I/O */
-#define        ARC_L2_WRITE_HEAD       (1 << 18)       /* head of write list */
-
-#define        HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
-#define        HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
-#define        HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
-#define        HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
-#define        HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
-#define        HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
-#define        HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
-#define        HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
-#define        HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
-                                   (hdr)->b_l2hdr != NULL)
-#define        HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
-#define        HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
-#define        HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
+#define        HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
+#define        HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
+#define        HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
+#define        HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_FLAG_PREFETCH)
+#define        HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
+#define        HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
+
+#define        HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_FLAG_L2CACHE)
+#define        HDR_L2COMPRESS(hdr)     ((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
+#define        HDR_L2_READING(hdr)     \
+           (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&      \
+           ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
+#define        HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
+#define        HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
+#define        HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
+
+#define        HDR_ISTYPE_METADATA(hdr)        \
+           ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
+#define        HDR_ISTYPE_DATA(hdr)    (!HDR_ISTYPE_METADATA(hdr))
+
+#define        HDR_HAS_L1HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
+#define        HDR_HAS_L2HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 
 /*
  * Other sizes
  */
 
-#define        HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
-#define        L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
+#define        HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define        HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 
 /*
  * Hash table routines
@@ -596,6 +730,16 @@ uint64_t zfs_crc64_table[256];
 #define        L2ARC_FEED_SECS         1               /* caching interval secs */
 #define        L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 
+/*
+ * Used to distinguish headers that are being process by
+ * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk
+ * address. This can happen when the header is added to the l2arc's list
+ * of buffers to write in the first stage of l2arc_write_buffers(), but
+ * has not yet been written out which happens in the second stage of
+ * l2arc_write_buffers().
+ */
+#define        L2ARC_ADDR_UNSET        ((uint64_t)(-1))
+
 #define        l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 #define        l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 
@@ -618,7 +762,6 @@ static list_t L2ARC_dev_list;                       /* device list */
 static list_t *l2arc_dev_list;                 /* device list pointer */
 static kmutex_t l2arc_dev_mtx;                 /* device list mutex */
 static l2arc_dev_t *l2arc_dev_last;            /* last device used */
-static kmutex_t l2arc_buflist_mtx;             /* mutex for all buflists */
 static list_t L2ARC_free_on_write;             /* free after write buf list */
 static list_t *l2arc_free_on_write;            /* free after write list ptr */
 static kmutex_t l2arc_free_on_write_mtx;       /* mutex for list */
@@ -633,19 +776,6 @@ typedef struct l2arc_read_callback {
        enum zio_compress       l2rcb_compress;         /* applied compress */
 } l2arc_read_callback_t;
 
-struct l2arc_buf_hdr {
-       /* protected by arc_buf_hdr  mutex */
-       l2arc_dev_t             *b_dev;         /* L2ARC device */
-       uint64_t                b_daddr;        /* disk address, offset byte */
-       /* compression applied to buffer data */
-       enum zio_compress       b_compress;
-       /* real alloc'd buffer size depending on b_compress applied */
-       uint32_t                b_hits;
-       uint64_t                b_asize;
-       /* temporary buffer holder for in-flight compressed data */
-       void                    *b_tmp_cdata;
-};
-
 typedef struct l2arc_data_free {
        /* protected by l2arc_free_on_write_mtx */
        void            *l2df_data;
@@ -658,14 +788,21 @@ static kmutex_t l2arc_feed_thr_lock;
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
 
-static void l2arc_read_done(zio_t *zio);
-static void l2arc_hdr_stat_add(void);
-static void l2arc_hdr_stat_remove(void);
+static void arc_get_data_buf(arc_buf_t *);
+static void arc_access(arc_buf_hdr_t *, kmutex_t *);
+static boolean_t arc_is_overflowing(void);
+static void arc_buf_watch(arc_buf_t *);
+static void arc_tuning_update(void);
+
+static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
+static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
+
+static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
+static void l2arc_read_done(zio_t *);
 
-static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
-static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
-    enum zio_compress c);
-static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
+static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
+static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
+static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
 
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
@@ -686,8 +823,7 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 
 #define        BUF_EMPTY(buf)                                          \
        ((buf)->b_dva.dva_word[0] == 0 &&                       \
-       (buf)->b_dva.dva_word[1] == 0 &&                        \
-       (buf)->b_cksum0 == 0)
+       (buf)->b_dva.dva_word[1] == 0)
 
 #define        BUF_EQUAL(spa, dva, birth, buf)                         \
        ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
@@ -700,7 +836,6 @@ buf_discard_identity(arc_buf_hdr_t *hdr)
        hdr->b_dva.dva_word[0] = 0;
        hdr->b_dva.dva_word[1] = 0;
        hdr->b_birth = 0;
-       hdr->b_cksum0 = 0;
 }
 
 static arc_buf_hdr_t *
@@ -710,14 +845,14 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
        uint64_t birth = BP_PHYSICAL_BIRTH(bp);
        uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
        kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
-       arc_buf_hdr_t *buf;
+       arc_buf_hdr_t *hdr;
 
        mutex_enter(hash_lock);
-       for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
-           buf = buf->b_hash_next) {
-               if (BUF_EQUAL(spa, dva, birth, buf)) {
+       for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
+           hdr = hdr->b_hash_next) {
+               if (BUF_EQUAL(spa, dva, birth, hdr)) {
                        *lockp = hash_lock;
-                       return (buf);
+                       return (hdr);
                }
        }
        mutex_exit(hash_lock);
@@ -730,29 +865,36 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
+ * If lockp == NULL, the caller is assumed to already hold the hash lock.
  */
 static arc_buf_hdr_t *
-buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
+buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
 {
-       uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+       uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
        kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
-       arc_buf_hdr_t *fbuf;
+       arc_buf_hdr_t *fhdr;
        uint32_t i;
 
-       ASSERT(!DVA_IS_EMPTY(&buf->b_dva));
-       ASSERT(buf->b_birth != 0);
-       ASSERT(!HDR_IN_HASH_TABLE(buf));
-       *lockp = hash_lock;
-       mutex_enter(hash_lock);
-       for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
-           fbuf = fbuf->b_hash_next, i++) {
-               if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
-                       return (fbuf);
+       ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
+       ASSERT(hdr->b_birth != 0);
+       ASSERT(!HDR_IN_HASH_TABLE(hdr));
+
+       if (lockp != NULL) {
+               *lockp = hash_lock;
+               mutex_enter(hash_lock);
+       } else {
+               ASSERT(MUTEX_HELD(hash_lock));
+       }
+
+       for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
+           fhdr = fhdr->b_hash_next, i++) {
+               if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
+                       return (fhdr);
        }
 
-       buf->b_hash_next = buf_hash_table.ht_table[idx];
-       buf_hash_table.ht_table[idx] = buf;
-       buf->b_flags |= ARC_IN_HASH_TABLE;
+       hdr->b_hash_next = buf_hash_table.ht_table[idx];
+       buf_hash_table.ht_table[idx] = hdr;
+       hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
 
        /* collect some hash table performance data */
        if (i > 0) {
@@ -770,22 +912,22 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 }
 
 static void
-buf_hash_remove(arc_buf_hdr_t *buf)
+buf_hash_remove(arc_buf_hdr_t *hdr)
 {
-       arc_buf_hdr_t *fbuf, **bufp;
-       uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+       arc_buf_hdr_t *fhdr, **hdrp;
+       uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 
        ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
-       ASSERT(HDR_IN_HASH_TABLE(buf));
+       ASSERT(HDR_IN_HASH_TABLE(hdr));
 
-       bufp = &buf_hash_table.ht_table[idx];
-       while ((fbuf = *bufp) != buf) {
-               ASSERT(fbuf != NULL);
-               bufp = &fbuf->b_hash_next;
+       hdrp = &buf_hash_table.ht_table[idx];
+       while ((fhdr = *hdrp) != hdr) {
+               ASSERT(fhdr != NULL);
+               hdrp = &fhdr->b_hash_next;
        }
-       *bufp = buf->b_hash_next;
-       buf->b_hash_next = NULL;
-       buf->b_flags &= ~ARC_IN_HASH_TABLE;
+       *hdrp = hdr->b_hash_next;
+       hdr->b_hash_next = NULL;
+       hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
 
        /* collect some hash table performance data */
        ARCSTAT_BUMPDOWN(arcstat_hash_elements);
@@ -798,9 +940,9 @@ buf_hash_remove(arc_buf_hdr_t *buf)
 /*
  * Global data structures and functions for the buf kmem cache.
  */
-static kmem_cache_t *hdr_cache;
+static kmem_cache_t *hdr_full_cache;
+static kmem_cache_t *hdr_l2only_cache;
 static kmem_cache_t *buf_cache;
-static kmem_cache_t *l2arc_hdr_cache;
 
 static void
 buf_fini(void)
@@ -820,9 +962,9 @@ buf_fini(void)
 #endif
        for (i = 0; i < BUF_LOCKS; i++)
                mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
-       kmem_cache_destroy(hdr_cache);
+       kmem_cache_destroy(hdr_full_cache);
+       kmem_cache_destroy(hdr_l2only_cache);
        kmem_cache_destroy(buf_cache);
-       kmem_cache_destroy(l2arc_hdr_cache);
 }
 
 /*
@@ -831,17 +973,30 @@ buf_fini(void)
  */
 /* ARGSUSED */
 static int
-hdr_cons(void *vbuf, void *unused, int kmflag)
+hdr_full_cons(void *vbuf, void *unused, int kmflag)
+{
+       arc_buf_hdr_t *hdr = vbuf;
+
+       bzero(hdr, HDR_FULL_SIZE);
+       cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
+       refcount_create(&hdr->b_l1hdr.b_refcnt);
+       mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+       list_link_init(&hdr->b_l1hdr.b_arc_node);
+       list_link_init(&hdr->b_l2hdr.b_l2node);
+       multilist_link_init(&hdr->b_l1hdr.b_arc_node);
+       arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
+
+       return (0);
+}
+
+/* ARGSUSED */
+static int
+hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
 {
-       arc_buf_hdr_t *buf = vbuf;
+       arc_buf_hdr_t *hdr = vbuf;
 
-       bzero(buf, sizeof (arc_buf_hdr_t));
-       refcount_create(&buf->b_refcnt);
-       cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
-       mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
-       list_link_init(&buf->b_arc_node);
-       list_link_init(&buf->b_l2node);
-       arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
+       bzero(hdr, HDR_L2ONLY_SIZE);
+       arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 
        return (0);
 }
@@ -865,15 +1020,26 @@ buf_cons(void *vbuf, void *unused, int kmflag)
  */
 /* ARGSUSED */
 static void
-hdr_dest(void *vbuf, void *unused)
+hdr_full_dest(void *vbuf, void *unused)
+{
+       arc_buf_hdr_t *hdr = vbuf;
+
+       ASSERT(BUF_EMPTY(hdr));
+       cv_destroy(&hdr->b_l1hdr.b_cv);
+       refcount_destroy(&hdr->b_l1hdr.b_refcnt);
+       mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+       ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+       arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
+}
+
+/* ARGSUSED */
+static void
+hdr_l2only_dest(void *vbuf, void *unused)
 {
-       arc_buf_hdr_t *buf = vbuf;
+       ASSERTV(arc_buf_hdr_t *hdr = vbuf);
 
-       ASSERT(BUF_EMPTY(buf));
-       refcount_destroy(&buf->b_refcnt);
-       cv_destroy(&buf->b_cv);
-       mutex_destroy(&buf->b_freeze_lock);
-       arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
+       ASSERT(BUF_EMPTY(hdr));
+       arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 }
 
 /* ARGSUSED */
@@ -886,6 +1052,22 @@ buf_dest(void *vbuf, void *unused)
        arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
+/*
+ * Reclaim callback -- invoked when memory is low.
+ */
+/* ARGSUSED */
+static void
+hdr_recl(void *unused)
+{
+       dprintf("hdr_recl called\n");
+       /*
+        * umem calls the reclaim func when we destroy the buf cache,
+        * which is after we do arc_fini().
+        */
+       if (!arc_dead)
+               cv_signal(&arc_reclaim_thread_cv);
+}
+
 static void
 buf_init(void)
 {
@@ -920,12 +1102,13 @@ retry:
                goto retry;
        }
 
-       hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
-           0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0);
+       hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
+           0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
+       hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
+           HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
+           NULL, NULL, 0);
        buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
            0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
-       l2arc_hdr_cache = kmem_cache_create("l2arc_buf_hdr_t", L2HDR_SIZE,
-           0, NULL, NULL, NULL, NULL, NULL, 0);
 
        for (i = 0; i < 256; i++)
                for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
@@ -937,6 +1120,109 @@ retry:
        }
 }
 
+/*
+ * Transition between the two allocation states for the arc_buf_hdr struct.
+ * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
+ * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
+ * version is used when a cache buffer is only in the L2ARC in order to reduce
+ * memory usage.
+ */
+static arc_buf_hdr_t *
+arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
+{
+       arc_buf_hdr_t *nhdr;
+       l2arc_dev_t *dev;
+
+       ASSERT(HDR_HAS_L2HDR(hdr));
+       ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
+           (old == hdr_l2only_cache && new == hdr_full_cache));
+
+       dev = hdr->b_l2hdr.b_dev;
+       nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
+
+       ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
+       buf_hash_remove(hdr);
+
+       bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
+
+       if (new == hdr_full_cache) {
+               nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
+               /*
+                * arc_access and arc_change_state need to be aware that a
+                * header has just come out of L2ARC, so we set its state to
+                * l2c_only even though it's about to change.
+                */
+               nhdr->b_l1hdr.b_state = arc_l2c_only;
+
+               /* Verify previous threads set to NULL before freeing */
+               ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+       } else {
+               ASSERT(hdr->b_l1hdr.b_buf == NULL);
+               ASSERT0(hdr->b_l1hdr.b_datacnt);
+
+               /*
+                * If we've reached here, We must have been called from
+                * arc_evict_hdr(), as such we should have already been
+                * removed from any ghost list we were previously on
+                * (which protects us from racing with arc_evict_state),
+                * thus no locking is needed during this check.
+                */
+               ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+
+               /*
+                * A buffer must not be moved into the arc_l2c_only
+                * state if it's not finished being written out to the
+                * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
+                * might try to be accessed, even though it was removed.
+                */
+               VERIFY(!HDR_L2_WRITING(hdr));
+               VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+
+               nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
+       }
+       /*
+        * The header has been reallocated so we need to re-insert it into any
+        * lists it was on.
+        */
+       (void) buf_hash_insert(nhdr, NULL);
+
+       ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
+
+       mutex_enter(&dev->l2ad_mtx);
+
+       /*
+        * We must place the realloc'ed header back into the list at
+        * the same spot. Otherwise, if it's placed earlier in the list,
+        * l2arc_write_buffers() could find it during the function's
+        * write phase, and try to write it out to the l2arc.
+        */
+       list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
+       list_remove(&dev->l2ad_buflist, hdr);
+
+       mutex_exit(&dev->l2ad_mtx);
+
+       /*
+        * Since we're using the pointer address as the tag when
+        * incrementing and decrementing the l2ad_alloc refcount, we
+        * must remove the old pointer (that we're about to destroy) and
+        * add the new pointer to the refcount. Otherwise we'd remove
+        * the wrong pointer address when calling arc_hdr_destroy() later.
+        */
+
+       (void) refcount_remove_many(&dev->l2ad_alloc,
+           hdr->b_l2hdr.b_asize, hdr);
+
+       (void) refcount_add_many(&dev->l2ad_alloc,
+           nhdr->b_l2hdr.b_asize, nhdr);
+
+       buf_discard_identity(hdr);
+       hdr->b_freeze_cksum = NULL;
+       kmem_cache_free(old, hdr);
+
+       return (nhdr);
+}
+
+
 #define        ARC_MINTIME     (hz>>4) /* 62 ms */
 
 static void
@@ -947,16 +1233,15 @@ arc_cksum_verify(arc_buf_t *buf)
        if (!(zfs_flags & ZFS_DEBUG_MODIFY))
                return;
 
-       mutex_enter(&buf->b_hdr->b_freeze_lock);
-       if (buf->b_hdr->b_freeze_cksum == NULL ||
-           (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
-               mutex_exit(&buf->b_hdr->b_freeze_lock);
+       mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+       if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
+               mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
                return;
        }
        fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
        if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
                panic("buffer modified while frozen!");
-       mutex_exit(&buf->b_hdr->b_freeze_lock);
+       mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
 }
 
 static int
@@ -965,10 +1250,10 @@ arc_cksum_equal(arc_buf_t *buf)
        zio_cksum_t zc;
        int equal;
 
-       mutex_enter(&buf->b_hdr->b_freeze_lock);
+       mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
        fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
        equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
-       mutex_exit(&buf->b_hdr->b_freeze_lock);
+       mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
 
        return (equal);
 }
@@ -979,16 +1264,15 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
        if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
                return;
 
-       mutex_enter(&buf->b_hdr->b_freeze_lock);
+       mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
        if (buf->b_hdr->b_freeze_cksum != NULL) {
-               mutex_exit(&buf->b_hdr->b_freeze_lock);
+               mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
                return;
        }
-       buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
-           KM_SLEEP);
+       buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
        fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
            buf->b_hdr->b_freeze_cksum);
-       mutex_exit(&buf->b_hdr->b_freeze_lock);
+       mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
        arc_buf_watch(buf);
 }
 
@@ -1022,24 +1306,50 @@ arc_buf_watch(arc_buf_t *buf)
 #endif
 }
 
+static arc_buf_contents_t
+arc_buf_type(arc_buf_hdr_t *hdr)
+{
+       if (HDR_ISTYPE_METADATA(hdr)) {
+               return (ARC_BUFC_METADATA);
+       } else {
+               return (ARC_BUFC_DATA);
+       }
+}
+
+static uint32_t
+arc_bufc_to_flags(arc_buf_contents_t type)
+{
+       switch (type) {
+       case ARC_BUFC_DATA:
+               /* metadata field is 0 if buffer contains normal data */
+               return (0);
+       case ARC_BUFC_METADATA:
+               return (ARC_FLAG_BUFC_METADATA);
+       default:
+               break;
+       }
+       panic("undefined ARC buffer type!");
+       return ((uint32_t)-1);
+}
+
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
        if (zfs_flags & ZFS_DEBUG_MODIFY) {
-               if (buf->b_hdr->b_state != arc_anon)
+               if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
                        panic("modifying non-anon buffer!");
-               if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
+               if (HDR_IO_IN_PROGRESS(buf->b_hdr))
                        panic("modifying buffer while i/o in progress!");
                arc_cksum_verify(buf);
        }
 
-       mutex_enter(&buf->b_hdr->b_freeze_lock);
+       mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
        if (buf->b_hdr->b_freeze_cksum != NULL) {
                kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
                buf->b_hdr->b_freeze_cksum = NULL;
        }
 
-       mutex_exit(&buf->b_hdr->b_freeze_lock);
+       mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
 
        arc_buf_unwatch(buf);
 }
@@ -1056,62 +1366,72 @@ arc_buf_freeze(arc_buf_t *buf)
        mutex_enter(hash_lock);
 
        ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
-           buf->b_hdr->b_state == arc_anon);
+           buf->b_hdr->b_l1hdr.b_state == arc_anon);
        arc_cksum_compute(buf, B_FALSE);
        mutex_exit(hash_lock);
 
 }
 
 static void
-add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
 {
+       arc_state_t *state;
+
+       ASSERT(HDR_HAS_L1HDR(hdr));
        ASSERT(MUTEX_HELD(hash_lock));
 
-       if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
-           (ab->b_state != arc_anon)) {
-               uint64_t delta = ab->b_size * ab->b_datacnt;
-               list_t *list = &ab->b_state->arcs_list[ab->b_type];
-               uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
-
-               ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
-               mutex_enter(&ab->b_state->arcs_mtx);
-               ASSERT(list_link_active(&ab->b_arc_node));
-               list_remove(list, ab);
-               if (GHOST_STATE(ab->b_state)) {
-                       ASSERT0(ab->b_datacnt);
-                       ASSERT3P(ab->b_buf, ==, NULL);
-                       delta = ab->b_size;
+       state = hdr->b_l1hdr.b_state;
+
+       if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
+           (state != arc_anon)) {
+               /* We don't use the L2-only state list. */
+               if (state != arc_l2c_only) {
+                       arc_buf_contents_t type = arc_buf_type(hdr);
+                       uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
+                       multilist_t *list = &state->arcs_list[type];
+                       uint64_t *size = &state->arcs_lsize[type];
+
+                       multilist_remove(list, hdr);
+
+                       if (GHOST_STATE(state)) {
+                               ASSERT0(hdr->b_l1hdr.b_datacnt);
+                               ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+                               delta = hdr->b_size;
+                       }
+                       ASSERT(delta > 0);
+                       ASSERT3U(*size, >=, delta);
+                       atomic_add_64(size, -delta);
                }
-               ASSERT(delta > 0);
-               ASSERT3U(*size, >=, delta);
-               atomic_add_64(size, -delta);
-               mutex_exit(&ab->b_state->arcs_mtx);
                /* remove the prefetch flag if we get a reference */
-               if (ab->b_flags & ARC_PREFETCH)
-                       ab->b_flags &= ~ARC_PREFETCH;
+               hdr->b_flags &= ~ARC_FLAG_PREFETCH;
        }
 }
 
 static int
-remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
 {
        int cnt;
-       arc_state_t *state = ab->b_state;
+       arc_state_t *state = hdr->b_l1hdr.b_state;
 
+       ASSERT(HDR_HAS_L1HDR(hdr));
        ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
        ASSERT(!GHOST_STATE(state));
 
-       if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
+       /*
+        * arc_l2c_only counts as a ghost state so we don't need to explicitly
+        * check to prevent usage of the arc_l2c_only list.
+        */
+       if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
            (state != arc_anon)) {
-               uint64_t *size = &state->arcs_lsize[ab->b_type];
+               arc_buf_contents_t type = arc_buf_type(hdr);
+               multilist_t *list = &state->arcs_list[type];
+               uint64_t *size = &state->arcs_lsize[type];
+
+               multilist_insert(list, hdr);
 
-               ASSERT(!MUTEX_HELD(&state->arcs_mtx));
-               mutex_enter(&state->arcs_mtx);
-               ASSERT(!list_link_active(&ab->b_arc_node));
-               list_insert_head(&state->arcs_list[ab->b_type], ab);
-               ASSERT(ab->b_datacnt > 0);
-               atomic_add_64(size, ab->b_size * ab->b_datacnt);
-               mutex_exit(&state->arcs_mtx);
+               ASSERT(hdr->b_l1hdr.b_datacnt > 0);
+               atomic_add_64(size, hdr->b_size *
+                   hdr->b_l1hdr.b_datacnt);
        }
        return (cnt);
 }
@@ -1127,132 +1447,209 @@ void
 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
 {
        arc_buf_hdr_t *hdr = ab->b_hdr;
-       arc_state_t *state = hdr->b_state;
+       l1arc_buf_hdr_t *l1hdr = NULL;
+       l2arc_buf_hdr_t *l2hdr = NULL;
+       arc_state_t *state = NULL;
+
+       if (HDR_HAS_L1HDR(hdr)) {
+               l1hdr = &hdr->b_l1hdr;
+               state = l1hdr->b_state;
+       }
+       if (HDR_HAS_L2HDR(hdr))
+               l2hdr = &hdr->b_l2hdr;
 
        memset(abi, 0, sizeof (arc_buf_info_t));
        abi->abi_flags = hdr->b_flags;
-       abi->abi_datacnt = hdr->b_datacnt;
+
+       if (l1hdr) {
+               abi->abi_datacnt = l1hdr->b_datacnt;
+               abi->abi_access = l1hdr->b_arc_access;
+               abi->abi_mru_hits = l1hdr->b_mru_hits;
+               abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
+               abi->abi_mfu_hits = l1hdr->b_mfu_hits;
+               abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
+               abi->abi_holds = refcount_count(&l1hdr->b_refcnt);
+       }
+
+       if (l2hdr) {
+               abi->abi_l2arc_dattr = l2hdr->b_daddr;
+               abi->abi_l2arc_asize = l2hdr->b_asize;
+               abi->abi_l2arc_compress = l2hdr->b_compress;
+               abi->abi_l2arc_hits = l2hdr->b_hits;
+       }
+
        abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
-       abi->abi_state_contents = hdr->b_type;
-       abi->abi_state_index = -1;
+       abi->abi_state_contents = arc_buf_type(hdr);
        abi->abi_size = hdr->b_size;
-       abi->abi_access = hdr->b_arc_access;
-       abi->abi_mru_hits = hdr->b_mru_hits;
-       abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits;
-       abi->abi_mfu_hits = hdr->b_mfu_hits;
-       abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits;
-       abi->abi_holds = refcount_count(&hdr->b_refcnt);
-
-       if (hdr->b_l2hdr) {
-               abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr;
-               abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize;
-               abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress;
-               abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits;
-       }
-
-       if (state && state_index && list_link_active(&hdr->b_arc_node)) {
-               list_t *list = &state->arcs_list[hdr->b_type];
-               arc_buf_hdr_t *h;
-
-               mutex_enter(&state->arcs_mtx);
-               for (h = list_head(list); h != NULL; h = list_next(list, h)) {
-                       abi->abi_state_index++;
-                       if (h == hdr)
-                               break;
-               }
-               mutex_exit(&state->arcs_mtx);
-       }
 }
 
 /*
- * Move the supplied buffer to the indicated state.  The mutex
+ * Move the supplied buffer to the indicated state. The hash lock
  * for the buffer must be held by the caller.
  */
 static void
-arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
+    kmutex_t *hash_lock)
 {
-       arc_state_t *old_state = ab->b_state;
-       int64_t refcnt = refcount_count(&ab->b_refcnt);
+       arc_state_t *old_state;
+       int64_t refcnt;
+       uint32_t datacnt;
        uint64_t from_delta, to_delta;
+       arc_buf_contents_t buftype = arc_buf_type(hdr);
+
+       /*
+        * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
+        * in arc_read() when bringing a buffer out of the L2ARC.  However, the
+        * L1 hdr doesn't always exist when we change state to arc_anon before
+        * destroying a header, in which case reallocating to add the L1 hdr is
+        * pointless.
+        */
+       if (HDR_HAS_L1HDR(hdr)) {
+               old_state = hdr->b_l1hdr.b_state;
+               refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
+               datacnt = hdr->b_l1hdr.b_datacnt;
+       } else {
+               old_state = arc_l2c_only;
+               refcnt = 0;
+               datacnt = 0;
+       }
 
        ASSERT(MUTEX_HELD(hash_lock));
        ASSERT3P(new_state, !=, old_state);
-       ASSERT(refcnt == 0 || ab->b_datacnt > 0);
-       ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
-       ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
+       ASSERT(refcnt == 0 || datacnt > 0);
+       ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
+       ASSERT(old_state != arc_anon || datacnt <= 1);
 
-       from_delta = to_delta = ab->b_datacnt * ab->b_size;
+       from_delta = to_delta = datacnt * hdr->b_size;
 
        /*
         * If this buffer is evictable, transfer it from the
         * old state list to the new state list.
         */
        if (refcnt == 0) {
-               if (old_state != arc_anon) {
-                       int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
-                       uint64_t *size = &old_state->arcs_lsize[ab->b_type];
+               if (old_state != arc_anon && old_state != arc_l2c_only) {
+                       uint64_t *size = &old_state->arcs_lsize[buftype];
 
-                       if (use_mutex)
-                               mutex_enter(&old_state->arcs_mtx);
-
-                       ASSERT(list_link_active(&ab->b_arc_node));
-                       list_remove(&old_state->arcs_list[ab->b_type], ab);
+                       ASSERT(HDR_HAS_L1HDR(hdr));
+                       multilist_remove(&old_state->arcs_list[buftype], hdr);
 
                        /*
                         * If prefetching out of the ghost cache,
                         * we will have a non-zero datacnt.
                         */
-                       if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
+                       if (GHOST_STATE(old_state) && datacnt == 0) {
                                /* ghost elements have a ghost size */
-                               ASSERT(ab->b_buf == NULL);
-                               from_delta = ab->b_size;
+                               ASSERT(hdr->b_l1hdr.b_buf == NULL);
+                               from_delta = hdr->b_size;
                        }
                        ASSERT3U(*size, >=, from_delta);
                        atomic_add_64(size, -from_delta);
-
-                       if (use_mutex)
-                               mutex_exit(&old_state->arcs_mtx);
                }
-               if (new_state != arc_anon) {
-                       int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
-                       uint64_t *size = &new_state->arcs_lsize[ab->b_type];
+               if (new_state != arc_anon && new_state != arc_l2c_only) {
+                       uint64_t *size = &new_state->arcs_lsize[buftype];
 
-                       if (use_mutex)
-                               mutex_enter(&new_state->arcs_mtx);
-
-                       list_insert_head(&new_state->arcs_list[ab->b_type], ab);
+                       /*
+                        * An L1 header always exists here, since if we're
+                        * moving to some L1-cached state (i.e. not l2c_only or
+                        * anonymous), we realloc the header to add an L1hdr
+                        * beforehand.
+                        */
+                       ASSERT(HDR_HAS_L1HDR(hdr));
+                       multilist_insert(&new_state->arcs_list[buftype], hdr);
 
                        /* ghost elements have a ghost size */
                        if (GHOST_STATE(new_state)) {
-                               ASSERT(ab->b_datacnt == 0);
-                               ASSERT(ab->b_buf == NULL);
-                               to_delta = ab->b_size;
+                               ASSERT0(datacnt);
+                               ASSERT(hdr->b_l1hdr.b_buf == NULL);
+                               to_delta = hdr->b_size;
                        }
                        atomic_add_64(size, to_delta);
+               }
+       }
+
+       ASSERT(!BUF_EMPTY(hdr));
+       if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
+               buf_hash_remove(hdr);
+
+       /* adjust state sizes (ignore arc_l2c_only) */
+
+       if (to_delta && new_state != arc_l2c_only) {
+               ASSERT(HDR_HAS_L1HDR(hdr));
+               if (GHOST_STATE(new_state)) {
+                       ASSERT0(datacnt);
+
+                       /*
+                        * We moving a header to a ghost state, we first
+                        * remove all arc buffers. Thus, we'll have a
+                        * datacnt of zero, and no arc buffer to use for
+                        * the reference. As a result, we use the arc
+                        * header pointer for the reference.
+                        */
+                       (void) refcount_add_many(&new_state->arcs_size,
+                           hdr->b_size, hdr);
+               } else {
+                       arc_buf_t *buf;
+                       ASSERT3U(datacnt, !=, 0);
 
-                       if (use_mutex)
-                               mutex_exit(&new_state->arcs_mtx);
+                       /*
+                        * Each individual buffer holds a unique reference,
+                        * thus we must remove each of these references one
+                        * at a time.
+                        */
+                       for (buf = hdr->b_l1hdr.b_buf; buf != NULL;
+                           buf = buf->b_next) {
+                               (void) refcount_add_many(&new_state->arcs_size,
+                                   hdr->b_size, buf);
+                       }
                }
        }
 
-       ASSERT(!BUF_EMPTY(ab));
-       if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
-               buf_hash_remove(ab);
+       if (from_delta && old_state != arc_l2c_only) {
+               ASSERT(HDR_HAS_L1HDR(hdr));
+               if (GHOST_STATE(old_state)) {
+                       /*
+                        * When moving a header off of a ghost state,
+                        * there's the possibility for datacnt to be
+                        * non-zero. This is because we first add the
+                        * arc buffer to the header prior to changing
+                        * the header's state. Since we used the header
+                        * for the reference when putting the header on
+                        * the ghost state, we must balance that and use
+                        * the header when removing off the ghost state
+                        * (even though datacnt is non zero).
+                        */
+
+                       IMPLY(datacnt == 0, new_state == arc_anon ||
+                           new_state == arc_l2c_only);
 
-       /* adjust state sizes */
-       if (to_delta)
-               atomic_add_64(&new_state->arcs_size, to_delta);
-       if (from_delta) {
-               ASSERT3U(old_state->arcs_size, >=, from_delta);
-               atomic_add_64(&old_state->arcs_size, -from_delta);
+                       (void) refcount_remove_many(&old_state->arcs_size,
+                           hdr->b_size, hdr);
+               } else {
+                       arc_buf_t *buf;
+                       ASSERT3U(datacnt, !=, 0);
+
+                       /*
+                        * Each individual buffer holds a unique reference,
+                        * thus we must remove each of these references one
+                        * at a time.
+                        */
+                       for (buf = hdr->b_l1hdr.b_buf; buf != NULL;
+                           buf = buf->b_next) {
+                               (void) refcount_remove_many(
+                                   &old_state->arcs_size, hdr->b_size, buf);
+                       }
+               }
        }
-       ab->b_state = new_state;
 
-       /* adjust l2arc hdr stats */
-       if (new_state == arc_l2c_only)
-               l2arc_hdr_stat_add();
-       else if (old_state == arc_l2c_only)
-               l2arc_hdr_stat_remove();
+       if (HDR_HAS_L1HDR(hdr))
+               hdr->b_l1hdr.b_state = new_state;
+
+       /*
+        * L2 headers should never be on the L2 state list since they don't
+        * have L1 headers allocated.
+        */
+       ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
+           multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
 }
 
 void
@@ -1267,7 +1664,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
                ARCSTAT_INCR(arcstat_data_size, space);
                break;
        case ARC_SPACE_META:
-               ARCSTAT_INCR(arcstat_meta_size, space);
+               ARCSTAT_INCR(arcstat_metadata_size, space);
                break;
        case ARC_SPACE_OTHER:
                ARCSTAT_INCR(arcstat_other_size, space);
@@ -1280,11 +1677,8 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
                break;
        }
 
-       if (type != ARC_SPACE_DATA) {
+       if (type != ARC_SPACE_DATA)
                ARCSTAT_INCR(arcstat_meta_used, space);
-               if (arc_meta_max < arc_meta_used)
-                       arc_meta_max = arc_meta_used;
-       }
 
        atomic_add_64(&arc_size, space);
 }
@@ -1301,7 +1695,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
                ARCSTAT_INCR(arcstat_data_size, -space);
                break;
        case ARC_SPACE_META:
-               ARCSTAT_INCR(arcstat_meta_size, -space);
+               ARCSTAT_INCR(arcstat_metadata_size, -space);
                break;
        case ARC_SPACE_OTHER:
                ARCSTAT_INCR(arcstat_other_size, -space);
@@ -1316,6 +1710,8 @@ arc_space_return(uint64_t space, arc_space_type_t type)
 
        if (type != ARC_SPACE_DATA) {
                ASSERT(arc_meta_used >= space);
+               if (arc_meta_max < arc_meta_used)
+                       arc_meta_max = arc_meta_used;
                ARCSTAT_INCR(arcstat_meta_used, -space);
        }
 
@@ -1329,31 +1725,37 @@ arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type)
        arc_buf_hdr_t *hdr;
        arc_buf_t *buf;
 
-       VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
-       hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+       VERIFY3U(size, <=, spa_maxblocksize(spa));
+       hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
        ASSERT(BUF_EMPTY(hdr));
+       ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
        hdr->b_size = size;
-       hdr->b_type = type;
        hdr->b_spa = spa_load_guid(spa);
-       hdr->b_state = arc_anon;
-       hdr->b_arc_access = 0;
-       hdr->b_mru_hits = 0;
-       hdr->b_mru_ghost_hits = 0;
-       hdr->b_mfu_hits = 0;
-       hdr->b_mfu_ghost_hits = 0;
-       hdr->b_l2_hits = 0;
+       hdr->b_l1hdr.b_mru_hits = 0;
+       hdr->b_l1hdr.b_mru_ghost_hits = 0;
+       hdr->b_l1hdr.b_mfu_hits = 0;
+       hdr->b_l1hdr.b_mfu_ghost_hits = 0;
+       hdr->b_l1hdr.b_l2_hits = 0;
+
        buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
        buf->b_hdr = hdr;
        buf->b_data = NULL;
        buf->b_efunc = NULL;
        buf->b_private = NULL;
        buf->b_next = NULL;
-       hdr->b_buf = buf;
+
+       hdr->b_flags = arc_bufc_to_flags(type);
+       hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
+
+       hdr->b_l1hdr.b_buf = buf;
+       hdr->b_l1hdr.b_state = arc_anon;
+       hdr->b_l1hdr.b_arc_access = 0;
+       hdr->b_l1hdr.b_datacnt = 1;
+       hdr->b_l1hdr.b_tmp_cdata = NULL;
+
        arc_get_data_buf(buf);
-       hdr->b_datacnt = 1;
-       hdr->b_flags = 0;
-       ASSERT(refcount_is_zero(&hdr->b_refcnt));
-       (void) refcount_add(&hdr->b_refcnt, tag);
+       ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+       (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
 
        return (buf);
 }
@@ -1386,8 +1788,9 @@ arc_return_buf(arc_buf_t *buf, void *tag)
        arc_buf_hdr_t *hdr = buf->b_hdr;
 
        ASSERT(buf->b_data != NULL);
-       (void) refcount_add(&hdr->b_refcnt, tag);
-       (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
+       ASSERT(HDR_HAS_L1HDR(hdr));
+       (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
+       (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 
        atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
 }
@@ -1396,12 +1799,12 @@ arc_return_buf(arc_buf_t *buf, void *tag)
 void
 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
 {
-       arc_buf_hdr_t *hdr;
+       arc_buf_hdr_t *hdr = buf->b_hdr;
 
        ASSERT(buf->b_data != NULL);
-       hdr = buf->b_hdr;
-       (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
-       (void) refcount_remove(&hdr->b_refcnt, tag);
+       ASSERT(HDR_HAS_L1HDR(hdr));
+       (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
+       (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
        buf->b_efunc = NULL;
        buf->b_private = NULL;
 
@@ -1415,15 +1818,16 @@ arc_buf_clone(arc_buf_t *from)
        arc_buf_hdr_t *hdr = from->b_hdr;
        uint64_t size = hdr->b_size;
 
-       ASSERT(hdr->b_state != arc_anon);
+       ASSERT(HDR_HAS_L1HDR(hdr));
+       ASSERT(hdr->b_l1hdr.b_state != arc_anon);
 
        buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
        buf->b_hdr = hdr;
        buf->b_data = NULL;
        buf->b_efunc = NULL;
        buf->b_private = NULL;
-       buf->b_next = hdr->b_buf;
-       hdr->b_buf = buf;
+       buf->b_next = hdr->b_l1hdr.b_buf;
+       hdr->b_l1hdr.b_buf = buf;
        arc_get_data_buf(buf);
        bcopy(from->b_data, buf->b_data, size);
 
@@ -1433,11 +1837,11 @@ arc_buf_clone(arc_buf_t *from)
         * then track the size and number of duplicates.  These stats will be
         * updated as duplicate buffers are created and destroyed.
         */
-       if (hdr->b_type == ARC_BUFC_DATA) {
+       if (HDR_ISTYPE_DATA(hdr)) {
                ARCSTAT_BUMP(arcstat_duplicate_buffers);
                ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
        }
-       hdr->b_datacnt += 1;
+       hdr->b_l1hdr.b_datacnt += 1;
        return (buf);
 }
 
@@ -1460,17 +1864,20 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
        hash_lock = HDR_LOCK(buf->b_hdr);
        mutex_enter(hash_lock);
        hdr = buf->b_hdr;
+       ASSERT(HDR_HAS_L1HDR(hdr));
        ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
        mutex_exit(&buf->b_evict_lock);
 
-       ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+       ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+           hdr->b_l1hdr.b_state == arc_mfu);
+
        add_reference(hdr, hash_lock, tag);
        DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
        arc_access(hdr, hash_lock);
        mutex_exit(hash_lock);
        ARCSTAT_BUMP(arcstat_hits);
-       ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
-           demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+       ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+           demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
            data, metadata, hits);
 }
 
@@ -1480,7 +1887,7 @@ arc_buf_free_on_write(void *data, size_t size,
 {
        l2arc_data_free_t *df;
 
-       df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+       df = kmem_alloc(sizeof (*df), KM_SLEEP);
        df->l2df_data = data;
        df->l2df_size = size;
        df->l2df_func = free_func;
@@ -1506,75 +1913,114 @@ arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
        }
 }
 
-/*
- * Free up buf->b_data and if 'remove' is set, then pull the
- * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
- */
 static void
 arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
 {
-       l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
+       ASSERT(HDR_HAS_L2HDR(hdr));
+       ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
+
+       /*
+        * The b_tmp_cdata field is linked off of the b_l1hdr, so if
+        * that doesn't exist, the header is in the arc_l2c_only state,
+        * and there isn't anything to free (it's already been freed).
+        */
+       if (!HDR_HAS_L1HDR(hdr))
+               return;
+
+       /*
+        * The header isn't being written to the l2arc device, thus it
+        * shouldn't have a b_tmp_cdata to free.
+        */
+       if (!HDR_L2_WRITING(hdr)) {
+               ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+               return;
+       }
 
-       ASSERT(MUTEX_HELD(&l2arc_buflist_mtx));
+       /*
+        * The header does not have compression enabled. This can be due
+        * to the buffer not being compressible, or because we're
+        * freeing the buffer before the second phase of
+        * l2arc_write_buffer() has started (which does the compression
+        * step). In either case, b_tmp_cdata does not point to a
+        * separately compressed buffer, so there's nothing to free (it
+        * points to the same buffer as the arc_buf_t's b_data field).
+        */
+       if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_OFF) {
+               hdr->b_l1hdr.b_tmp_cdata = NULL;
+               return;
+       }
 
-       if (l2hdr->b_tmp_cdata == NULL)
+       /*
+        * There's nothing to free since the buffer was all zero's and
+        * compressed to a zero length buffer.
+        */
+       if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) {
+               ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
                return;
+       }
+
+       ASSERT(L2ARC_IS_VALID_COMPRESS(hdr->b_l2hdr.b_compress));
+
+       arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
+           hdr->b_size, zio_data_buf_free);
 
-       ASSERT(HDR_L2_WRITING(hdr));
-       arc_buf_free_on_write(l2hdr->b_tmp_cdata, hdr->b_size,
-           zio_data_buf_free);
        ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
-       l2hdr->b_tmp_cdata = NULL;
+       hdr->b_l1hdr.b_tmp_cdata = NULL;
 }
 
+/*
+ * Free up buf->b_data and if 'remove' is set, then pull the
+ * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
+ */
 static void
-arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
+arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
 {
        arc_buf_t **bufp;
 
        /* free up data associated with the buf */
-       if (buf->b_data) {
-               arc_state_t *state = buf->b_hdr->b_state;
+       if (buf->b_data != NULL) {
+               arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
                uint64_t size = buf->b_hdr->b_size;
-               arc_buf_contents_t type = buf->b_hdr->b_type;
+               arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
 
                arc_cksum_verify(buf);
                arc_buf_unwatch(buf);
 
-               if (!recycle) {
-                       if (type == ARC_BUFC_METADATA) {
-                               arc_buf_data_free(buf, zio_buf_free);
-                               arc_space_return(size, ARC_SPACE_META);
-                       } else {
-                               ASSERT(type == ARC_BUFC_DATA);
-                               arc_buf_data_free(buf, zio_data_buf_free);
-                               arc_space_return(size, ARC_SPACE_DATA);
-                       }
+               if (type == ARC_BUFC_METADATA) {
+                       arc_buf_data_free(buf, zio_buf_free);
+                       arc_space_return(size, ARC_SPACE_META);
+               } else {
+                       ASSERT(type == ARC_BUFC_DATA);
+                       arc_buf_data_free(buf, zio_data_buf_free);
+                       arc_space_return(size, ARC_SPACE_DATA);
                }
-               if (list_link_active(&buf->b_hdr->b_arc_node)) {
+
+               /* protected by hash lock, if in the hash table */
+               if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
                        uint64_t *cnt = &state->arcs_lsize[type];
 
-                       ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
-                       ASSERT(state != arc_anon);
+                       ASSERT(refcount_is_zero(
+                           &buf->b_hdr->b_l1hdr.b_refcnt));
+                       ASSERT(state != arc_anon && state != arc_l2c_only);
 
                        ASSERT3U(*cnt, >=, size);
                        atomic_add_64(cnt, -size);
                }
-               ASSERT3U(state->arcs_size, >=, size);
-               atomic_add_64(&state->arcs_size, -size);
+
+               (void) refcount_remove_many(&state->arcs_size, size, buf);
                buf->b_data = NULL;
 
                /*
                 * If we're destroying a duplicate buffer make sure
                 * that the appropriate statistics are updated.
                 */
-               if (buf->b_hdr->b_datacnt > 1 &&
-                   buf->b_hdr->b_type == ARC_BUFC_DATA) {
+               if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
+                   HDR_ISTYPE_DATA(buf->b_hdr)) {
                        ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
                        ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
                }
-               ASSERT(buf->b_hdr->b_datacnt > 0);
-               buf->b_hdr->b_datacnt -= 1;
+               ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
+               buf->b_hdr->b_l1hdr.b_datacnt -= 1;
        }
 
        /* only remove the buf if requested */
@@ -1582,7 +2028,8 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
                return;
 
        /* remove the buf from the hdr list */
-       for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
+       for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
+           bufp = &(*bufp)->b_next)
                continue;
        *bufp = buf->b_next;
        buf->b_next = NULL;
@@ -1595,87 +2042,136 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
 }
 
 static void
-arc_hdr_destroy(arc_buf_hdr_t *hdr)
+arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
 {
-       l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
+       l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
+       l2arc_dev_t *dev = l2hdr->b_dev;
+
+       ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
+       ASSERT(HDR_HAS_L2HDR(hdr));
+
+       list_remove(&dev->l2ad_buflist, hdr);
+
+       /*
+        * We don't want to leak the b_tmp_cdata buffer that was
+        * allocated in l2arc_write_buffers()
+        */
+       arc_buf_l2_cdata_free(hdr);
+
+       /*
+        * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then
+        * this header is being processed by l2arc_write_buffers() (i.e.
+        * it's in the first stage of l2arc_write_buffers()).
+        * Re-affirming that truth here, just to serve as a reminder. If
+        * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or
+        * may not have its HDR_L2_WRITING flag set. (the write may have
+        * completed, in which case HDR_L2_WRITING will be false and the
+        * b_daddr field will point to the address of the buffer on disk).
+        */
+       IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr));
+
+       /*
+        * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with
+        * l2arc_write_buffers(). Since we've just removed this header
+        * from the l2arc buffer list, this header will never reach the
+        * second stage of l2arc_write_buffers(), which increments the
+        * accounting stats for this header. Thus, we must be careful
+        * not to decrement them for this header either.
+        */
+       if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) {
+               ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
+               ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+
+               vdev_space_update(dev->l2ad_vdev,
+                   -l2hdr->b_asize, 0, 0);
 
-       ASSERT(refcount_is_zero(&hdr->b_refcnt));
-       ASSERT3P(hdr->b_state, ==, arc_anon);
+               (void) refcount_remove_many(&dev->l2ad_alloc,
+                   l2hdr->b_asize, hdr);
+       }
+
+       hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
+}
+
+static void
+arc_hdr_destroy(arc_buf_hdr_t *hdr)
+{
+       if (HDR_HAS_L1HDR(hdr)) {
+               ASSERT(hdr->b_l1hdr.b_buf == NULL ||
+                   hdr->b_l1hdr.b_datacnt > 0);
+               ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+               ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+       }
        ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+       ASSERT(!HDR_IN_HASH_TABLE(hdr));
+
+       if (HDR_HAS_L2HDR(hdr)) {
+               l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+               boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
+
+               if (!buflist_held)
+                       mutex_enter(&dev->l2ad_mtx);
 
-       if (l2hdr != NULL) {
-               boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
                /*
-                * To prevent arc_free() and l2arc_evict() from
-                * attempting to free the same buffer at the same time,
-                * a FREE_IN_PROGRESS flag is given to arc_free() to
-                * give it priority.  l2arc_evict() can't destroy this
-                * header while we are waiting on l2arc_buflist_mtx.
-                *
-                * The hdr may be removed from l2ad_buflist before we
-                * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+                * Even though we checked this conditional above, we
+                * need to check this again now that we have the
+                * l2ad_mtx. This is because we could be racing with
+                * another thread calling l2arc_evict() which might have
+                * destroyed this header's L2 portion as we were waiting
+                * to acquire the l2ad_mtx. If that happens, we don't
+                * want to re-destroy the header's L2 portion.
                 */
-               if (!buflist_held) {
-                       mutex_enter(&l2arc_buflist_mtx);
-                       l2hdr = hdr->b_l2hdr;
-               }
-
-               if (l2hdr != NULL) {
-                       list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
-                       arc_buf_l2_cdata_free(hdr);
-                       ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
-                       ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
-                       vdev_space_update(l2hdr->b_dev->l2ad_vdev,
-                           -l2hdr->b_asize, 0, 0);
-                       kmem_cache_free(l2arc_hdr_cache, l2hdr);
-                       arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
-                       if (hdr->b_state == arc_l2c_only)
-                               l2arc_hdr_stat_remove();
-                       hdr->b_l2hdr = NULL;
-               }
+               if (HDR_HAS_L2HDR(hdr))
+                       arc_hdr_l2hdr_destroy(hdr);
 
                if (!buflist_held)
-                       mutex_exit(&l2arc_buflist_mtx);
+                       mutex_exit(&dev->l2ad_mtx);
        }
 
-       if (!BUF_EMPTY(hdr)) {
-               ASSERT(!HDR_IN_HASH_TABLE(hdr));
+       if (!BUF_EMPTY(hdr))
                buf_discard_identity(hdr);
-       }
-       while (hdr->b_buf) {
-               arc_buf_t *buf = hdr->b_buf;
 
-               if (buf->b_efunc) {
-                       mutex_enter(&arc_eviction_mtx);
-                       mutex_enter(&buf->b_evict_lock);
-                       ASSERT(buf->b_hdr != NULL);
-                       arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
-                       hdr->b_buf = buf->b_next;
-                       buf->b_hdr = &arc_eviction_hdr;
-                       buf->b_next = arc_eviction_list;
-                       arc_eviction_list = buf;
-                       mutex_exit(&buf->b_evict_lock);
-                       mutex_exit(&arc_eviction_mtx);
-               } else {
-                       arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
-               }
-       }
        if (hdr->b_freeze_cksum != NULL) {
                kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
                hdr->b_freeze_cksum = NULL;
        }
 
-       ASSERT(!list_link_active(&hdr->b_arc_node));
+       if (HDR_HAS_L1HDR(hdr)) {
+               while (hdr->b_l1hdr.b_buf) {
+                       arc_buf_t *buf = hdr->b_l1hdr.b_buf;
+
+                       if (buf->b_efunc != NULL) {
+                               mutex_enter(&arc_user_evicts_lock);
+                               mutex_enter(&buf->b_evict_lock);
+                               ASSERT(buf->b_hdr != NULL);
+                               arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE);
+                               hdr->b_l1hdr.b_buf = buf->b_next;
+                               buf->b_hdr = &arc_eviction_hdr;
+                               buf->b_next = arc_eviction_list;
+                               arc_eviction_list = buf;
+                               mutex_exit(&buf->b_evict_lock);
+                               cv_signal(&arc_user_evicts_cv);
+                               mutex_exit(&arc_user_evicts_lock);
+                       } else {
+                               arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE);
+                       }
+               }
+       }
+
        ASSERT3P(hdr->b_hash_next, ==, NULL);
-       ASSERT3P(hdr->b_acb, ==, NULL);
-       kmem_cache_free(hdr_cache, hdr);
+       if (HDR_HAS_L1HDR(hdr)) {
+               ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+               ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+               kmem_cache_free(hdr_full_cache, hdr);
+       } else {
+               kmem_cache_free(hdr_l2only_cache, hdr);
+       }
 }
 
 void
 arc_buf_free(arc_buf_t *buf, void *tag)
 {
        arc_buf_hdr_t *hdr = buf->b_hdr;
-       int hashed = hdr->b_state != arc_anon;
+       int hashed = hdr->b_l1hdr.b_state != arc_anon;
 
        ASSERT(buf->b_efunc == NULL);
        ASSERT(buf->b_data != NULL);
@@ -1688,12 +2184,12 @@ arc_buf_free(arc_buf_t *buf, void *tag)
                ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
                (void) remove_reference(hdr, hash_lock, tag);
-               if (hdr->b_datacnt > 1) {
-                       arc_buf_destroy(buf, FALSE, TRUE);
+               if (hdr->b_l1hdr.b_datacnt > 1) {
+                       arc_buf_destroy(buf, TRUE);
                } else {
-                       ASSERT(buf == hdr->b_buf);
+                       ASSERT(buf == hdr->b_l1hdr.b_buf);
                        ASSERT(buf->b_efunc == NULL);
-                       hdr->b_flags |= ARC_BUF_AVAILABLE;
+                       hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
                }
                mutex_exit(hash_lock);
        } else if (HDR_IO_IN_PROGRESS(hdr)) {
@@ -1703,16 +2199,16 @@ arc_buf_free(arc_buf_t *buf, void *tag)
                 * this buffer unless the write completes before we finish
                 * decrementing the reference count.
                 */
-               mutex_enter(&arc_eviction_mtx);
+               mutex_enter(&arc_user_evicts_lock);
                (void) remove_reference(hdr, NULL, tag);
-               ASSERT(refcount_is_zero(&hdr->b_refcnt));
+               ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
                destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
-               mutex_exit(&arc_eviction_mtx);
+               mutex_exit(&arc_user_evicts_lock);
                if (destroy_hdr)
                        arc_hdr_destroy(hdr);
        } else {
                if (remove_reference(hdr, NULL, tag) > 0)
-                       arc_buf_destroy(buf, FALSE, TRUE);
+                       arc_buf_destroy(buf, TRUE);
                else
                        arc_hdr_destroy(hdr);
        }
@@ -1722,33 +2218,33 @@ boolean_t
 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
 {
        arc_buf_hdr_t *hdr = buf->b_hdr;
-       kmutex_t *hash_lock = NULL;
+       kmutex_t *hash_lock = HDR_LOCK(hdr);
        boolean_t no_callback = (buf->b_efunc == NULL);
 
-       if (hdr->b_state == arc_anon) {
-               ASSERT(hdr->b_datacnt == 1);
+       if (hdr->b_l1hdr.b_state == arc_anon) {
+               ASSERT(hdr->b_l1hdr.b_datacnt == 1);
                arc_buf_free(buf, tag);
                return (no_callback);
        }
 
-       hash_lock = HDR_LOCK(hdr);
        mutex_enter(hash_lock);
        hdr = buf->b_hdr;
+       ASSERT(hdr->b_l1hdr.b_datacnt > 0);
        ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
-       ASSERT(hdr->b_state != arc_anon);
+       ASSERT(hdr->b_l1hdr.b_state != arc_anon);
        ASSERT(buf->b_data != NULL);
 
        (void) remove_reference(hdr, hash_lock, tag);
-       if (hdr->b_datacnt > 1) {
+       if (hdr->b_l1hdr.b_datacnt > 1) {
                if (no_callback)
-                       arc_buf_destroy(buf, FALSE, TRUE);
+                       arc_buf_destroy(buf, TRUE);
        } else if (no_callback) {
-               ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+               ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
                ASSERT(buf->b_efunc == NULL);
-               hdr->b_flags |= ARC_BUF_AVAILABLE;
+               hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
        }
-       ASSERT(no_callback || hdr->b_datacnt > 1 ||
-           refcount_is_zero(&hdr->b_refcnt));
+       ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
+           refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
        mutex_exit(hash_lock);
        return (no_callback);
 }
@@ -1794,7 +2290,7 @@ arc_buf_eviction_needed(arc_buf_t *buf)
                return (B_TRUE);
        }
 
-       if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
+       if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
                evict_needed = B_TRUE;
 
        mutex_exit(&buf->b_evict_lock);
@@ -1802,401 +2298,464 @@ arc_buf_eviction_needed(arc_buf_t *buf)
 }
 
 /*
- * Evict buffers from list until we've removed the specified number of
- * bytes.  Move the removed buffers to the appropriate evict state.
- * If the recycle flag is set, then attempt to "recycle" a buffer:
- * - look for a buffer to evict that is `bytes' long.
- * - return the data block from this buffer rather than freeing it.
- * This flag is used by callers that are trying to make space for a
- * new buffer in a full arc cache.
+ * Evict the arc_buf_hdr that is provided as a parameter. The resultant
+ * state of the header is dependent on its state prior to entering this
+ * function. The following transitions are possible:
  *
- * This function makes a "best effort".  It skips over any buffers
- * it can't get a hash_lock on, and so may not catch all candidates.
- * It may also return without evicting as much space as requested.
+ *    - arc_mru -> arc_mru_ghost
+ *    - arc_mfu -> arc_mfu_ghost
+ *    - arc_mru_ghost -> arc_l2c_only
+ *    - arc_mru_ghost -> deleted
+ *    - arc_mfu_ghost -> arc_l2c_only
+ *    - arc_mfu_ghost -> deleted
  */
-static void *
-arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
-    arc_buf_contents_t type)
+static int64_t
+arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
-       arc_state_t *evicted_state;
-       uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
-       arc_buf_hdr_t *ab, *ab_prev = NULL;
-       list_t *list = &state->arcs_list[type];
-       kmutex_t *hash_lock;
-       boolean_t have_lock;
-       void *stolen = NULL;
-       arc_buf_hdr_t marker = {{{ 0 }}};
-       int count = 0;
-
-       ASSERT(state == arc_mru || state == arc_mfu);
-
-       evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+       arc_state_t *evicted_state, *state;
+       int64_t bytes_evicted = 0;
 
-top:
-       mutex_enter(&state->arcs_mtx);
-       mutex_enter(&evicted_state->arcs_mtx);
-
-       for (ab = list_tail(list); ab; ab = ab_prev) {
-               ab_prev = list_prev(list, ab);
-               /* prefetch buffers have a minimum lifespan */
-               if (HDR_IO_IN_PROGRESS(ab) ||
-                   (spa && ab->b_spa != spa) ||
-                   (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
-                   ddi_get_lbolt() - ab->b_arc_access <
-                   zfs_arc_min_prefetch_lifespan)) {
-                       skipped++;
-                       continue;
-               }
-               /* "lookahead" for better eviction candidate */
-               if (recycle && ab->b_size != bytes &&
-                   ab_prev && ab_prev->b_size == bytes)
-                       continue;
+       ASSERT(MUTEX_HELD(hash_lock));
+       ASSERT(HDR_HAS_L1HDR(hdr));
 
-               /* ignore markers */
-               if (ab->b_spa == 0)
-                       continue;
+       state = hdr->b_l1hdr.b_state;
+       if (GHOST_STATE(state)) {
+               ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+               ASSERT(hdr->b_l1hdr.b_buf == NULL);
 
                /*
-                * It may take a long time to evict all the bufs requested.
-                * To avoid blocking all arc activity, periodically drop
-                * the arcs_mtx and give other threads a chance to run
-                * before reacquiring the lock.
-                *
-                * If we are looking for a buffer to recycle, we are in
-                * the hot code path, so don't sleep.
+                * l2arc_write_buffers() relies on a header's L1 portion
+                * (i.e. its b_tmp_cdata field) during its write phase.
+                * Thus, we cannot push a header onto the arc_l2c_only
+                * state (removing its L1 piece) until the header is
+                * done being written to the l2arc.
                 */
-               if (!recycle && count++ > arc_evict_iterations) {
-                       list_insert_after(list, ab, &marker);
-                       mutex_exit(&evicted_state->arcs_mtx);
-                       mutex_exit(&state->arcs_mtx);
-                       kpreempt(KPREEMPT_SYNC);
-                       mutex_enter(&state->arcs_mtx);
-                       mutex_enter(&evicted_state->arcs_mtx);
-                       ab_prev = list_prev(list, &marker);
-                       list_remove(list, &marker);
-                       count = 0;
-                       continue;
+               if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
+                       ARCSTAT_BUMP(arcstat_evict_l2_skip);
+                       return (bytes_evicted);
                }
 
-               hash_lock = HDR_LOCK(ab);
-               have_lock = MUTEX_HELD(hash_lock);
-               if (have_lock || mutex_tryenter(hash_lock)) {
-                       ASSERT0(refcount_count(&ab->b_refcnt));
-                       ASSERT(ab->b_datacnt > 0);
-                       while (ab->b_buf) {
-                               arc_buf_t *buf = ab->b_buf;
-                               if (!mutex_tryenter(&buf->b_evict_lock)) {
-                                       missed += 1;
-                                       break;
-                               }
-                               if (buf->b_data) {
-                                       bytes_evicted += ab->b_size;
-                                       if (recycle && ab->b_type == type &&
-                                           ab->b_size == bytes &&
-                                           !HDR_L2_WRITING(ab)) {
-                                               stolen = buf->b_data;
-                                               recycle = FALSE;
-                                       }
-                               }
-                               if (buf->b_efunc) {
-                                       mutex_enter(&arc_eviction_mtx);
-                                       arc_buf_destroy(buf,
-                                           buf->b_data == stolen, FALSE);
-                                       ab->b_buf = buf->b_next;
-                                       buf->b_hdr = &arc_eviction_hdr;
-                                       buf->b_next = arc_eviction_list;
-                                       arc_eviction_list = buf;
-                                       mutex_exit(&arc_eviction_mtx);
-                                       mutex_exit(&buf->b_evict_lock);
-                               } else {
-                                       mutex_exit(&buf->b_evict_lock);
-                                       arc_buf_destroy(buf,
-                                           buf->b_data == stolen, TRUE);
-                               }
-                       }
+               ARCSTAT_BUMP(arcstat_deleted);
+               bytes_evicted += hdr->b_size;
 
-                       if (ab->b_l2hdr) {
-                               ARCSTAT_INCR(arcstat_evict_l2_cached,
-                                   ab->b_size);
-                       } else {
-                               if (l2arc_write_eligible(ab->b_spa, ab)) {
-                                       ARCSTAT_INCR(arcstat_evict_l2_eligible,
-                                           ab->b_size);
-                               } else {
-                                       ARCSTAT_INCR(
-                                           arcstat_evict_l2_ineligible,
-                                           ab->b_size);
-                               }
-                       }
+               DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
 
-                       if (ab->b_datacnt == 0) {
-                               arc_change_state(evicted_state, ab, hash_lock);
-                               ASSERT(HDR_IN_HASH_TABLE(ab));
-                               ab->b_flags |= ARC_IN_HASH_TABLE;
-                               ab->b_flags &= ~ARC_BUF_AVAILABLE;
-                               DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
-                       }
-                       if (!have_lock)
-                               mutex_exit(hash_lock);
-                       if (bytes >= 0 && bytes_evicted >= bytes)
-                               break;
+               if (HDR_HAS_L2HDR(hdr)) {
+                       /*
+                        * This buffer is cached on the 2nd Level ARC;
+                        * don't destroy the header.
+                        */
+                       arc_change_state(arc_l2c_only, hdr, hash_lock);
+                       /*
+                        * dropping from L1+L2 cached to L2-only,
+                        * realloc to remove the L1 header.
+                        */
+                       hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+                           hdr_l2only_cache);
                } else {
-                       missed += 1;
+                       arc_change_state(arc_anon, hdr, hash_lock);
+                       arc_hdr_destroy(hdr);
                }
+               return (bytes_evicted);
        }
 
-       mutex_exit(&evicted_state->arcs_mtx);
-       mutex_exit(&state->arcs_mtx);
+       ASSERT(state == arc_mru || state == arc_mfu);
+       evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
 
-       if (list == &state->arcs_list[ARC_BUFC_DATA] &&
-           (bytes < 0 || bytes_evicted < bytes)) {
-               /* Prevent second pass from recycling metadata into data */
-               recycle = FALSE;
-               type = ARC_BUFC_METADATA;
-               list = &state->arcs_list[type];
-               goto top;
+       /* prefetch buffers have a minimum lifespan */
+       if (HDR_IO_IN_PROGRESS(hdr) ||
+           ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+           ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
+           arc_min_prefetch_lifespan)) {
+               ARCSTAT_BUMP(arcstat_evict_skip);
+               return (bytes_evicted);
        }
 
-       if (bytes_evicted < bytes)
-               dprintf("only evicted %lld bytes from %x\n",
-                   (longlong_t)bytes_evicted, state->arcs_state);
-
-       if (skipped)
-               ARCSTAT_INCR(arcstat_evict_skip, skipped);
+       ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
+       ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
+       while (hdr->b_l1hdr.b_buf) {
+               arc_buf_t *buf = hdr->b_l1hdr.b_buf;
+               if (!mutex_tryenter(&buf->b_evict_lock)) {
+                       ARCSTAT_BUMP(arcstat_mutex_miss);
+                       break;
+               }
+               if (buf->b_data != NULL)
+                       bytes_evicted += hdr->b_size;
+               if (buf->b_efunc != NULL) {
+                       mutex_enter(&arc_user_evicts_lock);
+                       arc_buf_destroy(buf, FALSE);
+                       hdr->b_l1hdr.b_buf = buf->b_next;
+                       buf->b_hdr = &arc_eviction_hdr;
+                       buf->b_next = arc_eviction_list;
+                       arc_eviction_list = buf;
+                       cv_signal(&arc_user_evicts_cv);
+                       mutex_exit(&arc_user_evicts_lock);
+                       mutex_exit(&buf->b_evict_lock);
+               } else {
+                       mutex_exit(&buf->b_evict_lock);
+                       arc_buf_destroy(buf, TRUE);
+               }
+       }
 
-       if (missed)
-               ARCSTAT_INCR(arcstat_mutex_miss, missed);
+       if (HDR_HAS_L2HDR(hdr)) {
+               ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size);
+       } else {
+               if (l2arc_write_eligible(hdr->b_spa, hdr))
+                       ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size);
+               else
+                       ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size);
+       }
 
-       /*
-        * Note: we have just evicted some data into the ghost state,
-        * potentially putting the ghost size over the desired size.  Rather
-        * that evicting from the ghost list in this hot code path, leave
-        * this chore to the arc_reclaim_thread().
-        */
+       if (hdr->b_l1hdr.b_datacnt == 0) {
+               arc_change_state(evicted_state, hdr, hash_lock);
+               ASSERT(HDR_IN_HASH_TABLE(hdr));
+               hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
+               hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
+               DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
+       }
 
-       return (stolen);
+       return (bytes_evicted);
 }
 
-/*
- * Remove buffers from list until we've removed the specified number of
- * bytes.  Destroy the buffers that are removed.
- */
-static void
-arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
-    arc_buf_contents_t type)
+static uint64_t
+arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
+    uint64_t spa, int64_t bytes)
 {
-       arc_buf_hdr_t *ab, *ab_prev;
-       arc_buf_hdr_t marker;
-       list_t *list = &state->arcs_list[type];
+       multilist_sublist_t *mls;
+       uint64_t bytes_evicted = 0;
+       arc_buf_hdr_t *hdr;
        kmutex_t *hash_lock;
-       uint64_t bytes_deleted = 0;
-       uint64_t bufs_skipped = 0;
-       int count = 0;
+       int evict_count = 0;
 
-       ASSERT(GHOST_STATE(state));
-       bzero(&marker, sizeof (marker));
-top:
-       mutex_enter(&state->arcs_mtx);
-       for (ab = list_tail(list); ab; ab = ab_prev) {
-               ab_prev = list_prev(list, ab);
-               if (ab->b_type > ARC_BUFC_NUMTYPES)
-                       panic("invalid ab=%p", (void *)ab);
-               if (spa && ab->b_spa != spa)
-                       continue;
+       ASSERT3P(marker, !=, NULL);
+       IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
 
-               /* ignore markers */
-               if (ab->b_spa == 0)
-                       continue;
+       mls = multilist_sublist_lock(ml, idx);
 
-               hash_lock = HDR_LOCK(ab);
-               /* caller may be trying to modify this buffer, skip it */
-               if (MUTEX_HELD(hash_lock))
-                       continue;
+       for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
+           hdr = multilist_sublist_prev(mls, marker)) {
+               if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
+                   (evict_count >= zfs_arc_evict_batch_limit))
+                       break;
 
                /*
-                * It may take a long time to evict all the bufs requested.
-                * To avoid blocking all arc activity, periodically drop
-                * the arcs_mtx and give other threads a chance to run
-                * before reacquiring the lock.
+                * To keep our iteration location, move the marker
+                * forward. Since we're not holding hdr's hash lock, we
+                * must be very careful and not remove 'hdr' from the
+                * sublist. Otherwise, other consumers might mistake the
+                * 'hdr' as not being on a sublist when they call the
+                * multilist_link_active() function (they all rely on
+                * the hash lock protecting concurrent insertions and
+                * removals). multilist_sublist_move_forward() was
+                * specifically implemented to ensure this is the case
+                * (only 'marker' will be removed and re-inserted).
                 */
-               if (count++ > arc_evict_iterations) {
-                       list_insert_after(list, ab, &marker);
-                       mutex_exit(&state->arcs_mtx);
-                       kpreempt(KPREEMPT_SYNC);
-                       mutex_enter(&state->arcs_mtx);
-                       ab_prev = list_prev(list, &marker);
-                       list_remove(list, &marker);
-                       count = 0;
+               multilist_sublist_move_forward(mls, marker);
+
+               /*
+                * The only case where the b_spa field should ever be
+                * zero, is the marker headers inserted by
+                * arc_evict_state(). It's possible for multiple threads
+                * to be calling arc_evict_state() concurrently (e.g.
+                * dsl_pool_close() and zio_inject_fault()), so we must
+                * skip any markers we see from these other threads.
+                */
+               if (hdr->b_spa == 0)
+                       continue;
+
+               /* we're only interested in evicting buffers of a certain spa */
+               if (spa != 0 && hdr->b_spa != spa) {
+                       ARCSTAT_BUMP(arcstat_evict_skip);
                        continue;
                }
+
+               hash_lock = HDR_LOCK(hdr);
+
+               /*
+                * We aren't calling this function from any code path
+                * that would already be holding a hash lock, so we're
+                * asserting on this assumption to be defensive in case
+                * this ever changes. Without this check, it would be
+                * possible to incorrectly increment arcstat_mutex_miss
+                * below (e.g. if the code changed such that we called
+                * this function with a hash lock held).
+                */
+               ASSERT(!MUTEX_HELD(hash_lock));
+
                if (mutex_tryenter(hash_lock)) {
-                       ASSERT(!HDR_IO_IN_PROGRESS(ab));
-                       ASSERT(ab->b_buf == NULL);
-                       ARCSTAT_BUMP(arcstat_deleted);
-                       bytes_deleted += ab->b_size;
+                       uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
+                       mutex_exit(hash_lock);
 
-                       if (ab->b_l2hdr != NULL) {
-                               /*
-                                * This buffer is cached on the 2nd Level ARC;
-                                * don't destroy the header.
-                                */
-                               arc_change_state(arc_l2c_only, ab, hash_lock);
-                               mutex_exit(hash_lock);
-                       } else {
-                               arc_change_state(arc_anon, ab, hash_lock);
-                               mutex_exit(hash_lock);
-                               arc_hdr_destroy(ab);
-                       }
+                       bytes_evicted += evicted;
 
-                       DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
-                       if (bytes >= 0 && bytes_deleted >= bytes)
-                               break;
-               } else if (bytes < 0) {
                        /*
-                        * Insert a list marker and then wait for the
-                        * hash lock to become available. Once its
-                        * available, restart from where we left off.
+                        * If evicted is zero, arc_evict_hdr() must have
+                        * decided to skip this header, don't increment
+                        * evict_count in this case.
                         */
-                       list_insert_after(list, ab, &marker);
-                       mutex_exit(&state->arcs_mtx);
-                       mutex_enter(hash_lock);
-                       mutex_exit(hash_lock);
-                       mutex_enter(&state->arcs_mtx);
-                       ab_prev = list_prev(list, &marker);
-                       list_remove(list, &marker);
+                       if (evicted != 0)
+                               evict_count++;
+
+                       /*
+                        * If arc_size isn't overflowing, signal any
+                        * threads that might happen to be waiting.
+                        *
+                        * For each header evicted, we wake up a single
+                        * thread. If we used cv_broadcast, we could
+                        * wake up "too many" threads causing arc_size
+                        * to significantly overflow arc_c; since
+                        * arc_get_data_buf() doesn't check for overflow
+                        * when it's woken up (it doesn't because it's
+                        * possible for the ARC to be overflowing while
+                        * full of un-evictable buffers, and the
+                        * function should proceed in this case).
+                        *
+                        * If threads are left sleeping, due to not
+                        * using cv_broadcast, they will be woken up
+                        * just before arc_reclaim_thread() sleeps.
+                        */
+                       mutex_enter(&arc_reclaim_lock);
+                       if (!arc_is_overflowing())
+                               cv_signal(&arc_reclaim_waiters_cv);
+                       mutex_exit(&arc_reclaim_lock);
                } else {
-                       bufs_skipped += 1;
+                       ARCSTAT_BUMP(arcstat_mutex_miss);
                }
        }
-       mutex_exit(&state->arcs_mtx);
-
-       if (list == &state->arcs_list[ARC_BUFC_DATA] &&
-           (bytes < 0 || bytes_deleted < bytes)) {
-               list = &state->arcs_list[ARC_BUFC_METADATA];
-               goto top;
-       }
 
-       if (bufs_skipped) {
-               ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
-               ASSERT(bytes >= 0);
-       }
+       multilist_sublist_unlock(mls);
 
-       if (bytes_deleted < bytes)
-               dprintf("only deleted %lld bytes from %p\n",
-                   (longlong_t)bytes_deleted, state);
+       return (bytes_evicted);
 }
 
-static void
-arc_adjust(void)
+/*
+ * Evict buffers from the given arc state, until we've removed the
+ * specified number of bytes. Move the removed buffers to the
+ * appropriate evict state.
+ *
+ * This function makes a "best effort". It skips over any buffers
+ * it can't get a hash_lock on, and so, may not catch all candidates.
+ * It may also return without evicting as much space as requested.
+ *
+ * If bytes is specified using the special value ARC_EVICT_ALL, this
+ * will evict all available (i.e. unlocked and evictable) buffers from
+ * the given arc state; which is used by arc_flush().
+ */
+static uint64_t
+arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type)
 {
-       int64_t adjustment, delta;
+       uint64_t total_evicted = 0;
+       multilist_t *ml = &state->arcs_list[type];
+       int num_sublists;
+       arc_buf_hdr_t **markers;
+       int i;
+
+       IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+       num_sublists = multilist_get_num_sublists(ml);
 
        /*
-        * Adjust MRU size
+        * If we've tried to evict from each sublist, made some
+        * progress, but still have not hit the target number of bytes
+        * to evict, we want to keep trying. The markers allow us to
+        * pick up where we left off for each individual sublist, rather
+        * than starting from the tail each time.
         */
+       markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
+       for (i = 0; i < num_sublists; i++) {
+               multilist_sublist_t *mls;
 
-       adjustment = MIN((int64_t)(arc_size - arc_c),
-           (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p));
+               markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+               /*
+                * A b_spa of 0 is used to indicate that this header is
+                * a marker. This fact is used in arc_adjust_type() and
+                * arc_evict_state_impl().
+                */
+               markers[i]->b_spa = 0;
 
-       if (adjustment > 0 && arc_mru->arcs_size > 0) {
-               delta = MIN(arc_mru->arcs_size, adjustment);
-               (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
+               mls = multilist_sublist_lock(ml, i);
+               multilist_sublist_insert_tail(mls, markers[i]);
+               multilist_sublist_unlock(mls);
        }
 
        /*
-        * Adjust MFU size
+        * While we haven't hit our target number of bytes to evict, or
+        * we're evicting all available buffers.
         */
+       while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
+               /*
+                * Start eviction using a randomly selected sublist,
+                * this is to try and evenly balance eviction across all
+                * sublists. Always starting at the same sublist
+                * (e.g. index 0) would cause evictions to favor certain
+                * sublists over others.
+                */
+               int sublist_idx = multilist_get_random_index(ml);
+               uint64_t scan_evicted = 0;
 
-       adjustment = arc_size - arc_c;
+               for (i = 0; i < num_sublists; i++) {
+                       uint64_t bytes_remaining;
+                       uint64_t bytes_evicted;
 
-       if (adjustment > 0 && arc_mfu->arcs_size > 0) {
-               delta = MIN(arc_mfu->arcs_size, adjustment);
-               (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
-       }
+                       if (bytes == ARC_EVICT_ALL)
+                               bytes_remaining = ARC_EVICT_ALL;
+                       else if (total_evicted < bytes)
+                               bytes_remaining = bytes - total_evicted;
+                       else
+                               break;
 
-       /*
-        * Adjust ghost lists
-        */
+                       bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
+                           markers[sublist_idx], spa, bytes_remaining);
+
+                       scan_evicted += bytes_evicted;
+                       total_evicted += bytes_evicted;
 
-       adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
+                       /* we've reached the end, wrap to the beginning */
+                       if (++sublist_idx >= num_sublists)
+                               sublist_idx = 0;
+               }
+
+               /*
+                * If we didn't evict anything during this scan, we have
+                * no reason to believe we'll evict more during another
+                * scan, so break the loop.
+                */
+               if (scan_evicted == 0) {
+                       /* This isn't possible, let's make that obvious */
+                       ASSERT3S(bytes, !=, 0);
+
+                       /*
+                        * When bytes is ARC_EVICT_ALL, the only way to
+                        * break the loop is when scan_evicted is zero.
+                        * In that case, we actually have evicted enough,
+                        * so we don't want to increment the kstat.
+                        */
+                       if (bytes != ARC_EVICT_ALL) {
+                               ASSERT3S(total_evicted, <, bytes);
+                               ARCSTAT_BUMP(arcstat_evict_not_enough);
+                       }
 
-       if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
-               delta = MIN(arc_mru_ghost->arcs_size, adjustment);
-               arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_DATA);
+                       break;
+               }
        }
 
-       adjustment =
-           arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
+       for (i = 0; i < num_sublists; i++) {
+               multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+               multilist_sublist_remove(mls, markers[i]);
+               multilist_sublist_unlock(mls);
 
-       if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
-               delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
-               arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_DATA);
+               kmem_cache_free(hdr_full_cache, markers[i]);
        }
+       kmem_free(markers, sizeof (*markers) * num_sublists);
+
+       return (total_evicted);
 }
 
 /*
- * Request that arc user drop references so that N bytes can be released
- * from the cache.  This provides a mechanism to ensure the arc can honor
- * the arc_meta_limit and reclaim buffers which are pinned in the cache
- * by higher layers.  (i.e. the zpl)
+ * Flush all "evictable" data of the given type from the arc state
+ * specified. This will not evict any "active" buffers (i.e. referenced).
+ *
+ * When 'retry' is set to FALSE, the function will make a single pass
+ * over the state and evict any buffers that it can. Since it doesn't
+ * continually retry the eviction, it might end up leaving some buffers
+ * in the ARC due to lock misses.
+ *
+ * When 'retry' is set to TRUE, the function will continually retry the
+ * eviction until *all* evictable buffers have been removed from the
+ * state. As a result, if concurrent insertions into the state are
+ * allowed (e.g. if the ARC isn't shutting down), this function might
+ * wind up in an infinite loop, continually trying to evict buffers.
  */
-static void
-arc_do_user_prune(int64_t adjustment)
+static uint64_t
+arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
+    boolean_t retry)
 {
-       arc_prune_func_t *func;
-       void *private;
-       arc_prune_t *cp, *np;
+       uint64_t evicted = 0;
 
-       mutex_enter(&arc_prune_mtx);
+       while (state->arcs_lsize[type] != 0) {
+               evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
 
-       cp = list_head(&arc_prune_list);
-       while (cp != NULL) {
-               func = cp->p_pfunc;
-               private = cp->p_private;
-               np = list_next(&arc_prune_list, cp);
-               refcount_add(&cp->p_refcnt, func);
-               mutex_exit(&arc_prune_mtx);
+               if (!retry)
+                       break;
+       }
 
-               if (func != NULL)
-                       func(adjustment, private);
+       return (evicted);
+}
 
-               mutex_enter(&arc_prune_mtx);
+/*
+ * Helper function for arc_prune_async() it is responsible for safely
+ * handling the execution of a registered arc_prune_func_t.
+ */
+static void
+arc_prune_task(void *ptr)
+{
+       arc_prune_t *ap = (arc_prune_t *)ptr;
+       arc_prune_func_t *func = ap->p_pfunc;
 
-               /* User removed prune callback concurrently with execution */
-               if (refcount_remove(&cp->p_refcnt, func) == 0) {
-                       ASSERT(!list_link_active(&cp->p_node));
-                       refcount_destroy(&cp->p_refcnt);
-                       kmem_free(cp, sizeof (*cp));
-               }
+       if (func != NULL)
+               func(ap->p_adjust, ap->p_private);
 
-               cp = np;
+       /* Callback unregistered concurrently with execution */
+       if (refcount_remove(&ap->p_refcnt, func) == 0) {
+               ASSERT(!list_link_active(&ap->p_node));
+               refcount_destroy(&ap->p_refcnt);
+               kmem_free(ap, sizeof (*ap));
        }
-
-       ARCSTAT_BUMP(arcstat_prune);
-       mutex_exit(&arc_prune_mtx);
 }
 
+/*
+ * Notify registered consumers they must drop holds on a portion of the ARC
+ * buffered they reference.  This provides a mechanism to ensure the ARC can
+ * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers.  This
+ * is analogous to dnlc_reduce_cache() but more generic.
+ *
+ * This operation is performed asynchronously so it may be safely called
+ * in the context of the arc_reclaim_thread().  A reference is taken here
+ * for each registered arc_prune_t and the arc_prune_task() is responsible
+ * for releasing it once the registered arc_prune_func_t has completed.
+ */
 static void
-arc_do_user_evicts(void)
+arc_prune_async(int64_t adjust)
 {
-       mutex_enter(&arc_eviction_mtx);
-       while (arc_eviction_list != NULL) {
-               arc_buf_t *buf = arc_eviction_list;
-               arc_eviction_list = buf->b_next;
-               mutex_enter(&buf->b_evict_lock);
-               buf->b_hdr = NULL;
-               mutex_exit(&buf->b_evict_lock);
-               mutex_exit(&arc_eviction_mtx);
+       arc_prune_t *ap;
 
-               if (buf->b_efunc != NULL)
-                       VERIFY0(buf->b_efunc(buf->b_private));
+       mutex_enter(&arc_prune_mtx);
+       for (ap = list_head(&arc_prune_list); ap != NULL;
+           ap = list_next(&arc_prune_list, ap)) {
 
-               buf->b_efunc = NULL;
-               buf->b_private = NULL;
-               kmem_cache_free(buf_cache, buf);
-               mutex_enter(&arc_eviction_mtx);
+               if (refcount_count(&ap->p_refcnt) >= 2)
+                       continue;
+
+               refcount_add(&ap->p_refcnt, ap->p_pfunc);
+               ap->p_adjust = adjust;
+               taskq_dispatch(arc_prune_taskq, arc_prune_task, ap, TQ_SLEEP);
+               ARCSTAT_BUMP(arcstat_prune);
        }
-       mutex_exit(&arc_eviction_mtx);
+       mutex_exit(&arc_prune_mtx);
+}
+
+/*
+ * Evict the specified number of bytes from the state specified,
+ * restricting eviction to the spa and type given. This function
+ * prevents us from trying to evict more from a state's list than
+ * is "evictable", and to skip evicting altogether when passed a
+ * negative value for "bytes". In contrast, arc_evict_state() will
+ * evict everything it can, when passed a negative value for "bytes".
+ */
+static uint64_t
+arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type)
+{
+       int64_t delta;
+
+       if (bytes > 0 && state->arcs_lsize[type] > 0) {
+               delta = MIN(state->arcs_lsize[type], bytes);
+               return (arc_evict_state(state, spa, delta, type));
+       }
+
+       return (0);
 }
 
 /*
@@ -2216,12 +2775,13 @@ arc_do_user_evicts(void)
  * be dropped from the VFS cache.  This will make dnode meta data buffers
  * available for reclaim.
  */
-static void
-arc_adjust_meta(void)
+static uint64_t
+arc_adjust_meta_balanced(void)
 {
        int64_t adjustmnt, delta, prune = 0;
+       uint64_t total_evicted = 0;
        arc_buf_contents_t type = ARC_BUFC_DATA;
-       unsigned long restarts = zfs_arc_meta_adjust_restarts;
+       int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
 
 restart:
        /*
@@ -2236,7 +2796,7 @@ restart:
 
        if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) {
                delta = MIN(arc_mru->arcs_lsize[type], adjustmnt);
-               arc_evict(arc_mru, 0, delta, FALSE, type);
+               total_evicted += arc_adjust_impl(arc_mru, 0, delta, type);
                adjustmnt -= delta;
        }
 
@@ -2252,7 +2812,7 @@ restart:
 
        if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) {
                delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt);
-               arc_evict(arc_mfu, 0, delta, FALSE, type);
+               total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
        }
 
        adjustmnt = arc_meta_used - arc_meta_limit;
@@ -2260,14 +2820,14 @@ restart:
        if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
                delta = MIN(adjustmnt,
                    arc_mru_ghost->arcs_lsize[type]);
-               arc_evict_ghost(arc_mru_ghost, 0, delta, type);
+               total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type);
                adjustmnt -= delta;
        }
 
        if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) {
                delta = MIN(adjustmnt,
                    arc_mfu_ghost->arcs_lsize[type]);
-               arc_evict_ghost(arc_mfu_ghost, 0, delta, type);
+               total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type);
        }
 
        /*
@@ -2285,7 +2845,7 @@ restart:
 
                        if (zfs_arc_meta_prune) {
                                prune += zfs_arc_meta_prune;
-                               arc_do_user_prune(prune);
+                               arc_prune_async(prune);
                        }
                }
 
@@ -2294,70 +2854,339 @@ restart:
                        goto restart;
                }
        }
+       return (total_evicted);
 }
 
 /*
- * Flush all *evictable* data from the cache for the given spa.
- * NOTE: this will not touch "active" (i.e. referenced) data.
+ * Evict metadata buffers from the cache, such that arc_meta_used is
+ * capped by the arc_meta_limit tunable.
  */
-void
-arc_flush(spa_t *spa)
+static uint64_t
+arc_adjust_meta_only(void)
 {
-       uint64_t guid = 0;
+       uint64_t total_evicted = 0;
+       int64_t target;
 
-       if (spa)
-               guid = spa_load_guid(spa);
+       /*
+        * If we're over the meta limit, we want to evict enough
+        * metadata to get back under the meta limit. We don't want to
+        * evict so much that we drop the MRU below arc_p, though. If
+        * we're over the meta limit more than we're over arc_p, we
+        * evict some from the MRU here, and some from the MFU below.
+        */
+       target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
+           (int64_t)(refcount_count(&arc_anon->arcs_size) +
+           refcount_count(&arc_mru->arcs_size) - arc_p));
+
+       total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+
+       /*
+        * Similar to the above, we want to evict enough bytes to get us
+        * below the meta limit, but not so much as to drop us below the
+        * space alloted to the MFU (which is defined as arc_c - arc_p).
+        */
+       target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
+           (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
+
+       total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+
+       return (total_evicted);
+}
+
+static uint64_t
+arc_adjust_meta(void)
+{
+       if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
+               return (arc_adjust_meta_only());
+       else
+               return (arc_adjust_meta_balanced());
+}
+
+/*
+ * Return the type of the oldest buffer in the given arc state
+ *
+ * This function will select a random sublist of type ARC_BUFC_DATA and
+ * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
+ * is compared, and the type which contains the "older" buffer will be
+ * returned.
+ */
+static arc_buf_contents_t
+arc_adjust_type(arc_state_t *state)
+{
+       multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
+       multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
+       int data_idx = multilist_get_random_index(data_ml);
+       int meta_idx = multilist_get_random_index(meta_ml);
+       multilist_sublist_t *data_mls;
+       multilist_sublist_t *meta_mls;
+       arc_buf_contents_t type;
+       arc_buf_hdr_t *data_hdr;
+       arc_buf_hdr_t *meta_hdr;
+
+       /*
+        * We keep the sublist lock until we're finished, to prevent
+        * the headers from being destroyed via arc_evict_state().
+        */
+       data_mls = multilist_sublist_lock(data_ml, data_idx);
+       meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
+
+       /*
+        * These two loops are to ensure we skip any markers that
+        * might be at the tail of the lists due to arc_evict_state().
+        */
 
-       while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
-               (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
-               if (spa)
+       for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
+           data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
+               if (data_hdr->b_spa != 0)
                        break;
        }
-       while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
-               (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
-               if (spa)
+
+       for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
+           meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
+               if (meta_hdr->b_spa != 0)
                        break;
        }
-       while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
-               (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
-               if (spa)
-                       break;
+
+       if (data_hdr == NULL && meta_hdr == NULL) {
+               type = ARC_BUFC_DATA;
+       } else if (data_hdr == NULL) {
+               ASSERT3P(meta_hdr, !=, NULL);
+               type = ARC_BUFC_METADATA;
+       } else if (meta_hdr == NULL) {
+               ASSERT3P(data_hdr, !=, NULL);
+               type = ARC_BUFC_DATA;
+       } else {
+               ASSERT3P(data_hdr, !=, NULL);
+               ASSERT3P(meta_hdr, !=, NULL);
+
+               /* The headers can't be on the sublist without an L1 header */
+               ASSERT(HDR_HAS_L1HDR(data_hdr));
+               ASSERT(HDR_HAS_L1HDR(meta_hdr));
+
+               if (data_hdr->b_l1hdr.b_arc_access <
+                   meta_hdr->b_l1hdr.b_arc_access) {
+                       type = ARC_BUFC_DATA;
+               } else {
+                       type = ARC_BUFC_METADATA;
+               }
        }
-       while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
-               (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
-               if (spa)
-                       break;
+
+       multilist_sublist_unlock(meta_mls);
+       multilist_sublist_unlock(data_mls);
+
+       return (type);
+}
+
+/*
+ * Evict buffers from the cache, such that arc_size is capped by arc_c.
+ */
+static uint64_t
+arc_adjust(void)
+{
+       uint64_t total_evicted = 0;
+       uint64_t bytes;
+       int64_t target;
+
+       /*
+        * If we're over arc_meta_limit, we want to correct that before
+        * potentially evicting data buffers below.
+        */
+       total_evicted += arc_adjust_meta();
+
+       /*
+        * Adjust MRU size
+        *
+        * If we're over the target cache size, we want to evict enough
+        * from the list to get back to our target size. We don't want
+        * to evict too much from the MRU, such that it drops below
+        * arc_p. So, if we're over our target cache size more than
+        * the MRU is over arc_p, we'll evict enough to get back to
+        * arc_p here, and then evict more from the MFU below.
+        */
+       target = MIN((int64_t)(arc_size - arc_c),
+           (int64_t)(refcount_count(&arc_anon->arcs_size) +
+           refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
+
+       /*
+        * If we're below arc_meta_min, always prefer to evict data.
+        * Otherwise, try to satisfy the requested number of bytes to
+        * evict from the type which contains older buffers; in an
+        * effort to keep newer buffers in the cache regardless of their
+        * type. If we cannot satisfy the number of bytes from this
+        * type, spill over into the next type.
+        */
+       if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
+           arc_meta_used > arc_meta_min) {
+               bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+               total_evicted += bytes;
+
+               /*
+                * If we couldn't evict our target number of bytes from
+                * metadata, we try to get the rest from data.
+                */
+               target -= bytes;
+
+               total_evicted +=
+                   arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+       } else {
+               bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+               total_evicted += bytes;
+
+               /*
+                * If we couldn't evict our target number of bytes from
+                * data, we try to get the rest from metadata.
+                */
+               target -= bytes;
+
+               total_evicted +=
+                   arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
        }
 
-       arc_evict_ghost(arc_mru_ghost, guid, -1, ARC_BUFC_DATA);
-       arc_evict_ghost(arc_mfu_ghost, guid, -1, ARC_BUFC_DATA);
+       /*
+        * Adjust MFU size
+        *
+        * Now that we've tried to evict enough from the MRU to get its
+        * size back to arc_p, if we're still above the target cache
+        * size, we evict the rest from the MFU.
+        */
+       target = arc_size - arc_c;
+
+       if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
+           arc_meta_used > arc_meta_min) {
+               bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+               total_evicted += bytes;
+
+               /*
+                * If we couldn't evict our target number of bytes from
+                * metadata, we try to get the rest from data.
+                */
+               target -= bytes;
+
+               total_evicted +=
+                   arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+       } else {
+               bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+               total_evicted += bytes;
+
+               /*
+                * If we couldn't evict our target number of bytes from
+                * data, we try to get the rest from data.
+                */
+               target -= bytes;
+
+               total_evicted +=
+                   arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+       }
+
+       /*
+        * Adjust ghost lists
+        *
+        * In addition to the above, the ARC also defines target values
+        * for the ghost lists. The sum of the mru list and mru ghost
+        * list should never exceed the target size of the cache, and
+        * the sum of the mru list, mfu list, mru ghost list, and mfu
+        * ghost list should never exceed twice the target size of the
+        * cache. The following logic enforces these limits on the ghost
+        * caches, and evicts from them as needed.
+        */
+       target = refcount_count(&arc_mru->arcs_size) +
+           refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
+
+       bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
+       total_evicted += bytes;
+
+       target -= bytes;
+
+       total_evicted +=
+           arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
+
+       /*
+        * We assume the sum of the mru list and mfu list is less than
+        * or equal to arc_c (we enforced this above), which means we
+        * can use the simpler of the two equations below:
+        *
+        *      mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
+        *                  mru ghost + mfu ghost <= arc_c
+        */
+       target = refcount_count(&arc_mru_ghost->arcs_size) +
+           refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
+
+       bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
+       total_evicted += bytes;
+
+       target -= bytes;
+
+       total_evicted +=
+           arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
+
+       return (total_evicted);
+}
+
+static void
+arc_do_user_evicts(void)
+{
+       mutex_enter(&arc_user_evicts_lock);
+       while (arc_eviction_list != NULL) {
+               arc_buf_t *buf = arc_eviction_list;
+               arc_eviction_list = buf->b_next;
+               mutex_enter(&buf->b_evict_lock);
+               buf->b_hdr = NULL;
+               mutex_exit(&buf->b_evict_lock);
+               mutex_exit(&arc_user_evicts_lock);
+
+               if (buf->b_efunc != NULL)
+                       VERIFY0(buf->b_efunc(buf->b_private));
+
+               buf->b_efunc = NULL;
+               buf->b_private = NULL;
+               kmem_cache_free(buf_cache, buf);
+               mutex_enter(&arc_user_evicts_lock);
+       }
+       mutex_exit(&arc_user_evicts_lock);
+}
+
+void
+arc_flush(spa_t *spa, boolean_t retry)
+{
+       uint64_t guid = 0;
+
+       /*
+        * If retry is TRUE, a spa must not be specified since we have
+        * no good way to determine if all of a spa's buffers have been
+        * evicted from an arc state.
+        */
+       ASSERT(!retry || spa == 0);
+
+       if (spa != NULL)
+               guid = spa_load_guid(spa);
+
+       (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
+       (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
+
+       (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
+       (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
+
+       (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
+       (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
+
+       (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
+       (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
 
-       mutex_enter(&arc_reclaim_thr_lock);
        arc_do_user_evicts();
-       mutex_exit(&arc_reclaim_thr_lock);
        ASSERT(spa || arc_eviction_list == NULL);
 }
 
 void
-arc_shrink(uint64_t bytes)
+arc_shrink(int64_t to_free)
 {
        if (arc_c > arc_c_min) {
-               uint64_t to_free;
-
-               to_free = bytes ? bytes : arc_c >> zfs_arc_shrink_shift;
 
                if (arc_c > arc_c_min + to_free)
                        atomic_add_64(&arc_c, -to_free);
                else
                        arc_c = arc_c_min;
 
-               to_free = bytes ? bytes : arc_p >> zfs_arc_shrink_shift;
-
-               if (arc_p > to_free)
-                       atomic_add_64(&arc_p, -to_free);
-               else
-                       arc_p = 0;
-
+               atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
                if (arc_c > arc_size)
                        arc_c = MAX(arc_size, arc_c_min);
                if (arc_p > arc_c)
@@ -2367,24 +3196,181 @@ arc_shrink(uint64_t bytes)
        }
 
        if (arc_size > arc_c)
-               arc_adjust();
+               (void) arc_adjust();
+}
+
+typedef enum free_memory_reason_t {
+       FMR_UNKNOWN,
+       FMR_NEEDFREE,
+       FMR_LOTSFREE,
+       FMR_SWAPFS_MINFREE,
+       FMR_PAGES_PP_MAXIMUM,
+       FMR_HEAP_ARENA,
+       FMR_ZIO_ARENA,
+} free_memory_reason_t;
+
+int64_t last_free_memory;
+free_memory_reason_t last_free_reason;
+
+#ifdef _KERNEL
+/*
+ * Additional reserve of pages for pp_reserve.
+ */
+int64_t arc_pages_pp_reserve = 64;
+
+/*
+ * Additional reserve of pages for swapfs.
+ */
+int64_t arc_swapfs_reserve = 64;
+#endif /* _KERNEL */
+
+/*
+ * Return the amount of memory that can be consumed before reclaim will be
+ * needed.  Positive if there is sufficient free memory, negative indicates
+ * the amount of memory that needs to be freed up.
+ */
+static int64_t
+arc_available_memory(void)
+{
+       int64_t lowest = INT64_MAX;
+       free_memory_reason_t r = FMR_UNKNOWN;
+#ifdef _KERNEL
+       int64_t n;
+#ifdef __linux__
+       pgcnt_t needfree = btop(arc_need_free);
+       pgcnt_t lotsfree = btop(arc_sys_free);
+       pgcnt_t desfree = 0;
+#endif
+
+       if (needfree > 0) {
+               n = PAGESIZE * (-needfree);
+               if (n < lowest) {
+                       lowest = n;
+                       r = FMR_NEEDFREE;
+               }
+       }
+
+       /*
+        * check that we're out of range of the pageout scanner.  It starts to
+        * schedule paging if freemem is less than lotsfree and needfree.
+        * lotsfree is the high-water mark for pageout, and needfree is the
+        * number of needed free pages.  We add extra pages here to make sure
+        * the scanner doesn't start up while we're freeing memory.
+        */
+       n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
+       if (n < lowest) {
+               lowest = n;
+               r = FMR_LOTSFREE;
+       }
+
+#ifndef __linux__
+       /*
+        * check to make sure that swapfs has enough space so that anon
+        * reservations can still succeed. anon_resvmem() checks that the
+        * availrmem is greater than swapfs_minfree, and the number of reserved
+        * swap pages.  We also add a bit of extra here just to prevent
+        * circumstances from getting really dire.
+        */
+       n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
+           desfree - arc_swapfs_reserve);
+       if (n < lowest) {
+               lowest = n;
+               r = FMR_SWAPFS_MINFREE;
+       }
+
+
+       /*
+        * Check that we have enough availrmem that memory locking (e.g., via
+        * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
+        * stores the number of pages that cannot be locked; when availrmem
+        * drops below pages_pp_maximum, page locking mechanisms such as
+        * page_pp_lock() will fail.)
+        */
+       n = PAGESIZE * (availrmem - pages_pp_maximum -
+           arc_pages_pp_reserve);
+       if (n < lowest) {
+               lowest = n;
+               r = FMR_PAGES_PP_MAXIMUM;
+       }
+#endif
+
+#if defined(__i386)
+       /*
+        * If we're on an i386 platform, it's possible that we'll exhaust the
+        * kernel heap space before we ever run out of available physical
+        * memory.  Most checks of the size of the heap_area compare against
+        * tune.t_minarmem, which is the minimum available real memory that we
+        * can have in the system.  However, this is generally fixed at 25 pages
+        * which is so low that it's useless.  In this comparison, we seek to
+        * calculate the total heap-size, and reclaim if more than 3/4ths of the
+        * heap is allocated.  (Or, in the calculation, if less than 1/4th is
+        * free)
+        */
+       n = vmem_size(heap_arena, VMEM_FREE) -
+           (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
+       if (n < lowest) {
+               lowest = n;
+               r = FMR_HEAP_ARENA;
+       }
+#endif
+
+       /*
+        * If zio data pages are being allocated out of a separate heap segment,
+        * then enforce that the size of available vmem for this arena remains
+        * above about 1/16th free.
+        *
+        * Note: The 1/16th arena free requirement was put in place
+        * to aggressively evict memory from the arc in order to avoid
+        * memory fragmentation issues.
+        */
+       if (zio_arena != NULL) {
+               n = vmem_size(zio_arena, VMEM_FREE) -
+                   (vmem_size(zio_arena, VMEM_ALLOC) >> 4);
+               if (n < lowest) {
+                       lowest = n;
+                       r = FMR_ZIO_ARENA;
+               }
+       }
+#else /* _KERNEL */
+       /* Every 100 calls, free a small amount */
+       if (spa_get_random(100) == 0)
+               lowest = -1024;
+#endif /* _KERNEL */
+
+       last_free_memory = lowest;
+       last_free_reason = r;
+
+       return (lowest);
+}
+
+/*
+ * Determine if the system is under memory pressure and is asking
+ * to reclaim memory. A return value of TRUE indicates that the system
+ * is under memory pressure and that the arc should adjust accordingly.
+ */
+static boolean_t
+arc_reclaim_needed(void)
+{
+       return (arc_available_memory() < 0);
 }
 
 static void
-arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
+arc_kmem_reap_now(void)
 {
        size_t                  i;
        kmem_cache_t            *prev_cache = NULL;
        kmem_cache_t            *prev_data_cache = NULL;
        extern kmem_cache_t     *zio_buf_cache[];
        extern kmem_cache_t     *zio_data_buf_cache[];
+       extern kmem_cache_t     *range_seg_cache;
 
-       /*
-        * An aggressive reclamation will shrink the cache size as well as
-        * reap free buffers from the arc kmem caches.
-        */
-       if (strat == ARC_RECLAIM_AGGR)
-               arc_shrink(bytes);
+       if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) {
+               /*
+                * We are exceeding our meta-data cache limit.
+                * Prune some entries to release holds on meta-data.
+                */
+               arc_prune_async(zfs_arc_meta_prune);
+       }
 
        for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
                if (zio_buf_cache[i] != prev_cache) {
@@ -2396,97 +3382,172 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
                        kmem_cache_reap_now(zio_data_buf_cache[i]);
                }
        }
-
        kmem_cache_reap_now(buf_cache);
-       kmem_cache_reap_now(hdr_cache);
+       kmem_cache_reap_now(hdr_full_cache);
+       kmem_cache_reap_now(hdr_l2only_cache);
+       kmem_cache_reap_now(range_seg_cache);
+
+       if (zio_arena != NULL) {
+               /*
+                * Ask the vmem arena to reclaim unused memory from its
+                * quantum caches.
+                */
+               vmem_qcache_reap(zio_arena);
+       }
 }
 
 /*
- * Unlike other ZFS implementations this thread is only responsible for
- * adapting the target ARC size on Linux.  The responsibility for memory
- * reclamation has been entirely delegated to the arc_shrinker_func()
- * which is registered with the VM.  To reflect this change in behavior
- * the arc_reclaim thread has been renamed to arc_adapt.
+ * Threads can block in arc_get_data_buf() waiting for this thread to evict
+ * enough data and signal them to proceed. When this happens, the threads in
+ * arc_get_data_buf() are sleeping while holding the hash lock for their
+ * particular arc header. Thus, we must be careful to never sleep on a
+ * hash lock in this thread. This is to prevent the following deadlock:
+ *
+ *  - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
+ *    waiting for the reclaim thread to signal it.
+ *
+ *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
+ *    fails, and goes to sleep forever.
+ *
+ * This possible deadlock is avoided by always acquiring a hash lock
+ * using mutex_tryenter() from arc_reclaim_thread().
  */
 static void
-arc_adapt_thread(void)
+arc_reclaim_thread(void)
 {
+       fstrans_cookie_t        cookie = spl_fstrans_mark();
+       clock_t                 growtime = 0;
        callb_cpr_t             cpr;
-       fstrans_cookie_t        cookie;
 
-       CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
+       CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
 
-       cookie = spl_fstrans_mark();
-       mutex_enter(&arc_reclaim_thr_lock);
-       while (arc_thread_exit == 0) {
-#ifndef _KERNEL
-               arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
+       mutex_enter(&arc_reclaim_lock);
+       while (!arc_reclaim_thread_exit) {
+               int64_t to_free;
+               int64_t free_memory = arc_available_memory();
+               uint64_t evicted = 0;
 
-               if (spa_get_random(100) == 0) {
+               arc_tuning_update();
 
-                       if (arc_no_grow) {
-                               if (last_reclaim == ARC_RECLAIM_CONS) {
-                                       last_reclaim = ARC_RECLAIM_AGGR;
-                               } else {
-                                       last_reclaim = ARC_RECLAIM_CONS;
-                               }
-                       } else {
-                               arc_no_grow = TRUE;
-                               last_reclaim = ARC_RECLAIM_AGGR;
-                               membar_producer();
-                       }
+               mutex_exit(&arc_reclaim_lock);
 
-                       /* reset the growth delay for every reclaim */
-                       arc_grow_time = ddi_get_lbolt() +
-                           (zfs_arc_grow_retry * hz);
+               if (free_memory < 0) {
 
-                       arc_kmem_reap_now(last_reclaim, 0);
+                       arc_no_grow = B_TRUE;
                        arc_warm = B_TRUE;
+
+                       /*
+                        * Wait at least zfs_grow_retry (default 5) seconds
+                        * before considering growing.
+                        */
+                       growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
+
+                       arc_kmem_reap_now();
+
+                       /*
+                        * If we are still low on memory, shrink the ARC
+                        * so that we have arc_shrink_min free space.
+                        */
+                       free_memory = arc_available_memory();
+
+                       to_free = (arc_c >> arc_shrink_shift) - free_memory;
+                       if (to_free > 0) {
+#ifdef _KERNEL
+                               to_free = MAX(to_free, arc_need_free);
+#endif
+                               arc_shrink(to_free);
+                       }
+               } else if (free_memory < arc_c >> arc_no_grow_shift) {
+                       arc_no_grow = B_TRUE;
+               } else if (ddi_get_lbolt() >= growtime) {
+                       arc_no_grow = B_FALSE;
                }
-#endif /* !_KERNEL */
 
-               /* No recent memory pressure allow the ARC to grow. */
-               if (arc_no_grow &&
-                   ddi_time_after_eq(ddi_get_lbolt(), arc_grow_time))
-                       arc_no_grow = FALSE;
+               evicted = arc_adjust();
 
-               arc_adjust_meta();
+               mutex_enter(&arc_reclaim_lock);
 
-               arc_adjust();
+               /*
+                * If evicted is zero, we couldn't evict anything via
+                * arc_adjust(). This could be due to hash lock
+                * collisions, but more likely due to the majority of
+                * arc buffers being unevictable. Therefore, even if
+                * arc_size is above arc_c, another pass is unlikely to
+                * be helpful and could potentially cause us to enter an
+                * infinite loop.
+                */
+               if (arc_size <= arc_c || evicted == 0) {
+                       /*
+                        * We're either no longer overflowing, or we
+                        * can't evict anything more, so we should wake
+                        * up any threads before we go to sleep and clear
+                        * arc_need_free since nothing more can be done.
+                        */
+                       cv_broadcast(&arc_reclaim_waiters_cv);
+                       arc_need_free = 0;
 
-               if (arc_eviction_list != NULL)
-                       arc_do_user_evicts();
+                       /*
+                        * Block until signaled, or after one second (we
+                        * might need to perform arc_kmem_reap_now()
+                        * even if we aren't being signalled)
+                        */
+                       CALLB_CPR_SAFE_BEGIN(&cpr);
+                       (void) cv_timedwait_sig(&arc_reclaim_thread_cv,
+                           &arc_reclaim_lock, ddi_get_lbolt() + hz);
+                       CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
+               }
+       }
 
-               /* block until needed, or one second, whichever is shorter */
-               CALLB_CPR_SAFE_BEGIN(&cpr);
-               (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
-                   &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
-               CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+       arc_reclaim_thread_exit = FALSE;
+       cv_broadcast(&arc_reclaim_thread_cv);
+       CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_lock */
+       spl_fstrans_unmark(cookie);
+       thread_exit();
+}
 
+static void
+arc_user_evicts_thread(void)
+{
+       fstrans_cookie_t        cookie = spl_fstrans_mark();
+       callb_cpr_t cpr;
 
-               /* Allow the module options to be changed */
-               if (zfs_arc_max > 64 << 20 &&
-                   zfs_arc_max < physmem * PAGESIZE &&
-                   zfs_arc_max != arc_c_max)
-                       arc_c_max = zfs_arc_max;
+       CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG);
 
-               if (zfs_arc_min > 0 &&
-                   zfs_arc_min < arc_c_max &&
-                   zfs_arc_min != arc_c_min)
-                       arc_c_min = zfs_arc_min;
+       mutex_enter(&arc_user_evicts_lock);
+       while (!arc_user_evicts_thread_exit) {
+               mutex_exit(&arc_user_evicts_lock);
 
-               if (zfs_arc_meta_limit > 0 &&
-                   zfs_arc_meta_limit <= arc_c_max &&
-                   zfs_arc_meta_limit != arc_meta_limit)
-                       arc_meta_limit = zfs_arc_meta_limit;
+               arc_do_user_evicts();
 
+               /*
+                * This is necessary in order for the mdb ::arc dcmd to
+                * show up to date information. Since the ::arc command
+                * does not call the kstat's update function, without
+                * this call, the command may show stale stats for the
+                * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
+                * with this change, the data might be up to 1 second
+                * out of date; but that should suffice. The arc_state_t
+                * structures can be queried directly if more accurate
+                * information is needed.
+                */
+               if (arc_ksp != NULL)
+                       arc_ksp->ks_update(arc_ksp, KSTAT_READ);
 
+               mutex_enter(&arc_user_evicts_lock);
 
+               /*
+                * Block until signaled, or after one second (we need to
+                * call the arc's kstat update function regularly).
+                */
+               CALLB_CPR_SAFE_BEGIN(&cpr);
+               (void) cv_timedwait_sig(&arc_user_evicts_cv,
+                   &arc_user_evicts_lock, ddi_get_lbolt() + hz);
+               CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock);
        }
 
-       arc_thread_exit = 0;
-       cv_broadcast(&arc_reclaim_thr_cv);
-       CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_thr_lock */
+       arc_user_evicts_thread_exit = FALSE;
+       cv_broadcast(&arc_user_evicts_cv);
+       CALLB_CPR_EXIT(&cpr);           /* drops arc_user_evicts_lock */
        spl_fstrans_unmark(cookie);
        thread_exit();
 }
@@ -2588,27 +3649,34 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
                return (SHRINK_STOP);
 
        /* Reclaim in progress */
-       if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
+       if (mutex_tryenter(&arc_reclaim_lock) == 0)
                return (SHRINK_STOP);
 
+       mutex_exit(&arc_reclaim_lock);
+
        /*
         * Evict the requested number of pages by shrinking arc_c the
         * requested amount.  If there is nothing left to evict just
         * reap whatever we can from the various arc slabs.
         */
        if (pages > 0) {
-               arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
-
+               arc_shrink(ptob(sc->nr_to_scan));
+               arc_kmem_reap_now();
 #ifdef HAVE_SPLIT_SHRINKER_CALLBACK
                pages = MAX(pages - btop(arc_evictable_memory()), 0);
 #else
                pages = btop(arc_evictable_memory());
 #endif
        } else {
-               arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
+               arc_kmem_reap_now();
                pages = SHRINK_STOP;
        }
 
+       /*
+        * We've reaped what we can, wake up threads.
+        */
+       cv_broadcast(&arc_reclaim_waiters_cv);
+
        /*
         * When direct reclaim is observed it usually indicates a rapid
         * increase in memory pressure.  This occurs because the kswapd
@@ -2620,12 +3688,10 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
                ARCSTAT_BUMP(arcstat_memory_indirect_count);
        } else {
                arc_no_grow = B_TRUE;
-               arc_grow_time = ddi_get_lbolt() + (zfs_arc_grow_retry * hz);
+               arc_need_free = ptob(sc->nr_to_scan);
                ARCSTAT_BUMP(arcstat_memory_direct_count);
        }
 
-       mutex_exit(&arc_reclaim_thr_lock);
-
        return (pages);
 }
 SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
@@ -2642,6 +3708,9 @@ static void
 arc_adapt(int bytes, arc_state_t *state)
 {
        int mult;
+       uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
+       int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
+       int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
 
        if (state == arc_l2c_only)
                return;
@@ -2656,27 +3725,28 @@ arc_adapt(int bytes, arc_state_t *state)
         *        target size of the MRU list.
         */
        if (state == arc_mru_ghost) {
-               mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
-                   1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
-
+               mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
                if (!zfs_arc_p_dampener_disable)
                        mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
 
-               arc_p = MIN(arc_c, arc_p + bytes * mult);
+               arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
        } else if (state == arc_mfu_ghost) {
                uint64_t delta;
 
-               mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
-                   1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
-
+               mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
                if (!zfs_arc_p_dampener_disable)
                        mult = MIN(mult, 10);
 
                delta = MIN(bytes * mult, arc_p);
-               arc_p = MAX(0, arc_p - delta);
+               arc_p = MAX(arc_p_min, arc_p - delta);
        }
        ASSERT((int64_t)arc_p >= 0);
 
+       if (arc_reclaim_needed()) {
+               cv_signal(&arc_reclaim_thread_cv);
+               return;
+       }
+
        if (arc_no_grow)
                return;
 
@@ -2687,7 +3757,8 @@ arc_adapt(int bytes, arc_state_t *state)
         * If we're within (2 * maxblocksize) bytes of the target
         * cache size, increment the target cache size
         */
-       if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+       VERIFY3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
+       if (arc_size >= arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
                atomic_add_64(&arc_c, (int64_t)bytes);
                if (arc_c > arc_c_max)
                        arc_c = arc_c_max;
@@ -2700,154 +3771,111 @@ arc_adapt(int bytes, arc_state_t *state)
 }
 
 /*
- * Check if the cache has reached its limits and eviction is required
- * prior to insert.
+ * Check if arc_size has grown past our upper threshold, determined by
+ * zfs_arc_overflow_shift.
  */
-static int
-arc_evict_needed(arc_buf_contents_t type)
+static boolean_t
+arc_is_overflowing(void)
 {
-       if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
-               return (1);
-
-       if (arc_no_grow)
-               return (1);
+       /* Always allow at least one block of overflow */
+       uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
+           arc_c >> zfs_arc_overflow_shift);
 
-       return (arc_size > arc_c);
+       return (arc_size >= arc_c + overflow);
 }
 
 /*
- * The buffer, supplied as the first argument, needs a data block.
- * So, if we are at cache max, determine which cache should be victimized.
- * We have the following cases:
- *
- * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
- * In this situation if we're out of space, but the resident size of the MFU is
- * under the limit, victimize the MFU cache to satisfy this insertion request.
- *
- * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
- * Here, we've used up all of the available space for the MRU, so we need to
- * evict from our own cache instead.  Evict from the set of resident MRU
- * entries.
- *
- * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
- * c minus p represents the MFU space in the cache, since p is the size of the
- * cache that is dedicated to the MRU.  In this situation there's still space on
- * the MFU side, so the MRU side needs to be victimized.
- *
- * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
- * MFU's resident set is consuming more space than it has been allotted.  In
- * this situation, we must victimize our own cache, the MFU, for this insertion.
+ * The buffer, supplied as the first argument, needs a data block. If we
+ * are hitting the hard limit for the cache size, we must sleep, waiting
+ * for the eviction thread to catch up. If we're past the target size
+ * but below the hard limit, we'll only signal the reclaim thread and
+ * continue on.
  */
 static void
 arc_get_data_buf(arc_buf_t *buf)
 {
-       arc_state_t             *state = buf->b_hdr->b_state;
+       arc_state_t             *state = buf->b_hdr->b_l1hdr.b_state;
        uint64_t                size = buf->b_hdr->b_size;
-       arc_buf_contents_t      type = buf->b_hdr->b_type;
-       arc_buf_contents_t      evict = ARC_BUFC_DATA;
-       boolean_t               recycle = TRUE;
+       arc_buf_contents_t      type = arc_buf_type(buf->b_hdr);
 
        arc_adapt(size, state);
 
        /*
-        * We have not yet reached cache maximum size,
-        * just allocate a new buffer.
+        * If arc_size is currently overflowing, and has grown past our
+        * upper limit, we must be adding data faster than the evict
+        * thread can evict. Thus, to ensure we don't compound the
+        * problem by adding more data and forcing arc_size to grow even
+        * further past it's target size, we halt and wait for the
+        * eviction thread to catch up.
+        *
+        * It's also possible that the reclaim thread is unable to evict
+        * enough buffers to get arc_size below the overflow limit (e.g.
+        * due to buffers being un-evictable, or hash lock collisions).
+        * In this case, we want to proceed regardless if we're
+        * overflowing; thus we don't use a while loop here.
         */
-       if (!arc_evict_needed(type)) {
-               if (type == ARC_BUFC_METADATA) {
-                       buf->b_data = zio_buf_alloc(size);
-                       arc_space_consume(size, ARC_SPACE_META);
-               } else {
-                       ASSERT(type == ARC_BUFC_DATA);
-                       buf->b_data = zio_data_buf_alloc(size);
-                       arc_space_consume(size, ARC_SPACE_DATA);
+       if (arc_is_overflowing()) {
+               mutex_enter(&arc_reclaim_lock);
+
+               /*
+                * Now that we've acquired the lock, we may no longer be
+                * over the overflow limit, lets check.
+                *
+                * We're ignoring the case of spurious wake ups. If that
+                * were to happen, it'd let this thread consume an ARC
+                * buffer before it should have (i.e. before we're under
+                * the overflow limit and were signalled by the reclaim
+                * thread). As long as that is a rare occurrence, it
+                * shouldn't cause any harm.
+                */
+               if (arc_is_overflowing()) {
+                       cv_signal(&arc_reclaim_thread_cv);
+                       cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
                }
-               goto out;
-       }
 
-       /*
-        * If we are prefetching from the mfu ghost list, this buffer
-        * will end up on the mru list; so steal space from there.
-        */
-       if (state == arc_mfu_ghost)
-               state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
-       else if (state == arc_mru_ghost)
-               state = arc_mru;
-
-       if (state == arc_mru || state == arc_anon) {
-               uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
-               state = (arc_mfu->arcs_lsize[type] >= size &&
-                   arc_p > mru_used) ? arc_mfu : arc_mru;
-       } else {
-               /* MFU cases */
-               uint64_t mfu_space = arc_c - arc_p;
-               state =  (arc_mru->arcs_lsize[type] >= size &&
-                   mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
+               mutex_exit(&arc_reclaim_lock);
        }
 
-       /*
-        * Evict data buffers prior to metadata buffers, unless we're
-        * over the metadata limit and adding a metadata buffer.
-        */
        if (type == ARC_BUFC_METADATA) {
-               if (arc_meta_used >= arc_meta_limit)
-                       evict = ARC_BUFC_METADATA;
-               else
-                       /*
-                        * In this case, we're evicting data while
-                        * adding metadata. Thus, to prevent recycling a
-                        * data buffer into a metadata buffer, recycling
-                        * is disabled in the following arc_evict call.
-                        */
-                       recycle = FALSE;
+               buf->b_data = zio_buf_alloc(size);
+               arc_space_consume(size, ARC_SPACE_META);
+       } else {
+               ASSERT(type == ARC_BUFC_DATA);
+               buf->b_data = zio_data_buf_alloc(size);
+               arc_space_consume(size, ARC_SPACE_DATA);
        }
 
-       if ((buf->b_data = arc_evict(state, 0, size, recycle, evict)) == NULL) {
-               if (type == ARC_BUFC_METADATA) {
-                       buf->b_data = zio_buf_alloc(size);
-                       arc_space_consume(size, ARC_SPACE_META);
-
-                       /*
-                        * If we are unable to recycle an existing meta buffer
-                        * signal the reclaim thread.  It will notify users
-                        * via the prune callback to drop references.  The
-                        * prune callback in run in the context of the reclaim
-                        * thread to avoid deadlocking on the hash_lock.
-                        * Of course, only do this when recycle is true.
-                        */
-                       if (recycle)
-                               cv_signal(&arc_reclaim_thr_cv);
-               } else {
-                       ASSERT(type == ARC_BUFC_DATA);
-                       buf->b_data = zio_data_buf_alloc(size);
-                       arc_space_consume(size, ARC_SPACE_DATA);
-               }
-
-               /* Only bump this if we tried to recycle and failed */
-               if (recycle)
-                       ARCSTAT_BUMP(arcstat_recycle_miss);
-       }
-       ASSERT(buf->b_data != NULL);
-out:
        /*
         * Update the state size.  Note that ghost states have a
         * "ghost size" and so don't need to be updated.
         */
-       if (!GHOST_STATE(buf->b_hdr->b_state)) {
+       if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
                arc_buf_hdr_t *hdr = buf->b_hdr;
+               arc_state_t *state = hdr->b_l1hdr.b_state;
 
-               atomic_add_64(&hdr->b_state->arcs_size, size);
-               if (list_link_active(&hdr->b_arc_node)) {
-                       ASSERT(refcount_is_zero(&hdr->b_refcnt));
-                       atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
+               (void) refcount_add_many(&state->arcs_size, size, buf);
+
+               /*
+                * If this is reached via arc_read, the link is
+                * protected by the hash lock. If reached via
+                * arc_buf_alloc, the header should not be accessed by
+                * any other thread. And, if reached via arc_read_done,
+                * the hash lock will protect it if it's found in the
+                * hash table; otherwise no other thread should be
+                * trying to [add|remove]_reference it.
+                */
+               if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+                       ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+                       atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
+                           size);
                }
                /*
                 * If we are growing the cache, and we are adding anonymous
                 * data, and we have outgrown arc_p, update arc_p
                 */
-               if (!zfs_arc_p_aggressive_disable &&
-                   arc_size < arc_c && hdr->b_state == arc_anon &&
-                   arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
+               if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
+                   (refcount_count(&arc_anon->arcs_size) +
+                   refcount_count(&arc_mru->arcs_size) > arc_p))
                        arc_p = MIN(arc_c, arc_p + size);
        }
 }
@@ -2857,25 +3885,26 @@ out:
  * NOTE: the hash lock is dropped in this function.
  */
 static void
-arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
+arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
        clock_t now;
 
        ASSERT(MUTEX_HELD(hash_lock));
+       ASSERT(HDR_HAS_L1HDR(hdr));
 
-       if (buf->b_state == arc_anon) {
+       if (hdr->b_l1hdr.b_state == arc_anon) {
                /*
                 * This buffer is not in the cache, and does not
                 * appear in our "ghost" list.  Add the new buffer
                 * to the MRU state.
                 */
 
-               ASSERT(buf->b_arc_access == 0);
-               buf->b_arc_access = ddi_get_lbolt();
-               DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
-               arc_change_state(arc_mru, buf, hash_lock);
+               ASSERT0(hdr->b_l1hdr.b_arc_access);
+               hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+               DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
+               arc_change_state(arc_mru, hdr, hash_lock);
 
-       } else if (buf->b_state == arc_mru) {
+       } else if (hdr->b_l1hdr.b_state == arc_mru) {
                now = ddi_get_lbolt();
 
                /*
@@ -2886,15 +3915,17 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                 * - move the buffer to the head of the list if this is
                 *   another prefetch (to make it less likely to be evicted).
                 */
-               if ((buf->b_flags & ARC_PREFETCH) != 0) {
-                       if (refcount_count(&buf->b_refcnt) == 0) {
-                               ASSERT(list_link_active(&buf->b_arc_node));
+               if (HDR_PREFETCH(hdr)) {
+                       if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
+                               /* link protected by hash lock */
+                               ASSERT(multilist_link_active(
+                                   &hdr->b_l1hdr.b_arc_node));
                        } else {
-                               buf->b_flags &= ~ARC_PREFETCH;
-                               atomic_inc_32(&buf->b_mru_hits);
+                               hdr->b_flags &= ~ARC_FLAG_PREFETCH;
+                               atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
                                ARCSTAT_BUMP(arcstat_mru_hits);
                        }
-                       buf->b_arc_access = now;
+                       hdr->b_l1hdr.b_arc_access = now;
                        return;
                }
 
@@ -2903,19 +3934,20 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                 * but it is still in the cache. Move it to the MFU
                 * state.
                 */
-               if (ddi_time_after(now, buf->b_arc_access + ARC_MINTIME)) {
+               if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
+                   ARC_MINTIME)) {
                        /*
                         * More than 125ms have passed since we
                         * instantiated this buffer.  Move it to the
                         * most frequently used state.
                         */
-                       buf->b_arc_access = now;
-                       DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
-                       arc_change_state(arc_mfu, buf, hash_lock);
+                       hdr->b_l1hdr.b_arc_access = now;
+                       DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+                       arc_change_state(arc_mfu, hdr, hash_lock);
                }
-               atomic_inc_32(&buf->b_mru_hits);
+               atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
                ARCSTAT_BUMP(arcstat_mru_hits);
-       } else if (buf->b_state == arc_mru_ghost) {
+       } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
                arc_state_t     *new_state;
                /*
                 * This buffer has been "accessed" recently, but
@@ -2923,22 +3955,22 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                 * MFU state.
                 */
 
-               if (buf->b_flags & ARC_PREFETCH) {
+               if (HDR_PREFETCH(hdr)) {
                        new_state = arc_mru;
-                       if (refcount_count(&buf->b_refcnt) > 0)
-                               buf->b_flags &= ~ARC_PREFETCH;
-                       DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
+                       if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
+                               hdr->b_flags &= ~ARC_FLAG_PREFETCH;
+                       DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
                } else {
                        new_state = arc_mfu;
-                       DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+                       DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
                }
 
-               buf->b_arc_access = ddi_get_lbolt();
-               arc_change_state(new_state, buf, hash_lock);
+               hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+               arc_change_state(new_state, hdr, hash_lock);
 
-               atomic_inc_32(&buf->b_mru_ghost_hits);
+               atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
                ARCSTAT_BUMP(arcstat_mru_ghost_hits);
-       } else if (buf->b_state == arc_mfu) {
+       } else if (hdr->b_l1hdr.b_state == arc_mfu) {
                /*
                 * This buffer has been accessed more than once and is
                 * still in the cache.  Keep it in the MFU state.
@@ -2948,14 +3980,15 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                 * If it was a prefetch, we will explicitly move it to
                 * the head of the list now.
                 */
-               if ((buf->b_flags & ARC_PREFETCH) != 0) {
-                       ASSERT(refcount_count(&buf->b_refcnt) == 0);
-                       ASSERT(list_link_active(&buf->b_arc_node));
+               if ((HDR_PREFETCH(hdr)) != 0) {
+                       ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+                       /* link protected by hash_lock */
+                       ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
                }
-               atomic_inc_32(&buf->b_mfu_hits);
+               atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
                ARCSTAT_BUMP(arcstat_mfu_hits);
-               buf->b_arc_access = ddi_get_lbolt();
-       } else if (buf->b_state == arc_mfu_ghost) {
+               hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+       } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
                arc_state_t     *new_state = arc_mfu;
                /*
                 * This buffer has been accessed more than once but has
@@ -2963,31 +3996,32 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                 * MFU state.
                 */
 
-               if (buf->b_flags & ARC_PREFETCH) {
+               if (HDR_PREFETCH(hdr)) {
                        /*
                         * This is a prefetch access...
                         * move this block back to the MRU state.
                         */
-                       ASSERT0(refcount_count(&buf->b_refcnt));
+                       ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
                        new_state = arc_mru;
                }
 
-               buf->b_arc_access = ddi_get_lbolt();
-               DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
-               arc_change_state(new_state, buf, hash_lock);
+               hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+               DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+               arc_change_state(new_state, hdr, hash_lock);
 
-               atomic_inc_32(&buf->b_mfu_ghost_hits);
+               atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
                ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
-       } else if (buf->b_state == arc_l2c_only) {
+       } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
                /*
                 * This buffer is on the 2nd Level ARC.
                 */
 
-               buf->b_arc_access = ddi_get_lbolt();
-               DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
-               arc_change_state(arc_mfu, buf, hash_lock);
+               hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+               DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+               arc_change_state(arc_mfu, hdr, hash_lock);
        } else {
-               cmn_err(CE_PANIC, "invalid arc state 0x%p", buf->b_state);
+               cmn_err(CE_PANIC, "invalid arc state 0x%p",
+                   hdr->b_l1hdr.b_state);
        }
 }
 
@@ -3055,12 +4089,12 @@ arc_read_done(zio_t *zio)
                    (found == hdr && HDR_L2_READING(hdr)));
        }
 
-       hdr->b_flags &= ~ARC_L2_EVICTED;
-       if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
-               hdr->b_flags &= ~ARC_L2CACHE;
+       hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
+       if (l2arc_noprefetch && HDR_PREFETCH(hdr))
+               hdr->b_flags &= ~ARC_FLAG_L2CACHE;
 
        /* byteswap if necessary */
-       callback_list = hdr->b_acb;
+       callback_list = hdr->b_l1hdr.b_acb;
        ASSERT(callback_list != NULL);
        if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
                dmu_object_byteswap_t bswap =
@@ -3074,7 +4108,8 @@ arc_read_done(zio_t *zio)
        arc_cksum_compute(buf, B_FALSE);
        arc_buf_watch(buf);
 
-       if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+       if (hash_lock && zio->io_error == 0 &&
+           hdr->b_l1hdr.b_state == arc_anon) {
                /*
                 * Only call arc_access on anonymous buffers.  This is because
                 * if we've issued an I/O for an evicted buffer, we've already
@@ -3096,24 +4131,25 @@ arc_read_done(zio_t *zio)
                        abuf = NULL;
                }
        }
-       hdr->b_acb = NULL;
-       hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+       hdr->b_l1hdr.b_acb = NULL;
+       hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
        ASSERT(!HDR_BUF_AVAILABLE(hdr));
        if (abuf == buf) {
                ASSERT(buf->b_efunc == NULL);
-               ASSERT(hdr->b_datacnt == 1);
-               hdr->b_flags |= ARC_BUF_AVAILABLE;
+               ASSERT(hdr->b_l1hdr.b_datacnt == 1);
+               hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
        }
 
-       ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
+       ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
+           callback_list != NULL);
 
        if (zio->io_error != 0) {
-               hdr->b_flags |= ARC_IO_ERROR;
-               if (hdr->b_state != arc_anon)
+               hdr->b_flags |= ARC_FLAG_IO_ERROR;
+               if (hdr->b_l1hdr.b_state != arc_anon)
                        arc_change_state(arc_anon, hdr, hash_lock);
                if (HDR_IN_HASH_TABLE(hdr))
                        buf_hash_remove(hdr);
-               freeable = refcount_is_zero(&hdr->b_refcnt);
+               freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
        }
 
        /*
@@ -3121,9 +4157,9 @@ arc_read_done(zio_t *zio)
         * that the hdr (and hence the cv) might be freed before we get to
         * the cv_broadcast().
         */
-       cv_broadcast(&hdr->b_cv);
+       cv_broadcast(&hdr->b_l1hdr.b_cv);
 
-       if (hash_lock) {
+       if (hash_lock != NULL) {
                mutex_exit(hash_lock);
        } else {
                /*
@@ -3132,8 +4168,8 @@ arc_read_done(zio_t *zio)
                 * moved to the anonymous state (so that it won't show up
                 * in the cache).
                 */
-               ASSERT3P(hdr->b_state, ==, arc_anon);
-               freeable = refcount_is_zero(&hdr->b_refcnt);
+               ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+               freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
        }
 
        /* execute each callback and free its structure */
@@ -3174,8 +4210,8 @@ arc_read_done(zio_t *zio)
  */
 int
 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
-    void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
-    const zbookmark_phys_t *zb)
+    void *private, zio_priority_t priority, int zio_flags,
+    arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
 {
        arc_buf_hdr_t *hdr = NULL;
        arc_buf_t *buf = NULL;
@@ -3196,18 +4232,18 @@ top:
                hdr = buf_hash_find(guid, bp, &hash_lock);
        }
 
-       if (hdr != NULL && hdr->b_datacnt > 0) {
+       if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
 
-               *arc_flags |= ARC_CACHED;
+               *arc_flags |= ARC_FLAG_CACHED;
 
                if (HDR_IO_IN_PROGRESS(hdr)) {
 
-                       if (*arc_flags & ARC_WAIT) {
-                               cv_wait(&hdr->b_cv, hash_lock);
+                       if (*arc_flags & ARC_FLAG_WAIT) {
+                               cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
                                mutex_exit(hash_lock);
                                goto top;
                        }
-                       ASSERT(*arc_flags & ARC_NOWAIT);
+                       ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
 
                        if (done) {
                                arc_callback_t  *acb = NULL;
@@ -3221,8 +4257,8 @@ top:
                                            spa, NULL, NULL, NULL, zio_flags);
 
                                ASSERT(acb->acb_done != NULL);
-                               acb->acb_next = hdr->b_acb;
-                               hdr->b_acb = acb;
+                               acb->acb_next = hdr->b_l1hdr.b_acb;
+                               hdr->b_l1hdr.b_acb = acb;
                                add_reference(hdr, hash_lock, private);
                                mutex_exit(hash_lock);
                                goto out;
@@ -3231,7 +4267,8 @@ top:
                        goto out;
                }
 
-               ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+               ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+                   hdr->b_l1hdr.b_state == arc_mfu);
 
                if (done) {
                        add_reference(hdr, hash_lock, private);
@@ -3240,30 +4277,30 @@ top:
                         * copy of the data so that we will be guaranteed
                         * that arc_release() will always succeed.
                         */
-                       buf = hdr->b_buf;
+                       buf = hdr->b_l1hdr.b_buf;
                        ASSERT(buf);
                        ASSERT(buf->b_data);
                        if (HDR_BUF_AVAILABLE(hdr)) {
                                ASSERT(buf->b_efunc == NULL);
-                               hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+                               hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
                        } else {
                                buf = arc_buf_clone(buf);
                        }
 
-               } else if (*arc_flags & ARC_PREFETCH &&
-                   refcount_count(&hdr->b_refcnt) == 0) {
-                       hdr->b_flags |= ARC_PREFETCH;
+               } else if (*arc_flags & ARC_FLAG_PREFETCH &&
+                   refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
+                       hdr->b_flags |= ARC_FLAG_PREFETCH;
                }
                DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
                arc_access(hdr, hash_lock);
-               if (*arc_flags & ARC_L2CACHE)
-                       hdr->b_flags |= ARC_L2CACHE;
-               if (*arc_flags & ARC_L2COMPRESS)
-                       hdr->b_flags |= ARC_L2COMPRESS;
+               if (*arc_flags & ARC_FLAG_L2CACHE)
+                       hdr->b_flags |= ARC_FLAG_L2CACHE;
+               if (*arc_flags & ARC_FLAG_L2COMPRESS)
+                       hdr->b_flags |= ARC_FLAG_L2COMPRESS;
                mutex_exit(hash_lock);
                ARCSTAT_BUMP(arcstat_hits);
-               ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
-                   demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+               ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+                   demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
                    data, metadata, hits);
 
                if (done)
@@ -3275,13 +4312,13 @@ top:
                uint64_t addr = 0;
                boolean_t devw = B_FALSE;
                enum zio_compress b_compress = ZIO_COMPRESS_OFF;
-               uint64_t b_asize = 0;
+               int32_t b_asize = 0;
 
                /*
                 * Gracefully handle a damaged logical block size as a
                 * checksum error by passing a dummy zio to the done callback.
                 */
-               if (size > SPA_MAXBLOCKSIZE) {
+               if (size > spa_maxblocksize(spa)) {
                        if (done) {
                                rzio = zio_null(pio, spa, NULL,
                                    NULL, NULL, zio_flags);
@@ -3302,7 +4339,6 @@ top:
                        if (!BP_IS_EMBEDDED(bp)) {
                                hdr->b_dva = *BP_IDENTITY(bp);
                                hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
-                               hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
                                exists = buf_hash_insert(hdr, &hash_lock);
                        }
                        if (exists != NULL) {
@@ -3312,63 +4348,73 @@ top:
                                (void) arc_buf_remove_ref(buf, private);
                                goto top; /* restart the IO request */
                        }
+
                        /* if this is a prefetch, we don't have a reference */
-                       if (*arc_flags & ARC_PREFETCH) {
+                       if (*arc_flags & ARC_FLAG_PREFETCH) {
                                (void) remove_reference(hdr, hash_lock,
                                    private);
-                               hdr->b_flags |= ARC_PREFETCH;
+                               hdr->b_flags |= ARC_FLAG_PREFETCH;
                        }
-                       if (*arc_flags & ARC_L2CACHE)
-                               hdr->b_flags |= ARC_L2CACHE;
-                       if (*arc_flags & ARC_L2COMPRESS)
-                               hdr->b_flags |= ARC_L2COMPRESS;
+                       if (*arc_flags & ARC_FLAG_L2CACHE)
+                               hdr->b_flags |= ARC_FLAG_L2CACHE;
+                       if (*arc_flags & ARC_FLAG_L2COMPRESS)
+                               hdr->b_flags |= ARC_FLAG_L2COMPRESS;
                        if (BP_GET_LEVEL(bp) > 0)
-                               hdr->b_flags |= ARC_INDIRECT;
+                               hdr->b_flags |= ARC_FLAG_INDIRECT;
                } else {
-                       /* this block is in the ghost cache */
-                       ASSERT(GHOST_STATE(hdr->b_state));
+                       /*
+                        * This block is in the ghost cache. If it was L2-only
+                        * (and thus didn't have an L1 hdr), we realloc the
+                        * header to add an L1 hdr.
+                        */
+                       if (!HDR_HAS_L1HDR(hdr)) {
+                               hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
+                                   hdr_full_cache);
+                       }
+
+                       ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
                        ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-                       ASSERT0(refcount_count(&hdr->b_refcnt));
-                       ASSERT(hdr->b_buf == NULL);
+                       ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+                       ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 
                        /* if this is a prefetch, we don't have a reference */
-                       if (*arc_flags & ARC_PREFETCH)
-                               hdr->b_flags |= ARC_PREFETCH;
+                       if (*arc_flags & ARC_FLAG_PREFETCH)
+                               hdr->b_flags |= ARC_FLAG_PREFETCH;
                        else
                                add_reference(hdr, hash_lock, private);
-                       if (*arc_flags & ARC_L2CACHE)
-                               hdr->b_flags |= ARC_L2CACHE;
-                       if (*arc_flags & ARC_L2COMPRESS)
-                               hdr->b_flags |= ARC_L2COMPRESS;
+                       if (*arc_flags & ARC_FLAG_L2CACHE)
+                               hdr->b_flags |= ARC_FLAG_L2CACHE;
+                       if (*arc_flags & ARC_FLAG_L2COMPRESS)
+                               hdr->b_flags |= ARC_FLAG_L2COMPRESS;
                        buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
                        buf->b_hdr = hdr;
                        buf->b_data = NULL;
                        buf->b_efunc = NULL;
                        buf->b_private = NULL;
                        buf->b_next = NULL;
-                       hdr->b_buf = buf;
-                       ASSERT(hdr->b_datacnt == 0);
-                       hdr->b_datacnt = 1;
+                       hdr->b_l1hdr.b_buf = buf;
+                       ASSERT0(hdr->b_l1hdr.b_datacnt);
+                       hdr->b_l1hdr.b_datacnt = 1;
                        arc_get_data_buf(buf);
                        arc_access(hdr, hash_lock);
                }
 
-               ASSERT(!GHOST_STATE(hdr->b_state));
+               ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
 
                acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
                acb->acb_done = done;
                acb->acb_private = private;
 
-               ASSERT(hdr->b_acb == NULL);
-               hdr->b_acb = acb;
-               hdr->b_flags |= ARC_IO_IN_PROGRESS;
+               ASSERT(hdr->b_l1hdr.b_acb == NULL);
+               hdr->b_l1hdr.b_acb = acb;
+               hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
 
-               if (hdr->b_l2hdr != NULL &&
-                   (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
-                       devw = hdr->b_l2hdr->b_dev->l2ad_writing;
-                       addr = hdr->b_l2hdr->b_daddr;
-                       b_compress = hdr->b_l2hdr->b_compress;
-                       b_asize = hdr->b_l2hdr->b_asize;
+               if (HDR_HAS_L2HDR(hdr) &&
+                   (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
+                       devw = hdr->b_l2hdr.b_dev->l2ad_writing;
+                       addr = hdr->b_l2hdr.b_daddr;
+                       b_compress = hdr->b_l2hdr.b_compress;
+                       b_asize = hdr->b_l2hdr.b_asize;
                        /*
                         * Lock out device removal.
                         */
@@ -3388,8 +4434,8 @@ top:
                DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
                    uint64_t, size, zbookmark_phys_t *, zb);
                ARCSTAT_BUMP(arcstat_misses);
-               ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
-                   demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+               ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+                   demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
                    data, metadata, misses);
 
                if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
@@ -3402,14 +4448,14 @@ top:
                         *    also have invalidated the vdev.
                         * 5. This isn't prefetch and l2arc_noprefetch is set.
                         */
-                       if (hdr->b_l2hdr != NULL &&
+                       if (HDR_HAS_L2HDR(hdr) &&
                            !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
                            !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
                                l2arc_read_callback_t *cb;
 
                                DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
                                ARCSTAT_BUMP(arcstat_l2_hits);
-                               atomic_inc_32(&hdr->b_l2hdr->b_hits);
+                               atomic_inc_32(&hdr->b_l2hdr.b_hits);
 
                                cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
                                    KM_SLEEP);
@@ -3451,12 +4497,12 @@ top:
                                    zio_t *, rzio);
                                ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
 
-                               if (*arc_flags & ARC_NOWAIT) {
+                               if (*arc_flags & ARC_FLAG_NOWAIT) {
                                        zio_nowait(rzio);
                                        goto out;
                                }
 
-                               ASSERT(*arc_flags & ARC_WAIT);
+                               ASSERT(*arc_flags & ARC_FLAG_WAIT);
                                if (zio_wait(rzio) == 0)
                                        goto out;
 
@@ -3482,12 +4528,12 @@ top:
                rzio = zio_read(pio, spa, bp, buf->b_data, size,
                    arc_read_done, buf, priority, zio_flags, zb);
 
-               if (*arc_flags & ARC_WAIT) {
+               if (*arc_flags & ARC_FLAG_WAIT) {
                        rc = zio_wait(rzio);
                        goto out;
                }
 
-               ASSERT(*arc_flags & ARC_NOWAIT);
+               ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
                zio_nowait(rzio);
        }
 
@@ -3531,8 +4577,9 @@ void
 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
 {
        ASSERT(buf->b_hdr != NULL);
-       ASSERT(buf->b_hdr->b_state != arc_anon);
-       ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+       ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
+       ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
+           func == NULL);
        ASSERT(buf->b_efunc == NULL);
        ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
 
@@ -3556,9 +4603,9 @@ arc_freed(spa_t *spa, const blkptr_t *bp)
        if (hdr == NULL)
                return;
        if (HDR_BUF_AVAILABLE(hdr)) {
-               arc_buf_t *buf = hdr->b_buf;
+               arc_buf_t *buf = hdr->b_l1hdr.b_buf;
                add_reference(hdr, hash_lock, FTAG);
-               hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+               hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
                mutex_exit(hash_lock);
 
                arc_release(buf, FTAG);
@@ -3614,18 +4661,20 @@ arc_clear_callback(arc_buf_t *buf)
        hdr = buf->b_hdr;
        ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
-       ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
-       ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+       ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
+           hdr->b_l1hdr.b_datacnt);
+       ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+           hdr->b_l1hdr.b_state == arc_mfu);
 
        buf->b_efunc = NULL;
        buf->b_private = NULL;
 
-       if (hdr->b_datacnt > 1) {
+       if (hdr->b_l1hdr.b_datacnt > 1) {
                mutex_exit(&buf->b_evict_lock);
-               arc_buf_destroy(buf, FALSE, TRUE);
+               arc_buf_destroy(buf, TRUE);
        } else {
-               ASSERT(buf == hdr->b_buf);
-               hdr->b_flags |= ARC_BUF_AVAILABLE;
+               ASSERT(buf == hdr->b_l1hdr.b_buf);
+               hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
                mutex_exit(&buf->b_evict_lock);
        }
 
@@ -3643,69 +4692,110 @@ arc_clear_callback(arc_buf_t *buf)
 void
 arc_release(arc_buf_t *buf, void *tag)
 {
-       arc_buf_hdr_t *hdr;
-       kmutex_t *hash_lock = NULL;
-       l2arc_buf_hdr_t *l2hdr;
-       uint64_t buf_size = 0;
+       kmutex_t *hash_lock;
+       arc_state_t *state;
+       arc_buf_hdr_t *hdr = buf->b_hdr;
 
        /*
-        * It would be nice to assert that if it's DMU metadata (level >
+        * It would be nice to assert that if its DMU metadata (level >
         * 0 || it's the dnode file), then it must be syncing context.
         * But we don't know that information at this level.
         */
 
        mutex_enter(&buf->b_evict_lock);
-       hdr = buf->b_hdr;
 
-       /* this buffer is not on any list */
-       ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+       ASSERT(HDR_HAS_L1HDR(hdr));
 
-       if (hdr->b_state == arc_anon) {
-               /* this buffer is already released */
-               ASSERT(buf->b_efunc == NULL);
-       } else {
-               hash_lock = HDR_LOCK(hdr);
-               mutex_enter(hash_lock);
-               hdr = buf->b_hdr;
-               ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+       /*
+        * We don't grab the hash lock prior to this check, because if
+        * the buffer's header is in the arc_anon state, it won't be
+        * linked into the hash table.
+        */
+       if (hdr->b_l1hdr.b_state == arc_anon) {
+               mutex_exit(&buf->b_evict_lock);
+               ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+               ASSERT(!HDR_IN_HASH_TABLE(hdr));
+               ASSERT(!HDR_HAS_L2HDR(hdr));
+               ASSERT(BUF_EMPTY(hdr));
+
+               ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
+               ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
+               ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+
+               ASSERT3P(buf->b_efunc, ==, NULL);
+               ASSERT3P(buf->b_private, ==, NULL);
+
+               hdr->b_l1hdr.b_arc_access = 0;
+               arc_buf_thaw(buf);
+
+               return;
        }
 
-       l2hdr = hdr->b_l2hdr;
-       if (l2hdr) {
-               mutex_enter(&l2arc_buflist_mtx);
-               arc_buf_l2_cdata_free(hdr);
-               hdr->b_l2hdr = NULL;
-               list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+       hash_lock = HDR_LOCK(hdr);
+       mutex_enter(hash_lock);
+
+       /*
+        * This assignment is only valid as long as the hash_lock is
+        * held, we must be careful not to reference state or the
+        * b_state field after dropping the lock.
+        */
+       state = hdr->b_l1hdr.b_state;
+       ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+       ASSERT3P(state, !=, arc_anon);
+
+       /* this buffer is not on any list */
+       ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
+
+       if (HDR_HAS_L2HDR(hdr)) {
+               mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+
+               /*
+                * We have to recheck this conditional again now that
+                * we're holding the l2ad_mtx to prevent a race with
+                * another thread which might be concurrently calling
+                * l2arc_evict(). In that case, l2arc_evict() might have
+                * destroyed the header's L2 portion as we were waiting
+                * to acquire the l2ad_mtx.
+                */
+               if (HDR_HAS_L2HDR(hdr))
+                       arc_hdr_l2hdr_destroy(hdr);
+
+               mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
        }
-       buf_size = hdr->b_size;
 
        /*
         * Do we have more than one buf?
         */
-       if (hdr->b_datacnt > 1) {
+       if (hdr->b_l1hdr.b_datacnt > 1) {
                arc_buf_hdr_t *nhdr;
                arc_buf_t **bufp;
                uint64_t blksz = hdr->b_size;
                uint64_t spa = hdr->b_spa;
-               arc_buf_contents_t type = hdr->b_type;
+               arc_buf_contents_t type = arc_buf_type(hdr);
                uint32_t flags = hdr->b_flags;
 
-               ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
+               ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
                /*
                 * Pull the data off of this hdr and attach it to
                 * a new anonymous hdr.
                 */
                (void) remove_reference(hdr, hash_lock, tag);
-               bufp = &hdr->b_buf;
+               bufp = &hdr->b_l1hdr.b_buf;
                while (*bufp != buf)
                        bufp = &(*bufp)->b_next;
                *bufp = buf->b_next;
                buf->b_next = NULL;
 
-               ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
-               atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
-               if (refcount_is_zero(&hdr->b_refcnt)) {
-                       uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
+               ASSERT3P(state, !=, arc_l2c_only);
+
+               (void) refcount_remove_many(
+                   &state->arcs_size, hdr->b_size, buf);
+
+               if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+                       uint64_t *size;
+
+                       ASSERT3P(state, !=, arc_l2c_only);
+                       size = &state->arcs_lsize[type];
                        ASSERT3U(*size, >=, hdr->b_size);
                        atomic_add_64(size, -hdr->b_size);
                }
@@ -3714,68 +4804,61 @@ arc_release(arc_buf_t *buf, void *tag)
                 * We're releasing a duplicate user data buffer, update
                 * our statistics accordingly.
                 */
-               if (hdr->b_type == ARC_BUFC_DATA) {
+               if (HDR_ISTYPE_DATA(hdr)) {
                        ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
                        ARCSTAT_INCR(arcstat_duplicate_buffers_size,
                            -hdr->b_size);
                }
-               hdr->b_datacnt -= 1;
+               hdr->b_l1hdr.b_datacnt -= 1;
                arc_cksum_verify(buf);
                arc_buf_unwatch(buf);
 
                mutex_exit(hash_lock);
 
-               nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+               nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
                nhdr->b_size = blksz;
                nhdr->b_spa = spa;
-               nhdr->b_type = type;
-               nhdr->b_buf = buf;
-               nhdr->b_state = arc_anon;
-               nhdr->b_arc_access = 0;
-               nhdr->b_mru_hits = 0;
-               nhdr->b_mru_ghost_hits = 0;
-               nhdr->b_mfu_hits = 0;
-               nhdr->b_mfu_ghost_hits = 0;
-               nhdr->b_l2_hits = 0;
-               nhdr->b_flags = flags & ARC_L2_WRITING;
-               nhdr->b_l2hdr = NULL;
-               nhdr->b_datacnt = 1;
+
+               nhdr->b_l1hdr.b_mru_hits = 0;
+               nhdr->b_l1hdr.b_mru_ghost_hits = 0;
+               nhdr->b_l1hdr.b_mfu_hits = 0;
+               nhdr->b_l1hdr.b_mfu_ghost_hits = 0;
+               nhdr->b_l1hdr.b_l2_hits = 0;
+               nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
+               nhdr->b_flags |= arc_bufc_to_flags(type);
+               nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
+
+               nhdr->b_l1hdr.b_buf = buf;
+               nhdr->b_l1hdr.b_datacnt = 1;
+               nhdr->b_l1hdr.b_state = arc_anon;
+               nhdr->b_l1hdr.b_arc_access = 0;
+               nhdr->b_l1hdr.b_tmp_cdata = NULL;
                nhdr->b_freeze_cksum = NULL;
-               (void) refcount_add(&nhdr->b_refcnt, tag);
+
+               (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
                buf->b_hdr = nhdr;
                mutex_exit(&buf->b_evict_lock);
-               atomic_add_64(&arc_anon->arcs_size, blksz);
+               (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf);
        } else {
                mutex_exit(&buf->b_evict_lock);
-               ASSERT(refcount_count(&hdr->b_refcnt) == 1);
-               ASSERT(!list_link_active(&hdr->b_arc_node));
+               ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
+               /* protected by hash lock, or hdr is on arc_anon */
+               ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
                ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-               if (hdr->b_state != arc_anon)
-                       arc_change_state(arc_anon, hdr, hash_lock);
-               hdr->b_arc_access = 0;
-               hdr->b_mru_hits = 0;
-               hdr->b_mru_ghost_hits = 0;
-               hdr->b_mfu_hits = 0;
-               hdr->b_mfu_ghost_hits = 0;
-               hdr->b_l2_hits = 0;
-               if (hash_lock)
-                       mutex_exit(hash_lock);
+               hdr->b_l1hdr.b_mru_hits = 0;
+               hdr->b_l1hdr.b_mru_ghost_hits = 0;
+               hdr->b_l1hdr.b_mfu_hits = 0;
+               hdr->b_l1hdr.b_mfu_ghost_hits = 0;
+               hdr->b_l1hdr.b_l2_hits = 0;
+               arc_change_state(arc_anon, hdr, hash_lock);
+               hdr->b_l1hdr.b_arc_access = 0;
+               mutex_exit(hash_lock);
 
                buf_discard_identity(hdr);
                arc_buf_thaw(buf);
        }
        buf->b_efunc = NULL;
        buf->b_private = NULL;
-
-       if (l2hdr) {
-               ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
-               vdev_space_update(l2hdr->b_dev->l2ad_vdev,
-                   -l2hdr->b_asize, 0, 0);
-               kmem_cache_free(l2arc_hdr_cache, l2hdr);
-               arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
-               ARCSTAT_INCR(arcstat_l2_size, -buf_size);
-               mutex_exit(&l2arc_buflist_mtx);
-       }
 }
 
 int
@@ -3784,7 +4867,8 @@ arc_released(arc_buf_t *buf)
        int released;
 
        mutex_enter(&buf->b_evict_lock);
-       released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+       released = (buf->b_data != NULL &&
+           buf->b_hdr->b_l1hdr.b_state == arc_anon);
        mutex_exit(&buf->b_evict_lock);
        return (released);
 }
@@ -3796,7 +4880,7 @@ arc_referenced(arc_buf_t *buf)
        int referenced;
 
        mutex_enter(&buf->b_evict_lock);
-       referenced = (refcount_count(&buf->b_hdr->b_refcnt));
+       referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
        mutex_exit(&buf->b_evict_lock);
        return (referenced);
 }
@@ -3809,7 +4893,9 @@ arc_write_ready(zio_t *zio)
        arc_buf_t *buf = callback->awcb_buf;
        arc_buf_hdr_t *hdr = buf->b_hdr;
 
-       ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
+       ASSERT(HDR_HAS_L1HDR(hdr));
+       ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
+       ASSERT(hdr->b_l1hdr.b_datacnt > 0);
        callback->awcb_ready(zio, buf, callback->awcb_private);
 
        /*
@@ -3819,15 +4905,15 @@ arc_write_ready(zio_t *zio)
         * accounting for any re-write attempt.
         */
        if (HDR_IO_IN_PROGRESS(hdr)) {
-               mutex_enter(&hdr->b_freeze_lock);
+               mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
                if (hdr->b_freeze_cksum != NULL) {
                        kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
                        hdr->b_freeze_cksum = NULL;
                }
-               mutex_exit(&hdr->b_freeze_lock);
+               mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
        }
        arc_cksum_compute(buf, B_FALSE);
-       hdr->b_flags |= ARC_IO_IN_PROGRESS;
+       hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
 }
 
 /*
@@ -3849,7 +4935,7 @@ arc_write_done(zio_t *zio)
        arc_buf_t *buf = callback->awcb_buf;
        arc_buf_hdr_t *hdr = buf->b_hdr;
 
-       ASSERT(hdr->b_acb == NULL);
+       ASSERT(hdr->b_l1hdr.b_acb == NULL);
 
        if (zio->io_error == 0) {
                if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
@@ -3857,7 +4943,6 @@ arc_write_done(zio_t *zio)
                } else {
                        hdr->b_dva = *BP_IDENTITY(zio->io_bp);
                        hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
-                       hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
                }
        } else {
                ASSERT(BUF_EMPTY(hdr));
@@ -3878,7 +4963,7 @@ arc_write_done(zio_t *zio)
                arc_cksum_verify(buf);
 
                exists = buf_hash_insert(hdr, &hash_lock);
-               if (exists) {
+               if (exists != NULL) {
                        /*
                         * This can only happen if we overwrite for
                         * sync-to-convergence, because we remove
@@ -3888,7 +4973,8 @@ arc_write_done(zio_t *zio)
                                if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
                                        panic("bad overwrite, hdr=%p exists=%p",
                                            (void *)hdr, (void *)exists);
-                               ASSERT(refcount_is_zero(&exists->b_refcnt));
+                               ASSERT(refcount_is_zero(
+                                   &exists->b_l1hdr.b_refcnt));
                                arc_change_state(arc_anon, exists, hash_lock);
                                mutex_exit(hash_lock);
                                arc_hdr_destroy(exists);
@@ -3902,22 +4988,22 @@ arc_write_done(zio_t *zio)
                                            (void *)hdr, (void *)exists);
                        } else {
                                /* Dedup */
-                               ASSERT(hdr->b_datacnt == 1);
-                               ASSERT(hdr->b_state == arc_anon);
+                               ASSERT(hdr->b_l1hdr.b_datacnt == 1);
+                               ASSERT(hdr->b_l1hdr.b_state == arc_anon);
                                ASSERT(BP_GET_DEDUP(zio->io_bp));
                                ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
                        }
                }
-               hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+               hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
                /* if it's not anon, we are doing a scrub */
-               if (!exists && hdr->b_state == arc_anon)
+               if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
                        arc_access(hdr, hash_lock);
                mutex_exit(hash_lock);
        } else {
-               hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+               hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
        }
 
-       ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+       ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
        callback->awcb_done(zio, buf, callback->awcb_private);
 
        kmem_free(callback, sizeof (arc_write_callback_t));
@@ -3937,12 +5023,13 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
        ASSERT(ready != NULL);
        ASSERT(done != NULL);
        ASSERT(!HDR_IO_ERROR(hdr));
-       ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
-       ASSERT(hdr->b_acb == NULL);
+       ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+       ASSERT(hdr->b_l1hdr.b_acb == NULL);
+       ASSERT(hdr->b_l1hdr.b_datacnt > 0);
        if (l2arc)
-               hdr->b_flags |= ARC_L2CACHE;
+               hdr->b_flags |= ARC_FLAG_L2CACHE;
        if (l2arc_compress)
-               hdr->b_flags |= ARC_L2COMPRESS;
+               hdr->b_flags |= ARC_FLAG_L2COMPRESS;
        callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
        callback->awcb_ready = ready;
        callback->awcb_physdone = physdone;
@@ -3961,14 +5048,41 @@ static int
 arc_memory_throttle(uint64_t reserve, uint64_t txg)
 {
 #ifdef _KERNEL
-       if (zfs_arc_memory_throttle_disable)
+       uint64_t available_memory = ptob(freemem);
+       static uint64_t page_load = 0;
+       static uint64_t last_txg = 0;
+#ifdef __linux__
+       pgcnt_t minfree = btop(arc_sys_free / 4);
+#endif
+
+       if (freemem > physmem * arc_lotsfree_percent / 100)
                return (0);
 
-       if (freemem <= physmem * arc_lotsfree_percent / 100) {
+       if (txg > last_txg) {
+               last_txg = txg;
+               page_load = 0;
+       }
+
+       /*
+        * If we are in pageout, we know that memory is already tight,
+        * the arc is already going to be evicting, so we just want to
+        * continue to let page writes occur as quickly as possible.
+        */
+       if (current_is_kswapd()) {
+               if (page_load > MAX(ptob(minfree), available_memory) / 4) {
+                       DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
+                       return (SET_ERROR(ERESTART));
+               }
+               /* Note: reserve is inflated, so we deflate */
+               page_load += reserve / 8;
+               return (0);
+       } else if (page_load > 0 && arc_reclaim_needed()) {
+               /* memory is low, delay before restarting */
                ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
                DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
                return (SET_ERROR(EAGAIN));
        }
+       page_load = 0;
 #endif
        return (0);
 }
@@ -4003,7 +5117,8 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
         * network delays from blocking transactions that are ready to be
         * assigned to a txg.
         */
-       anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
+       anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
+           arc_loaned_bytes), 0);
 
        /*
         * Writes will, almost always, require additional memory allocations
@@ -4041,7 +5156,7 @@ static void
 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
 {
-       size->value.ui64 = state->arcs_size;
+       size->value.ui64 = refcount_count(&state->arcs_size);
        evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
        evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
 }
@@ -4052,44 +5167,161 @@ arc_kstat_update(kstat_t *ksp, int rw)
        arc_stats_t *as = ksp->ks_data;
 
        if (rw == KSTAT_WRITE) {
-               return (SET_ERROR(EACCES));
+               return (EACCES);
        } else {
                arc_kstat_update_state(arc_anon,
                    &as->arcstat_anon_size,
-                   &as->arcstat_anon_evict_data,
-                   &as->arcstat_anon_evict_metadata);
+                   &as->arcstat_anon_evictable_data,
+                   &as->arcstat_anon_evictable_metadata);
                arc_kstat_update_state(arc_mru,
                    &as->arcstat_mru_size,
-                   &as->arcstat_mru_evict_data,
-                   &as->arcstat_mru_evict_metadata);
+                   &as->arcstat_mru_evictable_data,
+                   &as->arcstat_mru_evictable_metadata);
                arc_kstat_update_state(arc_mru_ghost,
                    &as->arcstat_mru_ghost_size,
-                   &as->arcstat_mru_ghost_evict_data,
-                   &as->arcstat_mru_ghost_evict_metadata);
+                   &as->arcstat_mru_ghost_evictable_data,
+                   &as->arcstat_mru_ghost_evictable_metadata);
                arc_kstat_update_state(arc_mfu,
                    &as->arcstat_mfu_size,
-                   &as->arcstat_mfu_evict_data,
-                   &as->arcstat_mfu_evict_metadata);
+                   &as->arcstat_mfu_evictable_data,
+                   &as->arcstat_mfu_evictable_metadata);
                arc_kstat_update_state(arc_mfu_ghost,
                    &as->arcstat_mfu_ghost_size,
-                   &as->arcstat_mfu_ghost_evict_data,
-                   &as->arcstat_mfu_ghost_evict_metadata);
+                   &as->arcstat_mfu_ghost_evictable_data,
+                   &as->arcstat_mfu_ghost_evictable_metadata);
        }
 
        return (0);
 }
 
+/*
+ * This function *must* return indices evenly distributed between all
+ * sublists of the multilist. This is needed due to how the ARC eviction
+ * code is laid out; arc_evict_state() assumes ARC buffers are evenly
+ * distributed between all sublists and uses this assumption when
+ * deciding which sublist to evict from and how much to evict from it.
+ */
+unsigned int
+arc_state_multilist_index_func(multilist_t *ml, void *obj)
+{
+       arc_buf_hdr_t *hdr = obj;
+
+       /*
+        * We rely on b_dva to generate evenly distributed index
+        * numbers using buf_hash below. So, as an added precaution,
+        * let's make sure we never add empty buffers to the arc lists.
+        */
+       ASSERT(!BUF_EMPTY(hdr));
+
+       /*
+        * The assumption here, is the hash value for a given
+        * arc_buf_hdr_t will remain constant throughout its lifetime
+        * (i.e. its b_spa, b_dva, and b_birth fields don't change).
+        * Thus, we don't need to store the header's sublist index
+        * on insertion, as this index can be recalculated on removal.
+        *
+        * Also, the low order bits of the hash value are thought to be
+        * distributed evenly. Otherwise, in the case that the multilist
+        * has a power of two number of sublists, each sublists' usage
+        * would not be evenly distributed.
+        */
+       return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
+           multilist_get_num_sublists(ml));
+}
+
+/*
+ * Called during module initialization and periodically thereafter to
+ * apply reasonable changes to the exposed performance tunings.  Non-zero
+ * zfs_* values which differ from the currently set values will be applied.
+ */
+static void
+arc_tuning_update(void)
+{
+       /* Valid range: 64M - <all physical memory> */
+       if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
+           (zfs_arc_max > 64 << 20) && (zfs_arc_max < ptob(physmem)) &&
+           (zfs_arc_max > arc_c_min)) {
+               arc_c_max = zfs_arc_max;
+               arc_c = arc_c_max;
+               arc_p = (arc_c >> 1);
+               arc_meta_limit = MIN(arc_meta_limit, arc_c_max);
+       }
+
+       /* Valid range: 32M - <arc_c_max> */
+       if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
+           (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
+           (zfs_arc_min <= arc_c_max)) {
+               arc_c_min = zfs_arc_min;
+               arc_c = MAX(arc_c, arc_c_min);
+       }
+
+       /* Valid range: 16M - <arc_c_max> */
+       if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) &&
+           (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) &&
+           (zfs_arc_meta_min <= arc_c_max)) {
+               arc_meta_min = zfs_arc_meta_min;
+               arc_meta_limit = MAX(arc_meta_limit, arc_meta_min);
+       }
+
+       /* Valid range: <arc_meta_min> - <arc_c_max> */
+       if ((zfs_arc_meta_limit) && (zfs_arc_meta_limit != arc_meta_limit) &&
+           (zfs_arc_meta_limit >= zfs_arc_meta_min) &&
+           (zfs_arc_meta_limit <= arc_c_max))
+               arc_meta_limit = zfs_arc_meta_limit;
+
+       /* Valid range: 1 - N */
+       if (zfs_arc_grow_retry)
+               arc_grow_retry = zfs_arc_grow_retry;
+
+       /* Valid range: 1 - N */
+       if (zfs_arc_shrink_shift) {
+               arc_shrink_shift = zfs_arc_shrink_shift;
+               arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
+       }
+
+       /* Valid range: 1 - N */
+       if (zfs_arc_p_min_shift)
+               arc_p_min_shift = zfs_arc_p_min_shift;
+
+       /* Valid range: 1 - N ticks */
+       if (zfs_arc_min_prefetch_lifespan)
+               arc_min_prefetch_lifespan = zfs_arc_min_prefetch_lifespan;
+
+       /* Valid range: 0 - 100 */
+       if ((zfs_arc_lotsfree_percent >= 0) &&
+           (zfs_arc_lotsfree_percent <= 100))
+               arc_lotsfree_percent = zfs_arc_lotsfree_percent;
+
+       /* Valid range: 0 - <all physical memory> */
+       if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
+               arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), ptob(physmem));
+
+}
+
 void
 arc_init(void)
 {
-       mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
-       cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
+       /*
+        * allmem is "all memory that we could possibly use".
+        */
+#ifdef _KERNEL
+       uint64_t allmem = ptob(physmem);
+#else
+       uint64_t allmem = (physmem * PAGESIZE) / 2;
+#endif
+
+       mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
+       cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
+       cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
+
+       mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
+       cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL);
 
        /* Convert seconds to clock ticks */
-       zfs_arc_min_prefetch_lifespan = 1 * hz;
+       arc_min_prefetch_lifespan = 1 * hz;
 
        /* Start out with 1/8 of all memory */
-       arc_c = physmem * PAGESIZE / 8;
+       arc_c = allmem / 8;
 
 #ifdef _KERNEL
        /*
@@ -4098,38 +5330,39 @@ arc_init(void)
         * need to limit the cache to 1/8 of VM size.
         */
        arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+
        /*
         * Register a shrinker to support synchronous (direct) memory
         * reclaim from the arc.  This is done to prevent kswapd from
         * swapping out pages when it is preferable to shrink the arc.
         */
        spl_register_shrinker(&arc_shrinker);
-#endif
 
-       /* set min cache to zero */
-       arc_c_min = 4<<20;
-       /* set max to 1/2 of all memory */
-       arc_c_max = arc_c * 4;
+       /* Set to 1/64 of all memory or a minimum of 512K */
+       arc_sys_free = MAX(ptob(physmem / 64), (512 * 1024));
+       arc_need_free = 0;
+#endif
 
-       /*
-        * Allow the tunables to override our calculations if they are
-        * reasonable (ie. over 64MB)
-        */
-       if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
-               arc_c_max = zfs_arc_max;
-       if (zfs_arc_min > 0 && zfs_arc_min <= arc_c_max)
-               arc_c_min = zfs_arc_min;
+       /* Set min cache to allow safe operation of arc_adapt() */
+       arc_c_min = 2ULL << SPA_MAXBLOCKSHIFT;
+       /* Set max to 1/2 of all memory */
+       arc_c_max = allmem / 2;
 
        arc_c = arc_c_max;
        arc_p = (arc_c >> 1);
 
-       /* limit meta-data to 3/4 of the arc capacity */
-       arc_meta_limit = (3 * arc_c_max) / 4;
+       /* Set min to 1/2 of arc_c_min */
+       arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
+       /* Initialize maximum observed usage to zero */
        arc_meta_max = 0;
+       /* Set limit to 3/4 of arc_c_max with a floor of arc_meta_min */
+       arc_meta_limit = MAX((3 * arc_c_max) / 4, arc_meta_min);
 
-       /* Allow the tunable to override if it is reasonable */
-       if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
-               arc_meta_limit = zfs_arc_meta_limit;
+       /* Apply user specified tunings */
+       arc_tuning_update();
+
+       if (zfs_arc_num_sublists_per_state < 1)
+               zfs_arc_num_sublists_per_state = MAX(boot_ncpus, 1);
 
        /* if kmem_flags are set, lets try to use less memory */
        if (kmem_debugging())
@@ -4145,33 +5378,46 @@ arc_init(void)
        arc_l2c_only = &ARC_l2c_only;
        arc_size = 0;
 
-       mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-
-       list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+       multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
+           sizeof (arc_buf_hdr_t),
+           offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+           zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+       multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
+           sizeof (arc_buf_hdr_t),
+           offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+           zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+       multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
+           sizeof (arc_buf_hdr_t),
+           offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+           zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+       multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
+           sizeof (arc_buf_hdr_t),
+           offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+           zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+       multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
+           sizeof (arc_buf_hdr_t),
+           offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+           zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+       multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
+           sizeof (arc_buf_hdr_t),
+           offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+           zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+       multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
+           sizeof (arc_buf_hdr_t),
+           offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+           zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+       multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
+           sizeof (arc_buf_hdr_t),
+           offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+           zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+       multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
+           sizeof (arc_buf_hdr_t),
+           offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+           zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+       multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
+           sizeof (arc_buf_hdr_t),
+           offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+           zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
 
        arc_anon->arcs_state = ARC_STATE_ANON;
        arc_mru->arcs_state = ARC_STATE_MRU;
@@ -4180,16 +5426,26 @@ arc_init(void)
        arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
        arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
 
+       refcount_create(&arc_anon->arcs_size);
+       refcount_create(&arc_mru->arcs_size);
+       refcount_create(&arc_mru_ghost->arcs_size);
+       refcount_create(&arc_mfu->arcs_size);
+       refcount_create(&arc_mfu_ghost->arcs_size);
+       refcount_create(&arc_l2c_only->arcs_size);
+
        buf_init();
 
-       arc_thread_exit = 0;
+       arc_reclaim_thread_exit = FALSE;
+       arc_user_evicts_thread_exit = FALSE;
        list_create(&arc_prune_list, sizeof (arc_prune_t),
            offsetof(arc_prune_t, p_node));
        arc_eviction_list = NULL;
        mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
        bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
 
+       arc_prune_taskq = taskq_create("arc_prune", max_ncpus, defclsyspri,
+           max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+
        arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
            sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 
@@ -4199,8 +5455,11 @@ arc_init(void)
                kstat_install(arc_ksp);
        }
 
-       (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0,
-           TS_RUN, minclsyspri);
+       (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
+           TS_RUN, defclsyspri);
+
+       (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0,
+           TS_RUN, defclsyspri);
 
        arc_dead = FALSE;
        arc_warm = B_FALSE;
@@ -4230,17 +5489,36 @@ arc_fini(void)
 {
        arc_prune_t *p;
 
-       mutex_enter(&arc_reclaim_thr_lock);
 #ifdef _KERNEL
        spl_unregister_shrinker(&arc_shrinker);
 #endif /* _KERNEL */
 
-       arc_thread_exit = 1;
-       while (arc_thread_exit != 0)
-               cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
-       mutex_exit(&arc_reclaim_thr_lock);
+       mutex_enter(&arc_reclaim_lock);
+       arc_reclaim_thread_exit = TRUE;
+       /*
+        * The reclaim thread will set arc_reclaim_thread_exit back to
+        * FALSE when it is finished exiting; we're waiting for that.
+        */
+       while (arc_reclaim_thread_exit) {
+               cv_signal(&arc_reclaim_thread_cv);
+               cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
+       }
+       mutex_exit(&arc_reclaim_lock);
+
+       mutex_enter(&arc_user_evicts_lock);
+       arc_user_evicts_thread_exit = TRUE;
+       /*
+        * The user evicts thread will set arc_user_evicts_thread_exit
+        * to FALSE when it is finished exiting; we're waiting for that.
+        */
+       while (arc_user_evicts_thread_exit) {
+               cv_signal(&arc_user_evicts_cv);
+               cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock);
+       }
+       mutex_exit(&arc_user_evicts_lock);
 
-       arc_flush(NULL);
+       /* Use TRUE to ensure *all* buffers are evicted */
+       arc_flush(NULL, TRUE);
 
        arc_dead = TRUE;
 
@@ -4249,6 +5527,9 @@ arc_fini(void)
                arc_ksp = NULL;
        }
 
+       taskq_wait(arc_prune_taskq);
+       taskq_destroy(arc_prune_taskq);
+
        mutex_enter(&arc_prune_mtx);
        while ((p = list_head(&arc_prune_list)) != NULL) {
                list_remove(&arc_prune_list, p);
@@ -4260,29 +5541,34 @@ arc_fini(void)
 
        list_destroy(&arc_prune_list);
        mutex_destroy(&arc_prune_mtx);
-       mutex_destroy(&arc_eviction_mtx);
-       mutex_destroy(&arc_reclaim_thr_lock);
-       cv_destroy(&arc_reclaim_thr_cv);
-
-       list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
-       list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
-       list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
-       list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
-       list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
-       list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
-       list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
-       list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
-
-       mutex_destroy(&arc_anon->arcs_mtx);
-       mutex_destroy(&arc_mru->arcs_mtx);
-       mutex_destroy(&arc_mru_ghost->arcs_mtx);
-       mutex_destroy(&arc_mfu->arcs_mtx);
-       mutex_destroy(&arc_mfu_ghost->arcs_mtx);
-       mutex_destroy(&arc_l2c_only->arcs_mtx);
+       mutex_destroy(&arc_reclaim_lock);
+       cv_destroy(&arc_reclaim_thread_cv);
+       cv_destroy(&arc_reclaim_waiters_cv);
+
+       mutex_destroy(&arc_user_evicts_lock);
+       cv_destroy(&arc_user_evicts_cv);
+
+       refcount_destroy(&arc_anon->arcs_size);
+       refcount_destroy(&arc_mru->arcs_size);
+       refcount_destroy(&arc_mru_ghost->arcs_size);
+       refcount_destroy(&arc_mfu->arcs_size);
+       refcount_destroy(&arc_mfu_ghost->arcs_size);
+       refcount_destroy(&arc_l2c_only->arcs_size);
+
+       multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
+       multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
+       multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
+       multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
+       multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
+       multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
+       multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
+       multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
+       multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
+       multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
 
        buf_fini();
 
-       ASSERT(arc_loaned_bytes == 0);
+       ASSERT0(arc_loaned_bytes);
 }
 
 /*
@@ -4432,7 +5718,7 @@ arc_fini(void)
  */
 
 static boolean_t
-l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
+l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
 {
        /*
         * A buffer is *not* eligible for the L2ARC if it:
@@ -4441,8 +5727,8 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
         * 3. has an I/O in progress (it may be an incomplete read).
         * 4. is flagged not eligible (zfs property).
         */
-       if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
-           HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
+       if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
+           HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
                return (B_FALSE);
 
        return (B_TRUE);
@@ -4494,20 +5780,6 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
        return (next);
 }
 
-static void
-l2arc_hdr_stat_add(void)
-{
-       ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE);
-       ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
-}
-
-static void
-l2arc_hdr_stat_remove(void)
-{
-       ARCSTAT_INCR(arcstat_l2_hdr_size, -HDR_SIZE);
-       ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
-}
-
 /*
  * Cycle through L2ARC devices.  This is how L2ARC load balances.
  * If a device is returned, this also returns holding the spa config lock.
@@ -4603,8 +5875,7 @@ l2arc_write_done(zio_t *zio)
        l2arc_write_callback_t *cb;
        l2arc_dev_t *dev;
        list_t *buflist;
-       arc_buf_hdr_t *head, *ab, *ab_prev;
-       l2arc_buf_hdr_t *abl2;
+       arc_buf_hdr_t *head, *hdr, *hdr_prev;
        kmutex_t *hash_lock;
        int64_t bytes_dropped = 0;
 
@@ -4614,7 +5885,7 @@ l2arc_write_done(zio_t *zio)
        ASSERT(dev != NULL);
        head = cb->l2wcb_head;
        ASSERT(head != NULL);
-       buflist = dev->l2ad_buflist;
+       buflist = &dev->l2ad_buflist;
        ASSERT(buflist != NULL);
        DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
            l2arc_write_callback_t *, cb);
@@ -4622,57 +5893,92 @@ l2arc_write_done(zio_t *zio)
        if (zio->io_error != 0)
                ARCSTAT_BUMP(arcstat_l2_writes_error);
 
-       mutex_enter(&l2arc_buflist_mtx);
-
        /*
         * All writes completed, or an error was hit.
         */
-       for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
-               ab_prev = list_prev(buflist, ab);
-               abl2 = ab->b_l2hdr;
+top:
+       mutex_enter(&dev->l2ad_mtx);
+       for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
+               hdr_prev = list_prev(buflist, hdr);
+
+               hash_lock = HDR_LOCK(hdr);
 
                /*
-                * Release the temporary compressed buffer as soon as possible.
+                * We cannot use mutex_enter or else we can deadlock
+                * with l2arc_write_buffers (due to swapping the order
+                * the hash lock and l2ad_mtx are taken).
                 */
-               if (abl2->b_compress != ZIO_COMPRESS_OFF)
-                       l2arc_release_cdata_buf(ab);
-
-               hash_lock = HDR_LOCK(ab);
                if (!mutex_tryenter(hash_lock)) {
                        /*
-                        * This buffer misses out.  It may be in a stage
-                        * of eviction.  Its ARC_L2_WRITING flag will be
-                        * left set, denying reads to this buffer.
+                        * Missed the hash lock. We must retry so we
+                        * don't leave the ARC_FLAG_L2_WRITING bit set.
                         */
-                       ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
-                       continue;
+                       ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
+
+                       /*
+                        * We don't want to rescan the headers we've
+                        * already marked as having been written out, so
+                        * we reinsert the head node so we can pick up
+                        * where we left off.
+                        */
+                       list_remove(buflist, head);
+                       list_insert_after(buflist, hdr, head);
+
+                       mutex_exit(&dev->l2ad_mtx);
+
+                       /*
+                        * We wait for the hash lock to become available
+                        * to try and prevent busy waiting, and increase
+                        * the chance we'll be able to acquire the lock
+                        * the next time around.
+                        */
+                       mutex_enter(hash_lock);
+                       mutex_exit(hash_lock);
+                       goto top;
                }
 
+               /*
+                * We could not have been moved into the arc_l2c_only
+                * state while in-flight due to our ARC_FLAG_L2_WRITING
+                * bit being set. Let's just ensure that's being enforced.
+                */
+               ASSERT(HDR_HAS_L1HDR(hdr));
+
+               /*
+                * We may have allocated a buffer for L2ARC compression,
+                * we must release it to avoid leaking this data.
+                */
+               l2arc_release_cdata_buf(hdr);
+
                if (zio->io_error != 0) {
                        /*
                         * Error - drop L2ARC entry.
                         */
-                       list_remove(buflist, ab);
-                       ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
-                       bytes_dropped += abl2->b_asize;
-                       ab->b_l2hdr = NULL;
-                       kmem_cache_free(l2arc_hdr_cache, abl2);
-                       arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
-                       ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
+                       list_remove(buflist, hdr);
+                       hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
+
+                       ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
+                       ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+
+                       bytes_dropped += hdr->b_l2hdr.b_asize;
+                       (void) refcount_remove_many(&dev->l2ad_alloc,
+                           hdr->b_l2hdr.b_asize, hdr);
                }
 
                /*
-                * Allow ARC to begin reads to this L2ARC entry.
+                * Allow ARC to begin reads and ghost list evictions to
+                * this L2ARC entry.
                 */
-               ab->b_flags &= ~ARC_L2_WRITING;
+               hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
 
                mutex_exit(hash_lock);
        }
 
        atomic_inc_64(&l2arc_writes_done);
        list_remove(buflist, head);
-       kmem_cache_free(hdr_cache, head);
-       mutex_exit(&l2arc_buflist_mtx);
+       ASSERT(!HDR_HAS_L1HDR(head));
+       kmem_cache_free(hdr_l2only_cache, head);
+       mutex_exit(&dev->l2ad_mtx);
 
        vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
 
@@ -4715,6 +6021,8 @@ l2arc_read_done(zio_t *zio)
        if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
                l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
        ASSERT(zio->io_data != NULL);
+       ASSERT3U(zio->io_size, ==, hdr->b_size);
+       ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size);
 
        /*
         * Check this survived the L2ARC journey.
@@ -4751,7 +6059,7 @@ l2arc_read_done(zio_t *zio)
                        ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
 
                        zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
-                           buf->b_data, zio->io_size, arc_read_done, buf,
+                           buf->b_data, hdr->b_size, arc_read_done, buf,
                            zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
                }
        }
@@ -4769,35 +6077,37 @@ l2arc_read_done(zio_t *zio)
  * the data lists.  This function returns a locked list, and also returns
  * the lock pointer.
  */
-static list_t *
-l2arc_list_locked(int list_num, kmutex_t **lock)
+static multilist_sublist_t *
+l2arc_sublist_lock(int list_num)
 {
-       list_t *list = NULL;
+       multilist_t *ml = NULL;
+       unsigned int idx;
 
        ASSERT(list_num >= 0 && list_num <= 3);
 
        switch (list_num) {
        case 0:
-               list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
-               *lock = &arc_mfu->arcs_mtx;
+               ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
                break;
        case 1:
-               list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
-               *lock = &arc_mru->arcs_mtx;
+               ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
                break;
        case 2:
-               list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
-               *lock = &arc_mfu->arcs_mtx;
+               ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
                break;
        case 3:
-               list = &arc_mru->arcs_list[ARC_BUFC_DATA];
-               *lock = &arc_mru->arcs_mtx;
+               ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
                break;
        }
 
-       ASSERT(!(MUTEX_HELD(*lock)));
-       mutex_enter(*lock);
-       return (list);
+       /*
+        * Return a randomly-selected sublist. This is acceptable
+        * because the caller feeds only a little bit of data for each
+        * call (8MB). Subsequent calls will result in different
+        * sublists being selected.
+        */
+       idx = multilist_get_random_index(ml);
+       return (multilist_sublist_lock(ml, idx));
 }
 
 /*
@@ -4810,16 +6120,11 @@ static void
 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 {
        list_t *buflist;
-       l2arc_buf_hdr_t *abl2;
-       arc_buf_hdr_t *ab, *ab_prev;
+       arc_buf_hdr_t *hdr, *hdr_prev;
        kmutex_t *hash_lock;
        uint64_t taddr;
-       int64_t bytes_evicted = 0;
 
-       buflist = dev->l2ad_buflist;
-
-       if (buflist == NULL)
-               return;
+       buflist = &dev->l2ad_buflist;
 
        if (!all && dev->l2ad_first) {
                /*
@@ -4842,35 +6147,41 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
            uint64_t, taddr, boolean_t, all);
 
 top:
-       mutex_enter(&l2arc_buflist_mtx);
-       for (ab = list_tail(buflist); ab; ab = ab_prev) {
-               ab_prev = list_prev(buflist, ab);
+       mutex_enter(&dev->l2ad_mtx);
+       for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
+               hdr_prev = list_prev(buflist, hdr);
 
-               hash_lock = HDR_LOCK(ab);
+               hash_lock = HDR_LOCK(hdr);
+
+               /*
+                * We cannot use mutex_enter or else we can deadlock
+                * with l2arc_write_buffers (due to swapping the order
+                * the hash lock and l2ad_mtx are taken).
+                */
                if (!mutex_tryenter(hash_lock)) {
                        /*
                         * Missed the hash lock.  Retry.
                         */
                        ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
-                       mutex_exit(&l2arc_buflist_mtx);
+                       mutex_exit(&dev->l2ad_mtx);
                        mutex_enter(hash_lock);
                        mutex_exit(hash_lock);
                        goto top;
                }
 
-               if (HDR_L2_WRITE_HEAD(ab)) {
+               if (HDR_L2_WRITE_HEAD(hdr)) {
                        /*
                         * We hit a write head node.  Leave it for
                         * l2arc_write_done().
                         */
-                       list_remove(buflist, ab);
+                       list_remove(buflist, hdr);
                        mutex_exit(hash_lock);
                        continue;
                }
 
-               if (!all && ab->b_l2hdr != NULL &&
-                   (ab->b_l2hdr->b_daddr > taddr ||
-                   ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
+               if (!all && HDR_HAS_L2HDR(hdr) &&
+                   (hdr->b_l2hdr.b_daddr > taddr ||
+                   hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
                        /*
                         * We've evicted to the target address,
                         * or the end of the device.
@@ -4879,71 +6190,44 @@ top:
                        break;
                }
 
-               if (HDR_FREE_IN_PROGRESS(ab)) {
-                       /*
-                        * Already on the path to destruction.
-                        */
-                       mutex_exit(hash_lock);
-                       continue;
-               }
-
-               if (ab->b_state == arc_l2c_only) {
-                       ASSERT(!HDR_L2_READING(ab));
+               ASSERT(HDR_HAS_L2HDR(hdr));
+               if (!HDR_HAS_L1HDR(hdr)) {
+                       ASSERT(!HDR_L2_READING(hdr));
                        /*
                         * This doesn't exist in the ARC.  Destroy.
                         * arc_hdr_destroy() will call list_remove()
                         * and decrement arcstat_l2_size.
                         */
-                       arc_change_state(arc_anon, ab, hash_lock);
-                       arc_hdr_destroy(ab);
+                       arc_change_state(arc_anon, hdr, hash_lock);
+                       arc_hdr_destroy(hdr);
                } else {
+                       ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
+                       ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
                        /*
                         * Invalidate issued or about to be issued
                         * reads, since we may be about to write
                         * over this location.
                         */
-                       if (HDR_L2_READING(ab)) {
+                       if (HDR_L2_READING(hdr)) {
                                ARCSTAT_BUMP(arcstat_l2_evict_reading);
-                               ab->b_flags |= ARC_L2_EVICTED;
+                               hdr->b_flags |= ARC_FLAG_L2_EVICTED;
                        }
 
-                       /*
-                        * Tell ARC this no longer exists in L2ARC.
-                        */
-                       if (ab->b_l2hdr != NULL) {
-                               abl2 = ab->b_l2hdr;
-                               ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
-                               bytes_evicted += abl2->b_asize;
-                               ab->b_l2hdr = NULL;
-                               /*
-                                * We are destroying l2hdr, so ensure that
-                                * its compressed buffer, if any, is not leaked.
-                                */
-                               ASSERT(abl2->b_tmp_cdata == NULL);
-                               kmem_cache_free(l2arc_hdr_cache, abl2);
-                               arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
-                               ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
-                       }
-                       list_remove(buflist, ab);
+                       /* Ensure this header has finished being written */
+                       ASSERT(!HDR_L2_WRITING(hdr));
+                       ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
 
-                       /*
-                        * This may have been leftover after a
-                        * failed write.
-                        */
-                       ab->b_flags &= ~ARC_L2_WRITING;
+                       arc_hdr_l2hdr_destroy(hdr);
                }
                mutex_exit(hash_lock);
        }
-       mutex_exit(&l2arc_buflist_mtx);
-
-       vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
-       dev->l2ad_evict = taddr;
+       mutex_exit(&dev->l2ad_mtx);
 }
 
 /*
  * Find and write ARC buffers to the L2ARC device.
  *
- * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
+ * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
  * for reading until they have completed writing.
  * The headroom_boost is an in-out parameter used to maintain headroom boost
  * state between calls to this function.
@@ -4955,12 +6239,10 @@ static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
     boolean_t *headroom_boost)
 {
-       arc_buf_hdr_t *ab, *ab_prev, *head;
-       list_t *list;
-       uint64_t write_asize, write_psize, write_sz, headroom,
-           buf_compress_minsz;
+       arc_buf_hdr_t *hdr, *hdr_prev, *head;
+       uint64_t write_asize, write_sz, headroom, buf_compress_minsz,
+           stats_size;
        void *buf_data;
-       kmutex_t *list_lock = NULL;
        boolean_t full;
        l2arc_write_callback_t *cb;
        zio_t *pio, *wzio;
@@ -4974,10 +6256,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
        *headroom_boost = B_FALSE;
 
        pio = NULL;
-       write_sz = write_asize = write_psize = 0;
+       write_sz = write_asize = 0;
        full = B_FALSE;
-       head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
-       head->b_flags |= ARC_L2_WRITE_HEAD;
+       head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
+       head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
+       head->b_flags |= ARC_FLAG_HAS_L2HDR;
 
        /*
         * We will want to try to compress buffers that are at least 2x the
@@ -4988,12 +6271,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
        /*
         * Copy buffers for L2ARC writing.
         */
-       mutex_enter(&l2arc_buflist_mtx);
        for (try = 0; try <= 3; try++) {
+               multilist_sublist_t *mls = l2arc_sublist_lock(try);
                uint64_t passed_sz = 0;
 
-               list = l2arc_list_locked(try, &list_lock);
-
                /*
                 * L2ARC fast warmup.
                 *
@@ -5001,25 +6282,25 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
                 * head of the ARC lists rather than the tail.
                 */
                if (arc_warm == B_FALSE)
-                       ab = list_head(list);
+                       hdr = multilist_sublist_head(mls);
                else
-                       ab = list_tail(list);
+                       hdr = multilist_sublist_tail(mls);
 
                headroom = target_sz * l2arc_headroom;
                if (do_headroom_boost)
                        headroom = (headroom * l2arc_headroom_boost) / 100;
 
-               for (; ab; ab = ab_prev) {
-                       l2arc_buf_hdr_t *l2hdr;
+               for (; hdr; hdr = hdr_prev) {
                        kmutex_t *hash_lock;
                        uint64_t buf_sz;
+                       uint64_t buf_a_sz;
 
                        if (arc_warm == B_FALSE)
-                               ab_prev = list_next(list, ab);
+                               hdr_prev = multilist_sublist_next(mls, hdr);
                        else
-                               ab_prev = list_prev(list, ab);
+                               hdr_prev = multilist_sublist_prev(mls, hdr);
 
-                       hash_lock = HDR_LOCK(ab);
+                       hash_lock = HDR_LOCK(hdr);
                        if (!mutex_tryenter(hash_lock)) {
                                /*
                                 * Skip this buffer rather than waiting.
@@ -5027,7 +6308,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
                                continue;
                        }
 
-                       passed_sz += ab->b_size;
+                       passed_sz += hdr->b_size;
                        if (passed_sz > headroom) {
                                /*
                                 * Searched too far.
@@ -5036,12 +6317,20 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
                                break;
                        }
 
-                       if (!l2arc_write_eligible(guid, ab)) {
+                       if (!l2arc_write_eligible(guid, hdr)) {
                                mutex_exit(hash_lock);
                                continue;
                        }
 
-                       if ((write_sz + ab->b_size) > target_sz) {
+                       /*
+                        * Assume that the buffer is not going to be compressed
+                        * and could take more space on disk because of a larger
+                        * disk block size.
+                        */
+                       buf_sz = hdr->b_size;
+                       buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
+
+                       if ((write_asize + buf_a_sz) > target_sz) {
                                full = B_TRUE;
                                mutex_exit(hash_lock);
                                break;
@@ -5053,10 +6342,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
                                 * l2arc_write_done() can find where the
                                 * write buffers begin without searching.
                                 */
-                               list_insert_head(dev->l2ad_buflist, head);
+                               mutex_enter(&dev->l2ad_mtx);
+                               list_insert_head(&dev->l2ad_buflist, head);
+                               mutex_exit(&dev->l2ad_mtx);
 
-                               cb = kmem_alloc(sizeof (l2arc_write_callback_t),
-                                   KM_SLEEP);
+                               cb = kmem_alloc(
+                                   sizeof (l2arc_write_callback_t), KM_SLEEP);
                                cb->l2wcb_dev = dev;
                                cb->l2wcb_head = head;
                                pio = zio_root(spa, l2arc_write_done, cb,
@@ -5066,44 +6357,63 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
                        /*
                         * Create and add a new L2ARC header.
                         */
-                       l2hdr = kmem_cache_alloc(l2arc_hdr_cache, KM_SLEEP);
-                       l2hdr->b_dev = dev;
-                       l2hdr->b_daddr = 0;
-                       arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS);
-
-                       ab->b_flags |= ARC_L2_WRITING;
-
+                       hdr->b_l2hdr.b_dev = dev;
+                       hdr->b_flags |= ARC_FLAG_L2_WRITING;
                        /*
                         * Temporarily stash the data buffer in b_tmp_cdata.
                         * The subsequent write step will pick it up from
-                        * there. This is because can't access ab->b_buf
+                        * there. This is because can't access b_l1hdr.b_buf
                         * without holding the hash_lock, which we in turn
                         * can't access without holding the ARC list locks
                         * (which we want to avoid during compression/writing)
                         */
-                       l2hdr->b_compress = ZIO_COMPRESS_OFF;
-                       l2hdr->b_asize = ab->b_size;
-                       l2hdr->b_tmp_cdata = ab->b_buf->b_data;
-                       l2hdr->b_hits = 0;
+                       hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF;
+                       hdr->b_l2hdr.b_asize = hdr->b_size;
+                       hdr->b_l2hdr.b_hits = 0;
+                       hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
 
-                       buf_sz = ab->b_size;
-                       ab->b_l2hdr = l2hdr;
+                       /*
+                        * Explicitly set the b_daddr field to a known
+                        * value which means "invalid address". This
+                        * enables us to differentiate which stage of
+                        * l2arc_write_buffers() the particular header
+                        * is in (e.g. this loop, or the one below).
+                        * ARC_FLAG_L2_WRITING is not enough to make
+                        * this distinction, and we need to know in
+                        * order to do proper l2arc vdev accounting in
+                        * arc_release() and arc_hdr_destroy().
+                        *
+                        * Note, we can't use a new flag to distinguish
+                        * the two stages because we don't hold the
+                        * header's hash_lock below, in the second stage
+                        * of this function. Thus, we can't simply
+                        * change the b_flags field to denote that the
+                        * IO has been sent. We can change the b_daddr
+                        * field of the L2 portion, though, since we'll
+                        * be holding the l2ad_mtx; which is why we're
+                        * using it to denote the header's state change.
+                        */
+                       hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
+                       hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
 
-                       list_insert_head(dev->l2ad_buflist, ab);
+                       mutex_enter(&dev->l2ad_mtx);
+                       list_insert_head(&dev->l2ad_buflist, hdr);
+                       mutex_exit(&dev->l2ad_mtx);
 
                        /*
                         * Compute and store the buffer cksum before
                         * writing.  On debug the cksum is verified first.
                         */
-                       arc_cksum_verify(ab->b_buf);
-                       arc_cksum_compute(ab->b_buf, B_TRUE);
+                       arc_cksum_verify(hdr->b_l1hdr.b_buf);
+                       arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
 
                        mutex_exit(hash_lock);
 
                        write_sz += buf_sz;
+                       write_asize += buf_a_sz;
                }
 
-               mutex_exit(list_lock);
+               multilist_sublist_unlock(mls);
 
                if (full == B_TRUE)
                        break;
@@ -5112,33 +6422,55 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
        /* No buffers selected for writing? */
        if (pio == NULL) {
                ASSERT0(write_sz);
-               mutex_exit(&l2arc_buflist_mtx);
-               kmem_cache_free(hdr_cache, head);
+               ASSERT(!HDR_HAS_L1HDR(head));
+               kmem_cache_free(hdr_l2only_cache, head);
                return (0);
        }
 
+       mutex_enter(&dev->l2ad_mtx);
+
+       /*
+        * Note that elsewhere in this file arcstat_l2_asize
+        * and the used space on l2ad_vdev are updated using b_asize,
+        * which is not necessarily rounded up to the device block size.
+        * Too keep accounting consistent we do the same here as well:
+        * stats_size accumulates the sum of b_asize of the written buffers,
+        * while write_asize accumulates the sum of b_asize rounded up
+        * to the device block size.
+        * The latter sum is used only to validate the corectness of the code.
+        */
+       stats_size = 0;
+       write_asize = 0;
+
        /*
         * Now start writing the buffers. We're starting at the write head
         * and work backwards, retracing the course of the buffer selector
         * loop above.
         */
-       for (ab = list_prev(dev->l2ad_buflist, head); ab;
-           ab = list_prev(dev->l2ad_buflist, ab)) {
-               l2arc_buf_hdr_t *l2hdr;
+       for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
+           hdr = list_prev(&dev->l2ad_buflist, hdr)) {
                uint64_t buf_sz;
 
+               /*
+                * We rely on the L1 portion of the header below, so
+                * it's invalid for this header to have been evicted out
+                * of the ghost cache, prior to being written out. The
+                * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+                */
+               ASSERT(HDR_HAS_L1HDR(hdr));
+
                /*
                 * We shouldn't need to lock the buffer here, since we flagged
-                * it as ARC_L2_WRITING in the previous step, but we must take
-                * care to only access its L2 cache parameters. In particular,
-                * ab->b_buf may be invalid by now due to ARC eviction.
+                * it as ARC_FLAG_L2_WRITING in the previous step, but we must
+                * take care to only access its L2 cache parameters. In
+                * particular, hdr->l1hdr.b_buf may be invalid by now due to
+                * ARC eviction.
                 */
-               l2hdr = ab->b_l2hdr;
-               l2hdr->b_daddr = dev->l2ad_hand;
+               hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
 
-               if (!l2arc_nocompress && (ab->b_flags & ARC_L2COMPRESS) &&
-                   l2hdr->b_asize >= buf_compress_minsz) {
-                       if (l2arc_compress_buf(l2hdr)) {
+               if ((!l2arc_nocompress && HDR_L2COMPRESS(hdr)) &&
+                   hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
+                       if (l2arc_compress_buf(hdr)) {
                                /*
                                 * If compression succeeded, enable headroom
                                 * boost on the next scan cycle.
@@ -5151,20 +6483,19 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
                 * Pick up the buffer data we had previously stashed away
                 * (and now potentially also compressed).
                 */
-               buf_data = l2hdr->b_tmp_cdata;
-               buf_sz = l2hdr->b_asize;
+               buf_data = hdr->b_l1hdr.b_tmp_cdata;
+               buf_sz = hdr->b_l2hdr.b_asize;
 
                /*
-                * If the data has not been compressed, then clear b_tmp_cdata
-                * to make sure that it points only to a temporary compression
-                * buffer.
+                * We need to do this regardless if buf_sz is zero or
+                * not, otherwise, when this l2hdr is evicted we'll
+                * remove a reference that was never added.
                 */
-               if (!L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress))
-                       l2hdr->b_tmp_cdata = NULL;
+               (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr);
 
                /* Compression may have squashed the buffer to zero length. */
                if (buf_sz != 0) {
-                       uint64_t buf_p_sz;
+                       uint64_t buf_a_sz;
 
                        wzio = zio_write_phys(pio, dev->l2ad_vdev,
                            dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
@@ -5175,24 +6506,25 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
                            zio_t *, wzio);
                        (void) zio_nowait(wzio);
 
-                       write_asize += buf_sz;
+                       stats_size += buf_sz;
+
                        /*
                         * Keep the clock hand suitably device-aligned.
                         */
-                       buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
-                       write_psize += buf_p_sz;
-                       dev->l2ad_hand += buf_p_sz;
+                       buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
+                       write_asize += buf_a_sz;
+                       dev->l2ad_hand += buf_a_sz;
                }
        }
 
-       mutex_exit(&l2arc_buflist_mtx);
+       mutex_exit(&dev->l2ad_mtx);
 
        ASSERT3U(write_asize, <=, target_sz);
        ARCSTAT_BUMP(arcstat_l2_writes_sent);
        ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
        ARCSTAT_INCR(arcstat_l2_size, write_sz);
-       ARCSTAT_INCR(arcstat_l2_asize, write_asize);
-       vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
+       ARCSTAT_INCR(arcstat_l2_asize, stats_size);
+       vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0);
 
        /*
         * Bump device hand to the device start if it is approaching the end.
@@ -5200,7 +6532,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
         */
        if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
                dev->l2ad_hand = dev->l2ad_start;
-               dev->l2ad_evict = dev->l2ad_start;
                dev->l2ad_first = B_FALSE;
        }
 
@@ -5213,7 +6544,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
 
 /*
  * Compresses an L2ARC buffer.
- * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
+ * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
  * size in l2hdr->b_asize. This routine tries to compress the data and
  * depending on the compression result there are three possible outcomes:
  * *) The buffer was incompressible. The original l2hdr contents were left
@@ -5231,17 +6562,24 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
  * buffer was incompressible).
  */
 static boolean_t
-l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
+l2arc_compress_buf(arc_buf_hdr_t *hdr)
 {
        void *cdata;
        size_t csize, len, rounded;
+       l2arc_buf_hdr_t *l2hdr;
+
+       ASSERT(HDR_HAS_L2HDR(hdr));
+
+       l2hdr = &hdr->b_l2hdr;
 
-       ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
-       ASSERT(l2hdr->b_tmp_cdata != NULL);
+       ASSERT(HDR_HAS_L1HDR(hdr));
+       ASSERT3U(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF);
+       ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
 
        len = l2hdr->b_asize;
        cdata = zio_data_buf_alloc(len);
-       csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
+       ASSERT3P(cdata, !=, NULL);
+       csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
            cdata, l2hdr->b_asize);
 
        rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
@@ -5255,7 +6593,7 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
                zio_data_buf_free(cdata, len);
                l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
                l2hdr->b_asize = 0;
-               l2hdr->b_tmp_cdata = NULL;
+               hdr->b_l1hdr.b_tmp_cdata = NULL;
                ARCSTAT_BUMP(arcstat_l2_compress_zeros);
                return (B_TRUE);
        } else if (csize > 0 && csize < len) {
@@ -5265,7 +6603,7 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
                 */
                l2hdr->b_compress = ZIO_COMPRESS_LZ4;
                l2hdr->b_asize = csize;
-               l2hdr->b_tmp_cdata = cdata;
+               hdr->b_l1hdr.b_tmp_cdata = cdata;
                ARCSTAT_BUMP(arcstat_l2_compress_successes);
                return (B_TRUE);
        } else {
@@ -5313,9 +6651,9 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
                 * need to fill its io_data after we're done restoring the
                 * buffer's contents.
                 */
-               ASSERT(hdr->b_buf != NULL);
-               bzero(hdr->b_buf->b_data, hdr->b_size);
-               zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
+               ASSERT(hdr->b_l1hdr.b_buf != NULL);
+               bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
+               zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
        } else {
                ASSERT(zio->io_data != NULL);
                /*
@@ -5333,7 +6671,7 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
                bcopy(zio->io_data, cdata, csize);
                if (zio_decompress_data(c, cdata, zio->io_data, csize,
                    hdr->b_size) != 0)
-                       zio->io_error = SET_ERROR(EIO);
+                       zio->io_error = EIO;
                zio_data_buf_free(cdata, csize);
        }
 
@@ -5348,22 +6686,40 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
  * done, we can dispose of it.
  */
 static void
-l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
+l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
 {
-       l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
+       enum zio_compress comp;
+
+       ASSERT(HDR_HAS_L1HDR(hdr));
+       ASSERT(HDR_HAS_L2HDR(hdr));
+       comp = hdr->b_l2hdr.b_compress;
+       ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp));
 
-       ASSERT(L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress));
-       if (l2hdr->b_compress != ZIO_COMPRESS_EMPTY) {
+       if (comp == ZIO_COMPRESS_OFF) {
+               /*
+                * In this case, b_tmp_cdata points to the same buffer
+                * as the arc_buf_t's b_data field. We don't want to
+                * free it, since the arc_buf_t will handle that.
+                */
+               hdr->b_l1hdr.b_tmp_cdata = NULL;
+       } else if (comp == ZIO_COMPRESS_EMPTY) {
+               /*
+                * In this case, b_tmp_cdata was compressed to an empty
+                * buffer, thus there's nothing to free and b_tmp_cdata
+                * should have been set to NULL in l2arc_write_buffers().
+                */
+               ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+       } else {
                /*
                 * If the data was compressed, then we've allocated a
                 * temporary buffer for it, so now we need to release it.
                 */
-               ASSERT(l2hdr->b_tmp_cdata != NULL);
-               zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
-               l2hdr->b_tmp_cdata = NULL;
-       } else {
-               ASSERT(l2hdr->b_tmp_cdata == NULL);
+               ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
+               zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
+                   hdr->b_size);
+               hdr->b_l1hdr.b_tmp_cdata = NULL;
        }
+
 }
 
 /*
@@ -5388,7 +6744,7 @@ l2arc_feed_thread(void)
        cookie = spl_fstrans_mark();
        while (l2arc_thread_exit == 0) {
                CALLB_CPR_SAFE_BEGIN(&cpr);
-               (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
+               (void) cv_timedwait_sig(&l2arc_feed_thr_cv,
                    &l2arc_feed_thr_lock, next);
                CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
                next = ddi_get_lbolt() + hz;
@@ -5433,7 +6789,7 @@ l2arc_feed_thread(void)
                /*
                 * Avoid contributing to memory pressure.
                 */
-               if (arc_no_grow) {
+               if (arc_reclaim_needed()) {
                        ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
                        spa_config_exit(spa, SCL_L2ARC, dev);
                        continue;
@@ -5503,20 +6859,20 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
        adddev->l2ad_start = VDEV_LABEL_START_SIZE;
        adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
        adddev->l2ad_hand = adddev->l2ad_start;
-       adddev->l2ad_evict = adddev->l2ad_start;
        adddev->l2ad_first = B_TRUE;
        adddev->l2ad_writing = B_FALSE;
        list_link_init(&adddev->l2ad_node);
 
+       mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
        /*
         * This is a list of all ARC buffers that are still valid on the
         * device.
         */
-       adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
-       list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
-           offsetof(arc_buf_hdr_t, b_l2node));
+       list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
+           offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
 
        vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
+       refcount_create(&adddev->l2ad_alloc);
 
        /*
         * Add device to global list
@@ -5560,8 +6916,9 @@ l2arc_remove_vdev(vdev_t *vd)
         * Clear all buflists and ARC references.  L2ARC device flush.
         */
        l2arc_evict(remdev, 0, B_TRUE);
-       list_destroy(remdev->l2ad_buflist);
-       kmem_free(remdev->l2ad_buflist, sizeof (list_t));
+       list_destroy(&remdev->l2ad_buflist);
+       mutex_destroy(&remdev->l2ad_mtx);
+       refcount_destroy(&remdev->l2ad_alloc);
        kmem_free(remdev, sizeof (l2arc_dev_t));
 }
 
@@ -5576,7 +6933,6 @@ l2arc_init(void)
        mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
        mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
 
        l2arc_dev_list = &L2ARC_dev_list;
@@ -5601,7 +6957,6 @@ l2arc_fini(void)
        mutex_destroy(&l2arc_feed_thr_lock);
        cv_destroy(&l2arc_feed_thr_cv);
        mutex_destroy(&l2arc_dev_mtx);
-       mutex_destroy(&l2arc_buflist_mtx);
        mutex_destroy(&l2arc_free_on_write_mtx);
 
        list_destroy(l2arc_dev_list);
@@ -5615,7 +6970,7 @@ l2arc_start(void)
                return;
 
        (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
-           TS_RUN, minclsyspri);
+           TS_RUN, defclsyspri);
 }
 
 void
@@ -5651,13 +7006,19 @@ MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
 module_param(zfs_arc_meta_limit, ulong, 0644);
 MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
 
+module_param(zfs_arc_meta_min, ulong, 0644);
+MODULE_PARM_DESC(zfs_arc_meta_min, "Min arc metadata");
+
 module_param(zfs_arc_meta_prune, int, 0644);
 MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune");
 
-module_param(zfs_arc_meta_adjust_restarts, ulong, 0644);
+module_param(zfs_arc_meta_adjust_restarts, int, 0644);
 MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
        "Limit number of restarts in arc_adjust_meta");
 
+module_param(zfs_arc_meta_strategy, int, 0644);
+MODULE_PARM_DESC(zfs_arc_meta_strategy, "Meta reclaim strategy");
+
 module_param(zfs_arc_grow_retry, int, 0644);
 MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
 
@@ -5670,18 +7031,22 @@ MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener");
 module_param(zfs_arc_shrink_shift, int, 0644);
 MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
 
+module_param(zfs_arc_p_min_shift, int, 0644);
+MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
+
 module_param(zfs_disable_dup_eviction, int, 0644);
 MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
 
 module_param(zfs_arc_average_blocksize, int, 0444);
 MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size");
 
-module_param(zfs_arc_memory_throttle_disable, int, 0644);
-MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle");
-
 module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
 MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
 
+module_param(zfs_arc_num_sublists_per_state, int, 0644);
+MODULE_PARM_DESC(zfs_arc_num_sublists_per_state,
+       "Number of sublists used in each of the ARC state lists");
+
 module_param(l2arc_write_max, ulong, 0644);
 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
 
@@ -5712,4 +7077,11 @@ MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
 module_param(l2arc_norw, int, 0644);
 MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
 
+module_param(zfs_arc_lotsfree_percent, int, 0644);
+MODULE_PARM_DESC(zfs_arc_lotsfree_percent,
+       "System free memory I/O throttle in bytes");
+
+module_param(zfs_arc_sys_free, ulong, 0644);
+MODULE_PARM_DESC(zfs_arc_sys_free, "System free memory target size in bytes");
+
 #endif
index 7c8f932f5cf316941bb27935db9de04274b62951..17d98c36e13479b194834819b0bb5698d8925cc5 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/bpobj.h>
@@ -43,7 +43,7 @@ bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
                if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
                        ASSERT0(dp->dp_empty_bpobj);
                        dp->dp_empty_bpobj =
-                           bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
+                           bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
                        VERIFY(zap_add(os,
                            DMU_POOL_DIRECTORY_OBJECT,
                            DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
@@ -256,9 +256,8 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
                dbuf = NULL;
        }
        if (free) {
-               i++;
                VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
-                   i * sizeof (blkptr_t), -1ULL, tx));
+                   (i + 1) * sizeof (blkptr_t), -1ULL, tx));
        }
        if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
                goto out;
@@ -301,8 +300,10 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
                if (free) {
                        err = bpobj_space(&sublist,
                            &used_before, &comp_before, &uncomp_before);
-                       if (err)
+                       if (err != 0) {
+                               bpobj_close(&sublist);
                                break;
+                       }
                }
                err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
                if (free) {
@@ -397,7 +398,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
        dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
        if (bpo->bpo_phys->bpo_subobjs == 0) {
                bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
-                   DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+                   DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+                   DMU_OT_NONE, 0, tx);
        }
 
        ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
index d6ea9d7c645143be5425e50ad81a6ce523621c05..9f62d7b911f361ea2397bd9f96c4a756b883d6c5 100644 (file)
@@ -65,7 +65,7 @@ bptree_alloc(objset_t *os, dmu_tx_t *tx)
        bptree_phys_t *bt;
 
        obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
-           SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+           SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
            sizeof (bptree_phys_t), tx);
 
        /*
index ed6a8fd2a4dc1b59f75321625759cbe9580835dc..d340da821fc50f0e65bf34acea66383b0be42804 100644 (file)
@@ -21,8 +21,9 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -78,10 +79,16 @@ static void dbuf_destroy(dmu_buf_impl_t *db);
 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 
+#ifndef __lint
+extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
+    dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp);
+#endif /* ! __lint */
+
 /*
  * Global data structures and functions for the dbuf cache.
  */
 static kmem_cache_t *dbuf_cache;
+static taskq_t *dbu_evict_taskq;
 
 /* ARGSUSED */
 static int
@@ -93,7 +100,7 @@ dbuf_cons(void *vdb, void *unused, int kmflag)
        mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
        refcount_create(&db->db_holds);
-       list_link_init(&db->db_link);
+
        return (0);
 }
 
@@ -142,16 +149,13 @@ dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
        (dbuf)->db_blkid == (blkid))
 
 dmu_buf_impl_t *
-dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
+dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
 {
        dbuf_hash_table_t *h = &dbuf_hash_table;
-       objset_t *os = dn->dn_objset;
-       uint64_t obj;
        uint64_t hv;
        uint64_t idx;
        dmu_buf_impl_t *db;
 
-       obj = dn->dn_object;
        hv = DBUF_HASH(os, obj, level, blkid);
        idx = hv & h->hash_table_mask;
 
@@ -170,6 +174,24 @@ dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
        return (NULL);
 }
 
+static dmu_buf_impl_t *
+dbuf_find_bonus(objset_t *os, uint64_t object)
+{
+       dnode_t *dn;
+       dmu_buf_impl_t *db = NULL;
+
+       if (dnode_hold(os, object, FTAG, &dn) == 0) {
+               rw_enter(&dn->dn_struct_rwlock, RW_READER);
+               if (dn->dn_bonus != NULL) {
+                       db = dn->dn_bonus;
+                       mutex_enter(&db->db_mtx);
+               }
+               rw_exit(&dn->dn_struct_rwlock);
+               dnode_rele(dn, FTAG);
+       }
+       return (db);
+}
+
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
@@ -247,20 +269,72 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
 
 static arc_evict_func_t dbuf_do_evict;
 
+typedef enum {
+       DBVU_EVICTING,
+       DBVU_NOT_EVICTING
+} dbvu_verify_type_t;
+
+static void
+dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
+{
+#ifdef ZFS_DEBUG
+       int64_t holds;
+
+       if (db->db_user == NULL)
+               return;
+
+       /* Only data blocks support the attachment of user data. */
+       ASSERT(db->db_level == 0);
+
+       /* Clients must resolve a dbuf before attaching user data. */
+       ASSERT(db->db.db_data != NULL);
+       ASSERT3U(db->db_state, ==, DB_CACHED);
+
+       holds = refcount_count(&db->db_holds);
+       if (verify_type == DBVU_EVICTING) {
+               /*
+                * Immediate eviction occurs when holds == dirtycnt.
+                * For normal eviction buffers, holds is zero on
+                * eviction, except when dbuf_fix_old_data() calls
+                * dbuf_clear_data().  However, the hold count can grow
+                * during eviction even though db_mtx is held (see
+                * dmu_bonus_hold() for an example), so we can only
+                * test the generic invariant that holds >= dirtycnt.
+                */
+               ASSERT3U(holds, >=, db->db_dirtycnt);
+       } else {
+               if (db->db_user_immediate_evict == TRUE)
+                       ASSERT3U(holds, >=, db->db_dirtycnt);
+               else
+                       ASSERT3U(holds, >, 0);
+       }
+#endif
+}
+
 static void
 dbuf_evict_user(dmu_buf_impl_t *db)
 {
+       dmu_buf_user_t *dbu = db->db_user;
+
        ASSERT(MUTEX_HELD(&db->db_mtx));
 
-       if (db->db_level != 0 || db->db_evict_func == NULL)
+       if (dbu == NULL)
                return;
 
-       if (db->db_user_data_ptr_ptr)
-               *db->db_user_data_ptr_ptr = db->db.db_data;
-       db->db_evict_func(&db->db, db->db_user_ptr);
-       db->db_user_ptr = NULL;
-       db->db_user_data_ptr_ptr = NULL;
-       db->db_evict_func = NULL;
+       dbuf_verify_user(db, DBVU_EVICTING);
+       db->db_user = NULL;
+
+#ifdef ZFS_DEBUG
+       if (dbu->dbu_clear_on_evict_dbufp != NULL)
+               *dbu->dbu_clear_on_evict_dbufp = NULL;
+#endif
+
+       /*
+        * Invoke the callback from a taskq to avoid lock order reversals
+        * and limit stack depth.
+        */
+       taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0,
+           &dbu->dbu_tqent);
 }
 
 boolean_t
@@ -302,10 +376,11 @@ dbuf_init(void)
 
        /*
         * The hash table is big enough to fill all of physical memory
-        * with an average 4K block size.  The table will take up
-        * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
+        * with an average block size of zfs_arc_average_blocksize (default 8K).
+        * By default, the table will take up
+        * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
         */
-       while (hsize * 4096 < physmem * PAGESIZE)
+       while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
                hsize <<= 1;
 
 retry:
@@ -334,6 +409,12 @@ retry:
                mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 
        dbuf_stats_init(h);
+
+       /*
+        * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
+        * configuration is not required.
+        */
+       dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
 }
 
 void
@@ -356,6 +437,7 @@ dbuf_fini(void)
        kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 #endif
        kmem_cache_destroy(dbuf_cache);
+       taskq_destroy(dbu_evict_taskq);
 }
 
 /*
@@ -386,7 +468,7 @@ dbuf_verify(dmu_buf_impl_t *db)
                ASSERT3U(db->db_level, <, dn->dn_nlevels);
                ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
                    db->db_blkid == DMU_SPILL_BLKID ||
-                   !list_is_empty(&dn->dn_dbufs));
+                   !avl_is_empty(&dn->dn_dbufs));
        }
        if (db->db_blkid == DMU_BONUS_BLKID) {
                ASSERT(dn != NULL);
@@ -474,32 +556,27 @@ dbuf_verify(dmu_buf_impl_t *db)
 #endif
 
 static void
-dbuf_update_data(dmu_buf_impl_t *db)
+dbuf_clear_data(dmu_buf_impl_t *db)
 {
        ASSERT(MUTEX_HELD(&db->db_mtx));
-       if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
-               ASSERT(!refcount_is_zero(&db->db_holds));
-               *db->db_user_data_ptr_ptr = db->db.db_data;
-       }
+       dbuf_evict_user(db);
+       db->db_buf = NULL;
+       db->db.db_data = NULL;
+       if (db->db_state != DB_NOFILL)
+               db->db_state = DB_UNCACHED;
 }
 
 static void
 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 {
        ASSERT(MUTEX_HELD(&db->db_mtx));
+       ASSERT(buf != NULL);
+
        db->db_buf = buf;
-       if (buf != NULL) {
-               ASSERT(buf->b_data != NULL);
-               db->db.db_data = buf->b_data;
-               if (!arc_released(buf))
-                       arc_set_callback(buf, dbuf_do_evict, db);
-               dbuf_update_data(db);
-       } else {
-               dbuf_evict_user(db);
-               db->db.db_data = NULL;
-               if (db->db_state != DB_NOFILL)
-                       db->db_state = DB_UNCACHED;
-       }
+       ASSERT(buf->b_data != NULL);
+       db->db.db_data = buf->b_data;
+       if (!arc_released(buf))
+               arc_set_callback(buf, dbuf_do_evict, db);
 }
 
 /*
@@ -521,7 +598,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
        } else {
                abuf = db->db_buf;
                arc_loan_inuse_buf(abuf, db);
-               dbuf_set_data(db, NULL);
+               dbuf_clear_data(db);
                mutex_exit(&db->db_mtx);
        }
        return (abuf);
@@ -577,7 +654,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 {
        dnode_t *dn;
        zbookmark_phys_t zb;
-       uint32_t aflags = ARC_NOWAIT;
+       uint32_t aflags = ARC_FLAG_NOWAIT;
        int err;
 
        DB_DNODE_ENTER(db);
@@ -600,7 +677,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
                if (bonuslen)
                        bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
                DB_DNODE_EXIT(db);
-               dbuf_update_data(db);
                db->db_state = DB_CACHED;
                mutex_exit(&db->db_mtx);
                return (0);
@@ -632,9 +708,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
        mutex_exit(&db->db_mtx);
 
        if (DBUF_IS_L2CACHEABLE(db))
-               aflags |= ARC_L2CACHE;
+               aflags |= ARC_FLAG_L2CACHE;
        if (DBUF_IS_L2COMPRESSIBLE(db))
-               aflags |= ARC_L2COMPRESS;
+               aflags |= ARC_FLAG_L2COMPRESS;
 
        SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
            db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
@@ -646,7 +722,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
            dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
            (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
            &aflags, &zb);
-       if (aflags & ARC_CACHED)
+       if (aflags & ARC_FLAG_CACHED)
                *flags |= DB_RF_CACHED;
 
        return (SET_ERROR(err));
@@ -762,7 +838,7 @@ dbuf_noread(dmu_buf_impl_t *db)
                dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
                db->db_state = DB_FILL;
        } else if (db->db_state == DB_NOFILL) {
-               dbuf_set_data(db, NULL);
+               dbuf_clear_data(db);
        } else {
                ASSERT3U(db->db_state, ==, DB_CACHED);
        }
@@ -818,7 +894,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
                dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
                bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
        } else {
-               dbuf_set_data(db, NULL);
+               dbuf_clear_data(db);
        }
 }
 
@@ -866,23 +942,35 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
  * receive; see comment below for details.
  */
 void
-dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
+dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+    dmu_tx_t *tx)
 {
+       dmu_buf_impl_t *db_search;
        dmu_buf_impl_t *db, *db_next;
        uint64_t txg = tx->tx_txg;
+       avl_index_t where;
        boolean_t freespill =
-           (start == DMU_SPILL_BLKID || end == DMU_SPILL_BLKID);
+           (start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID);
+
+       if (end_blkid > dn->dn_maxblkid && !freespill)
+               end_blkid = dn->dn_maxblkid;
+       dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
 
-       if (end > dn->dn_maxblkid && !freespill)
-               end = dn->dn_maxblkid;
-       dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
+       db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
+       db_search->db_level = 0;
+       db_search->db_blkid = start_blkid;
+       db_search->db_state = DB_SEARCH;
 
        mutex_enter(&dn->dn_dbufs_mtx);
-       if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz &&
-           !freespill) {
+       if (start_blkid >= dn->dn_unlisted_l0_blkid && !freespill) {
                /* There can't be any dbufs in this range; no need to search. */
-               mutex_exit(&dn->dn_dbufs_mtx);
-               return;
+#ifdef DEBUG
+               db = avl_find(&dn->dn_dbufs, db_search, &where);
+               ASSERT3P(db, ==, NULL);
+               db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+               ASSERT(db == NULL || db->db_level > 0);
+#endif
+               goto out;
        } else if (dmu_objset_is_receiving(dn->dn_objset)) {
                /*
                 * If we are receiving, we expect there to be no dbufs in
@@ -894,19 +982,18 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
                atomic_inc_64(&zfs_free_range_recv_miss);
        }
 
-       for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
-               db_next = list_next(&dn->dn_dbufs, db);
+       db = avl_find(&dn->dn_dbufs, db_search, &where);
+       ASSERT3P(db, ==, NULL);
+       db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+       for (; db != NULL; db = db_next) {
+               db_next = AVL_NEXT(&dn->dn_dbufs, db);
                ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
-               /* Skip indirect blocks. */
-               if (db->db_level != 0)
-                       continue;
-               /* Skip direct blocks outside the range. */
-               if (!freespill && (db->db_blkid < start || db->db_blkid > end))
-                       continue;
-               /* Skip all direct blocks, only free spill blocks. */
-               if (freespill && (db->db_blkid != DMU_SPILL_BLKID))
-                       continue;
+               if (db->db_level != 0 || db->db_blkid > end_blkid) {
+                       break;
+               }
+               ASSERT3U(db->db_blkid, >=, start_blkid);
 
                /* found a level 0 buffer in the range */
                mutex_enter(&db->db_mtx);
@@ -968,6 +1055,9 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
 
                mutex_exit(&db->db_mtx);
        }
+
+out:
+       kmem_free(db_search, sizeof (dmu_buf_impl_t));
        mutex_exit(&dn->dn_dbufs_mtx);
 }
 
@@ -1366,6 +1456,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        dbuf_dirty_record_t *dr, **drp;
 
        ASSERT(txg != 0);
+
+       /*
+        * Due to our use of dn_nlevels below, this can only be called
+        * in open context, unless we are operating on the MOS.
+        * From syncing context, dn_nlevels may be different from the
+        * dn_nlevels used when dbuf was dirtied.
+        */
+       ASSERT(db->db_objset ==
+           dmu_objset_pool(db->db_objset)->dp_meta_objset ||
+           txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
        ASSERT(db->db_blkid != DMU_BONUS_BLKID);
        ASSERT0(db->db_level);
        ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1388,11 +1488,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
        ASSERT(db->db.db_size != 0);
 
-       /*
-        * Any space we accounted for in dp_dirty_* will be cleaned up by
-        * dsl_pool_sync().  This is relatively rare so the discrepancy
-        * is not a big deal.
-        */
+       dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
+           dr->dr_accounted, txg);
 
        *drp = dr->dr_next;
 
@@ -1407,7 +1504,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                list_remove(&dr->dr_parent->dt.di.dr_children, dr);
                mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
        } else if (db->db_blkid == DMU_SPILL_BLKID ||
-           db->db_level+1 == dn->dn_nlevels) {
+           db->db_level + 1 == dn->dn_nlevels) {
                ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
                mutex_enter(&dn->dn_mtx);
                list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
@@ -1423,6 +1520,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                if (dr->dt.dl.dr_data != db->db_buf)
                        VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
        }
+
        kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
        ASSERT(db->db_dirtycnt > 0);
@@ -1432,7 +1530,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                arc_buf_t *buf = db->db_buf;
 
                ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
-               dbuf_set_data(db, NULL);
+               dbuf_clear_data(db);
                VERIFY(arc_buf_remove_ref(buf, db));
                dbuf_evict(db);
                return (B_TRUE);
@@ -1651,7 +1749,7 @@ dbuf_clear(dmu_buf_impl_t *db)
        dn = DB_DNODE(db);
        dndb = dn->dn_dbuf;
        if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
-               list_remove(&dn->dn_dbufs, db);
+               avl_remove(&dn->dn_dbufs, db);
                atomic_dec_32(&dn->dn_dbufs_count);
                membar_producer();
                DB_DNODE_EXIT(db);
@@ -1781,11 +1879,10 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
        db->db_parent = parent;
        db->db_blkptr = blkptr;
 
-       db->db_user_ptr = NULL;
-       db->db_user_data_ptr_ptr = NULL;
-       db->db_evict_func = NULL;
-       db->db_immediate_evict = 0;
-       db->db_freed_in_flight = 0;
+       db->db_user = NULL;
+       db->db_user_immediate_evict = FALSE;
+       db->db_freed_in_flight = FALSE;
+       db->db_pending_evict = FALSE;
 
        if (blkid == DMU_BONUS_BLKID) {
                ASSERT3P(parent, ==, dn->dn_dbuf);
@@ -1823,7 +1920,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
                mutex_exit(&dn->dn_dbufs_mtx);
                return (odb);
        }
-       list_insert_head(&dn->dn_dbufs, db);
+       avl_add(&dn->dn_dbufs, db);
        if (db->db_level == 0 && db->db_blkid >=
            dn->dn_unlisted_l0_blkid)
                dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
@@ -1882,7 +1979,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
                        DB_DNODE_ENTER(db);
                        dn = DB_DNODE(db);
                        mutex_enter(&dn->dn_dbufs_mtx);
-                       list_remove(&dn->dn_dbufs, db);
+                       avl_remove(&dn->dn_dbufs, db);
                        atomic_dec_32(&dn->dn_dbufs_count);
                        mutex_exit(&dn->dn_dbufs_mtx);
                        DB_DNODE_EXIT(db);
@@ -1900,7 +1997,6 @@ dbuf_destroy(dmu_buf_impl_t *db)
        db->db_parent = NULL;
        db->db_buf = NULL;
 
-       ASSERT(!list_link_active(&db->db_link));
        ASSERT(db->db.db_data == NULL);
        ASSERT(db->db_hash_next == NULL);
        ASSERT(db->db_blkptr == NULL);
@@ -1923,7 +2019,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
                return;
 
        /* dbuf_find() returns with db_mtx held */
-       if ((db = dbuf_find(dn, 0, blkid))) {
+       if ((db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid))) {
                /*
                 * This dbuf is already in the cache.  We assume that
                 * it is already CACHED, or else about to be either
@@ -1936,7 +2032,8 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
        if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) {
                if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
                        dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
-                       uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
+                       arc_flags_t aflags =
+                           ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
                        zbookmark_phys_t zb;
 
                        SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
@@ -1971,7 +2068,8 @@ __dbuf_hold_impl(struct dbuf_hold_impl_data *dh)
        *(dh->dh_dbp) = NULL;
 top:
        /* dbuf_find() returns with db_mtx held */
-       dh->dh_db = dbuf_find(dh->dh_dn, dh->dh_level, dh->dh_blkid);
+       dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object,
+           dh->dh_level, dh->dh_blkid);
 
        if (dh->dh_db == NULL) {
                dh->dh_bp = NULL;
@@ -2034,7 +2132,6 @@ top:
        }
 
        (void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag);
-       dbuf_update_data(dh->dh_db);
        DBUF_VERIFY(dh->dh_db);
        mutex_exit(&dh->dh_db->db_mtx);
 
@@ -2124,10 +2221,8 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
                return (SET_ERROR(ENOTSUP));
        if (blksz == 0)
                blksz = SPA_MINBLOCKSIZE;
-       if (blksz > SPA_MAXBLOCKSIZE)
-               blksz = SPA_MAXBLOCKSIZE;
-       else
-               blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+       ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
+       blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
 
        DB_DNODE_ENTER(db);
        dn = DB_DNODE(db);
@@ -2152,6 +2247,30 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
        VERIFY(refcount_add(&db->db_holds, tag) > 1);
 }
 
+#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
+boolean_t
+dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
+    void *tag)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       dmu_buf_impl_t *found_db;
+       boolean_t result = B_FALSE;
+
+       if (blkid == DMU_BONUS_BLKID)
+               found_db = dbuf_find_bonus(os, obj);
+       else
+               found_db = dbuf_find(os, obj, 0, blkid);
+
+       if (found_db != NULL) {
+               if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
+                       (void) refcount_add(&db->db_holds, tag);
+                       result = B_TRUE;
+               }
+               mutex_exit(&found_db->db_mtx);
+       }
+       return (result);
+}
+
 /*
  * If you call dbuf_rele() you had better not be referencing the dnode handle
  * unless you have some other direct or indirect hold on the dnode. (An indirect
@@ -2200,12 +2319,13 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
                arc_buf_freeze(db->db_buf);
 
        if (holds == db->db_dirtycnt &&
-           db->db_level == 0 && db->db_immediate_evict)
+           db->db_level == 0 && db->db_user_immediate_evict)
                dbuf_evict_user(db);
 
        if (holds == 0) {
                if (db->db_blkid == DMU_BONUS_BLKID) {
                        dnode_t *dn;
+                       boolean_t evict_dbuf = db->db_pending_evict;
 
                        /*
                         * If the dnode moves here, we cannot cross this
@@ -2220,7 +2340,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
                         * Decrementing the dbuf count means that the bonus
                         * buffer's dnode hold is no longer discounted in
                         * dnode_move(). The dnode cannot move until after
-                        * the dnode_rele_and_unlock() below.
+                        * the dnode_rele() below.
                         */
                        DB_DNODE_EXIT(db);
 
@@ -2230,35 +2350,10 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
                         */
                        mutex_exit(&db->db_mtx);
 
-                       /*
-                        * If the dnode has been freed, evict the bonus
-                        * buffer immediately.  The data in the bonus
-                        * buffer is no longer relevant and this prevents
-                        * a stale bonus buffer from being associated
-                        * with this dnode_t should the dnode_t be reused
-                        * prior to being destroyed.
-                        */
-                       mutex_enter(&dn->dn_mtx);
-                       if (dn->dn_type == DMU_OT_NONE ||
-                           dn->dn_free_txg != 0) {
-                               /*
-                                * Drop dn_mtx.  It is a leaf lock and
-                                * cannot be held when dnode_evict_bonus()
-                                * acquires other locks in order to
-                                * perform the eviction.
-                                *
-                                * Freed dnodes cannot be reused until the
-                                * last hold is released.  Since this bonus
-                                * buffer has a hold, the dnode will remain
-                                * in the free state, even without dn_mtx
-                                * held, until the dnode_rele_and_unlock()
-                                * below.
-                                */
-                               mutex_exit(&dn->dn_mtx);
+                       if (evict_dbuf)
                                dnode_evict_bonus(dn);
-                               mutex_enter(&dn->dn_mtx);
-                       }
-                       dnode_rele_and_unlock(dn, db);
+
+                       dnode_rele(dn, db);
                } else if (db->db_buf == NULL) {
                        /*
                         * This is a special case: we never associated this
@@ -2272,7 +2367,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
                        /*
                         * This dbuf has anonymous data associated with it.
                         */
-                       dbuf_set_data(db, NULL);
+                       dbuf_clear_data(db);
                        VERIFY(arc_buf_remove_ref(buf, db));
                        dbuf_evict(db);
                } else {
@@ -2305,7 +2400,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
                                } else {
                                        dbuf_clear(db);
                                }
-                       } else if (arc_buf_eviction_needed(db->db_buf)) {
+                       } else if (db->db_pending_evict ||
+                           arc_buf_eviction_needed(db->db_buf)) {
                                dbuf_clear(db);
                        } else {
                                mutex_exit(&db->db_mtx);
@@ -2324,56 +2420,57 @@ dbuf_refcount(dmu_buf_impl_t *db)
 }
 
 void *
-dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
-    dmu_buf_evict_func_t *evict_func)
+dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
+    dmu_buf_user_t *new_user)
 {
-       return (dmu_buf_update_user(db_fake, NULL, user_ptr,
-           user_data_ptr_ptr, evict_func));
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+       mutex_enter(&db->db_mtx);
+       dbuf_verify_user(db, DBVU_NOT_EVICTING);
+       if (db->db_user == old_user)
+               db->db_user = new_user;
+       else
+               old_user = db->db_user;
+       dbuf_verify_user(db, DBVU_NOT_EVICTING);
+       mutex_exit(&db->db_mtx);
+
+       return (old_user);
 }
 
 void *
-dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
-    dmu_buf_evict_func_t *evict_func)
+dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
-       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
-       db->db_immediate_evict = TRUE;
-       return (dmu_buf_update_user(db_fake, NULL, user_ptr,
-           user_data_ptr_ptr, evict_func));
+       return (dmu_buf_replace_user(db_fake, NULL, user));
 }
 
 void *
-dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
-    void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
        dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-       ASSERT(db->db_level == 0);
-
-       ASSERT((user_ptr == NULL) == (evict_func == NULL));
 
-       mutex_enter(&db->db_mtx);
-
-       if (db->db_user_ptr == old_user_ptr) {
-               db->db_user_ptr = user_ptr;
-               db->db_user_data_ptr_ptr = user_data_ptr_ptr;
-               db->db_evict_func = evict_func;
-
-               dbuf_update_data(db);
-       } else {
-               old_user_ptr = db->db_user_ptr;
-       }
+       db->db_user_immediate_evict = TRUE;
+       return (dmu_buf_set_user(db_fake, user));
+}
 
-       mutex_exit(&db->db_mtx);
-       return (old_user_ptr);
+void *
+dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+       return (dmu_buf_replace_user(db_fake, user, NULL));
 }
 
 void *
 dmu_buf_get_user(dmu_buf_t *db_fake)
 {
        dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-       ASSERT(!refcount_is_zero(&db->db_holds));
 
-       return (db->db_user_ptr);
+       dbuf_verify_user(db, DBVU_NOT_EVICTING);
+       return (db->db_user);
+}
+
+void
+dmu_buf_user_evict_wait()
+{
+       taskq_wait(dbu_evict_taskq);
 }
 
 boolean_t
@@ -2486,7 +2583,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 
        zio = dr->dr_zio;
        mutex_enter(&dr->dt.di.dr_mtx);
-       dbuf_sync_list(&dr->dt.di.dr_children, tx);
+       dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
        ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
        mutex_exit(&dr->dt.di.dr_mtx);
        zio_nowait(zio);
@@ -2637,7 +2734,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 }
 
 void
-dbuf_sync_list(list_t *list, dmu_tx_t *tx)
+dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 {
        dbuf_dirty_record_t *dr;
 
@@ -2654,6 +2751,10 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx)
                            DMU_META_DNODE_OBJECT);
                        break;
                }
+               if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+                   dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+                       VERIFY3U(dr->dr_dbuf->db_level, ==, level);
+               }
                list_remove(list, dr);
                if (dr->dr_dbuf->db_level > 0)
                        dbuf_sync_indirect(dr, tx);
@@ -3041,7 +3142,6 @@ EXPORT_SYMBOL(dbuf_refcount);
 EXPORT_SYMBOL(dbuf_sync_list);
 EXPORT_SYMBOL(dmu_buf_set_user);
 EXPORT_SYMBOL(dmu_buf_set_user_ie);
-EXPORT_SYMBOL(dmu_buf_update_user);
 EXPORT_SYMBOL(dmu_buf_get_user);
 EXPORT_SYMBOL(dmu_buf_freeable);
 EXPORT_SYMBOL(dmu_buf_get_blkptr);
index 5e7eaf1acf33083d8c8a66efdc883dd8ec359cd1..afdf828ed54262287939ba02d2d3729f2cb4cbca 100644 (file)
@@ -48,12 +48,12 @@ dbuf_stats_hash_table_headers(char *buf, size_t size)
        (void) snprintf(buf, size,
            "%-88s | %-124s | %s\n"
            "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | "
-           "%-5s %-5s %-6s %-8s %-6s %-8s %-12s "
+           "%-5s %-5s %-8s %-6s %-8s %-12s "
            "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | "
            "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n",
            "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
            "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list",
-           "atype", "index", "flags", "count", "asize", "access",
+           "atype", "flags", "count", "asize", "access",
            "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
            "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs",
            "bsize", "lvls", "dholds", "blocks", "dsize");
@@ -77,7 +77,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
 
        nwritten = snprintf(buf, size,
            "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | "
-           "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu "
+           "%-5d %-5d 0x%-6x %-6lu %-8llu %-12llu "
            "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | "
            "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n",
            /* dmu_buf_impl_t */
@@ -94,7 +94,6 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
            /* arc_buf_info_t */
            abi.abi_state_type,
            abi.abi_state_contents,
-           (longlong_t)abi.abi_state_index,
            abi.abi_flags,
            (ulong_t)abi.abi_datacnt,
            (u_longlong_t)abi.abi_size,
index 18557ffb5c1f90fb3118ef7574be2a5c5d225d4f..12c1b7300a2136e088806d8b83a25791578981cd 100644 (file)
@@ -115,13 +115,14 @@ ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 
        error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
            sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
-
        if (error != 0)
                return (error);
 
-       VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+       error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
            sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
-           &ddt->ddt_histogram[type][class]));
+           &ddt->ddt_histogram[type][class]);
+       if (error != 0)
+               return (error);
 
        /*
         * Seed the cached statistics.
index 3b7bbefc2f733dc2926b23386f14086b4f8b3b2e..5e2a1db601b49fc85d59bf064cfcfa8a150f814f 100644 (file)
@@ -23,6 +23,7 @@
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -644,9 +645,13 @@ static int
 dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
     uint64_t length)
 {
-       uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
+       uint64_t object_size;
        int err;
 
+       if (dn == NULL)
+               return (SET_ERROR(EINVAL));
+
+       object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
        if (offset >= object_size)
                return (0);
 
@@ -764,7 +769,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
         * handle that here as well.
         */
        if (dn->dn_maxblkid == 0) {
-               int newsz = offset > dn->dn_datablksz ? 0 :
+               uint64_t newsz = offset > dn->dn_datablksz ? 0 :
                    MIN(size, dn->dn_datablksz - offset);
                bzero((char *)buf + newsz, size - newsz);
                size = newsz;
@@ -784,16 +789,16 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
                        break;
 
                for (i = 0; i < numbufs; i++) {
-                       int tocpy;
-                       int bufoff;
+                       uint64_t tocpy;
+                       int64_t bufoff;
                        dmu_buf_t *db = dbp[i];
 
                        ASSERT(size > 0);
 
                        bufoff = offset - db->db_offset;
-                       tocpy = (int)MIN(db->db_size - bufoff, size);
+                       tocpy = MIN(db->db_size - bufoff, size);
 
-                       bcopy((char *)db->db_data + bufoff, buf, tocpy);
+                       (void) memcpy(buf, (char *)db->db_data + bufoff, tocpy);
 
                        offset += tocpy;
                        size -= tocpy;
@@ -819,14 +824,14 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
            FALSE, FTAG, &numbufs, &dbp));
 
        for (i = 0; i < numbufs; i++) {
-               int tocpy;
-               int bufoff;
+               uint64_t tocpy;
+               int64_t bufoff;
                dmu_buf_t *db = dbp[i];
 
                ASSERT(size > 0);
 
                bufoff = offset - db->db_offset;
-               tocpy = (int)MIN(db->db_size - bufoff, size);
+               tocpy = MIN(db->db_size - bufoff, size);
 
                ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
@@ -929,7 +934,7 @@ dmu_xuio_init(xuio_t *xuio, int nblk)
        priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
        priv->cnt = nblk;
        priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
-       priv->iovp = uio->uio_iov;
+       priv->iovp = (iovec_t *)uio->uio_iov;
        XUIO_XUZC_PRIV(xuio) = priv;
 
        if (XUIO_XUZC_RW(xuio) == UIO_READ)
@@ -970,7 +975,7 @@ dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
 
        ASSERT(i < priv->cnt);
        ASSERT(off + n <= arc_buf_size(abuf));
-       iov = uio->uio_iov + i;
+       iov = (iovec_t *)uio->uio_iov + i;
        iov->iov_base = (char *)abuf->b_data + off;
        iov->iov_len = n;
        priv->bufs[i] = abuf;
@@ -1044,15 +1049,16 @@ xuio_stat_wbuf_nocopy()
  * return value is the number of bytes successfully copied to arg_buf.
  */
 static int
-dmu_req_copy(void *arg_buf, int size, struct request *req, size_t req_offset)
+dmu_bio_copy(void *arg_buf, int size, struct bio *bio, size_t bio_offset)
 {
-       struct bio_vec bv, *bvp;
-       struct req_iterator iter;
+       struct bio_vec bv, *bvp = &bv;
+       bvec_iterator_t iter;
        char *bv_buf;
        int tocpy, bv_len, bv_offset;
        int offset = 0;
 
-       rq_for_each_segment4(bv, bvp, req, iter) {
+       bio_for_each_segment4(bv, bvp, bio, iter) {
+
                /*
                 * Fully consumed the passed arg_buf. We use goto here because
                 * rq_for_each_segment is a double loop
@@ -1061,23 +1067,23 @@ dmu_req_copy(void *arg_buf, int size, struct request *req, size_t req_offset)
                if (size == offset)
                        goto out;
 
-               /* Skip already copied bv */
-               if (req_offset >=  bv.bv_len) {
-                       req_offset -= bv.bv_len;
+               /* Skip already copied bvp */
+               if (bio_offset >= bvp->bv_len) {
+                       bio_offset -= bvp->bv_len;
                        continue;
                }
 
-               bv_len = bv.bv_len - req_offset;
-               bv_offset = bv.bv_offset + req_offset;
-               req_offset = 0;
+               bv_len = bvp->bv_len - bio_offset;
+               bv_offset = bvp->bv_offset + bio_offset;
+               bio_offset = 0;
 
                tocpy = MIN(bv_len, size - offset);
                ASSERT3S(tocpy, >=, 0);
 
-               bv_buf = page_address(bv.bv_page) + bv_offset;
+               bv_buf = page_address(bvp->bv_page) + bv_offset;
                ASSERT3P(bv_buf, !=, NULL);
 
-               if (rq_data_dir(req) == WRITE)
+               if (bio_data_dir(bio) == WRITE)
                        memcpy(arg_buf + offset, bv_buf, tocpy);
                else
                        memcpy(bv_buf, arg_buf + offset, tocpy);
@@ -1089,13 +1095,13 @@ out:
 }
 
 int
-dmu_read_req(objset_t *os, uint64_t object, struct request *req)
+dmu_read_bio(objset_t *os, uint64_t object, struct bio *bio)
 {
-       uint64_t size = blk_rq_bytes(req);
-       uint64_t offset = blk_rq_pos(req) << 9;
+       uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+       uint64_t size = BIO_BI_SIZE(bio);
        dmu_buf_t **dbp;
        int numbufs, i, err;
-       size_t req_offset;
+       size_t bio_offset;
 
        /*
         * NB: we could do this block-at-a-time, but it's nice
@@ -1106,20 +1112,22 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req)
        if (err)
                return (err);
 
-       req_offset = 0;
+       bio_offset = 0;
        for (i = 0; i < numbufs; i++) {
-               int tocpy, didcpy, bufoff;
+               uint64_t tocpy;
+               int64_t bufoff;
+               int didcpy;
                dmu_buf_t *db = dbp[i];
 
                bufoff = offset - db->db_offset;
                ASSERT3S(bufoff, >=, 0);
 
-               tocpy = (int)MIN(db->db_size - bufoff, size);
+               tocpy = MIN(db->db_size - bufoff, size);
                if (tocpy == 0)
                        break;
 
-               didcpy = dmu_req_copy(db->db_data + bufoff, tocpy, req,
-                   req_offset);
+               didcpy = dmu_bio_copy(db->db_data + bufoff, tocpy, bio,
+                   bio_offset);
 
                if (didcpy < tocpy)
                        err = EIO;
@@ -1129,7 +1137,7 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req)
 
                size -= tocpy;
                offset += didcpy;
-               req_offset += didcpy;
+               bio_offset += didcpy;
                err = 0;
        }
        dmu_buf_rele_array(dbp, numbufs, FTAG);
@@ -1138,13 +1146,13 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req)
 }
 
 int
-dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx)
+dmu_write_bio(objset_t *os, uint64_t object, struct bio *bio, dmu_tx_t *tx)
 {
-       uint64_t size = blk_rq_bytes(req);
-       uint64_t offset = blk_rq_pos(req) << 9;
+       uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+       uint64_t size = BIO_BI_SIZE(bio);
        dmu_buf_t **dbp;
        int numbufs, i, err;
-       size_t req_offset;
+       size_t bio_offset;
 
        if (size == 0)
                return (0);
@@ -1154,15 +1162,17 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx)
        if (err)
                return (err);
 
-       req_offset = 0;
+       bio_offset = 0;
        for (i = 0; i < numbufs; i++) {
-               int tocpy, didcpy, bufoff;
+               uint64_t tocpy;
+               int64_t bufoff;
+               int didcpy;
                dmu_buf_t *db = dbp[i];
 
                bufoff = offset - db->db_offset;
                ASSERT3S(bufoff, >=, 0);
 
-               tocpy = (int)MIN(db->db_size - bufoff, size);
+               tocpy = MIN(db->db_size - bufoff, size);
                if (tocpy == 0)
                        break;
 
@@ -1173,8 +1183,8 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx)
                else
                        dmu_buf_will_dirty(db, tx);
 
-               didcpy = dmu_req_copy(db->db_data + bufoff, tocpy, req,
-                   req_offset);
+               didcpy = dmu_bio_copy(db->db_data + bufoff, tocpy, bio,
+                   bio_offset);
 
                if (tocpy == db->db_size)
                        dmu_buf_fill_done(db, tx);
@@ -1187,7 +1197,7 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx)
 
                size -= tocpy;
                offset += didcpy;
-               req_offset += didcpy;
+               bio_offset += didcpy;
                err = 0;
        }
 
@@ -1195,8 +1205,8 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx)
        return (err);
 }
 
-int
-dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
+static int
+dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
 {
        dmu_buf_t **dbp;
        int numbufs, i, err;
@@ -1206,20 +1216,20 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
         * NB: we could do this block-at-a-time, but it's nice
         * to be reading in parallel.
         */
-       err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
-           &numbufs, &dbp);
+       err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
+           TRUE, FTAG, &numbufs, &dbp, 0);
        if (err)
                return (err);
 
        for (i = 0; i < numbufs; i++) {
-               int tocpy;
-               int bufoff;
+               uint64_t tocpy;
+               int64_t bufoff;
                dmu_buf_t *db = dbp[i];
 
                ASSERT(size > 0);
 
                bufoff = uio->uio_loffset - db->db_offset;
-               tocpy = (int)MIN(db->db_size - bufoff, size);
+               tocpy = MIN(db->db_size - bufoff, size);
 
                if (xuio) {
                        dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
@@ -1249,6 +1259,58 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
        return (err);
 }
 
+/*
+ * Read 'size' bytes into the uio buffer.
+ * From object zdb->db_object.
+ * Starting at offset uio->uio_loffset.
+ *
+ * If the caller already has a dbuf in the target object
+ * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
+ * because we don't have to find the dnode_t for the object.
+ */
+int
+dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+       dnode_t *dn;
+       int err;
+
+       if (size == 0)
+               return (0);
+
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+       err = dmu_read_uio_dnode(dn, uio, size);
+       DB_DNODE_EXIT(db);
+
+       return (err);
+}
+
+/*
+ * Read 'size' bytes into the uio buffer.
+ * From the specified object
+ * Starting at offset uio->uio_loffset.
+ */
+int
+dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
+{
+       dnode_t *dn;
+       int err;
+
+       if (size == 0)
+               return (0);
+
+       err = dnode_hold(os, object, FTAG, &dn);
+       if (err)
+               return (err);
+
+       err = dmu_read_uio_dnode(dn, uio, size);
+
+       dnode_rele(dn, FTAG);
+
+       return (err);
+}
+
 static int
 dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
 {
@@ -1263,14 +1325,14 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
                return (err);
 
        for (i = 0; i < numbufs; i++) {
-               int tocpy;
-               int bufoff;
+               uint64_t tocpy;
+               int64_t bufoff;
                dmu_buf_t *db = dbp[i];
 
                ASSERT(size > 0);
 
                bufoff = uio->uio_loffset - db->db_offset;
-               tocpy = (int)MIN(db->db_size - bufoff, size);
+               tocpy = MIN(db->db_size - bufoff, size);
 
                ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
@@ -1301,6 +1363,15 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
        return (err);
 }
 
+/*
+ * Write 'size' bytes from the uio buffer.
+ * To object zdb->db_object.
+ * Starting at offset uio->uio_loffset.
+ *
+ * If the caller already has a dbuf in the target object
+ * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
+ * because we don't have to find the dnode_t for the object.
+ */
 int
 dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
@@ -1320,6 +1391,11 @@ dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
        return (err);
 }
 
+/*
+ * Write 'size' bytes from the uio buffer.
+ * To the specified object.
+ * Starting at offset uio->uio_loffset.
+ */
 int
 dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
@@ -1652,19 +1728,32 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
        ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
 
        /*
-        * Assume the on-disk data is X, the current syncing data is Y,
-        * and the current in-memory data is Z (currently in dmu_sync).
-        * X and Z are identical but Y is has been modified. Normally,
-        * when X and Z are the same we will perform a nopwrite but if Y
-        * is different we must disable nopwrite since the resulting write
-        * of Y to disk can free the block containing X. If we allowed a
-        * nopwrite to occur the block pointing to Z would reference a freed
-        * block. Since this is a rare case we simplify this by disabling
-        * nopwrite if the current dmu_sync-ing dbuf has been modified in
-        * a previous transaction.
+        * Assume the on-disk data is X, the current syncing data (in
+        * txg - 1) is Y, and the current in-memory data is Z (currently
+        * in dmu_sync).
+        *
+        * We usually want to perform a nopwrite if X and Z are the
+        * same.  However, if Y is different (i.e. the BP is going to
+        * change before this write takes effect), then a nopwrite will
+        * be incorrect - we would override with X, which could have
+        * been freed when Y was written.
+        *
+        * (Note that this is not a concern when we are nop-writing from
+        * syncing context, because X and Y must be identical, because
+        * all previous txgs have been synced.)
+        *
+        * Therefore, we disable nopwrite if the current BP could change
+        * before this TXG.  There are two ways it could change: by
+        * being dirty (dr_next is non-NULL), or by being freed
+        * (dnode_block_freed()).  This behavior is verified by
+        * zio_done(), which VERIFYs that the override BP is identical
+        * to the on-disk BP.
         */
-       if (dr->dr_next)
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+       if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
                zp.zp_nopwrite = B_FALSE;
+       DB_DNODE_EXIT(db);
 
        ASSERT(dr->dr_txg == txg);
        if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
@@ -1781,19 +1870,15 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
         *       3. all other level 0 blocks
         */
        if (ismd) {
-               /*
-                * XXX -- we should design a compression algorithm
-                * that specializes in arrays of bps.
-                */
-               boolean_t lz4_ac = spa_feature_is_active(os->os_spa,
-                   SPA_FEATURE_LZ4_COMPRESS);
-
                if (zfs_mdcomp_disable) {
                        compress = ZIO_COMPRESS_EMPTY;
-               } else if (lz4_ac) {
-                       compress = ZIO_COMPRESS_LZ4;
                } else {
-                       compress = ZIO_COMPRESS_LZJB;
+                       /*
+                        * XXX -- we should design a compression algorithm
+                        * that specializes in arrays of bps.
+                        */
+                       compress = zio_compress_select(os->os_spa,
+                           ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
                }
 
                /*
@@ -1826,7 +1911,8 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
                compress = ZIO_COMPRESS_OFF;
                checksum = ZIO_CHECKSUM_OFF;
        } else {
-               compress = zio_compress_select(dn->dn_compress, compress);
+               compress = zio_compress_select(os->os_spa, dn->dn_compress,
+                   compress);
 
                checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
                    zio_checksum_select(dn->dn_checksum, checksum) :
index 30fabbb07957f9566d620e2a1bd8d64430fdb1d2..91415d0d2dcb6aa8bc17f87aaf7e3b1e140016ac 100644 (file)
@@ -129,7 +129,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
        } else if (zb->zb_level == 0) {
                dnode_phys_t *blk;
                arc_buf_t *abuf;
-               uint32_t aflags = ARC_WAIT;
+               arc_flags_t aflags = ARC_FLAG_WAIT;
                int blksz = BP_GET_LSIZE(bp);
                int i;
 
@@ -194,7 +194,7 @@ dmu_diff(const char *tosnap_name, const char *fromsnap_name,
                return (SET_ERROR(EXDEV));
        }
 
-       fromtxg = fromsnap->ds_phys->ds_creation_txg;
+       fromtxg = dsl_dataset_phys(fromsnap)->ds_creation_txg;
        dsl_dataset_rele(fromsnap, FTAG);
 
        dsl_dataset_long_hold(tosnap, FTAG);
index f438ca62a11fe15b0e8bfe47e699f830f2b312b2..779b3bb789aad6a3476279f6778895a01f0e3782 100644 (file)
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -46,6 +50,7 @@
 #include <sys/sa.h>
 #include <sys/zfs_onexit.h>
 #include <sys/dsl_destroy.h>
+#include <sys/vdev.h>
 
 /*
  * Needed to close a window in dnode_move() that allows the objset to be freed
  */
 krwlock_t os_lock;
 
+/*
+ * Tunable to overwrite the maximum number of threads for the parallization
+ * of dmu_objset_find_dp, needed to speed up the import of pools with many
+ * datasets.
+ * Default is 4 times the number of leaf vdevs.
+ */
+int dmu_find_threads = 0;
+
+static void dmu_objset_find_dp_cb(void *arg);
+
 void
 dmu_objset_init(void)
 {
@@ -149,7 +164,8 @@ compression_changed_cb(void *arg, uint64_t newval)
         */
        ASSERT(newval != ZIO_COMPRESS_INHERIT);
 
-       os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
+       os->os_compress = zio_compress_select(os->os_spa, newval,
+           ZIO_COMPRESS_ON);
 }
 
 static void
@@ -254,6 +270,14 @@ logbias_changed_cb(void *arg, uint64_t newval)
                zil_set_logbias(os->os_zil, newval);
 }
 
+static void
+recordsize_changed_cb(void *arg, uint64_t newval)
+{
+       objset_t *os = arg;
+
+       os->os_recordsize = newval;
+}
+
 void
 dmu_objset_byteswap(void *buf, size_t size)
 {
@@ -284,15 +308,15 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
        os->os_spa = spa;
        os->os_rootbp = bp;
        if (!BP_IS_HOLE(os->os_rootbp)) {
-               uint32_t aflags = ARC_WAIT;
+               arc_flags_t aflags = ARC_FLAG_WAIT;
                zbookmark_phys_t zb;
                SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
                    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
                if (DMU_OS_IS_L2CACHEABLE(os))
-                       aflags |= ARC_L2CACHE;
+                       aflags |= ARC_FLAG_L2CACHE;
                if (DMU_OS_IS_L2COMPRESSIBLE(os))
-                       aflags |= ARC_L2COMPRESS;
+                       aflags |= ARC_FLAG_L2COMPRESS;
 
                dprintf_bp(os->os_rootbp, "reading %s", "");
                err = arc_read(NULL, spa, os->os_rootbp,
@@ -346,7 +370,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
                            zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
                            secondary_cache_changed_cb, os);
                }
-               if (!dsl_dataset_is_snapshot(ds)) {
+               if (!ds->ds_is_snapshot) {
                        if (err == 0) {
                                err = dsl_prop_register(ds,
                                    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
@@ -383,6 +407,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
                                    ZFS_PROP_REDUNDANT_METADATA),
                                    redundant_metadata_changed_cb, os);
                        }
+                       if (err == 0) {
+                               err = dsl_prop_register(ds,
+                                   zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+                                   recordsize_changed_cb, os);
+                       }
                }
                if (err != 0) {
                        VERIFY(arc_buf_remove_ref(os->os_phys_buf,
@@ -393,7 +422,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
        } else {
                /* It's the meta-objset. */
                os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
-               os->os_compress = ZIO_COMPRESS_LZJB;
+               os->os_compress = ZIO_COMPRESS_ON;
                os->os_copies = spa_max_replication(spa);
                os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
                os->os_dedup_verify = B_FALSE;
@@ -403,7 +432,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
                os->os_secondary_cache = ZFS_CACHE_ALL;
        }
 
-       if (ds == NULL || !dsl_dataset_is_snapshot(ds))
+       if (ds == NULL || !ds->ds_is_snapshot)
                os->os_zil_header = os->os_phys->os_zil_header;
        os->os_zil = zil_alloc(os, &os->os_zil_header);
 
@@ -418,20 +447,19 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
        list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
 
+       list_link_init(&os->os_evicting_node);
+
        mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 
-       DMU_META_DNODE(os) = dnode_special_open(os,
-           &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
-           &os->os_meta_dnode);
+       dnode_special_open(os, &os->os_phys->os_meta_dnode,
+           DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
        if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
-               DMU_USERUSED_DNODE(os) = dnode_special_open(os,
-                   &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
-                   &os->os_userused_dnode);
-               DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
-                   &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
-                   &os->os_groupused_dnode);
+               dnode_special_open(os, &os->os_phys->os_userused_dnode,
+                   DMU_USERUSED_OBJECT, &os->os_userused_dnode);
+               dnode_special_open(os, &os->os_phys->os_groupused_dnode,
+                   DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
        }
 
        *osp = os;
@@ -490,6 +518,25 @@ dmu_objset_hold(const char *name, void *tag, objset_t **osp)
        return (err);
 }
 
+static int
+dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp)
+{
+       int err;
+
+       err = dmu_objset_from_ds(ds, osp);
+       if (err != 0) {
+               dsl_dataset_disown(ds, tag);
+       } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
+               dsl_dataset_disown(ds, tag);
+               return (SET_ERROR(EINVAL));
+       } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+               dsl_dataset_disown(ds, tag);
+               return (SET_ERROR(EROFS));
+       }
+       return (err);
+}
+
 /*
  * dsl_pool must not be held when this is called.
  * Upon successful return, there will be a longhold on the dataset,
@@ -511,21 +558,26 @@ dmu_objset_own(const char *name, dmu_objset_type_t type,
                dsl_pool_rele(dp, FTAG);
                return (err);
        }
-
-       err = dmu_objset_from_ds(ds, osp);
+       err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
        dsl_pool_rele(dp, FTAG);
-       if (err != 0) {
-               dsl_dataset_disown(ds, tag);
-       } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
-               dsl_dataset_disown(ds, tag);
-               return (SET_ERROR(EINVAL));
-       } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
-               dsl_dataset_disown(ds, tag);
-               return (SET_ERROR(EROFS));
-       }
+
        return (err);
 }
 
+int
+dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp)
+{
+       dsl_dataset_t *ds;
+       int err;
+
+       err = dsl_dataset_own_obj(dp, obj, tag, &ds);
+       if (err != 0)
+               return (err);
+
+       return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
+}
+
 void
 dmu_objset_rele(objset_t *os, void *tag)
 {
@@ -575,41 +627,57 @@ dmu_objset_disown(objset_t *os, void *tag)
 void
 dmu_objset_evict_dbufs(objset_t *os)
 {
+       dnode_t *dn_marker;
        dnode_t *dn;
 
+       dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP);
+
        mutex_enter(&os->os_lock);
+       dn = list_head(&os->os_dnodes);
+       while (dn != NULL) {
+               /*
+                * Skip dnodes without holds.  We have to do this dance
+                * because dnode_add_ref() only works if there is already a
+                * hold.  If the dnode has no holds, then it has no dbufs.
+                */
+               if (dnode_add_ref(dn, FTAG)) {
+                       list_insert_after(&os->os_dnodes, dn, dn_marker);
+                       mutex_exit(&os->os_lock);
 
-       /* process the mdn last, since the other dnodes have holds on it */
-       list_remove(&os->os_dnodes, DMU_META_DNODE(os));
-       list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
+                       dnode_evict_dbufs(dn);
+                       dnode_rele(dn, FTAG);
 
-       /*
-        * Find the first dnode with holds.  We have to do this dance
-        * because dnode_add_ref() only works if you already have a
-        * hold.  If there are no holds then it has no dbufs so OK to
-        * skip.
-        */
-       for (dn = list_head(&os->os_dnodes);
-           dn && !dnode_add_ref(dn, FTAG);
-           dn = list_next(&os->os_dnodes, dn))
-               continue;
-
-       while (dn) {
-               dnode_t *next_dn = dn;
+                       mutex_enter(&os->os_lock);
+                       dn = list_next(&os->os_dnodes, dn_marker);
+                       list_remove(&os->os_dnodes, dn_marker);
+               } else {
+                       dn = list_next(&os->os_dnodes, dn);
+               }
+       }
+       mutex_exit(&os->os_lock);
 
-               do {
-                       next_dn = list_next(&os->os_dnodes, next_dn);
-               } while (next_dn && !dnode_add_ref(next_dn, FTAG));
+       kmem_free(dn_marker, sizeof (dnode_t));
 
-               mutex_exit(&os->os_lock);
-               dnode_evict_dbufs(dn);
-               dnode_rele(dn, FTAG);
-               mutex_enter(&os->os_lock);
-               dn = next_dn;
+       if (DMU_USERUSED_DNODE(os) != NULL) {
+               dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
+               dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
        }
-       mutex_exit(&os->os_lock);
+       dnode_evict_dbufs(DMU_META_DNODE(os));
 }
 
+/*
+ * Objset eviction processing is split into into two pieces.
+ * The first marks the objset as evicting, evicts any dbufs that
+ * have a refcount of zero, and then queues up the objset for the
+ * second phase of eviction.  Once os->os_dnodes has been cleared by
+ * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
+ * The second phase closes the special dnodes, dequeues the objset from
+ * the list of those undergoing eviction, and finally frees the objset.
+ *
+ * NOTE: Due to asynchronous eviction processing (invocation of
+ *       dnode_buf_pageout()), it is possible for the meta dnode for the
+ *       objset to have no holds even though os->os_dnodes is not empty.
+ */
 void
 dmu_objset_evict(objset_t *os)
 {
@@ -621,7 +689,7 @@ dmu_objset_evict(objset_t *os)
                ASSERT(!dmu_objset_is_dirty(os, t));
 
        if (ds) {
-               if (!dsl_dataset_is_snapshot(ds)) {
+               if (!ds->ds_is_snapshot) {
                        VERIFY0(dsl_prop_unregister(ds,
                            zfs_prop_to_name(ZFS_PROP_CHECKSUM),
                            checksum_changed_cb, os));
@@ -643,6 +711,9 @@ dmu_objset_evict(objset_t *os)
                        VERIFY0(dsl_prop_unregister(ds,
                            zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
                            redundant_metadata_changed_cb, os));
+                       VERIFY0(dsl_prop_unregister(ds,
+                           zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+                           recordsize_changed_cb, os));
                }
                VERIFY0(dsl_prop_unregister(ds,
                    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
@@ -657,6 +728,21 @@ dmu_objset_evict(objset_t *os)
 
        dmu_objset_evict_dbufs(os);
 
+       mutex_enter(&os->os_lock);
+       spa_evicting_os_register(os->os_spa, os);
+       if (list_is_empty(&os->os_dnodes)) {
+               mutex_exit(&os->os_lock);
+               dmu_objset_evict_done(os);
+       } else {
+               mutex_exit(&os->os_lock);
+       }
+}
+
+void
+dmu_objset_evict_done(objset_t *os)
+{
+       ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
+
        dnode_special_close(&os->os_meta_dnode);
        if (DMU_USERUSED_DNODE(os)) {
                dnode_special_close(&os->os_userused_dnode);
@@ -664,8 +750,6 @@ dmu_objset_evict(objset_t *os)
        }
        zil_free(os->os_zil);
 
-       ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
-
        VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
 
        /*
@@ -680,6 +764,7 @@ dmu_objset_evict(objset_t *os)
        mutex_destroy(&os->os_lock);
        mutex_destroy(&os->os_obj_lock);
        mutex_destroy(&os->os_user_ptr_lock);
+       spa_evicting_os_deregister(os->os_spa, os);
        kmem_free(os, sizeof (objset_t));
 }
 
@@ -780,9 +865,11 @@ dmu_objset_create_check(void *arg, dmu_tx_t *tx)
                dsl_dir_rele(pdd, FTAG);
                return (SET_ERROR(EEXIST));
        }
+       error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+           doca->doca_cred);
        dsl_dir_rele(pdd, FTAG);
 
-       return (0);
+       return (error);
 }
 
 static void
@@ -831,7 +918,8 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
        doca.doca_type = type;
 
        return (dsl_sync_task(name,
-           dmu_objset_create_check, dmu_objset_create_sync, &doca, 5));
+           dmu_objset_create_check, dmu_objset_create_sync, &doca,
+           5, ZFS_SPACE_CHECK_NORMAL));
 }
 
 typedef struct dmu_objset_clone_arg {
@@ -861,10 +949,12 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
                dsl_dir_rele(pdd, FTAG);
                return (SET_ERROR(EEXIST));
        }
-       /* You can't clone across pools. */
-       if (pdd->dd_pool != dp) {
+
+       error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+           doca->doca_cred);
+       if (error != 0) {
                dsl_dir_rele(pdd, FTAG);
-               return (SET_ERROR(EXDEV));
+               return (SET_ERROR(EDQUOT));
        }
        dsl_dir_rele(pdd, FTAG);
 
@@ -872,14 +962,8 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
        if (error != 0)
                return (error);
 
-       /* You can't clone across pools. */
-       if (origin->ds_dir->dd_pool != dp) {
-               dsl_dataset_rele(origin, FTAG);
-               return (SET_ERROR(EXDEV));
-       }
-
        /* You can only clone snapshots, not the head datasets. */
-       if (!dsl_dataset_is_snapshot(origin)) {
+       if (!origin->ds_is_snapshot) {
                dsl_dataset_rele(origin, FTAG);
                return (SET_ERROR(EINVAL));
        }
@@ -924,7 +1008,8 @@ dmu_objset_clone(const char *clone, const char *origin)
        doca.doca_cred = CRED();
 
        return (dsl_sync_task(clone,
-           dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 5));
+           dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
+           5, ZFS_SPACE_CHECK_NORMAL));
 }
 
 int
@@ -1444,7 +1529,7 @@ int
 dmu_objset_is_snapshot(objset_t *os)
 {
        if (os->os_dsl_dataset != NULL)
-               return (dsl_dataset_is_snapshot(os->os_dsl_dataset));
+               return (os->os_dsl_dataset->ds_is_snapshot);
        else
                return (B_FALSE);
 }
@@ -1456,12 +1541,12 @@ dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
        dsl_dataset_t *ds = os->os_dsl_dataset;
        uint64_t ignored;
 
-       if (ds->ds_phys->ds_snapnames_zapobj == 0)
+       if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
                return (SET_ERROR(ENOENT));
 
        return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
-           ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST,
-           real, maxlen, conflict));
+           dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
+           MT_FIRST, real, maxlen, conflict));
 }
 
 int
@@ -1474,12 +1559,12 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
 
        ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
 
-       if (ds->ds_phys->ds_snapnames_zapobj == 0)
+       if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
                return (SET_ERROR(ENOENT));
 
        zap_cursor_init_serialized(&cursor,
            ds->ds_dir->dd_pool->dp_meta_objset,
-           ds->ds_phys->ds_snapnames_zapobj, *offp);
+           dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
 
        if (zap_cursor_retrieve(&cursor, &attr) != 0) {
                zap_cursor_fini(&cursor);
@@ -1519,12 +1604,12 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name,
 
        /* there is no next dir on a snapshot! */
        if (os->os_dsl_dataset->ds_object !=
-           dd->dd_phys->dd_head_dataset_obj)
+           dsl_dir_phys(dd)->dd_head_dataset_obj)
                return (SET_ERROR(ENOENT));
 
        zap_cursor_init_serialized(&cursor,
            dd->dd_pool->dp_meta_objset,
-           dd->dd_phys->dd_child_dir_zapobj, *offp);
+           dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
 
        if (zap_cursor_retrieve(&cursor, &attr) != 0) {
                zap_cursor_fini(&cursor);
@@ -1546,70 +1631,81 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name,
        return (0);
 }
 
-/*
- * Find objsets under and including ddobj, call func(ds) on each.
- */
-int
-dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
-    int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
+typedef struct dmu_objset_find_ctx {
+       taskq_t         *dc_tq;
+       dsl_pool_t      *dc_dp;
+       uint64_t        dc_ddobj;
+       int             (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
+       void            *dc_arg;
+       int             dc_flags;
+       kmutex_t        *dc_error_lock;
+       int             *dc_error;
+} dmu_objset_find_ctx_t;
+
+static void
+dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
 {
+       dsl_pool_t *dp = dcp->dc_dp;
+       dmu_objset_find_ctx_t *child_dcp;
        dsl_dir_t *dd;
        dsl_dataset_t *ds;
        zap_cursor_t zc;
        zap_attribute_t *attr;
        uint64_t thisobj;
-       int err;
+       int err = 0;
 
-       ASSERT(dsl_pool_config_held(dp));
+       /* don't process if there already was an error */
+       if (*dcp->dc_error != 0)
+               goto out;
 
-       err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
+       err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd);
        if (err != 0)
-               return (err);
+               goto out;
 
        /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
        if (dd->dd_myname[0] == '$') {
                dsl_dir_rele(dd, FTAG);
-               return (0);
+               goto out;
        }
 
-       thisobj = dd->dd_phys->dd_head_dataset_obj;
+       thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
        attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
        /*
         * Iterate over all children.
         */
-       if (flags & DS_FIND_CHILDREN) {
+       if (dcp->dc_flags & DS_FIND_CHILDREN) {
                for (zap_cursor_init(&zc, dp->dp_meta_objset,
-                   dd->dd_phys->dd_child_dir_zapobj);
+                   dsl_dir_phys(dd)->dd_child_dir_zapobj);
                    zap_cursor_retrieve(&zc, attr) == 0;
                    (void) zap_cursor_advance(&zc)) {
                        ASSERT3U(attr->za_integer_length, ==,
                            sizeof (uint64_t));
                        ASSERT3U(attr->za_num_integers, ==, 1);
 
-                       err = dmu_objset_find_dp(dp, attr->za_first_integer,
-                           func, arg, flags);
-                       if (err != 0)
-                               break;
+                       child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
+                       *child_dcp = *dcp;
+                       child_dcp->dc_ddobj = attr->za_first_integer;
+                       if (dcp->dc_tq != NULL)
+                               (void) taskq_dispatch(dcp->dc_tq,
+                                   dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
+                       else
+                               dmu_objset_find_dp_impl(child_dcp);
                }
                zap_cursor_fini(&zc);
-
-               if (err != 0) {
-                       dsl_dir_rele(dd, FTAG);
-                       kmem_free(attr, sizeof (zap_attribute_t));
-                       return (err);
-               }
        }
 
        /*
         * Iterate over all snapshots.
         */
-       if (flags & DS_FIND_SNAPSHOTS) {
+       if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
                dsl_dataset_t *ds;
                err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 
                if (err == 0) {
-                       uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+                       uint64_t snapobj;
+
+                       snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
                        dsl_dataset_rele(ds, FTAG);
 
                        for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
@@ -1623,7 +1719,7 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
                                    attr->za_first_integer, FTAG, &ds);
                                if (err != 0)
                                        break;
-                               err = func(dp, ds, arg);
+                               err = dcp->dc_func(dp, ds, dcp->dc_arg);
                                dsl_dataset_rele(ds, FTAG);
                                if (err != 0)
                                        break;
@@ -1636,17 +1732,123 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
        kmem_free(attr, sizeof (zap_attribute_t));
 
        if (err != 0)
-               return (err);
+               goto out;
 
        /*
         * Apply to self.
         */
        err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
        if (err != 0)
-               return (err);
-       err = func(dp, ds, arg);
+               goto out;
+       err = dcp->dc_func(dp, ds, dcp->dc_arg);
        dsl_dataset_rele(ds, FTAG);
-       return (err);
+
+out:
+       if (err != 0) {
+               mutex_enter(dcp->dc_error_lock);
+               /* only keep first error */
+               if (*dcp->dc_error == 0)
+                       *dcp->dc_error = err;
+               mutex_exit(dcp->dc_error_lock);
+       }
+
+       kmem_free(dcp, sizeof (*dcp));
+}
+
+static void
+dmu_objset_find_dp_cb(void *arg)
+{
+       dmu_objset_find_ctx_t *dcp = arg;
+       dsl_pool_t *dp = dcp->dc_dp;
+
+       /*
+        * We need to get a pool_config_lock here, as there are several
+        * asssert(pool_config_held) down the stack. Getting a lock via
+        * dsl_pool_config_enter is risky, as it might be stalled by a
+        * pending writer. This would deadlock, as the write lock can
+        * only be granted when our parent thread gives up the lock.
+        * The _prio interface gives us priority over a pending writer.
+        */
+       dsl_pool_config_enter_prio(dp, FTAG);
+
+       dmu_objset_find_dp_impl(dcp);
+
+       dsl_pool_config_exit(dp, FTAG);
+}
+
+/*
+ * Find objsets under and including ddobj, call func(ds) on each.
+ * The order for the enumeration is completely undefined.
+ * func is called with dsl_pool_config held.
+ */
+int
+dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
+    int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
+{
+       int error = 0;
+       taskq_t *tq = NULL;
+       int ntasks;
+       dmu_objset_find_ctx_t *dcp;
+       kmutex_t err_lock;
+
+       mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
+       dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
+       dcp->dc_tq = NULL;
+       dcp->dc_dp = dp;
+       dcp->dc_ddobj = ddobj;
+       dcp->dc_func = func;
+       dcp->dc_arg = arg;
+       dcp->dc_flags = flags;
+       dcp->dc_error_lock = &err_lock;
+       dcp->dc_error = &error;
+
+       if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
+               /*
+                * In case a write lock is held we can't make use of
+                * parallelism, as down the stack of the worker threads
+                * the lock is asserted via dsl_pool_config_held.
+                * In case of a read lock this is solved by getting a read
+                * lock in each worker thread, which isn't possible in case
+                * of a writer lock. So we fall back to the synchronous path
+                * here.
+                * In the future it might be possible to get some magic into
+                * dsl_pool_config_held in a way that it returns true for
+                * the worker threads so that a single lock held from this
+                * thread suffices. For now, stay single threaded.
+                */
+               dmu_objset_find_dp_impl(dcp);
+
+               return (error);
+       }
+
+       ntasks = dmu_find_threads;
+       if (ntasks == 0)
+               ntasks = vdev_count_leaves(dp->dp_spa) * 4;
+       tq = taskq_create("dmu_objset_find", ntasks, maxclsyspri, ntasks,
+           INT_MAX, 0);
+       if (tq == NULL) {
+               kmem_free(dcp, sizeof (*dcp));
+               return (SET_ERROR(ENOMEM));
+       }
+       dcp->dc_tq = tq;
+
+       /* dcp will be freed by task */
+       (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
+
+       /*
+        * PORTING: this code relies on the property of taskq_wait to wait
+        * until no more tasks are queued and no more tasks are active. As
+        * we always queue new tasks from within other tasks, task_wait
+        * reliably waits for the full recursion to finish, even though we
+        * enqueue new tasks after taskq_wait has been called.
+        * On platforms other than illumos, taskq_wait may not have this
+        * property.
+        */
+       taskq_wait(tq);
+       taskq_destroy(tq);
+       mutex_destroy(&err_lock);
+
+       return (error);
 }
 
 /*
@@ -1684,7 +1886,7 @@ dmu_objset_find_impl(spa_t *spa, const char *name,
                return (0);
        }
 
-       thisobj = dd->dd_phys->dd_head_dataset_obj;
+       thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
        attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
        /*
@@ -1692,7 +1894,7 @@ dmu_objset_find_impl(spa_t *spa, const char *name,
         */
        if (flags & DS_FIND_CHILDREN) {
                for (zap_cursor_init(&zc, dp->dp_meta_objset,
-                   dd->dd_phys->dd_child_dir_zapobj);
+                   dsl_dir_phys(dd)->dd_child_dir_zapobj);
                    zap_cursor_retrieve(&zc, attr) == 0;
                    (void) zap_cursor_advance(&zc)) {
                        ASSERT3U(attr->za_integer_length, ==,
@@ -1725,7 +1927,9 @@ dmu_objset_find_impl(spa_t *spa, const char *name,
                err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 
                if (err == 0) {
-                       uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+                       uint64_t snapobj;
+
+                       snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
                        dsl_dataset_rele(ds, FTAG);
 
                        for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
index 1f61368c5d651bfac27efd98c4e7353530478194..b2d844eb42561523037f47451ff2a4a50db20c38 100644 (file)
@@ -22,7 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  */
 
@@ -234,11 +234,12 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
        drrw->drr_offset = offset;
        drrw->drr_length = blksz;
        drrw->drr_toguid = dsp->dsa_toguid;
-       if (BP_IS_EMBEDDED(bp)) {
+       if (bp == NULL || BP_IS_EMBEDDED(bp)) {
                /*
-                * There's no pre-computed checksum of embedded BP's, so
-                * (like fletcher4-checkummed blocks) userland will have
-                * to compute a dedup-capable checksum itself.
+                * There's no pre-computed checksum for partial-block
+                * writes or embedded BP's, so (like
+                * fletcher4-checkummed blocks) userland will have to
+                * compute a dedup-capable checksum itself.
                 */
                drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
        } else {
@@ -400,6 +401,10 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
        drro->drr_compress = dnp->dn_compress;
        drro->drr_toguid = dsp->dsa_toguid;
 
+       if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+           drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+               drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
+
        if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
                return (SET_ERROR(EINTR));
 
@@ -481,7 +486,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
                dnode_phys_t *blk;
                int i;
                int blksz = BP_GET_LSIZE(bp);
-               uint32_t aflags = ARC_WAIT;
+               arc_flags_t aflags = ARC_FLAG_WAIT;
                arc_buf_t *abuf;
 
                if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
@@ -499,7 +504,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
                }
                (void) arc_buf_remove_ref(abuf, &abuf);
        } else if (type == DMU_OT_SA) {
-               uint32_t aflags = ARC_WAIT;
+               arc_flags_t aflags = ARC_FLAG_WAIT;
                arc_buf_t *abuf;
                int blksz = BP_GET_LSIZE(bp);
 
@@ -516,7 +521,8 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
                err = dump_write_embedded(dsp, zb->zb_object,
                    zb->zb_blkid * blksz, blksz, bp);
        } else { /* it's a level-0 block of a regular object */
-               uint32_t aflags = ARC_WAIT;
+               uint64_t offset;
+               arc_flags_t aflags = ARC_FLAG_WAIT;
                arc_buf_t *abuf;
                int blksz = BP_GET_LSIZE(bp);
 
@@ -539,8 +545,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
                        }
                }
 
-               err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
-                   blksz, bp, abuf->b_data);
+               offset = zb->zb_blkid * blksz;
+
+               if (!(dsp->dsa_featureflags &
+                   DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+                   blksz > SPA_OLD_MAXBLOCKSIZE) {
+                       char *buf = abuf->b_data;
+                       while (blksz > 0 && err == 0) {
+                               int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
+                               err = dump_write(dsp, type, zb->zb_object,
+                                   offset, n, NULL, buf);
+                               offset += n;
+                               buf += n;
+                               blksz -= n;
+                       }
+               } else {
+                       err = dump_write(dsp, type, zb->zb_object,
+                           offset, blksz, bp, abuf->b_data);
+               }
                (void) arc_buf_remove_ref(abuf, &abuf);
        }
 
@@ -554,7 +576,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 static int
 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
     zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
-    int outfd, vnode_t *vp, offset_t *off)
+    boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
 {
        objset_t *os;
        dmu_replay_record_t *drr;
@@ -589,6 +611,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
        }
 #endif
 
+       if (large_block_ok && ds->ds_large_blocks)
+               featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
        if (embedok &&
            spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
                featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
@@ -602,12 +626,12 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
            featureflags);
 
        drr->drr_u.drr_begin.drr_creation_time =
-           ds->ds_phys->ds_creation_time;
+           dsl_dataset_phys(ds)->ds_creation_time;
        drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
        if (is_clone)
                drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
-       drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
-       if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+       drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid;
+       if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
                drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
 
        if (fromzb != NULL) {
@@ -615,7 +639,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
                fromtxg = fromzb->zbm_creation_txg;
        }
        dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
-       if (!dsl_dataset_is_snapshot(ds)) {
+       if (!ds->ds_is_snapshot) {
                (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
                    sizeof (drr->drr_u.drr_begin.drr_toname));
        }
@@ -628,7 +652,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
        dsp->dsa_proc = curproc;
        dsp->dsa_os = os;
        dsp->dsa_off = off;
-       dsp->dsa_toguid = ds->ds_phys->ds_guid;
+       dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid;
        ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
        dsp->dsa_pending_op = PENDING_NONE;
        dsp->dsa_incremental = (fromzb != NULL);
@@ -684,7 +708,8 @@ out:
 
 int
 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
-    boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
+    boolean_t embedok, boolean_t large_block_ok,
+    int outfd, vnode_t *vp, offset_t *off)
 {
        dsl_pool_t *dp;
        dsl_dataset_t *ds;
@@ -713,23 +738,25 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
                }
                if (!dsl_dataset_is_before(ds, fromds, 0))
                        err = SET_ERROR(EXDEV);
-               zb.zbm_creation_time = fromds->ds_phys->ds_creation_time;
-               zb.zbm_creation_txg = fromds->ds_phys->ds_creation_txg;
-               zb.zbm_guid = fromds->ds_phys->ds_guid;
+               zb.zbm_creation_time =
+                   dsl_dataset_phys(fromds)->ds_creation_time;
+               zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg;
+               zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
                is_clone = (fromds->ds_dir != ds->ds_dir);
                dsl_dataset_rele(fromds, FTAG);
-               err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
-                   outfd, vp, off);
+               err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+                   embedok, large_block_ok, outfd, vp, off);
        } else {
-               err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
-                   outfd, vp, off);
+               err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+                   embedok, large_block_ok, outfd, vp, off);
        }
        dsl_dataset_rele(ds, FTAG);
        return (err);
 }
 
 int
-dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+dmu_send(const char *tosnap, const char *fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
     int outfd, vnode_t *vp, offset_t *off)
 {
        dsl_pool_t *dp;
@@ -781,10 +808,10 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
                                if (!dsl_dataset_is_before(ds, fromds, 0))
                                        err = SET_ERROR(EXDEV);
                                zb.zbm_creation_time =
-                                   fromds->ds_phys->ds_creation_time;
+                                   dsl_dataset_phys(fromds)->ds_creation_time;
                                zb.zbm_creation_txg =
-                                   fromds->ds_phys->ds_creation_txg;
-                               zb.zbm_guid = fromds->ds_phys->ds_guid;
+                                   dsl_dataset_phys(fromds)->ds_creation_txg;
+                               zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
                                is_clone = (ds->ds_dir != fromds->ds_dir);
                                dsl_dataset_rele(fromds, FTAG);
                        }
@@ -796,11 +823,11 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
                        dsl_pool_rele(dp, FTAG);
                        return (err);
                }
-               err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
-                   outfd, vp, off);
+               err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+                   embedok, large_block_ok, outfd, vp, off);
        } else {
-               err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
-                   outfd, vp, off);
+               err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+                   embedok, large_block_ok, outfd, vp, off);
        }
        if (owned)
                dsl_dataset_disown(ds, FTAG);
@@ -809,17 +836,54 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
        return (err);
 }
 
+static int
+dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size,
+    uint64_t *sizep)
+{
+       int err;
+       /*
+        * Assume that space (both on-disk and in-stream) is dominated by
+        * data.  We will adjust for indirect blocks and the copies property,
+        * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
+        */
+
+       /*
+        * Subtract out approximate space used by indirect blocks.
+        * Assume most space is used by data blocks (non-indirect, non-dnode).
+        * Assume all blocks are recordsize.  Assume ditto blocks and
+        * internal fragmentation counter out compression.
+        *
+        * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
+        * block, which we observe in practice.
+        */
+       uint64_t recordsize;
+       err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
+       if (err != 0)
+               return (err);
+       size -= size / recordsize * sizeof (blkptr_t);
+
+       /* Add in the space for the record associated with each block. */
+       size += size / recordsize * sizeof (dmu_replay_record_t);
+
+       *sizep = size;
+
+       return (0);
+}
+
 int
 dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
 {
        int err;
-       uint64_t size, recordsize;
-       ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool);
+       uint64_t size;
 
-       ASSERT(dsl_pool_config_held(dp));
+       ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 
        /* tosnap must be a snapshot */
-       if (!dsl_dataset_is_snapshot(ds))
+       if (!ds->ds_is_snapshot)
+               return (SET_ERROR(EINVAL));
+
+       /* fromsnap, if provided, must be a snapshot */
+       if (fromds != NULL && !fromds->ds_is_snapshot)
                return (SET_ERROR(EINVAL));
 
        /*
@@ -831,7 +895,7 @@ dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
 
        /* Get uncompressed size estimate of changed data. */
        if (fromds == NULL) {
-               size = ds->ds_phys->ds_uncompressed_bytes;
+               size = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
        } else {
                uint64_t used, comp;
                err = dsl_dataset_space_written(fromds, ds,
@@ -840,32 +904,59 @@ dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
                        return (err);
        }
 
-       /*
-        * Assume that space (both on-disk and in-stream) is dominated by
-        * data.  We will adjust for indirect blocks and the copies property,
-        * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
-        */
+       err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
+       return (err);
+}
+
+/*
+ * Simple callback used to traverse the blocks of a snapshot and sum their
+ * uncompressed size
+ */
+/* ARGSUSED */
+static int
+dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+       uint64_t *spaceptr = arg;
+       if (bp != NULL && !BP_IS_HOLE(bp)) {
+               *spaceptr += BP_GET_UCSIZE(bp);
+       }
+       return (0);
+}
+
+/*
+ * Given a desination snapshot and a TXG, calculate the approximate size of a
+ * send stream sent from that TXG. from_txg may be zero, indicating that the
+ * whole snapshot will be sent.
+ */
+int
+dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
+    uint64_t *sizep)
+{
+       int err;
+       uint64_t size = 0;
+
+       ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+
+       /* tosnap must be a snapshot */
+       if (!dsl_dataset_is_snapshot(ds))
+               return (SET_ERROR(EINVAL));
 
+       /* verify that from_txg is before the provided snapshot was taken */
+       if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) {
+               return (SET_ERROR(EXDEV));
+       }
        /*
-        * Subtract out approximate space used by indirect blocks.
-        * Assume most space is used by data blocks (non-indirect, non-dnode).
-        * Assume all blocks are recordsize.  Assume ditto blocks and
-        * internal fragmentation counter out compression.
-        *
-        * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
-        * block, which we observe in practice.
+        * traverse the blocks of the snapshot with birth times after
+        * from_txg, summing their uncompressed size
         */
-       err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
-       if (err != 0)
+       err = traverse_dataset(ds, from_txg, TRAVERSE_POST,
+           dmu_calculate_send_traversal, &size);
+       if (err)
                return (err);
-       size -= size / recordsize * sizeof (blkptr_t);
-
-       /* Add in the space for the record associated with each block. */
-       size += size / recordsize * sizeof (dmu_replay_record_t);
 
-       *sizep = size;
-
-       return (0);
+       err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
+       return (err);
 }
 
 typedef struct dmu_recv_begin_arg {
@@ -885,21 +976,35 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
 
        /* temporary clone name must not exist */
        error = zap_lookup(dp->dp_meta_objset,
-           ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name,
+           dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
            8, 1, &val);
        if (error != ENOENT)
                return (error == 0 ? EBUSY : error);
 
        /* new snapshot name must not exist */
        error = zap_lookup(dp->dp_meta_objset,
-           ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap,
-           8, 1, &val);
+           dsl_dataset_phys(ds)->ds_snapnames_zapobj,
+           drba->drba_cookie->drc_tosnap, 8, 1, &val);
        if (error != ENOENT)
                return (error == 0 ? EEXIST : error);
 
+       /*
+        * Check snapshot limit before receiving. We'll recheck again at the
+        * end, but might as well abort before receiving if we're already over
+        * the limit.
+        *
+        * Note that we do not check the file system limit with
+        * dsl_dir_fscount_check because the temporary %clones don't count
+        * against that limit.
+        */
+       error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
+           NULL, drba->drba_cred);
+       if (error != 0)
+               return (error);
+
        if (fromguid != 0) {
                dsl_dataset_t *snap;
-               uint64_t obj = ds->ds_phys->ds_prev_snap_obj;
+               uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 
                /* Find snapshot in this dir that matches fromguid. */
                while (obj != 0) {
@@ -911,9 +1016,9 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
                                dsl_dataset_rele(snap, FTAG);
                                return (SET_ERROR(ENODEV));
                        }
-                       if (snap->ds_phys->ds_guid == fromguid)
+                       if (dsl_dataset_phys(snap)->ds_guid == fromguid)
                                break;
-                       obj = snap->ds_phys->ds_prev_snap_obj;
+                       obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
                        dsl_dataset_rele(snap, FTAG);
                }
                if (obj == 0)
@@ -935,10 +1040,12 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
 
                dsl_dataset_rele(snap, FTAG);
        } else {
-               /* if full, most recent snapshot must be $ORIGIN */
-               if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
-                       return (SET_ERROR(ENODEV));
-               drba->drba_snapobj = ds->ds_phys->ds_prev_snap_obj;
+               /* if full, then must be forced */
+               if (!drba->drba_cookie->drc_force)
+                       return (SET_ERROR(EEXIST));
+               /* start from $ORIGIN@$ORIGIN, if supported */
+               drba->drba_snapobj = dp->dp_origin_snap != NULL ?
+                   dp->dp_origin_snap->ds_object : 0;
        }
 
        return (0);
@@ -985,6 +1092,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
            !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
                return (SET_ERROR(ENOTSUP));
 
+       /*
+        * The receiving code doesn't know how to translate large blocks
+        * to smaller ones, so the pool must have the LARGE_BLOCKS
+        * feature enabled if the stream has LARGE_BLOCKS.
+        */
+       if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+           !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+               return (SET_ERROR(ENOTSUP));
+
        error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
        if (error == 0) {
                /* target fs already exists; recv into temp clone */
@@ -1015,6 +1131,25 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
                if (error != 0)
                        return (error);
 
+               /*
+                * Check filesystem and snapshot limits before receiving. We'll
+                * recheck snapshot limits again at the end (we create the
+                * filesystems and increment those counts during begin_sync).
+                */
+               error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+                   ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
+               if (error != 0) {
+                       dsl_dataset_rele(ds, FTAG);
+                       return (error);
+               }
+
+               error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+                   ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
+               if (error != 0) {
+                       dsl_dataset_rele(ds, FTAG);
+                       return (error);
+               }
+
                if (drba->drba_origin != NULL) {
                        dsl_dataset_t *origin;
                        error = dsl_dataset_hold(dp, drba->drba_origin,
@@ -1023,12 +1158,12 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
                                dsl_dataset_rele(ds, FTAG);
                                return (error);
                        }
-                       if (!dsl_dataset_is_snapshot(origin)) {
+                       if (!origin->ds_is_snapshot) {
                                dsl_dataset_rele(origin, FTAG);
                                dsl_dataset_rele(ds, FTAG);
                                return (SET_ERROR(EINVAL));
                        }
-                       if (origin->ds_phys->ds_guid != fromguid) {
+                       if (dsl_dataset_phys(origin)->ds_guid != fromguid) {
                                dsl_dataset_rele(origin, FTAG);
                                dsl_dataset_rele(ds, FTAG);
                                return (SET_ERROR(ENODEV));
@@ -1091,8 +1226,15 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
        }
        VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
 
+       if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+           DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+           !newds->ds_large_blocks) {
+               dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+               newds->ds_large_blocks = B_TRUE;
+       }
+
        dmu_buf_will_dirty(newds->ds_dbuf, tx);
-       newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+       dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
 
        /*
         * If we actually created a non-clone, we need to create the
@@ -1124,6 +1266,7 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
        drc->drc_tosnap = tosnap;
        drc->drc_tofs = tofs;
        drc->drc_force = force;
+       drc->drc_cred = CRED();
 
        if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
                drc->drc_byteswap = B_TRUE;
@@ -1156,7 +1299,7 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
        drba.drba_cred = CRED();
 
        return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync,
-           &drba, 5));
+           &drba, 5, ZFS_SPACE_CHECK_NORMAL));
 }
 
 struct restorearg {
@@ -1215,6 +1358,7 @@ restore_read(struct restorearg *ra, int len, char *buf)
 
        /* some things will require 8-byte alignment, so everything must */
        ASSERT0(len % 8);
+       ASSERT3U(len, <=, ra->bufsize);
 
        while (done < len) {
                ssize_t resid;
@@ -1356,7 +1500,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
            drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
            P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
            drro->drr_blksz < SPA_MINBLOCKSIZE ||
-           drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+           drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) ||
            drro->drr_bonuslen > DN_MAX_BONUSLEN) {
                return (SET_ERROR(EINVAL));
        }
@@ -1630,7 +1774,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
        int err;
 
        if (drrs->drr_length < SPA_MINBLOCKSIZE ||
-           drrs->drr_length > SPA_MAXBLOCKSIZE)
+           drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os)))
                return (SET_ERROR(EINVAL));
 
        data = restore_read(ra, drrs->drr_length, NULL);
@@ -1717,7 +1861,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
        ra.cksum = drc->drc_cksum;
        ra.vp = vp;
        ra.voff = *voffp;
-       ra.bufsize = 1<<20;
+       ra.bufsize = SPA_MAXBLOCKSIZE;
        ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP);
 
        /* these were verified in dmu_recv_begin */
@@ -1730,7 +1874,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
         */
        VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os));
 
-       ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
+       ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
 
        featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
 
@@ -1893,23 +2037,30 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx)
                         * the snap before drc_ds, because drc_ds can not
                         * have any snaps of its own).
                         */
-                       uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj;
-                       while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) {
+                       uint64_t obj;
+
+                       obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+                       while (obj !=
+                           dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
                                dsl_dataset_t *snap;
                                error = dsl_dataset_hold_obj(dp, obj, FTAG,
                                    &snap);
                                if (error != 0)
-                                       return (error);
+                                       break;
                                if (snap->ds_dir != origin_head->ds_dir)
                                        error = SET_ERROR(EINVAL);
                                if (error == 0)  {
                                        error = dsl_destroy_snapshot_check_impl(
                                            snap, B_FALSE);
                                }
-                               obj = snap->ds_phys->ds_prev_snap_obj;
+                               obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
                                dsl_dataset_rele(snap, FTAG);
                                if (error != 0)
-                                       return (error);
+                                       break;
+                       }
+                       if (error != 0) {
+                               dsl_dataset_rele(origin_head, FTAG);
+                               return (error);
                        }
                }
                error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
@@ -1919,7 +2070,7 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx)
                        return (error);
                }
                error = dsl_dataset_snapshot_check_impl(origin_head,
-                   drc->drc_tosnap, tx, B_TRUE);
+                   drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
                dsl_dataset_rele(origin_head, FTAG);
                if (error != 0)
                        return (error);
@@ -1927,7 +2078,7 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx)
                error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
        } else {
                error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
-                   drc->drc_tosnap, tx, B_TRUE);
+                   drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
        }
        return (error);
 }
@@ -1952,13 +2103,16 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
                         * Destroy any snapshots of drc_tofs (origin_head)
                         * after the origin (the snap before drc_ds).
                         */
-                       uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj;
-                       while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) {
+                       uint64_t obj;
+
+                       obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+                       while (obj !=
+                           dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
                                dsl_dataset_t *snap;
                                VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
                                    &snap));
                                ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
-                               obj = snap->ds_phys->ds_prev_snap_obj;
+                               obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
                                dsl_destroy_snapshot_sync_impl(snap,
                                    B_FALSE, tx);
                                dsl_dataset_rele(snap, FTAG);
@@ -1974,15 +2128,16 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
 
                /* set snapshot's creation time and guid */
                dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
-               origin_head->ds_prev->ds_phys->ds_creation_time =
+               dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
                    drc->drc_drrb->drr_creation_time;
-               origin_head->ds_prev->ds_phys->ds_guid =
+               dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
                    drc->drc_drrb->drr_toguid;
-               origin_head->ds_prev->ds_phys->ds_flags &=
+               dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
                    ~DS_FLAG_INCONSISTENT;
 
                dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
-               origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+               dsl_dataset_phys(origin_head)->ds_flags &=
+                   ~DS_FLAG_INCONSISTENT;
 
                dsl_dataset_rele(origin_head, FTAG);
                dsl_destroy_head_sync_impl(drc->drc_ds, tx);
@@ -1996,15 +2151,17 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
 
                /* set snapshot's creation time and guid */
                dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-               ds->ds_prev->ds_phys->ds_creation_time =
+               dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
                    drc->drc_drrb->drr_creation_time;
-               ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid;
-               ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+               dsl_dataset_phys(ds->ds_prev)->ds_guid =
+                   drc->drc_drrb->drr_toguid;
+               dsl_dataset_phys(ds->ds_prev)->ds_flags &=
+                   ~DS_FLAG_INCONSISTENT;
 
                dmu_buf_will_dirty(ds->ds_dbuf, tx);
-               ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+               dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
        }
-       drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj;
+       drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
        /*
         * Release the hold from dmu_recv_begin.  This must be done before
         * we return to open context, so that when we free the dataset's dnode,
@@ -2030,7 +2187,7 @@ add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj)
        gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
        err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds);
        if (err == 0) {
-               gmep->guid = snapds->ds_phys->ds_guid;
+               gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
                gmep->gme_ds = snapds;
                avl_add(guid_map, gmep);
                dsl_dataset_long_hold(snapds, gmep);
@@ -2064,7 +2221,7 @@ dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 
        error = dsl_sync_task(drc->drc_tofs,
            dmu_recv_end_check, dmu_recv_end_sync, drc,
-           dmu_recv_end_modified_blocks);
+           dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
 
        if (error != 0)
                dmu_recv_cleanup_ds(drc);
@@ -2078,7 +2235,7 @@ dmu_recv_new_end(dmu_recv_cookie_t *drc)
 
        error = dsl_sync_task(drc->drc_tofs,
            dmu_recv_end_check, dmu_recv_end_sync, drc,
-           dmu_recv_end_modified_blocks);
+           dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
 
        if (error != 0) {
                dmu_recv_cleanup_ds(drc);
index 7cabc8a6ef3ca0b90a167e9ec38c77e8c4e004a0..12d099bfd414fb546f4c0364dc637d8724168ffe 100644 (file)
@@ -58,6 +58,7 @@ typedef struct traverse_data {
        int td_flags;
        prefetch_data_t *td_pfd;
        boolean_t td_paused;
+       uint64_t td_hole_birth_enabled_txg;
        blkptr_cb_t *td_func;
        void *td_arg;
 } traverse_data_t;
@@ -176,7 +177,7 @@ static void
 traverse_prefetch_metadata(traverse_data_t *td,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
-       uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
+       arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 
        if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
                return;
@@ -226,25 +227,20 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
        }
 
        if (bp->blk_birth == 0) {
-               if (spa_feature_is_active(td->td_spa, SPA_FEATURE_HOLE_BIRTH)) {
-                       /*
-                        * Since this block has a birth time of 0 it must be a
-                        * hole created before the SPA_FEATURE_HOLE_BIRTH
-                        * feature was enabled.  If SPA_FEATURE_HOLE_BIRTH
-                        * was enabled before the min_txg for this traveral we
-                        * know the hole must have been created before the
-                        * min_txg for this traveral, so we can skip it. If
-                        * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg
-                        * for this traveral we cannot tell if the hole was
-                        * created before or after the min_txg for this
-                        * traversal, so we cannot skip it.
-                        */
-                       uint64_t hole_birth_enabled_txg;
-                       VERIFY(spa_feature_enabled_txg(td->td_spa,
-                           SPA_FEATURE_HOLE_BIRTH, &hole_birth_enabled_txg));
-                       if (hole_birth_enabled_txg < td->td_min_txg)
-                               return (0);
-               }
+               /*
+                * Since this block has a birth time of 0 it must be a
+                * hole created before the SPA_FEATURE_HOLE_BIRTH
+                * feature was enabled.  If SPA_FEATURE_HOLE_BIRTH
+                * was enabled before the min_txg for this traveral we
+                * know the hole must have been created before the
+                * min_txg for this traveral, so we can skip it. If
+                * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg
+                * for this traveral we cannot tell if the hole was
+                * created before or after the min_txg for this
+                * traversal, so we cannot skip it.
+                */
+               if (td->td_hole_birth_enabled_txg < td->td_min_txg)
+                       return (0);
        } else if (bp->blk_birth <= td->td_min_txg) {
                return (0);
        }
@@ -254,7 +250,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
                mutex_enter(&pd->pd_mtx);
                ASSERT(pd->pd_bytes_fetched >= 0);
                while (pd->pd_bytes_fetched < size && !pd->pd_exited)
-                       cv_wait_interruptible(&pd->pd_cv, &pd->pd_mtx);
+                       cv_wait_sig(&pd->pd_cv, &pd->pd_mtx);
                pd->pd_bytes_fetched -= size;
                cv_broadcast(&pd->pd_cv);
                mutex_exit(&pd->pd_mtx);
@@ -277,7 +273,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
        }
 
        if (BP_GET_LEVEL(bp) > 0) {
-               uint32_t flags = ARC_WAIT;
+               uint32_t flags = ARC_FLAG_WAIT;
                int32_t i;
                int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
                zbookmark_phys_t *czb;
@@ -311,32 +307,33 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
                kmem_free(czb, sizeof (zbookmark_phys_t));
 
        } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
-               uint32_t flags = ARC_WAIT;
+               uint32_t flags = ARC_FLAG_WAIT;
                int32_t i;
                int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+               dnode_phys_t *cdnp;
 
                err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
                    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
                if (err != 0)
                        goto post;
-               dnp = buf->b_data;
+               cdnp = buf->b_data;
 
                for (i = 0; i < epb; i++) {
-                       prefetch_dnode_metadata(td, &dnp[i], zb->zb_objset,
+                       prefetch_dnode_metadata(td, &cdnp[i], zb->zb_objset,
                            zb->zb_blkid * epb + i);
                }
 
                /* recursively visitbp() blocks below this */
                for (i = 0; i < epb; i++) {
-                       err = traverse_dnode(td, &dnp[i], zb->zb_objset,
+                       err = traverse_dnode(td, &cdnp[i], zb->zb_objset,
                            zb->zb_blkid * epb + i);
                        if (err != 0)
                                break;
                }
        } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
-               uint32_t flags = ARC_WAIT;
+               arc_flags_t flags = ARC_FLAG_WAIT;
                objset_phys_t *osp;
-               dnode_phys_t *dnp;
+               dnode_phys_t *mdnp, *gdnp, *udnp;
 
                err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
                    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
@@ -344,26 +341,27 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
                        goto post;
 
                osp = buf->b_data;
-               dnp = &osp->os_meta_dnode;
-               prefetch_dnode_metadata(td, dnp, zb->zb_objset,
+               mdnp = &osp->os_meta_dnode;
+               gdnp = &osp->os_groupused_dnode;
+               udnp = &osp->os_userused_dnode;
+
+               prefetch_dnode_metadata(td, mdnp, zb->zb_objset,
                    DMU_META_DNODE_OBJECT);
                if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-                       prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
-                           zb->zb_objset, DMU_GROUPUSED_OBJECT);
-                       prefetch_dnode_metadata(td, &osp->os_userused_dnode,
-                           zb->zb_objset, DMU_USERUSED_OBJECT);
+                       prefetch_dnode_metadata(td, gdnp, zb->zb_objset,
+                           DMU_GROUPUSED_OBJECT);
+                       prefetch_dnode_metadata(td, udnp, zb->zb_objset,
+                           DMU_USERUSED_OBJECT);
                }
 
-               err = traverse_dnode(td, dnp, zb->zb_objset,
+               err = traverse_dnode(td, mdnp, zb->zb_objset,
                    DMU_META_DNODE_OBJECT);
                if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-                       dnp = &osp->os_groupused_dnode;
-                       err = traverse_dnode(td, dnp, zb->zb_objset,
+                       err = traverse_dnode(td, gdnp, zb->zb_objset,
                            DMU_GROUPUSED_OBJECT);
                }
                if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-                       dnp = &osp->os_userused_dnode;
-                       err = traverse_dnode(td, dnp, zb->zb_objset,
+                       err = traverse_dnode(td, udnp, zb->zb_objset,
                            DMU_USERUSED_OBJECT);
                }
        }
@@ -450,7 +448,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
        prefetch_data_t *pfd = arg;
-       uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
+       arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 
        ASSERT(pfd->pd_bytes_fetched >= 0);
        if (pfd->pd_cancel)
@@ -461,7 +459,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 
        mutex_enter(&pfd->pd_mtx);
        while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
-               cv_wait_interruptible(&pfd->pd_cv, &pfd->pd_mtx);
+               cv_wait_sig(&pfd->pd_cv, &pfd->pd_mtx);
        pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
        cv_broadcast(&pfd->pd_cv);
        mutex_exit(&pfd->pd_mtx);
@@ -478,6 +476,7 @@ traverse_prefetch_thread(void *arg)
        traverse_data_t *td_main = arg;
        traverse_data_t td = *td_main;
        zbookmark_phys_t czb;
+       fstrans_cookie_t cookie = spl_fstrans_mark();
 
        td.td_func = traverse_prefetcher;
        td.td_arg = td_main->td_pfd;
@@ -491,6 +490,7 @@ traverse_prefetch_thread(void *arg)
        td_main->td_pfd->pd_exited = B_TRUE;
        cv_broadcast(&td_main->td_pfd->pd_cv);
        mutex_exit(&td_main->td_pfd->pd_mtx);
+       spl_fstrans_unmark(cookie);
 }
 
 /*
@@ -531,6 +531,13 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
        td->td_flags = flags;
        td->td_paused = B_FALSE;
 
+       if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+               VERIFY(spa_feature_enabled_txg(spa,
+                   SPA_FEATURE_HOLE_BIRTH, &td->td_hole_birth_enabled_txg));
+       } else {
+               td->td_hole_birth_enabled_txg = 0;
+       }
+
        pd->pd_flags = flags;
        mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);
@@ -539,8 +546,8 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
            ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
        /* See comment on ZIL traversal in dsl_scan_visitds. */
-       if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) {
-               uint32_t flags = ARC_WAIT;
+       if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
+               uint32_t flags = ARC_FLAG_WAIT;
                objset_phys_t *osp;
                arc_buf_t *buf;
 
@@ -566,7 +573,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
        pd->pd_cancel = B_TRUE;
        cv_broadcast(&pd->pd_cv);
        while (!pd->pd_exited)
-               cv_wait_interruptible(&pd->pd_cv, &pd->pd_mtx);
+               cv_wait_sig(&pd->pd_cv, &pd->pd_mtx);
        mutex_exit(&pd->pd_mtx);
 
        mutex_destroy(&pd->pd_mtx);
@@ -588,7 +595,7 @@ traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
     blkptr_cb_t func, void *arg)
 {
        return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
-           &ds->ds_phys->ds_bp, txg_start, NULL, flags, func, arg));
+           &dsl_dataset_phys(ds)->ds_bp, txg_start, NULL, flags, func, arg));
 }
 
 int
@@ -643,8 +650,8 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
                                        continue;
                                break;
                        }
-                       if (ds->ds_phys->ds_prev_snap_txg > txg)
-                               txg = ds->ds_phys->ds_prev_snap_txg;
+                       if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
+                               txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
                        err = traverse_dataset(ds, txg, flags, func, arg);
                        dsl_dataset_rele(ds, FTAG);
                        if (err != 0)
index 3d6dcc70f3050317cec0ceb0123e36d3b56d015d..5ae429f70866264ce059c715d5d83ae2c3ece80b 100644 (file)
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -241,7 +241,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
                return;
 
        min_bs = SPA_MINBLOCKSHIFT;
-       max_bs = SPA_MAXBLOCKSHIFT;
+       max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
        min_ibs = DN_MIN_INDBLKSHIFT;
        max_ibs = DN_MAX_INDBLKSHIFT;
 
@@ -310,6 +310,14 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
                         */
                        ASSERT(dn->dn_datablkshift != 0);
                        min_bs = max_bs = dn->dn_datablkshift;
+               } else {
+                       /*
+                        * The blocksize can increase up to the recordsize,
+                        * or if it is already more than the recordsize,
+                        * up to the next power of 2.
+                        */
+                       min_bs = highbit64(dn->dn_datablksz - 1);
+                       max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
                }
 
                /*
@@ -671,7 +679,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
                        uint64_t ibyte = i << shift;
                        err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
                        i = ibyte >> shift;
-                       if (err == ESRCH)
+                       if (err == ESRCH || i > end)
                                break;
                        if (err) {
                                tx->tx_err = err;
@@ -699,6 +707,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 {
        dmu_tx_hold_t *txh;
        dnode_t *dn;
+       dsl_dataset_phys_t *ds_phys;
        uint64_t nblocks;
        int epbs, err;
 
@@ -744,11 +753,11 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
                bp = &dn->dn_phys->dn_blkptr[0];
                if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
                    bp, bp->blk_birth))
-                       txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ;
                else
-                       txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_towrite += MZAP_MAX_BLKSZ;
                if (!BP_IS_HOLE(bp))
-                       txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_tounref += MZAP_MAX_BLKSZ;
                return;
        }
 
@@ -773,8 +782,9 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
         * we'll have to modify an indirect twig for each.
         */
        epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+       ds_phys = dsl_dataset_phys(dn->dn_objset->os_dsl_dataset);
        for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
-               if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
+               if (ds_phys->ds_prev_snap_obj)
                        txh->txh_space_towrite += 3 << dn->dn_indblkshift;
                else
                        txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
@@ -1544,18 +1554,18 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
 
        /* If blkptr doesn't exist then add space to towrite */
        if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
-               txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+               txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
        } else {
                blkptr_t *bp;
 
                bp = &dn->dn_phys->dn_spill;
                if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
                    bp, bp->blk_birth))
-                       txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
                else
-                       txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
                if (!BP_IS_HOLE(bp))
-                       txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE;
        }
 }
 
index ef74621a0f6ce60e24d11ab1fec51082933594d4..2858bbfb492ea74f81e241cab6fb84f9f94f931e 100644 (file)
@@ -20,7 +20,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -62,6 +63,43 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
 #endif /* _KERNEL */
 
+static int
+dbuf_compare(const void *x1, const void *x2)
+{
+       const dmu_buf_impl_t *d1 = x1;
+       const dmu_buf_impl_t *d2 = x2;
+
+       if (d1->db_level < d2->db_level) {
+               return (-1);
+       }
+       if (d1->db_level > d2->db_level) {
+               return (1);
+       }
+
+       if (d1->db_blkid < d2->db_blkid) {
+               return (-1);
+       }
+       if (d1->db_blkid > d2->db_blkid) {
+               return (1);
+       }
+
+       if (d1->db_state == DB_SEARCH) {
+               ASSERT3S(d2->db_state, !=, DB_SEARCH);
+               return (-1);
+       } else if (d2->db_state == DB_SEARCH) {
+               ASSERT3S(d1->db_state, !=, DB_SEARCH);
+               return (1);
+       }
+
+       if ((uintptr_t)d1 < (uintptr_t)d2) {
+               return (-1);
+       }
+       if ((uintptr_t)d1 > (uintptr_t)d2) {
+               return (1);
+       }
+       return (0);
+}
+
 /* ARGSUSED */
 static int
 dnode_cons(void *arg, void *unused, int kmflag)
@@ -116,7 +154,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
 
        dn->dn_dbufs_count = 0;
        dn->dn_unlisted_l0_blkid = 0;
-       list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+       avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
 
        dn->dn_moved = 0;
@@ -169,7 +207,7 @@ dnode_dest(void *arg, void *unused)
 
        ASSERT0(dn->dn_dbufs_count);
        ASSERT0(dn->dn_unlisted_l0_blkid);
-       list_destroy(&dn->dn_dbufs);
+       avl_destroy(&dn->dn_dbufs);
 }
 
 void
@@ -366,8 +404,9 @@ static dnode_t *
 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
     uint64_t object, dnode_handle_t *dnh)
 {
-       dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+       dnode_t *dn;
 
+       dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
        ASSERT(!POINTER_IS_VALID(dn->dn_objset));
        dn->dn_moved = 0;
 
@@ -404,13 +443,31 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
        ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 
        mutex_enter(&os->os_lock);
-       list_insert_head(&os->os_dnodes, dn);
+       if (dnh->dnh_dnode != NULL) {
+               /* Lost the allocation race. */
+               mutex_exit(&os->os_lock);
+               kmem_cache_free(dnode_cache, dn);
+               return (dnh->dnh_dnode);
+       }
+
+       /*
+        * Exclude special dnodes from os_dnodes so an empty os_dnodes
+        * signifies that the special dnodes have no references from
+        * their children (the entries in os_dnodes).  This allows
+        * dnode_destroy() to easily determine if the last child has
+        * been removed and then complete eviction of the objset.
+        */
+       if (!DMU_OBJECT_IS_SPECIAL(object))
+               list_insert_head(&os->os_dnodes, dn);
        membar_producer();
+
        /*
-        * Everything else must be valid before assigning dn_objset makes the
-        * dnode eligible for dnode_move().
+        * Everything else must be valid before assigning dn_objset
+        * makes the dnode eligible for dnode_move().
         */
        dn->dn_objset = os;
+
+       dnh->dnh_dnode = dn;
        mutex_exit(&os->os_lock);
 
        arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
@@ -424,12 +481,18 @@ static void
 dnode_destroy(dnode_t *dn)
 {
        objset_t *os = dn->dn_objset;
+       boolean_t complete_os_eviction = B_FALSE;
 
        ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 
        mutex_enter(&os->os_lock);
        POINTER_INVALIDATE(&dn->dn_objset);
-       list_remove(&os->os_dnodes, dn);
+       if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+               list_remove(&os->os_dnodes, dn);
+               complete_os_eviction =
+                   list_is_empty(&os->os_dnodes) &&
+                   list_link_active(&os->os_evicting_node);
+       }
        mutex_exit(&os->os_lock);
 
        /* the dnode can no longer move, so we can release the handle */
@@ -464,6 +527,9 @@ dnode_destroy(dnode_t *dn)
        dmu_zfetch_rele(&dn->dn_zfetch);
        kmem_cache_free(dnode_cache, dn);
        arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
+
+       if (complete_os_eviction)
+               dmu_objset_evict_done(os);
 }
 
 void
@@ -472,10 +538,10 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 {
        int i;
 
+       ASSERT3U(blocksize, <=,
+           spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        if (blocksize == 0)
                blocksize = 1 << zfs_default_bs;
-       else if (blocksize > SPA_MAXBLOCKSIZE)
-               blocksize = SPA_MAXBLOCKSIZE;
        else
                blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 
@@ -503,7 +569,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
        ASSERT0(dn->dn_assigned_txg);
        ASSERT(refcount_is_zero(&dn->dn_tx_holds));
        ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
-       ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+       ASSERT(avl_is_empty(&dn->dn_dbufs));
 
        for (i = 0; i < TXG_SIZE; i++) {
                ASSERT0(dn->dn_next_nblkptr[i]);
@@ -556,7 +622,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        int nblkptr;
 
        ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
-       ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+       ASSERT3U(blocksize, <=,
+           spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        ASSERT0(blocksize % SPA_MINBLOCKSIZE);
        ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
        ASSERT(tx->tx_txg != 0);
@@ -689,8 +756,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
        ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
        ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
        refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
-       ASSERT(list_is_empty(&ndn->dn_dbufs));
-       list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+       ASSERT(avl_is_empty(&ndn->dn_dbufs));
+       avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
        ndn->dn_dbufs_count = odn->dn_dbufs_count;
        ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
        ndn->dn_bonus = odn->dn_bonus;
@@ -724,7 +791,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
         */
        odn->dn_dbuf = NULL;
        odn->dn_handle = NULL;
-       list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+       avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
        odn->dn_dbufs_count = 0;
        odn->dn_unlisted_l0_blkid = 0;
@@ -929,33 +996,32 @@ dnode_special_close(dnode_handle_t *dnh)
         */
        while (refcount_count(&dn->dn_holds) > 0)
                delay(1);
+       ASSERT(dn->dn_dbuf == NULL ||
+           dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
        zrl_add(&dnh->dnh_zrlock);
        dnode_destroy(dn); /* implicit zrl_remove() */
        zrl_destroy(&dnh->dnh_zrlock);
        dnh->dnh_dnode = NULL;
 }
 
-dnode_t *
+void
 dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
     dnode_handle_t *dnh)
 {
-       dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
-       dnh->dnh_dnode = dn;
+       dnode_t *dn;
+
+       dn = dnode_create(os, dnp, NULL, object, dnh);
        zrl_init(&dnh->dnh_zrlock);
        DNODE_VERIFY(dn);
-       return (dn);
 }
 
 static void
-dnode_buf_pageout(dmu_buf_t *db, void *arg)
+dnode_buf_pageout(void *dbu)
 {
-       dnode_children_t *children_dnodes = arg;
+       dnode_children_t *children_dnodes = dbu;
        int i;
-       int epb = db->db_size >> DNODE_SHIFT;
-
-       ASSERT(epb == children_dnodes->dnc_count);
 
-       for (i = 0; i < epb; i++) {
+       for (i = 0; i < children_dnodes->dnc_count; i++) {
                dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
                dnode_t *dn;
 
@@ -985,7 +1051,7 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg)
                dnh->dnh_dnode = NULL;
        }
        kmem_free(children_dnodes, sizeof (dnode_children_t) +
-           (epb - 1) * sizeof (dnode_handle_t));
+           children_dnodes->dnc_count * sizeof (dnode_handle_t));
 }
 
 /*
@@ -1069,18 +1135,24 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
        if (children_dnodes == NULL) {
                int i;
                dnode_children_t *winner;
-               children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
-                   (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP);
+               children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
+                   epb * sizeof (dnode_handle_t), KM_SLEEP);
                children_dnodes->dnc_count = epb;
                dnh = &children_dnodes->dnc_children[0];
                for (i = 0; i < epb; i++) {
                        zrl_init(&dnh[i].dnh_zrlock);
-                       dnh[i].dnh_dnode = NULL;
                }
-               if ((winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
-                   dnode_buf_pageout))) {
+               dmu_buf_init_user(&children_dnodes->dnc_dbu,
+                   dnode_buf_pageout, NULL);
+               winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
+               if (winner != NULL) {
+
+                       for (i = 0; i < epb; i++) {
+                               zrl_destroy(&dnh[i].dnh_zrlock);
+                       }
+
                        kmem_free(children_dnodes, sizeof (dnode_children_t) +
-                           (epb - 1) * sizeof (dnode_handle_t));
+                           epb * sizeof (dnode_handle_t));
                        children_dnodes = winner;
                }
        }
@@ -1088,17 +1160,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
 
        dnh = &children_dnodes->dnc_children[idx];
        zrl_add(&dnh->dnh_zrlock);
-       if ((dn = dnh->dnh_dnode) == NULL) {
+       dn = dnh->dnh_dnode;
+       if (dn == NULL) {
                dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
-               dnode_t *winner;
 
                dn = dnode_create(os, phys, db, object, dnh);
-               winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
-               if (winner != NULL) {
-                       zrl_add(&dnh->dnh_zrlock);
-                       dnode_destroy(dn); /* implicit zrl_remove() */
-                       dn = winner;
-               }
        }
 
        mutex_enter(&dn->dn_mtx);
@@ -1112,10 +1178,10 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
                dbuf_rele(db, FTAG);
                return (type == DMU_OT_NONE ? ENOENT : EEXIST);
        }
-       mutex_exit(&dn->dn_mtx);
-
        if (refcount_add(&dn->dn_holds, tag) == 1)
                dbuf_add_ref(db, dnh);
+       mutex_exit(&dn->dn_mtx);
+
        /* Now we can rely on the hold to prevent the dnode from moving. */
        zrl_remove(&dnh->dnh_zrlock);
 
@@ -1233,7 +1299,8 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
                return;
        }
 
-       ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+       ASSERT(!refcount_is_zero(&dn->dn_holds) ||
+           !avl_is_empty(&dn->dn_dbufs));
        ASSERT(dn->dn_datablksz != 0);
        ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
        ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
@@ -1306,13 +1373,12 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx)
 int
 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 {
-       dmu_buf_impl_t *db, *db_next;
+       dmu_buf_impl_t *db;
        int err;
 
+       ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        if (size == 0)
                size = SPA_MINBLOCKSIZE;
-       if (size > SPA_MAXBLOCKSIZE)
-               size = SPA_MAXBLOCKSIZE;
        else
                size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
 
@@ -1329,9 +1395,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
                goto fail;
 
        mutex_enter(&dn->dn_dbufs_mtx);
-       for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
-               db_next = list_next(&dn->dn_dbufs, db);
-
+       for (db = avl_first(&dn->dn_dbufs); db != NULL;
+           db = AVL_NEXT(&dn->dn_dbufs, db)) {
                if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
                    db->db_blkid != DMU_SPILL_BLKID) {
                        mutex_exit(&dn->dn_dbufs_mtx);
@@ -1452,6 +1517,16 @@ out:
                rw_downgrade(&dn->dn_struct_rwlock);
 }
 
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
+{
+       dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+       if (db != NULL) {
+               dmu_buf_will_dirty(&db->db, tx);
+               dbuf_rele(db, FTAG);
+       }
+}
+
 void
 dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 {
@@ -1572,27 +1647,68 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
                nblks += 1;
 
        /*
-        * Dirty the first and last indirect blocks, as they (and/or their
-        * parents) will need to be written out if they were only
-        * partially freed.  Interior indirect blocks will be themselves freed,
-        * by free_children(), so they need not be dirtied.  Note that these
-        * interior blocks have already been prefetched by dmu_tx_hold_free().
+        * Dirty all the indirect blocks in this range.  Note that only
+        * the first and last indirect blocks can actually be written
+        * (if they were partially freed) -- they must be dirtied, even if
+        * they do not exist on disk yet.  The interior blocks will
+        * be freed by free_children(), so they will not actually be written.
+        * Even though these interior blocks will not be written, we
+        * dirty them for two reasons:
+        *
+        *  - It ensures that the indirect blocks remain in memory until
+        *    syncing context.  (They have already been prefetched by
+        *    dmu_tx_hold_free(), so we don't have to worry about reading
+        *    them serially here.)
+        *
+        *  - The dirty space accounting will put pressure on the txg sync
+        *    mechanism to begin syncing, and to delay transactions if there
+        *    is a large amount of freeing.  Even though these indirect
+        *    blocks will not be written, we could need to write the same
+        *    amount of space if we copy the freed BPs into deadlists.
         */
        if (dn->dn_nlevels > 1) {
-               uint64_t first, last;
+               uint64_t first, last, i, ibyte;
+               int shift, err;
 
                first = blkid >> epbs;
-               if ((db = dbuf_hold_level(dn, 1, first, FTAG))) {
-                       dmu_buf_will_dirty(&db->db, tx);
-                       dbuf_rele(db, FTAG);
-               }
+               dnode_dirty_l1(dn, first, tx);
                if (trunc)
                        last = dn->dn_maxblkid >> epbs;
                else
                        last = (blkid + nblks - 1) >> epbs;
-               if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
-                       dmu_buf_will_dirty(&db->db, tx);
-                       dbuf_rele(db, FTAG);
+               if (last != first)
+                       dnode_dirty_l1(dn, last, tx);
+
+               shift = dn->dn_datablkshift + dn->dn_indblkshift -
+                   SPA_BLKPTRSHIFT;
+               for (i = first + 1; i < last; i++) {
+                       /*
+                        * Set i to the blockid of the next non-hole
+                        * level-1 indirect block at or after i.  Note
+                        * that dnode_next_offset() operates in terms of
+                        * level-0-equivalent bytes.
+                        */
+                       ibyte = i << shift;
+                       err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+                           &ibyte, 2, 1, 0);
+                       i = ibyte >> shift;
+                       if (i >= last)
+                               break;
+
+                       /*
+                        * Normally we should not see an error, either
+                        * from dnode_next_offset() or dbuf_hold_level()
+                        * (except for ESRCH from dnode_next_offset).
+                        * If there is an i/o error, then when we read
+                        * this block in syncing context, it will use
+                        * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+                        * to the "failmode" property.  dnode_next_offset()
+                        * doesn't have a flag to indicate MUSTSUCCEED.
+                        */
+                       if (err != 0)
+                               break;
+
+                       dnode_dirty_l1(dn, i, tx);
                }
        }
 
index 1825e983551c7c2a0a0680b8af7971ecdd3bb610..df5c8e4ee6c43421ce04d4f2ec83d75059a4fda1 100644 (file)
@@ -21,7 +21,8 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -76,7 +77,8 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
 
        /* set dbuf's parent pointers to new indirect buf */
        for (i = 0; i < nblkptr; i++) {
-               dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i);
+               dmu_buf_impl_t *child =
+                   dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
 
                if (child == NULL)
                        continue;
@@ -402,57 +404,42 @@ dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
 void
 dnode_evict_dbufs(dnode_t *dn)
 {
-       int progress;
-       int pass = 0;
-
-       do {
-               dmu_buf_impl_t *db, marker;
-               int evicting = FALSE;
-
-               progress = FALSE;
-               mutex_enter(&dn->dn_dbufs_mtx);
-               list_insert_tail(&dn->dn_dbufs, &marker);
-               db = list_head(&dn->dn_dbufs);
-               for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
-                       list_remove(&dn->dn_dbufs, db);
-                       list_insert_tail(&dn->dn_dbufs, db);
+       dmu_buf_impl_t *db_marker;
+       dmu_buf_impl_t *db, *db_next;
+
+       db_marker = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
+
+       mutex_enter(&dn->dn_dbufs_mtx);
+       for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
+
 #ifdef DEBUG
-                       DB_DNODE_ENTER(db);
-                       ASSERT3P(DB_DNODE(db), ==, dn);
-                       DB_DNODE_EXIT(db);
+               DB_DNODE_ENTER(db);
+               ASSERT3P(DB_DNODE(db), ==, dn);
+               DB_DNODE_EXIT(db);
 #endif /* DEBUG */
 
-                       mutex_enter(&db->db_mtx);
-                       if (db->db_state == DB_EVICTING) {
-                               progress = TRUE;
-                               evicting = TRUE;
-                               mutex_exit(&db->db_mtx);
-                       } else if (refcount_is_zero(&db->db_holds)) {
-                               progress = TRUE;
-                               dbuf_clear(db); /* exits db_mtx for us */
-                       } else {
-                               mutex_exit(&db->db_mtx);
-                       }
-
+               mutex_enter(&db->db_mtx);
+               if (db->db_state != DB_EVICTING &&
+                   refcount_is_zero(&db->db_holds)) {
+                       db_marker->db_level = db->db_level;
+                       db_marker->db_blkid = db->db_blkid;
+                       db_marker->db_state = DB_SEARCH;
+                       avl_insert_here(&dn->dn_dbufs, db_marker, db,
+                           AVL_BEFORE);
+
+                       dbuf_clear(db);
+
+                       db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);
+                       avl_remove(&dn->dn_dbufs, db_marker);
+               } else {
+                       db->db_pending_evict = TRUE;
+                       mutex_exit(&db->db_mtx);
+                       db_next = AVL_NEXT(&dn->dn_dbufs, db);
                }
-               list_remove(&dn->dn_dbufs, &marker);
-               /*
-                * NB: we need to drop dn_dbufs_mtx between passes so
-                * that any DB_EVICTING dbufs can make progress.
-                * Ideally, we would have some cv we could wait on, but
-                * since we don't, just wait a bit to give the other
-                * thread a chance to run.
-                */
-               mutex_exit(&dn->dn_dbufs_mtx);
-               if (evicting)
-                       delay(1);
-               pass++;
-               if ((pass % 100) == 0)
-                       dprintf("Exceeded %d passes evicting dbufs\n", pass);
-       } while (progress);
+       }
+       mutex_exit(&dn->dn_dbufs_mtx);
 
-       if (pass >= 100)
-               dprintf("Required %d passes to evict dbufs\n", pass);
+       kmem_free(db_marker, sizeof (dmu_buf_impl_t));
 
        dnode_evict_bonus(dn);
 }
@@ -461,10 +448,14 @@ void
 dnode_evict_bonus(dnode_t *dn)
 {
        rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-       if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
-               mutex_enter(&dn->dn_bonus->db_mtx);
-               dbuf_evict(dn->dn_bonus);
-               dn->dn_bonus = NULL;
+       if (dn->dn_bonus != NULL) {
+               if (refcount_is_zero(&dn->dn_bonus->db_holds)) {
+                       mutex_enter(&dn->dn_bonus->db_mtx);
+                       dbuf_evict(dn->dn_bonus);
+                       dn->dn_bonus = NULL;
+               } else {
+                       dn->dn_bonus->db_pending_evict = TRUE;
+               }
        }
        rw_exit(&dn->dn_struct_rwlock);
 }
@@ -491,6 +482,9 @@ dnode_undirty_dbufs(list_t *list)
                        ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
                            dr->dt.dl.dr_data == db->db_buf);
                        dbuf_unoverride(dr);
+               } else {
+                       mutex_destroy(&dr->dt.di.dr_mtx);
+                       list_destroy(&dr->dt.di.dr_children);
                }
                kmem_free(dr, sizeof (dbuf_dirty_record_t));
                dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
@@ -513,8 +507,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 
        dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
        dnode_evict_dbufs(dn);
-       ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
-       ASSERT3P(dn->dn_bonus, ==, NULL);
 
        /*
         * XXX - It would be nice to assert this, but we may still
@@ -650,12 +642,11 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
        freeing_dnode = dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg;
 
        /*
-        * We will either remove a spill block when a file is being removed
-        * or we have been asked to remove it.
+        * Remove the spill block if we have been explicitly asked to
+        * remove it, or if the object is being removed.
         */
-       if (dn->dn_rm_spillblk[txgoff] ||
-           ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && freeing_dnode)) {
-               if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+       if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) {
+               if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
                        kill_spill = B_TRUE;
                dn->dn_rm_spillblk[txgoff] = 0;
        }
@@ -731,7 +722,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
                mutex_exit(&dn->dn_mtx);
        }
 
-       dbuf_sync_list(list, tx);
+       dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
 
        if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
                ASSERT3P(list_head(list), ==, NULL);
index 2cae5cd4d188c3caf997821678ea321a70fccd13..447a3a2dc3a2de6df39c4061b78772cf11ab0b4a 100644 (file)
@@ -13,7 +13,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -65,7 +65,7 @@ dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname,
        if (bmark_zapobj == 0)
                return (SET_ERROR(ESRCH));
 
-       if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+       if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
                mt = MT_FIRST;
        else
                mt = MT_EXACT;
@@ -120,7 +120,7 @@ dsl_bookmark_create_check_impl(dsl_dataset_t *snapds, const char *bookmark_name,
        int error;
        zfs_bookmark_phys_t bmark_phys;
 
-       if (!dsl_dataset_is_snapshot(snapds))
+       if (!snapds->ds_is_snapshot)
                return (SET_ERROR(EINVAL));
 
        error = dsl_bookmark_hold_ds(dp, bookmark_name,
@@ -210,10 +210,11 @@ dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx)
                            &bmark_fs->ds_bookmarks, tx));
                }
 
-               bmark_phys.zbm_guid = snapds->ds_phys->ds_guid;
-               bmark_phys.zbm_creation_txg = snapds->ds_phys->ds_creation_txg;
+               bmark_phys.zbm_guid = dsl_dataset_phys(snapds)->ds_guid;
+               bmark_phys.zbm_creation_txg =
+                   dsl_dataset_phys(snapds)->ds_creation_txg;
                bmark_phys.zbm_creation_time =
-                   snapds->ds_phys->ds_creation_time;
+                   dsl_dataset_phys(snapds)->ds_creation_time;
 
                VERIFY0(zap_add(mos, bmark_fs->ds_bookmarks,
                    shortname, sizeof (uint64_t),
@@ -248,7 +249,8 @@ dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors)
        dbca.dbca_errors = errors;
 
        return (dsl_sync_task(nvpair_name(pair), dsl_bookmark_create_check,
-           dsl_bookmark_create_sync, &dbca, fnvlist_num_pairs(bmarks)));
+           dsl_bookmark_create_sync, &dbca,
+           fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL));
 }
 
 int
@@ -342,7 +344,7 @@ dsl_dataset_bookmark_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx)
        uint64_t bmark_zapobj = ds->ds_bookmarks;
        matchtype_t mt;
 
-       if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+       if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
                mt = MT_FIRST;
        else
                mt = MT_EXACT;
@@ -453,7 +455,8 @@ dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors)
        dbda.dbda_success = fnvlist_alloc();
 
        rv = dsl_sync_task(nvpair_name(pair), dsl_bookmark_destroy_check,
-           dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks));
+           dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks),
+           ZFS_SPACE_CHECK_RESERVED);
        fnvlist_free(dbda.dbda_success);
        return (rv);
 }
index 79cb6a3a25e591b97bf5171352e969b1fbeae975..2168f28941ed826bb10ce29eadac63b3d6d5578c 100644 (file)
@@ -21,8 +21,9 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 RackTop Systems.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/dmu_objset.h>
 #include <sys/dsl_userhold.h>
 #include <sys/dsl_bookmark.h>
 
+/*
+ * The SPA supports block sizes up to 16MB.  However, very large blocks
+ * can have an impact on i/o latency (e.g. tying up a spinning disk for
+ * ~300ms), and also potentially on the memory allocator.  Therefore,
+ * we do not allow the recordsize to be set larger than zfs_max_recordsize
+ * (default 1MB).  Larger blocks can be created by changing this tunable,
+ * and pools with larger blocks can always be imported and used, regardless
+ * of this setting.
+ */
+int zfs_max_recordsize = 1 * 1024 * 1024;
+
 #define        SWITCH64(x, y) \
        { \
                uint64_t __tmp = (x); \
@@ -59,7 +71,7 @@
 
 #define        DS_REF_MAX      (1ULL << 62)
 
-#define        DSL_DEADLIST_BLOCKSIZE  SPA_MAXBLOCKSIZE
+extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
 
 /*
  * Figure out how much of this delta should be propogated to the dsl_dir
 static int64_t
 parent_delta(dsl_dataset_t *ds, int64_t delta)
 {
+       dsl_dataset_phys_t *ds_phys;
        uint64_t old_bytes, new_bytes;
 
        if (ds->ds_reserved == 0)
                return (delta);
 
-       old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
-       new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
+       ds_phys = dsl_dataset_phys(ds);
+       old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
+       new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
 
        ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
        return (new_bytes - old_bytes);
@@ -108,10 +122,12 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
        dmu_buf_will_dirty(ds->ds_dbuf, tx);
        mutex_enter(&ds->ds_lock);
        delta = parent_delta(ds, used);
-       ds->ds_phys->ds_referenced_bytes += used;
-       ds->ds_phys->ds_compressed_bytes += compressed;
-       ds->ds_phys->ds_uncompressed_bytes += uncompressed;
-       ds->ds_phys->ds_unique_bytes += used;
+       dsl_dataset_phys(ds)->ds_referenced_bytes += used;
+       dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
+       dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
+       dsl_dataset_phys(ds)->ds_unique_bytes += used;
+       if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
+               ds->ds_need_large_blocks = B_TRUE;
        mutex_exit(&ds->ds_lock);
        dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
            compressed, uncompressed, tx);
@@ -141,20 +157,20 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
        }
        ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
 
-       ASSERT(!dsl_dataset_is_snapshot(ds));
+       ASSERT(!ds->ds_is_snapshot);
        dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
-       if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
+       if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
                int64_t delta;
 
                dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
                dsl_free(tx->tx_pool, tx->tx_txg, bp);
 
                mutex_enter(&ds->ds_lock);
-               ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
+               ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
                    !DS_UNIQUE_IS_ACCURATE(ds));
                delta = parent_delta(ds, -used);
-               ds->ds_phys->ds_unique_bytes -= used;
+               dsl_dataset_phys(ds)->ds_unique_bytes -= used;
                mutex_exit(&ds->ds_lock);
                dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
                    delta, -compressed, -uncompressed, tx);
@@ -175,15 +191,15 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
                        dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
                }
                ASSERT3U(ds->ds_prev->ds_object, ==,
-                   ds->ds_phys->ds_prev_snap_obj);
-               ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
+                   dsl_dataset_phys(ds)->ds_prev_snap_obj);
+               ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
                /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
-               if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
+               if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
                    ds->ds_object && bp->blk_birth >
-                   ds->ds_prev->ds_phys->ds_prev_snap_txg) {
+                   dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
                        dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
                        mutex_enter(&ds->ds_prev->ds_lock);
-                       ds->ds_prev->ds_phys->ds_unique_bytes += used;
+                       dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
                        mutex_exit(&ds->ds_prev->ds_lock);
                }
                if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
@@ -192,12 +208,12 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
                }
        }
        mutex_enter(&ds->ds_lock);
-       ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
-       ds->ds_phys->ds_referenced_bytes -= used;
-       ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
-       ds->ds_phys->ds_compressed_bytes -= compressed;
-       ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
-       ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
+       ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
+       dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
+       ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
+       dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
+       ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
+       dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
        mutex_exit(&ds->ds_lock);
 
        return (used);
@@ -223,7 +239,7 @@ dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
        if (ds->ds_trysnap_txg >
            spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
                trysnap = ds->ds_trysnap_txg;
-       return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
+       return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap));
 }
 
 boolean_t
@@ -239,14 +255,15 @@ dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
        return (B_TRUE);
 }
 
-/* ARGSUSED */
 static void
-dsl_dataset_evict(dmu_buf_t *db, void *dsv)
+dsl_dataset_evict(void *dbu)
 {
-       dsl_dataset_t *ds = dsv;
+       dsl_dataset_t *ds = dbu;
 
        ASSERT(ds->ds_owner == NULL);
 
+       ds->ds_dbuf = NULL;
+
        unique_remove(ds->ds_fsid_guid);
 
        if (ds->ds_objset != NULL)
@@ -258,15 +275,16 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv)
        }
 
        bplist_destroy(&ds->ds_pending_deadlist);
-       if (ds->ds_phys->ds_deadlist_obj != 0)
+       if (ds->ds_deadlist.dl_os != NULL)
                dsl_deadlist_close(&ds->ds_deadlist);
        if (ds->ds_dir)
-               dsl_dir_rele(ds->ds_dir, ds);
+               dsl_dir_async_rele(ds->ds_dir, ds);
 
        ASSERT(!list_link_active(&ds->ds_synced_link));
 
        mutex_destroy(&ds->ds_lock);
        mutex_destroy(&ds->ds_opening_lock);
+       mutex_destroy(&ds->ds_sendstream_lock);
        refcount_destroy(&ds->ds_longholds);
 
        kmem_free(ds, sizeof (dsl_dataset_t));
@@ -283,16 +301,22 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds)
 
        if (ds->ds_snapname[0])
                return (0);
-       if (ds->ds_phys->ds_next_snap_obj == 0)
+       if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
                return (0);
 
-       err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
+       err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
            FTAG, &headdbuf);
        if (err != 0)
                return (err);
        headphys = headdbuf->db_data;
        err = zap_value_search(dp->dp_meta_objset,
            headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
+       if (err != 0 && zfs_recover == B_TRUE) {
+               err = 0;
+               (void) snprintf(ds->ds_snapname, sizeof (ds->ds_snapname),
+                   "SNAPOBJ=%llu-ERR=%d",
+                   (unsigned long long)ds->ds_object, err);
+       }
        dmu_buf_rele(headdbuf, FTAG);
        return (err);
 }
@@ -301,11 +325,11 @@ int
 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
 {
        objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-       uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+       uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
        matchtype_t mt;
        int err;
 
-       if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+       if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
                mt = MT_FIRST;
        else
                mt = MT_EXACT;
@@ -318,16 +342,17 @@ dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
 }
 
 int
-dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx)
+dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
+    boolean_t adj_cnt)
 {
        objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-       uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+       uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
        matchtype_t mt;
        int err;
 
        dsl_dir_snap_cmtime_update(ds->ds_dir);
 
-       if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+       if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
                mt = MT_FIRST;
        else
                mt = MT_EXACT;
@@ -335,9 +360,32 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx)
        err = zap_remove_norm(mos, snapobj, name, mt, tx);
        if (err == ENOTSUP && mt == MT_FIRST)
                err = zap_remove(mos, snapobj, name, tx);
+
+       if (err == 0 && adj_cnt)
+               dsl_fs_ss_count_adjust(ds->ds_dir, -1,
+                   DD_FIELD_SNAPSHOT_COUNT, tx);
+
        return (err);
 }
 
+boolean_t
+dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
+{
+       dmu_buf_t *dbuf = ds->ds_dbuf;
+       boolean_t result = B_FALSE;
+
+       if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
+           ds->ds_object, DMU_BONUS_BLKID, tag)) {
+
+               if (ds == dmu_buf_get_user(dbuf))
+                       result = B_TRUE;
+               else
+                       dmu_buf_rele(dbuf, tag);
+       }
+
+       return (result);
+}
+
 int
 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
     dsl_dataset_t **dsp)
@@ -368,7 +416,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
                ds->ds_dbuf = dbuf;
                ds->ds_object = dsobj;
-               ds->ds_phys = dbuf->db_data;
+               ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
                list_link_init(&ds->ds_synced_link);
 
                mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -378,18 +426,29 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 
                bplist_create(&ds->ds_pending_deadlist);
                dsl_deadlist_open(&ds->ds_deadlist,
-                   mos, ds->ds_phys->ds_deadlist_obj);
+                   mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
 
                list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
                    offsetof(dmu_sendarg_t, dsa_link));
 
+               if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+                       int zaperr = zap_contains(mos, dsobj,
+                           DS_FIELD_LARGE_BLOCKS);
+                       if (zaperr != ENOENT) {
+                               VERIFY0(zaperr);
+                               ds->ds_large_blocks = B_TRUE;
+                       }
+               }
+
                if (err == 0) {
                        err = dsl_dir_hold_obj(dp,
-                           ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
+                           dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds,
+                           &ds->ds_dir);
                }
                if (err != 0) {
                        mutex_destroy(&ds->ds_lock);
                        mutex_destroy(&ds->ds_opening_lock);
+                       mutex_destroy(&ds->ds_sendstream_lock);
                        refcount_destroy(&ds->ds_longholds);
                        bplist_destroy(&ds->ds_pending_deadlist);
                        dsl_deadlist_close(&ds->ds_deadlist);
@@ -398,11 +457,11 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                        return (err);
                }
 
-               if (!dsl_dataset_is_snapshot(ds)) {
+               if (!ds->ds_is_snapshot) {
                        ds->ds_snapname[0] = '\0';
-                       if (ds->ds_phys->ds_prev_snap_obj != 0) {
+                       if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
                                err = dsl_dataset_hold_obj(dp,
-                                   ds->ds_phys->ds_prev_snap_obj,
+                                   dsl_dataset_phys(ds)->ds_prev_snap_obj,
                                    ds, &ds->ds_prev);
                        }
                        if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
@@ -416,15 +475,16 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                } else {
                        if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
                                err = dsl_dataset_get_snapname(ds);
-                       if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
+                       if (err == 0 &&
+                           dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
                                err = zap_count(
                                    ds->ds_dir->dd_pool->dp_meta_objset,
-                                   ds->ds_phys->ds_userrefs_obj,
+                                   dsl_dataset_phys(ds)->ds_userrefs_obj,
                                    &ds->ds_userrefs);
                        }
                }
 
-               if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
+               if (err == 0 && !ds->ds_is_snapshot) {
                        err = dsl_prop_get_int_ds(ds,
                            zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
                            &ds->ds_reserved);
@@ -437,8 +497,11 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                        ds->ds_reserved = ds->ds_quota = 0;
                }
 
-               if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds,
-                   &ds->ds_phys, dsl_dataset_evict)) != NULL) {
+               dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict, &ds->ds_dbuf);
+               if (err == 0)
+                       winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
+
+               if (err != 0 || winner != NULL) {
                        bplist_destroy(&ds->ds_pending_deadlist);
                        dsl_deadlist_close(&ds->ds_deadlist);
                        if (ds->ds_prev)
@@ -446,6 +509,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                        dsl_dir_rele(ds->ds_dir, ds);
                        mutex_destroy(&ds->ds_lock);
                        mutex_destroy(&ds->ds_opening_lock);
+                       mutex_destroy(&ds->ds_sendstream_lock);
                        refcount_destroy(&ds->ds_longholds);
                        kmem_free(ds, sizeof (dsl_dataset_t));
                        if (err != 0) {
@@ -455,12 +519,12 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                        ds = winner;
                } else {
                        ds->ds_fsid_guid =
-                           unique_insert(ds->ds_phys->ds_fsid_guid);
+                           unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
                }
        }
        ASSERT3P(ds->ds_dbuf, ==, dbuf);
-       ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
-       ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
+       ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
+       ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
            spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
            dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
        *dsp = ds;
@@ -481,7 +545,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
                return (err);
 
        ASSERT(dsl_pool_config_held(dp));
-       obj = dd->dd_phys->dd_head_dataset_obj;
+       obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
        if (obj != 0)
                err = dsl_dataset_hold_obj(dp, obj, tag, dsp);
        else
@@ -609,16 +673,14 @@ dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
 void
 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
 {
-       ASSERT(ds->ds_owner == tag && ds->ds_dbuf != NULL);
+       ASSERT3P(ds->ds_owner, ==, tag);
+       ASSERT(ds->ds_dbuf != NULL);
 
        mutex_enter(&ds->ds_lock);
        ds->ds_owner = NULL;
        mutex_exit(&ds->ds_lock);
        dsl_dataset_long_rele(ds, tag);
-       if (ds->ds_dbuf != NULL)
-               dsl_dataset_rele(ds, tag);
-       else
-               dsl_dataset_evict(NULL, ds);
+       dsl_dataset_rele(ds, tag);
 }
 
 boolean_t
@@ -650,9 +712,9 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
                origin = dp->dp_origin_snap;
 
        ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
-       ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
+       ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
        ASSERT(dmu_tx_is_syncing(tx));
-       ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
+       ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
 
        dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
            DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
@@ -678,52 +740,58 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
 
                dsphys->ds_prev_snap_obj = origin->ds_object;
                dsphys->ds_prev_snap_txg =
-                   origin->ds_phys->ds_creation_txg;
+                   dsl_dataset_phys(origin)->ds_creation_txg;
                dsphys->ds_referenced_bytes =
-                   origin->ds_phys->ds_referenced_bytes;
+                   dsl_dataset_phys(origin)->ds_referenced_bytes;
                dsphys->ds_compressed_bytes =
-                   origin->ds_phys->ds_compressed_bytes;
+                   dsl_dataset_phys(origin)->ds_compressed_bytes;
                dsphys->ds_uncompressed_bytes =
-                   origin->ds_phys->ds_uncompressed_bytes;
-               dsphys->ds_bp = origin->ds_phys->ds_bp;
+                   dsl_dataset_phys(origin)->ds_uncompressed_bytes;
+               dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
 
                /*
                 * Inherit flags that describe the dataset's contents
                 * (INCONSISTENT) or properties (Case Insensitive).
                 */
-               dsphys->ds_flags |= origin->ds_phys->ds_flags &
+               dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
                    (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
 
+               if (origin->ds_large_blocks)
+                       dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+
                dmu_buf_will_dirty(origin->ds_dbuf, tx);
-               origin->ds_phys->ds_num_children++;
+               dsl_dataset_phys(origin)->ds_num_children++;
 
                VERIFY0(dsl_dataset_hold_obj(dp,
-                   origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
+                   dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
+                   FTAG, &ohds));
                dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
                    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
                dsl_dataset_rele(ohds, FTAG);
 
                if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
-                       if (origin->ds_phys->ds_next_clones_obj == 0) {
-                               origin->ds_phys->ds_next_clones_obj =
+                       if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
+                               dsl_dataset_phys(origin)->ds_next_clones_obj =
                                    zap_create(mos,
                                    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
                        }
                        VERIFY0(zap_add_int(mos,
-                           origin->ds_phys->ds_next_clones_obj, dsobj, tx));
+                           dsl_dataset_phys(origin)->ds_next_clones_obj,
+                           dsobj, tx));
                }
 
                dmu_buf_will_dirty(dd->dd_dbuf, tx);
-               dd->dd_phys->dd_origin_obj = origin->ds_object;
+               dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
                if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
-                       if (origin->ds_dir->dd_phys->dd_clones == 0) {
+                       if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
                                dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
-                               origin->ds_dir->dd_phys->dd_clones =
+                               dsl_dir_phys(origin->ds_dir)->dd_clones =
                                    zap_create(mos,
                                    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
                        }
                        VERIFY0(zap_add_int(mos,
-                           origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
+                           dsl_dir_phys(origin->ds_dir)->dd_clones,
+                           dsobj, tx));
                }
        }
 
@@ -733,7 +801,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
        dmu_buf_rele(dbuf, FTAG);
 
        dmu_buf_will_dirty(dd->dd_dbuf, tx);
-       dd->dd_phys->dd_head_dataset_obj = dsobj;
+       dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
 
        return (dsobj);
 }
@@ -767,6 +835,21 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
 
        dsl_deleg_set_create_perms(dd, tx, cr);
 
+       /*
+        * Since we're creating a new node we know it's a leaf, so we can
+        * initialize the counts if the limit feature is active.
+        */
+       if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+               uint64_t cnt = 0;
+               objset_t *os = dd->dd_pool->dp_meta_objset;
+
+               dsl_dir_zapify(dd, tx);
+               VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+                   sizeof (cnt), 1, &cnt, tx));
+               VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+                   sizeof (cnt), 1, &cnt, tx));
+       }
+
        dsl_dir_rele(dd, FTAG);
 
        /*
@@ -798,22 +881,22 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
        uint64_t mrs_used;
        uint64_t dlused, dlcomp, dluncomp;
 
-       ASSERT(!dsl_dataset_is_snapshot(ds));
+       ASSERT(!ds->ds_is_snapshot);
 
-       if (ds->ds_phys->ds_prev_snap_obj != 0)
-               mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
+       if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
+               mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
        else
                mrs_used = 0;
 
        dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
 
        ASSERT3U(dlused, <=, mrs_used);
-       ds->ds_phys->ds_unique_bytes =
-           ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
+       dsl_dataset_phys(ds)->ds_unique_bytes =
+           dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
 
        if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
            SPA_VERSION_UNIQUE_ACCURATE)
-               ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+               dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 }
 
 void
@@ -824,8 +907,9 @@ dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
        int err;
        ASSERTV(uint64_t count);
 
-       ASSERT(ds->ds_phys->ds_num_children >= 2);
-       err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
+       ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
+       err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+           obj, tx);
        /*
         * The err should not be ENOENT, but a bug in a previous version
         * of the code could cause upgrade_clones_cb() to not set
@@ -838,16 +922,16 @@ dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
         */
        if (err != ENOENT)
                VERIFY0(err);
-       ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
+       ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
            &count));
-       ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
+       ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
 }
 
 
 blkptr_t *
 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
 {
-       return (&ds->ds_phys->ds_bp);
+       return (&dsl_dataset_phys(ds)->ds_bp);
 }
 
 void
@@ -859,7 +943,7 @@ dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
                tx->tx_pool->dp_meta_rootbp = *bp;
        } else {
                dmu_buf_will_dirty(ds->ds_dbuf, tx);
-               ds->ds_phys->ds_bp = *bp;
+               dsl_dataset_phys(ds)->ds_bp = *bp;
        }
 }
 
@@ -879,7 +963,7 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
 
        ASSERT(ds->ds_objset != NULL);
 
-       if (ds->ds_phys->ds_next_snap_obj != 0)
+       if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
                panic("dirtying snapshot!");
 
        dp = ds->ds_dir->dd_pool;
@@ -917,7 +1001,7 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
         * outside of the reservation.
         */
        ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
-       asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
+       asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
        if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
                return (SET_ERROR(ENOSPC));
 
@@ -935,11 +1019,12 @@ typedef struct dsl_dataset_snapshot_arg {
        nvlist_t *ddsa_snaps;
        nvlist_t *ddsa_props;
        nvlist_t *ddsa_errors;
+       cred_t *ddsa_cr;
 } dsl_dataset_snapshot_arg_t;
 
 int
 dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
-    dmu_tx_t *tx, boolean_t recv)
+    dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
 {
        int error;
        uint64_t value;
@@ -953,7 +1038,7 @@ dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
         * We don't allow multiple snapshots of the same txg.  If there
         * is already one, try again.
         */
-       if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
+       if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
                return (SET_ERROR(EAGAIN));
 
        /*
@@ -977,6 +1062,18 @@ dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
        if (!recv && DS_IS_INCONSISTENT(ds))
                return (SET_ERROR(EBUSY));
 
+       /*
+        * Skip the check for temporary snapshots or if we have already checked
+        * the counts in dsl_dataset_snapshot_check. This means we really only
+        * check the count here when we're receiving a stream.
+        */
+       if (cnt != 0 && cr != NULL) {
+               error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+                   ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
+               if (error != 0)
+                       return (error);
+       }
+
        error = dsl_dataset_snapshot_reserve_space(ds, tx);
        if (error != 0)
                return (error);
@@ -992,6 +1089,103 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
        nvpair_t *pair;
        int rv = 0;
 
+       /*
+        * Pre-compute how many total new snapshots will be created for each
+        * level in the tree and below. This is needed for validating the
+        * snapshot limit when either taking a recursive snapshot or when
+        * taking multiple snapshots.
+        *
+        * The problem is that the counts are not actually adjusted when
+        * we are checking, only when we finally sync. For a single snapshot,
+        * this is easy, the count will increase by 1 at each node up the tree,
+        * but its more complicated for the recursive/multiple snapshot case.
+        *
+        * The dsl_fs_ss_limit_check function does recursively check the count
+        * at each level up the tree but since it is validating each snapshot
+        * independently we need to be sure that we are validating the complete
+        * count for the entire set of snapshots. We do this by rolling up the
+        * counts for each component of the name into an nvlist and then
+        * checking each of those cases with the aggregated count.
+        *
+        * This approach properly handles not only the recursive snapshot
+        * case (where we get all of those on the ddsa_snaps list) but also
+        * the sibling case (e.g. snapshot a/b and a/c so that we will also
+        * validate the limit on 'a' using a count of 2).
+        *
+        * We validate the snapshot names in the third loop and only report
+        * name errors once.
+        */
+       if (dmu_tx_is_syncing(tx)) {
+               char *nm;
+               nvlist_t *cnt_track = NULL;
+               cnt_track = fnvlist_alloc();
+
+               nm = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+               /* Rollup aggregated counts into the cnt_track list */
+               for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+                   pair != NULL;
+                   pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+                       char *pdelim;
+                       uint64_t val;
+
+                       (void) strlcpy(nm, nvpair_name(pair), MAXPATHLEN);
+                       pdelim = strchr(nm, '@');
+                       if (pdelim == NULL)
+                               continue;
+                       *pdelim = '\0';
+
+                       do {
+                               if (nvlist_lookup_uint64(cnt_track, nm,
+                                   &val) == 0) {
+                                       /* update existing entry */
+                                       fnvlist_add_uint64(cnt_track, nm,
+                                           val + 1);
+                               } else {
+                                       /* add to list */
+                                       fnvlist_add_uint64(cnt_track, nm, 1);
+                               }
+
+                               pdelim = strrchr(nm, '/');
+                               if (pdelim != NULL)
+                                       *pdelim = '\0';
+                       } while (pdelim != NULL);
+               }
+
+               kmem_free(nm, MAXPATHLEN);
+
+               /* Check aggregated counts at each level */
+               for (pair = nvlist_next_nvpair(cnt_track, NULL);
+                   pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
+                       int error = 0;
+                       char *name;
+                       uint64_t cnt = 0;
+                       dsl_dataset_t *ds;
+
+                       name = nvpair_name(pair);
+                       cnt = fnvpair_value_uint64(pair);
+                       ASSERT(cnt > 0);
+
+                       error = dsl_dataset_hold(dp, name, FTAG, &ds);
+                       if (error == 0) {
+                               error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+                                   ZFS_PROP_SNAPSHOT_LIMIT, NULL,
+                                   ddsa->ddsa_cr);
+                               dsl_dataset_rele(ds, FTAG);
+                       }
+
+                       if (error != 0) {
+                               if (ddsa->ddsa_errors != NULL)
+                                       fnvlist_add_int32(ddsa->ddsa_errors,
+                                           name, error);
+                               rv = error;
+                               /* only report one error for this check */
+                               break;
+                       }
+               }
+               nvlist_free(cnt_track);
+       }
+
        for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
            pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
                int error = 0;
@@ -1012,8 +1206,9 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
                if (error == 0)
                        error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
                if (error == 0) {
+                       /* passing 0/NULL skips dsl_fs_ss_limit_check */
                        error = dsl_dataset_snapshot_check_impl(ds,
-                           atp + 1, tx, B_FALSE);
+                           atp + 1, tx, B_FALSE, 0, NULL);
                        dsl_dataset_rele(ds, FTAG);
                }
 
@@ -1025,6 +1220,7 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
                        rv = error;
                }
        }
+
        return (rv);
 }
 
@@ -1051,6 +1247,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
            bcmp(&os->os_phys->os_zil_header, &zero_zil,
            sizeof (zero_zil)) == 0);
 
+       dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
 
        /*
         * The origin's ds_creation_txg has to be < TXG_INITIAL
@@ -1070,32 +1267,38 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
        dsphys->ds_fsid_guid = unique_create();
        (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
            sizeof (dsphys->ds_guid));
-       dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
-       dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+       dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+       dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
        dsphys->ds_next_snap_obj = ds->ds_object;
        dsphys->ds_num_children = 1;
        dsphys->ds_creation_time = gethrestime_sec();
        dsphys->ds_creation_txg = crtxg;
-       dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
-       dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
-       dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
-       dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
-       dsphys->ds_flags = ds->ds_phys->ds_flags;
-       dsphys->ds_bp = ds->ds_phys->ds_bp;
+       dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+       dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
+       dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
+       dsphys->ds_uncompressed_bytes =
+           dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+       dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
+       dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
        dmu_buf_rele(dbuf, FTAG);
 
-       ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
+       if (ds->ds_large_blocks)
+               dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+
+       ASSERT3U(ds->ds_prev != 0, ==,
+           dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
        if (ds->ds_prev) {
                uint64_t next_clones_obj =
-                   ds->ds_prev->ds_phys->ds_next_clones_obj;
-               ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
+                   dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
+               ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
                    ds->ds_object ||
-                   ds->ds_prev->ds_phys->ds_num_children > 1);
-               if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+                   dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
+               if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+                   ds->ds_object) {
                        dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-                       ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
-                           ds->ds_prev->ds_phys->ds_creation_txg);
-                       ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
+                       ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+                           dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
+                       dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
                } else if (next_clones_obj != 0) {
                        dsl_dataset_remove_from_next_clones(ds->ds_prev,
                            dsphys->ds_next_snap_obj, tx);
@@ -1112,33 +1315,36 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
        if (ds->ds_reserved) {
                int64_t delta;
                ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
-               delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
+               delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
+                   ds->ds_reserved);
                dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
                    delta, 0, 0, tx);
        }
 
        dmu_buf_will_dirty(ds->ds_dbuf, tx);
-       ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
-           UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
+       dsl_dataset_phys(ds)->ds_deadlist_obj =
+           dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
+           dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
        dsl_deadlist_close(&ds->ds_deadlist);
-       dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+       dsl_deadlist_open(&ds->ds_deadlist, mos,
+           dsl_dataset_phys(ds)->ds_deadlist_obj);
        dsl_deadlist_add_key(&ds->ds_deadlist,
-           ds->ds_phys->ds_prev_snap_txg, tx);
+           dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
 
-       ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
-       ds->ds_phys->ds_prev_snap_obj = dsobj;
-       ds->ds_phys->ds_prev_snap_txg = crtxg;
-       ds->ds_phys->ds_unique_bytes = 0;
+       ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
+       dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
+       dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
+       dsl_dataset_phys(ds)->ds_unique_bytes = 0;
        if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
-               ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+               dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 
-       VERIFY0(zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
+       VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
            snapname, 8, 1, &dsobj, tx));
 
        if (ds->ds_prev)
                dsl_dataset_rele(ds->ds_prev, ds);
        VERIFY0(dsl_dataset_hold_obj(dp,
-           ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
+           dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
 
        dsl_scan_ds_snapshotted(ds, tx);
 
@@ -1227,11 +1433,12 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
        ddsa.ddsa_snaps = snaps;
        ddsa.ddsa_props = props;
        ddsa.ddsa_errors = errors;
+       ddsa.ddsa_cr = CRED();
 
        if (error == 0) {
                error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
                    dsl_dataset_snapshot_sync, &ddsa,
-                   fnvlist_num_pairs(snaps) * 3);
+                   fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
        }
 
        if (suspended != NULL) {
@@ -1275,8 +1482,9 @@ dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
        if (error != 0)
                return (error);
 
+       /* NULL cred means no limit check for tmp snapshot */
        error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
-           tx, B_FALSE);
+           tx, B_FALSE, 0, NULL);
        if (error != 0) {
                dsl_dataset_rele(ds, FTAG);
                return (error);
@@ -1342,7 +1550,7 @@ dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
        }
 
        error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
-           dsl_dataset_snapshot_tmp_sync, &ddsta, 3);
+           dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
 
        if (needsuspend)
                zil_resume(cookie);
@@ -1355,16 +1563,21 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 {
        ASSERT(dmu_tx_is_syncing(tx));
        ASSERT(ds->ds_objset != NULL);
-       ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
+       ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
 
        /*
         * in case we had to change ds_fsid_guid when we opened it,
         * sync it out now.
         */
        dmu_buf_will_dirty(ds->ds_dbuf, tx);
-       ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
+       dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
 
        dmu_objset_sync(ds->ds_objset, zio, tx);
+
+       if (ds->ds_need_large_blocks && !ds->ds_large_blocks) {
+               dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
+               ds->ds_large_blocks = B_TRUE;
+       }
 }
 
 static void
@@ -1384,13 +1597,14 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
         * due to a bug in a previous version of the code.
         * Only trust it if it has the right number of entries.
         */
-       if (ds->ds_phys->ds_next_clones_obj != 0) {
-               VERIFY0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
+       if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+               VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
                    &count));
        }
-       if (count != ds->ds_phys->ds_num_children - 1)
+       if (count != dsl_dataset_phys(ds)->ds_num_children - 1)
                goto fail;
-       for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
+       for (zap_cursor_init(&zc, mos,
+           dsl_dataset_phys(ds)->ds_next_clones_obj);
            zap_cursor_retrieve(&zc, &za) == 0;
            zap_cursor_advance(&zc)) {
                dsl_dataset_t *clone;
@@ -1417,18 +1631,18 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 
        ASSERT(dsl_pool_config_held(dp));
 
-       ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
-           (ds->ds_phys->ds_uncompressed_bytes * 100 /
-           ds->ds_phys->ds_compressed_bytes);
+       ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
+           (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
+           dsl_dataset_phys(ds)->ds_compressed_bytes);
 
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
-           ds->ds_phys->ds_uncompressed_bytes);
+           dsl_dataset_phys(ds)->ds_uncompressed_bytes);
 
-       if (dsl_dataset_is_snapshot(ds)) {
+       if (ds->ds_is_snapshot) {
                dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
                dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
-                   ds->ds_phys->ds_unique_bytes);
+                   dsl_dataset_phys(ds)->ds_unique_bytes);
                get_clones_stat(ds, nv);
        } else {
                dsl_dir_stats(ds->ds_dir, nv);
@@ -1439,17 +1653,17 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
 
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
-           ds->ds_phys->ds_creation_time);
+           dsl_dataset_phys(ds)->ds_creation_time);
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
-           ds->ds_phys->ds_creation_txg);
+           dsl_dataset_phys(ds)->ds_creation_txg);
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
            ds->ds_quota);
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
            ds->ds_reserved);
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
-           ds->ds_phys->ds_guid);
+           dsl_dataset_phys(ds)->ds_guid);
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
-           ds->ds_phys->ds_unique_bytes);
+           dsl_dataset_phys(ds)->ds_unique_bytes);
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
            ds->ds_object);
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
@@ -1457,14 +1671,14 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
            DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
 
-       if (ds->ds_phys->ds_prev_snap_obj != 0) {
+       if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
                uint64_t written, comp, uncomp;
                dsl_pool_t *dp = ds->ds_dir->dd_pool;
                dsl_dataset_t *prev;
                int err;
 
                err = dsl_dataset_hold_obj(dp,
-                   ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+                   dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
                if (err == 0) {
                        err = dsl_dataset_space_written(prev, ds, &written,
                            &comp, &uncomp);
@@ -1484,13 +1698,15 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
        dsl_pool_t *dp = ds->ds_dir->dd_pool;
        ASSERT(dsl_pool_config_held(dp));
 
-       stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
-       stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
-       stat->dds_guid = ds->ds_phys->ds_guid;
+       stat->dds_creation_txg = dsl_dataset_phys(ds)->ds_creation_txg;
+       stat->dds_inconsistent =
+           dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT;
+       stat->dds_guid = dsl_dataset_phys(ds)->ds_guid;
        stat->dds_origin[0] = '\0';
-       if (dsl_dataset_is_snapshot(ds)) {
+       if (ds->ds_is_snapshot) {
                stat->dds_is_snapshot = B_TRUE;
-               stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
+               stat->dds_num_clones =
+                   dsl_dataset_phys(ds)->ds_num_children - 1;
        } else {
                stat->dds_is_snapshot = B_FALSE;
                stat->dds_num_clones = 0;
@@ -1499,7 +1715,8 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
                        dsl_dataset_t *ods;
 
                        VERIFY0(dsl_dataset_hold_obj(dp,
-                           ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
+                           dsl_dir_phys(ds->ds_dir)->dd_origin_obj,
+                           FTAG, &ods));
                        dsl_dataset_name(ods, stat->dds_origin);
                        dsl_dataset_rele(ods, FTAG);
                }
@@ -1517,10 +1734,11 @@ dsl_dataset_space(dsl_dataset_t *ds,
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
-       *refdbytesp = ds->ds_phys->ds_referenced_bytes;
+       *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
        *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
-       if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
-               *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
+       if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
+               *availbytesp +=
+                   ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
        if (ds->ds_quota != 0) {
                /*
                 * Adjust available bytes according to refquota
@@ -1531,7 +1749,7 @@ dsl_dataset_space(dsl_dataset_t *ds,
                else
                        *availbytesp = 0;
        }
-       *usedobjsp = BP_GET_FILL(&ds->ds_phys->ds_bp);
+       *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
        *availobjsp = DN_MAX_OBJECT - *usedobjsp;
 }
 
@@ -1541,8 +1759,8 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
        ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
        if (snap == NULL)
                return (B_FALSE);
-       if (ds->ds_phys->ds_bp.blk_birth >
-           snap->ds_phys->ds_creation_txg) {
+       if (dsl_dataset_phys(ds)->ds_bp.blk_birth >
+           dsl_dataset_phys(snap)->ds_creation_txg) {
                objset_t *os, *os_snap;
                /*
                 * It may be that only the ZIL differs, because it was
@@ -1644,11 +1862,13 @@ dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
        spa_history_log_internal_ds(ds, "rename", tx,
            "-> @%s", ddrsa->ddrsa_newsnapname);
 
-       VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx));
+       VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
+           B_FALSE));
        mutex_enter(&ds->ds_lock);
        (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
        mutex_exit(&ds->ds_lock);
-       VERIFY0(zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj,
+       VERIFY0(zap_add(dp->dp_meta_objset,
+           dsl_dataset_phys(hds)->ds_snapnames_zapobj,
            ds->ds_snapname, 8, 1, &ds->ds_object, tx));
 
        dsl_dataset_rele(ds, FTAG);
@@ -1691,7 +1911,8 @@ dsl_dataset_rename_snapshot(const char *fsname,
        ddrsa.ddrsa_recursive = recursive;
 
        error = dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
-           dsl_dataset_rename_snapshot_sync, &ddrsa, 1);
+           dsl_dataset_rename_snapshot_sync, &ddrsa,
+           1, ZFS_SPACE_CHECK_RESERVED);
 
        if (error)
            return (SET_ERROR(error));
@@ -1760,13 +1981,13 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
                return (error);
 
        /* must not be a snapshot */
-       if (dsl_dataset_is_snapshot(ds)) {
+       if (ds->ds_is_snapshot) {
                dsl_dataset_rele(ds, FTAG);
                return (SET_ERROR(EINVAL));
        }
 
        /* must have a most recent snapshot */
-       if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) {
+       if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
                dsl_dataset_rele(ds, FTAG);
                return (SET_ERROR(EINVAL));
        }
@@ -1785,7 +2006,7 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
                    fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
                    zfs_prop_to_name(ZFS_PROP_CREATETXG));
                uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
-               if (createtxg > ds->ds_phys->ds_prev_snap_txg) {
+               if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
                        fnvlist_free(bookmarks);
                        dsl_dataset_rele(ds, FTAG);
                        return (SET_ERROR(EEXIST));
@@ -1804,7 +2025,7 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
         * the refquota.
         */
        if (ds->ds_quota != 0 &&
-           ds->ds_prev->ds_phys->ds_referenced_bytes > ds->ds_quota) {
+           dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
                dsl_dataset_rele(ds, FTAG);
                return (SET_ERROR(EDQUOT));
        }
@@ -1817,7 +2038,7 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
         * this space, but the freeing happens over many txg's.
         */
        unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
-           ds->ds_phys->ds_unique_bytes);
+           dsl_dataset_phys(ds)->ds_unique_bytes);
 
        if (unused_refres_delta > 0 &&
            unused_refres_delta >
@@ -1881,7 +2102,8 @@ dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result)
        ddra.ddra_result = result;
 
        return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
-           dsl_dataset_rollback_sync, &ddra, 1));
+           dsl_dataset_rollback_sync, &ddra,
+           1, ZFS_SPACE_CHECK_RESERVED));
 }
 
 struct promotenode {
@@ -1896,6 +2118,7 @@ typedef struct dsl_dataset_promote_arg {
        dsl_dataset_t *origin_origin; /* origin of the origin */
        uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
        char *err_ds;
+       cred_t *cr;
 } dsl_dataset_promote_arg_t;
 
 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
@@ -1913,14 +2136,17 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
        dsl_dataset_t *origin_ds;
        int err;
        uint64_t unused;
+       uint64_t ss_mv_cnt;
+       size_t max_snap_len;
 
        err = promote_hold(ddpa, dp, FTAG);
        if (err != 0)
                return (err);
 
        hds = ddpa->ddpa_clone;
+       max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
 
-       if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
+       if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
                promote_rele(ddpa, FTAG);
                return (SET_ERROR(EXDEV));
        }
@@ -1939,9 +2165,10 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
 
        /* compute origin's new unique space */
        snap = list_tail(&ddpa->clone_snaps);
-       ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
+       ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+           origin_ds->ds_object);
        dsl_deadlist_space_range(&snap->ds->ds_deadlist,
-           origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+           dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
            &ddpa->unique, &unused, &unused);
 
        /*
@@ -1959,14 +2186,17 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
         * Note however, if we stop before we reach the ORIGIN we get:
         * uN + kN + kN-1 + ... + kM - uM-1
         */
-       ddpa->used = origin_ds->ds_phys->ds_referenced_bytes;
-       ddpa->comp = origin_ds->ds_phys->ds_compressed_bytes;
-       ddpa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
+       ss_mv_cnt = 0;
+       ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
+       ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
+       ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
        for (snap = list_head(&ddpa->shared_snaps); snap;
            snap = list_next(&ddpa->shared_snaps, snap)) {
                uint64_t val, dlused, dlcomp, dluncomp;
                dsl_dataset_t *ds = snap->ds;
 
+               ss_mv_cnt++;
+
                /*
                 * If there are long holds, we won't be able to evict
                 * the objset.
@@ -1978,6 +2208,10 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
 
                /* Check that the snapshot name does not conflict */
                VERIFY0(dsl_dataset_get_snapname(ds));
+               if (strlen(ds->ds_snapname) >= max_snap_len) {
+                       err = SET_ERROR(ENAMETOOLONG);
+                       goto out;
+               }
                err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
                if (err == 0) {
                        (void) strcpy(ddpa->err_ds, snap->ds->ds_snapname);
@@ -1988,7 +2222,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
                        goto out;
 
                /* The very first snapshot does not have a deadlist */
-               if (ds->ds_phys->ds_prev_snap_obj == 0)
+               if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
                        continue;
 
                dsl_deadlist_space(&ds->ds_deadlist,
@@ -2003,15 +2237,18 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
         * so we need to subtract out the clone origin's used space.
         */
        if (ddpa->origin_origin) {
-               ddpa->used -= ddpa->origin_origin->ds_phys->ds_referenced_bytes;
-               ddpa->comp -= ddpa->origin_origin->ds_phys->ds_compressed_bytes;
+               ddpa->used -=
+                   dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
+               ddpa->comp -=
+                   dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
                ddpa->uncomp -=
-                   ddpa->origin_origin->ds_phys->ds_uncompressed_bytes;
+                   dsl_dataset_phys(ddpa->origin_origin)->
+                   ds_uncompressed_bytes;
        }
 
-       /* Check that there is enough space here */
+       /* Check that there is enough space and limit headroom here */
        err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
-           ddpa->used);
+           0, ss_mv_cnt, ddpa->used, ddpa->cr);
        if (err != 0)
                goto out;
 
@@ -2021,7 +2258,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
         * it is the amount of space that will be on all of their
         * deadlists (that was not born before their new origin).
         */
-       if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+       if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
                uint64_t space;
 
                /*
@@ -2043,9 +2280,11 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
                        goto out;
                ddpa->cloneusedsnap += space;
        }
-       if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+       if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
+           DD_FLAG_USED_BREAKDOWN) {
                err = snaplist_space(&ddpa->origin_snaps,
-                   origin_ds->ds_phys->ds_creation_txg, &ddpa->originusedsnap);
+                   dsl_dataset_phys(origin_ds)->ds_creation_txg,
+                   &ddpa->originusedsnap);
                if (err != 0)
                        goto out;
        }
@@ -2072,7 +2311,7 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
        VERIFY0(promote_hold(ddpa, dp, FTAG));
        hds = ddpa->ddpa_clone;
 
-       ASSERT0(hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE);
+       ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
 
        snap = list_head(&ddpa->shared_snaps);
        origin_ds = snap->ds;
@@ -2090,47 +2329,49 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
 
        /* change origin's next snap */
        dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
-       oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
+       oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
        snap = list_tail(&ddpa->clone_snaps);
-       ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
-       origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
+       ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+           origin_ds->ds_object);
+       dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
 
        /* change the origin's next clone */
-       if (origin_ds->ds_phys->ds_next_clones_obj) {
+       if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
                dsl_dataset_remove_from_next_clones(origin_ds,
                    snap->ds->ds_object, tx);
                VERIFY0(zap_add_int(dp->dp_meta_objset,
-                   origin_ds->ds_phys->ds_next_clones_obj,
+                   dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
                    oldnext_obj, tx));
        }
 
        /* change origin */
        dmu_buf_will_dirty(dd->dd_dbuf, tx);
-       ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
-       dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
+       ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
+       dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
        dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
        dmu_buf_will_dirty(odd->dd_dbuf, tx);
-       odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
+       dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
        origin_head->ds_dir->dd_origin_txg =
-           origin_ds->ds_phys->ds_creation_txg;
+           dsl_dataset_phys(origin_ds)->ds_creation_txg;
 
        /* change dd_clone entries */
        if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
                VERIFY0(zap_remove_int(dp->dp_meta_objset,
-                   odd->dd_phys->dd_clones, hds->ds_object, tx));
+                   dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
                VERIFY0(zap_add_int(dp->dp_meta_objset,
-                   ddpa->origin_origin->ds_dir->dd_phys->dd_clones,
+                   dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
                    hds->ds_object, tx));
 
                VERIFY0(zap_remove_int(dp->dp_meta_objset,
-                   ddpa->origin_origin->ds_dir->dd_phys->dd_clones,
+                   dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
                    origin_head->ds_object, tx));
-               if (dd->dd_phys->dd_clones == 0) {
-                       dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
-                           DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+               if (dsl_dir_phys(dd)->dd_clones == 0) {
+                       dsl_dir_phys(dd)->dd_clones =
+                           zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
+                           DMU_OT_NONE, 0, tx);
                }
                VERIFY0(zap_add_int(dp->dp_meta_objset,
-                   dd->dd_phys->dd_clones, origin_head->ds_object, tx));
+                   dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
        }
 
        /* move snapshots to this dir */
@@ -2151,28 +2392,30 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
                /* move snap name entry */
                VERIFY0(dsl_dataset_get_snapname(ds));
                VERIFY0(dsl_dataset_snap_remove(origin_head,
-                   ds->ds_snapname, tx));
+                   ds->ds_snapname, tx, B_TRUE));
                VERIFY0(zap_add(dp->dp_meta_objset,
-                   hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
+                   dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
                    8, 1, &ds->ds_object, tx));
+               dsl_fs_ss_count_adjust(hds->ds_dir, 1,
+                   DD_FIELD_SNAPSHOT_COUNT, tx);
 
                /* change containing dsl_dir */
                dmu_buf_will_dirty(ds->ds_dbuf, tx);
-               ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
-               ds->ds_phys->ds_dir_obj = dd->dd_object;
+               ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
+               dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
                ASSERT3P(ds->ds_dir, ==, odd);
                dsl_dir_rele(ds->ds_dir, ds);
                VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
                    NULL, ds, &ds->ds_dir));
 
                /* move any clone references */
-               if (ds->ds_phys->ds_next_clones_obj &&
+               if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
                    spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
                        zap_cursor_t zc;
                        zap_attribute_t za;
 
                        for (zap_cursor_init(&zc, dp->dp_meta_objset,
-                           ds->ds_phys->ds_next_clones_obj);
+                           dsl_dataset_phys(ds)->ds_next_clones_obj);
                            zap_cursor_retrieve(&zc, &za) == 0;
                            zap_cursor_advance(&zc)) {
                                dsl_dataset_t *cnds;
@@ -2188,12 +2431,13 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
 
                                VERIFY0(dsl_dataset_hold_obj(dp,
                                    za.za_first_integer, FTAG, &cnds));
-                               o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
+                               o = dsl_dir_phys(cnds->ds_dir)->
+                                   dd_head_dataset_obj;
 
                                VERIFY0(zap_remove_int(dp->dp_meta_objset,
-                                   odd->dd_phys->dd_clones, o, tx));
+                                   dsl_dir_phys(odd)->dd_clones, o, tx));
                                VERIFY0(zap_add_int(dp->dp_meta_objset,
-                                   dd->dd_phys->dd_clones, o, tx));
+                                   dsl_dir_phys(dd)->dd_clones, o, tx));
                                dsl_dataset_rele(cnds, FTAG);
                        }
                        zap_cursor_fini(&zc);
@@ -2210,7 +2454,7 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
         */
 
        delta = ddpa->cloneusedsnap -
-           dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
+           dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
        ASSERT3S(delta, >=, 0);
        ASSERT3U(ddpa->used, >=, delta);
        dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
@@ -2218,14 +2462,14 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
            ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
 
        delta = ddpa->originusedsnap -
-           odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
+           dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
        ASSERT3S(delta, <=, 0);
        ASSERT3U(ddpa->used, >=, -delta);
        dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
        dsl_dir_diduse_space(odd, DD_USED_HEAD,
            -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
 
-       origin_ds->ds_phys->ds_unique_bytes = ddpa->unique;
+       dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
 
        /* log history record */
        spa_history_log_internal_ds(hds, "promote", tx, "");
@@ -2260,12 +2504,12 @@ snaplist_make(dsl_pool_t *dp,
                        return (err);
 
                if (first_obj == 0)
-                       first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
+                       first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
 
                snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
                snap->ds = ds;
                list_insert_tail(l, snap);
-               obj = ds->ds_phys->ds_prev_snap_obj;
+               obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
        }
 
        return (0);
@@ -2315,13 +2559,13 @@ promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
                return (error);
        dd = ddpa->ddpa_clone->ds_dir;
 
-       if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) ||
+       if (ddpa->ddpa_clone->ds_is_snapshot ||
            !dsl_dir_is_clone(dd)) {
                dsl_dataset_rele(ddpa->ddpa_clone, tag);
                return (SET_ERROR(EINVAL));
        }
 
-       error = snaplist_make(dp, 0, dd->dd_phys->dd_origin_obj,
+       error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
            &ddpa->shared_snaps, tag);
        if (error != 0)
                goto out;
@@ -2332,16 +2576,16 @@ promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
                goto out;
 
        snap = list_head(&ddpa->shared_snaps);
-       ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
-       error = snaplist_make(dp, dd->dd_phys->dd_origin_obj,
-           snap->ds->ds_dir->dd_phys->dd_head_dataset_obj,
+       ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
+       error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
+           dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
            &ddpa->origin_snaps, tag);
        if (error != 0)
                goto out;
 
-       if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
+       if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
                error = dsl_dataset_hold_obj(dp,
-                   snap->ds->ds_dir->dd_phys->dd_origin_obj,
+                   dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
                    tag, &ddpa->origin_origin);
                if (error != 0)
                        goto out;
@@ -2385,16 +2629,19 @@ dsl_dataset_promote(const char *name, char *conflsnap)
        if (error != 0)
                return (error);
        error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
-           dmu_objset_ds(os)->ds_phys->ds_snapnames_zapobj, &numsnaps);
+           dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
+           &numsnaps);
        dmu_objset_rele(os, FTAG);
        if (error != 0)
                return (error);
 
        ddpa.ddpa_clonename = name;
        ddpa.err_ds = conflsnap;
+       ddpa.cr = CRED();
 
        return (dsl_sync_task(name, dsl_dataset_promote_check,
-           dsl_dataset_promote_sync, &ddpa, 2 + numsnaps));
+           dsl_dataset_promote_sync, &ddpa,
+           2 + numsnaps, ZFS_SPACE_CHECK_RESERVED));
 }
 
 int
@@ -2404,8 +2651,8 @@ dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
        int64_t unused_refres_delta;
 
        /* they should both be heads */
-       if (dsl_dataset_is_snapshot(clone) ||
-           dsl_dataset_is_snapshot(origin_head))
+       if (clone->ds_is_snapshot ||
+           origin_head->ds_is_snapshot)
                return (SET_ERROR(EINVAL));
 
        /* if we are not forcing, the branch point should be just before them */
@@ -2434,9 +2681,9 @@ dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
        /* check amount of any unconsumed refreservation */
        unused_refres_delta =
            (int64_t)MIN(origin_head->ds_reserved,
-           origin_head->ds_phys->ds_unique_bytes) -
+           dsl_dataset_phys(origin_head)->ds_unique_bytes) -
            (int64_t)MIN(origin_head->ds_reserved,
-           clone->ds_phys->ds_unique_bytes);
+           dsl_dataset_phys(clone)->ds_unique_bytes);
 
        if (unused_refres_delta > 0 &&
            unused_refres_delta >
@@ -2445,7 +2692,8 @@ dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
 
        /* clone can't be over the head's refquota */
        if (origin_head->ds_quota != 0 &&
-           clone->ds_phys->ds_referenced_bytes > origin_head->ds_quota)
+           dsl_dataset_phys(clone)->ds_referenced_bytes >
+           origin_head->ds_quota)
                return (SET_ERROR(EDQUOT));
 
        return (0);
@@ -2460,7 +2708,7 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
 
        ASSERT(clone->ds_reserved == 0);
        ASSERT(origin_head->ds_quota == 0 ||
-           clone->ds_phys->ds_unique_bytes <= origin_head->ds_quota);
+           dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota);
        ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
 
        dmu_buf_will_dirty(clone->ds_dbuf, tx);
@@ -2478,9 +2726,9 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
 
        unused_refres_delta =
            (int64_t)MIN(origin_head->ds_reserved,
-           origin_head->ds_phys->ds_unique_bytes) -
+           dsl_dataset_phys(origin_head)->ds_unique_bytes) -
            (int64_t)MIN(origin_head->ds_reserved,
-           clone->ds_phys->ds_unique_bytes);
+           dsl_dataset_phys(clone)->ds_unique_bytes);
 
        /*
         * Reset origin's unique bytes, if it exists.
@@ -2491,16 +2739,17 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
 
                dmu_buf_will_dirty(origin->ds_dbuf, tx);
                dsl_deadlist_space_range(&clone->ds_deadlist,
-                   origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
-                   &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
+                   dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
+                   &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
        }
 
        /* swap blkptrs */
        {
                blkptr_t tmp;
-               tmp = origin_head->ds_phys->ds_bp;
-               origin_head->ds_phys->ds_bp = clone->ds_phys->ds_bp;
-               clone->ds_phys->ds_bp = tmp;
+               tmp = dsl_dataset_phys(origin_head)->ds_bp;
+               dsl_dataset_phys(origin_head)->ds_bp =
+                   dsl_dataset_phys(clone)->ds_bp;
+               dsl_dataset_phys(clone)->ds_bp = tmp;
        }
 
        /* set dd_*_bytes */
@@ -2509,7 +2758,7 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
                uint64_t cdl_used, cdl_comp, cdl_uncomp;
                uint64_t odl_used, odl_comp, odl_uncomp;
 
-               ASSERT3U(clone->ds_dir->dd_phys->
+               ASSERT3U(dsl_dir_phys(clone->ds_dir)->
                    dd_used_breakdown[DD_USED_SNAP], ==, 0);
 
                dsl_deadlist_space(&clone->ds_deadlist,
@@ -2517,13 +2766,18 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
                dsl_deadlist_space(&origin_head->ds_deadlist,
                    &odl_used, &odl_comp, &odl_uncomp);
 
-               dused = clone->ds_phys->ds_referenced_bytes + cdl_used -
-                   (origin_head->ds_phys->ds_referenced_bytes + odl_used);
-               dcomp = clone->ds_phys->ds_compressed_bytes + cdl_comp -
-                   (origin_head->ds_phys->ds_compressed_bytes + odl_comp);
-               duncomp = clone->ds_phys->ds_uncompressed_bytes +
+               dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
+                   cdl_used -
+                   (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
+                   odl_used);
+               dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
+                   cdl_comp -
+                   (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
+                   odl_comp);
+               duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
                    cdl_uncomp -
-                   (origin_head->ds_phys->ds_uncompressed_bytes + odl_uncomp);
+                   (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
+                   odl_uncomp);
 
                dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
                    dused, dcomp, duncomp, tx);
@@ -2547,14 +2801,14 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
        }
 
        /* swap ds_*_bytes */
-       SWITCH64(origin_head->ds_phys->ds_referenced_bytes,
-           clone->ds_phys->ds_referenced_bytes);
-       SWITCH64(origin_head->ds_phys->ds_compressed_bytes,
-           clone->ds_phys->ds_compressed_bytes);
-       SWITCH64(origin_head->ds_phys->ds_uncompressed_bytes,
-           clone->ds_phys->ds_uncompressed_bytes);
-       SWITCH64(origin_head->ds_phys->ds_unique_bytes,
-           clone->ds_phys->ds_unique_bytes);
+       SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
+           dsl_dataset_phys(clone)->ds_referenced_bytes);
+       SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
+           dsl_dataset_phys(clone)->ds_compressed_bytes);
+       SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
+           dsl_dataset_phys(clone)->ds_uncompressed_bytes);
+       SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
+           dsl_dataset_phys(clone)->ds_unique_bytes);
 
        /* apply any parent delta for change in unconsumed refreservation */
        dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
@@ -2565,12 +2819,12 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
         */
        dsl_deadlist_close(&clone->ds_deadlist);
        dsl_deadlist_close(&origin_head->ds_deadlist);
-       SWITCH64(origin_head->ds_phys->ds_deadlist_obj,
-           clone->ds_phys->ds_deadlist_obj);
+       SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
+           dsl_dataset_phys(clone)->ds_deadlist_obj);
        dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
-           clone->ds_phys->ds_deadlist_obj);
+           dsl_dataset_phys(clone)->ds_deadlist_obj);
        dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
-           origin_head->ds_phys->ds_deadlist_obj);
+           dsl_dataset_phys(origin_head)->ds_deadlist_obj);
 
        dsl_scan_ds_clone_swapped(origin_head, clone, tx);
 
@@ -2621,10 +2875,11 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
        /*
         * Make a space adjustment for reserved bytes.
         */
-       if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
+       if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
                ASSERT3U(*used, >=,
-                   ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
-               *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
+                   ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
+               *used -=
+                   (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
                *ref_rsrv =
                    asize - MIN(asize, parent_delta(ds, asize + inflight));
        }
@@ -2639,9 +2894,10 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
         * on-disk is over quota and there are no pending changes (which
         * may free up space for us).
         */
-       if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
+       if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
+           ds->ds_quota) {
                if (inflight > 0 ||
-                   ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
+                   dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
                        error = SET_ERROR(ERESTART);
                else
                        error = SET_ERROR(EDQUOT);
@@ -2675,7 +2931,7 @@ dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
        if (error != 0)
                return (error);
 
-       if (dsl_dataset_is_snapshot(ds)) {
+       if (ds->ds_is_snapshot) {
                dsl_dataset_rele(ds, FTAG);
                return (SET_ERROR(EINVAL));
        }
@@ -2693,7 +2949,7 @@ dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
                return (0);
        }
 
-       if (newval < ds->ds_phys->ds_referenced_bytes ||
+       if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
            newval < ds->ds_reserved) {
                dsl_dataset_rele(ds, FTAG);
                return (SET_ERROR(ENOSPC));
@@ -2739,7 +2995,7 @@ dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
        ddsqra.ddsqra_value = refquota;
 
        return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
-           dsl_dataset_set_refquota_sync, &ddsqra, 0));
+           dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 static int
@@ -2758,7 +3014,7 @@ dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
        if (error != 0)
                return (error);
 
-       if (dsl_dataset_is_snapshot(ds)) {
+       if (ds->ds_is_snapshot) {
                dsl_dataset_rele(ds, FTAG);
                return (SET_ERROR(EINVAL));
        }
@@ -2783,7 +3039,7 @@ dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
        mutex_enter(&ds->ds_lock);
        if (!DS_UNIQUE_IS_ACCURATE(ds))
                dsl_dataset_recalc_head_uniq(ds);
-       unique = ds->ds_phys->ds_unique_bytes;
+       unique = dsl_dataset_phys(ds)->ds_unique_bytes;
        mutex_exit(&ds->ds_lock);
 
        if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
@@ -2820,7 +3076,7 @@ dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
        mutex_enter(&ds->ds_dir->dd_lock);
        mutex_enter(&ds->ds_lock);
        ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
-       unique = ds->ds_phys->ds_unique_bytes;
+       unique = dsl_dataset_phys(ds)->ds_unique_bytes;
        delta = MAX(0, (int64_t)(newval - unique)) -
            MAX(0, (int64_t)(ds->ds_reserved - unique));
        ds->ds_reserved = newval;
@@ -2854,7 +3110,8 @@ dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
        ddsqra.ddsqra_value = refreservation;
 
        return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
-           dsl_dataset_set_refreservation_sync, &ddsqra, 0));
+           dsl_dataset_set_refreservation_sync, &ddsqra,
+           0, ZFS_SPACE_CHECK_NONE));
 }
 
 /*
@@ -2885,16 +3142,16 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
        ASSERT(dsl_pool_config_held(dp));
 
        *usedp = 0;
-       *usedp += new->ds_phys->ds_referenced_bytes;
-       *usedp -= oldsnap->ds_phys->ds_referenced_bytes;
+       *usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
+       *usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
 
        *compp = 0;
-       *compp += new->ds_phys->ds_compressed_bytes;
-       *compp -= oldsnap->ds_phys->ds_compressed_bytes;
+       *compp += dsl_dataset_phys(new)->ds_compressed_bytes;
+       *compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
 
        *uncompp = 0;
-       *uncompp += new->ds_phys->ds_uncompressed_bytes;
-       *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
+       *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
+       *uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
 
        snapobj = new->ds_object;
        while (snapobj != oldsnap->ds_object) {
@@ -2909,8 +3166,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
                                break;
                }
 
-               if (snap->ds_phys->ds_prev_snap_txg ==
-                   oldsnap->ds_phys->ds_creation_txg) {
+               if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
+                   dsl_dataset_phys(oldsnap)->ds_creation_txg) {
                        /*
                         * The blocks in the deadlist can not be born after
                         * ds_prev_snap_txg, so get the whole deadlist space,
@@ -2923,7 +3180,7 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
                            &used, &comp, &uncomp);
                } else {
                        dsl_deadlist_space_range(&snap->ds_deadlist,
-                           0, oldsnap->ds_phys->ds_creation_txg,
+                           0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
                            &used, &comp, &uncomp);
                }
                *usedp += used;
@@ -2935,7 +3192,7 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
                 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
                 * was not a snapshot of/before new.
                 */
-               snapobj = snap->ds_phys->ds_prev_snap_obj;
+               snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
                if (snap != new)
                        dsl_dataset_rele(snap, FTAG);
                if (snapobj == 0) {
@@ -2971,21 +3228,21 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
        uint64_t snapobj;
        dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
 
-       ASSERT(dsl_dataset_is_snapshot(firstsnap));
-       ASSERT(dsl_dataset_is_snapshot(lastsnap));
+       ASSERT(firstsnap->ds_is_snapshot);
+       ASSERT(lastsnap->ds_is_snapshot);
 
        /*
         * Check that the snapshots are in the same dsl_dir, and firstsnap
         * is before lastsnap.
         */
        if (firstsnap->ds_dir != lastsnap->ds_dir ||
-           firstsnap->ds_phys->ds_creation_txg >
-           lastsnap->ds_phys->ds_creation_txg)
+           dsl_dataset_phys(firstsnap)->ds_creation_txg >
+           dsl_dataset_phys(lastsnap)->ds_creation_txg)
                return (SET_ERROR(EINVAL));
 
        *usedp = *compp = *uncompp = 0;
 
-       snapobj = lastsnap->ds_phys->ds_next_snap_obj;
+       snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
        while (snapobj != firstsnap->ds_object) {
                dsl_dataset_t *ds;
                uint64_t used, comp, uncomp;
@@ -2995,19 +3252,90 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
                        break;
 
                dsl_deadlist_space_range(&ds->ds_deadlist,
-                   firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+                   dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
                    &used, &comp, &uncomp);
                *usedp += used;
                *compp += comp;
                *uncompp += uncomp;
 
-               snapobj = ds->ds_phys->ds_prev_snap_obj;
+               snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
                ASSERT3U(snapobj, !=, 0);
                dsl_dataset_rele(ds, FTAG);
        }
        return (err);
 }
 
+static int
+dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx)
+{
+       const char *dsname = arg;
+       dsl_dataset_t *ds;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       int error = 0;
+
+       if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+               return (SET_ERROR(ENOTSUP));
+
+       ASSERT(spa_feature_is_enabled(dp->dp_spa,
+           SPA_FEATURE_EXTENSIBLE_DATASET));
+
+       error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+       if (error != 0)
+               return (error);
+
+       if (ds->ds_large_blocks)
+               error = EALREADY;
+       dsl_dataset_rele(ds, FTAG);
+
+       return (error);
+}
+
+void
+dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx)
+{
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+       uint64_t zero = 0;
+
+       spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+       dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+
+       VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS,
+           sizeof (zero), 1, &zero, tx));
+}
+
+static void
+dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx)
+{
+       const char *dsname = arg;
+       dsl_dataset_t *ds;
+
+       VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds));
+
+       dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
+       ASSERT(!ds->ds_large_blocks);
+       ds->ds_large_blocks = B_TRUE;
+       dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_activate_large_blocks(const char *dsname)
+{
+       int error;
+
+       error = dsl_sync_task(dsname,
+           dsl_dataset_activate_large_blocks_check,
+           dsl_dataset_activate_large_blocks_sync, (void *)dsname,
+           1, ZFS_SPACE_CHECK_RESERVED);
+
+       /*
+        * EALREADY indicates that this dataset already supports large blocks.
+        */
+       if (error == EALREADY)
+               error = 0;
+       return (error);
+}
+
 /*
  * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
  * For example, they could both be snapshots of the same filesystem, and
@@ -3027,13 +3355,13 @@ dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
        dsl_dataset_t *origin;
 
        ASSERT(dsl_pool_config_held(dp));
-       ASSERT(dsl_dataset_is_snapshot(earlier) || earlier_txg != 0);
+       ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);
 
        if (earlier_txg == 0)
-               earlier_txg = earlier->ds_phys->ds_creation_txg;
+               earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
 
-       if (dsl_dataset_is_snapshot(later) &&
-           earlier_txg >= later->ds_phys->ds_creation_txg)
+       if (later->ds_is_snapshot &&
+           earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
                return (B_FALSE);
 
        if (later->ds_dir == earlier->ds_dir)
@@ -3041,10 +3369,10 @@ dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
        if (!dsl_dir_is_clone(later->ds_dir))
                return (B_FALSE);
 
-       if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object)
+       if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
                return (B_TRUE);
        error = dsl_dataset_hold_obj(dp,
-           later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin);
+           dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
        if (error != 0)
                return (B_FALSE);
        ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
@@ -3061,6 +3389,15 @@ dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
+#if defined(_LP64)
+module_param(zfs_max_recordsize, int, 0644);
+MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size");
+#else
+/* Limited to 1M on 32-bit platforms due to lack of virtual address space */
+module_param(zfs_max_recordsize, int, 0444);
+MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size");
+#endif
+
 EXPORT_SYMBOL(dsl_dataset_hold);
 EXPORT_SYMBOL(dsl_dataset_hold_obj);
 EXPORT_SYMBOL(dsl_dataset_own);
index 8a4362ff9c1ae25cc82fd2a338aee9e9ccb9542f..8da77ebd7b6e12208e1db7a3ecbf378315efc298 100644 (file)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/dsl_dataset.h>
@@ -121,6 +122,8 @@ dsl_deadlist_close(dsl_deadlist_t *dl)
        void *cookie = NULL;
        dsl_deadlist_entry_t *dle;
 
+       dl->dl_os = NULL;
+
        if (dl->dl_oldfmt) {
                dl->dl_oldfmt = B_FALSE;
                bpobj_close(&dl->dl_bpobj);
@@ -145,7 +148,7 @@ uint64_t
 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
 {
        if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
-               return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx));
+               return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
        return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
            sizeof (dsl_deadlist_phys_t), tx));
 }
@@ -182,7 +185,7 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
 {
        if (dle->dle_bpobj.bpo_object ==
            dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
-               uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+               uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
                bpobj_close(&dle->dle_bpobj);
                bpobj_decr_empty(dl->dl_os, tx);
                VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
@@ -256,7 +259,7 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 
        dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
        dle->dle_mintxg = mintxg;
-       obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+       obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
        VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
        avl_add(&dl->dl_tree, dle);
 
@@ -310,8 +313,9 @@ dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
        while (mrs_obj != 0) {
                dsl_dataset_t *ds;
                VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
-               dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx);
-               mrs_obj = ds->ds_phys->ds_prev_snap_obj;
+               dsl_deadlist_add_key(&dl,
+                   dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+               mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
                dsl_dataset_rele(ds, FTAG);
        }
        dsl_deadlist_close(&dl);
@@ -340,7 +344,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
                if (dle->dle_mintxg >= maxtxg)
                        break;
 
-               obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+               obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
                VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
                    dle->dle_mintxg, obj, tx));
        }
index 99670dfe072475f423ab5aaf819b1f58e62d87c4..952422be23813914dbb8efcf654d54c71e997aad 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  */
 
 /*
@@ -164,10 +164,10 @@ dsl_deleg_set_sync(void *arg, dmu_tx_t *tx)
 
        VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
 
-       zapobj = dd->dd_phys->dd_deleg_zapobj;
+       zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
        if (zapobj == 0) {
                dmu_buf_will_dirty(dd->dd_dbuf, tx);
-               zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
+               zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
                    DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
        }
 
@@ -208,7 +208,7 @@ dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx)
        uint64_t zapobj;
 
        VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
-       zapobj = dd->dd_phys->dd_deleg_zapobj;
+       zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
        if (zapobj == 0) {
                dsl_dir_rele(dd, FTAG);
                return;
@@ -282,7 +282,7 @@ dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset)
 
        return (dsl_sync_task(ddname, dsl_deleg_check,
            unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
-           &dda, fnvlist_num_pairs(nvp)));
+           &dda, fnvlist_num_pairs(nvp), ZFS_SPACE_CHECK_RESERVED));
 }
 
 /*
@@ -337,14 +337,14 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp)
                nvlist_t *sp_nvp;
                uint64_t n;
 
-               if (dd->dd_phys->dd_deleg_zapobj == 0 ||
-                   zap_count(mos, dd->dd_phys->dd_deleg_zapobj, &n) != 0 ||
-                   n == 0)
+               if (dsl_dir_phys(dd)->dd_deleg_zapobj == 0 ||
+                   zap_count(mos,
+                   dsl_dir_phys(dd)->dd_deleg_zapobj, &n) != 0 || n == 0)
                        continue;
 
                sp_nvp = fnvlist_alloc();
                for (zap_cursor_init(basezc, mos,
-                   dd->dd_phys->dd_deleg_zapobj);
+                   dsl_dir_phys(dd)->dd_deleg_zapobj);
                    zap_cursor_retrieve(basezc, baseza) == 0;
                    zap_cursor_advance(basezc)) {
                        nvlist_t *perms_nvp;
@@ -570,7 +570,7 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr)
            SPA_VERSION_DELEGATED_PERMS)
                return (SET_ERROR(EPERM));
 
-       if (dsl_dataset_is_snapshot(ds)) {
+       if (ds->ds_is_snapshot) {
                /*
                 * Snapshots are treated as descendents only,
                 * local permissions do not apply.
@@ -603,7 +603,7 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr)
                        if (!zoned)
                                break;
                }
-               zapobj = dd->dd_phys->dd_deleg_zapobj;
+               zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
 
                if (zapobj == 0)
                        continue;
@@ -682,7 +682,7 @@ copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj,
 {
        objset_t *mos = dd->dd_pool->dp_meta_objset;
        uint64_t jumpobj, pjumpobj;
-       uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+       uint64_t zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
        zap_cursor_t zc;
        zap_attribute_t za;
        char whokey[ZFS_MAX_DELEG_NAME];
@@ -695,7 +695,7 @@ copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj,
 
        if (zapobj == 0) {
                dmu_buf_will_dirty(dd->dd_dbuf, tx);
-               zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
+               zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
                    DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
        }
 
@@ -733,7 +733,7 @@ dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr)
                return;
 
        for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) {
-               uint64_t pzapobj = dd->dd_phys->dd_deleg_zapobj;
+               uint64_t pzapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
 
                if (pzapobj == 0)
                        continue;
index 9765ba1553a891bc02e2209cc2963b392a46b695..0e2238f99e5176862b74e0b5ff5347f7c0cd7c06 100644 (file)
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -50,7 +51,7 @@ typedef struct dmu_snapshots_destroy_arg {
 int
 dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
 {
-       if (!dsl_dataset_is_snapshot(ds))
+       if (!ds->ds_is_snapshot)
                return (SET_ERROR(EINVAL));
 
        if (dsl_dataset_long_held(ds))
@@ -77,7 +78,7 @@ dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
        /*
         * Can't delete a branch point.
         */
-       if (ds->ds_phys->ds_num_children > 1)
+       if (dsl_dataset_phys(ds)->ds_num_children > 1)
                return (SET_ERROR(EEXIST));
 
        return (0);
@@ -146,12 +147,12 @@ process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 
        ASSERT(!BP_IS_HOLE(bp));
 
-       if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
+       if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
                dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
                if (poa->ds_prev && !poa->after_branch_point &&
                    bp->blk_birth >
-                   poa->ds_prev->ds_phys->ds_prev_snap_txg) {
-                       poa->ds_prev->ds_phys->ds_unique_bytes +=
+                   dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
+                       dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
                            bp_get_dsize_sync(dp->dp_spa, bp);
                }
        } else {
@@ -182,7 +183,7 @@ process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
        VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
            process_old_cb, &poa, tx));
        VERIFY0(zio_wait(poa.pio));
-       ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
+       ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);
 
        /* change snapused */
        dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
@@ -191,12 +192,14 @@ process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
        /* swap next's deadlist to our deadlist */
        dsl_deadlist_close(&ds->ds_deadlist);
        dsl_deadlist_close(&ds_next->ds_deadlist);
-       deadlist_obj = ds->ds_phys->ds_deadlist_obj;
-       ds->ds_phys->ds_deadlist_obj = ds_next->ds_phys->ds_deadlist_obj;
-       ds_next->ds_phys->ds_deadlist_obj = deadlist_obj;
-       dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+       deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+       dsl_dataset_phys(ds)->ds_deadlist_obj =
+           dsl_dataset_phys(ds_next)->ds_deadlist_obj;
+       dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
+       dsl_deadlist_open(&ds->ds_deadlist, mos,
+           dsl_dataset_phys(ds)->ds_deadlist_obj);
        dsl_deadlist_open(&ds_next->ds_deadlist, mos,
-           ds_next->ds_phys->ds_deadlist_obj);
+           dsl_dataset_phys(ds_next)->ds_deadlist_obj);
 }
 
 static void
@@ -211,13 +214,13 @@ dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
         * find the clones, but dsl_deadlist_remove_key() is a no-op so it
         * doesn't matter.
         */
-       if (ds->ds_dir->dd_phys->dd_clones == 0)
+       if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0)
                return;
 
        zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
        za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
-       for (zap_cursor_init(zc, mos, ds->ds_dir->dd_phys->dd_clones);
+       for (zap_cursor_init(zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones);
            zap_cursor_retrieve(zc, za) == 0;
            zap_cursor_advance(zc)) {
                dsl_dataset_t *clone;
@@ -252,19 +255,20 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
 
 
        ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
-       ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+       ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
        ASSERT(refcount_is_zero(&ds->ds_longholds));
 
        if (defer &&
-           (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)) {
+           (ds->ds_userrefs > 0 ||
+           dsl_dataset_phys(ds)->ds_num_children > 1)) {
                ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
                dmu_buf_will_dirty(ds->ds_dbuf, tx);
-               ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
+               dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
                spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
                return;
        }
 
-       ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+       ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 
        /* We need to log before removing it from the namespace. */
        spa_history_log_internal_ds(ds, "destroy", tx, "");
@@ -273,42 +277,48 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
 
        obj = ds->ds_object;
 
-       if (ds->ds_phys->ds_prev_snap_obj != 0) {
+       if (ds->ds_large_blocks) {
+               ASSERT0(zap_contains(mos, obj, DS_FIELD_LARGE_BLOCKS));
+               spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+       }
+       if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
                ASSERT3P(ds->ds_prev, ==, NULL);
                VERIFY0(dsl_dataset_hold_obj(dp,
-                   ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
+                   dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
                after_branch_point =
-                   (ds_prev->ds_phys->ds_next_snap_obj != obj);
+                   (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);
 
                dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
                if (after_branch_point &&
-                   ds_prev->ds_phys->ds_next_clones_obj != 0) {
+                   dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
                        dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
-                       if (ds->ds_phys->ds_next_snap_obj != 0) {
+                       if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
                                VERIFY0(zap_add_int(mos,
-                                   ds_prev->ds_phys->ds_next_clones_obj,
-                                   ds->ds_phys->ds_next_snap_obj, tx));
+                                   dsl_dataset_phys(ds_prev)->
+                                   ds_next_clones_obj,
+                                   dsl_dataset_phys(ds)->ds_next_snap_obj,
+                                   tx));
                        }
                }
                if (!after_branch_point) {
-                       ds_prev->ds_phys->ds_next_snap_obj =
-                           ds->ds_phys->ds_next_snap_obj;
+                       dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
+                           dsl_dataset_phys(ds)->ds_next_snap_obj;
                }
        }
 
        VERIFY0(dsl_dataset_hold_obj(dp,
-           ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
-       ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
+           dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
+       ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);
 
-       old_unique = ds_next->ds_phys->ds_unique_bytes;
+       old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;
 
        dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
-       ds_next->ds_phys->ds_prev_snap_obj =
-           ds->ds_phys->ds_prev_snap_obj;
-       ds_next->ds_phys->ds_prev_snap_txg =
-           ds->ds_phys->ds_prev_snap_txg;
-       ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
-           ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
+       dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
+           dsl_dataset_phys(ds)->ds_prev_snap_obj;
+       dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
+           dsl_dataset_phys(ds)->ds_prev_snap_txg;
+       ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+           ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);
 
        if (ds_next->ds_deadlist.dl_oldfmt) {
                process_old_deadlist(ds, ds_prev, ds_next,
@@ -317,40 +327,40 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
                /* Adjust prev's unique space. */
                if (ds_prev && !after_branch_point) {
                        dsl_deadlist_space_range(&ds_next->ds_deadlist,
-                           ds_prev->ds_phys->ds_prev_snap_txg,
-                           ds->ds_phys->ds_prev_snap_txg,
+                           dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
+                           dsl_dataset_phys(ds)->ds_prev_snap_txg,
                            &used, &comp, &uncomp);
-                       ds_prev->ds_phys->ds_unique_bytes += used;
+                       dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
                }
 
                /* Adjust snapused. */
                dsl_deadlist_space_range(&ds_next->ds_deadlist,
-                   ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+                   dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
                    &used, &comp, &uncomp);
                dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
                    -used, -comp, -uncomp, tx);
 
                /* Move blocks to be freed to pool's free list. */
                dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
-                   &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
+                   &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
                    tx);
                dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
                    DD_USED_HEAD, used, comp, uncomp, tx);
 
                /* Merge our deadlist into next's and free it. */
                dsl_deadlist_merge(&ds_next->ds_deadlist,
-                   ds->ds_phys->ds_deadlist_obj, tx);
+                   dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
        }
        dsl_deadlist_close(&ds->ds_deadlist);
-       dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
+       dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
        dmu_buf_will_dirty(ds->ds_dbuf, tx);
-       ds->ds_phys->ds_deadlist_obj = 0;
+       dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
 
        /* Collapse range in clone heads */
        dsl_dataset_remove_clones_key(ds,
-           ds->ds_phys->ds_creation_txg, tx);
+           dsl_dataset_phys(ds)->ds_creation_txg, tx);
 
-       if (dsl_dataset_is_snapshot(ds_next)) {
+       if (ds_next->ds_is_snapshot) {
                dsl_dataset_t *ds_nextnext;
 
                /*
@@ -363,20 +373,21 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
                 * deadlist).
                 */
                VERIFY0(dsl_dataset_hold_obj(dp,
-                   ds_next->ds_phys->ds_next_snap_obj, FTAG, &ds_nextnext));
+                   dsl_dataset_phys(ds_next)->ds_next_snap_obj,
+                   FTAG, &ds_nextnext));
                dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
-                   ds->ds_phys->ds_prev_snap_txg,
-                   ds->ds_phys->ds_creation_txg,
+                   dsl_dataset_phys(ds)->ds_prev_snap_txg,
+                   dsl_dataset_phys(ds)->ds_creation_txg,
                    &used, &comp, &uncomp);
-               ds_next->ds_phys->ds_unique_bytes += used;
+               dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
                dsl_dataset_rele(ds_nextnext, FTAG);
                ASSERT3P(ds_next->ds_prev, ==, NULL);
 
                /* Collapse range in this head. */
                VERIFY0(dsl_dataset_hold_obj(dp,
-                   ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &hds));
+                   dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds));
                dsl_deadlist_remove_key(&hds->ds_deadlist,
-                   ds->ds_phys->ds_creation_txg, tx);
+                   dsl_dataset_phys(ds)->ds_creation_txg, tx);
                dsl_dataset_rele(hds, FTAG);
 
        } else {
@@ -385,7 +396,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
                ds_next->ds_prev = NULL;
                if (ds_prev) {
                        VERIFY0(dsl_dataset_hold_obj(dp,
-                           ds->ds_phys->ds_prev_snap_obj,
+                           dsl_dataset_phys(ds)->ds_prev_snap_obj,
                            ds_next, &ds_next->ds_prev));
                }
 
@@ -399,7 +410,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
                if (old_unique < ds_next->ds_reserved) {
                        int64_t mrsdelta;
                        uint64_t new_unique =
-                           ds_next->ds_phys->ds_unique_bytes;
+                           dsl_dataset_phys(ds_next)->ds_unique_bytes;
 
                        ASSERT(old_unique <= new_unique);
                        mrsdelta = MIN(new_unique - old_unique,
@@ -420,9 +431,9 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
        }
 
        /* remove from snapshot namespace */
-       ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
+       ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
        VERIFY0(dsl_dataset_hold_obj(dp,
-           ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
+           dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
        VERIFY0(dsl_dataset_get_snapname(ds));
 #ifdef ZFS_DEBUG
        {
@@ -434,7 +445,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
                ASSERT3U(val, ==, obj);
        }
 #endif
-       VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx));
+       VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
        dsl_dataset_rele(ds_head, FTAG);
 
        if (ds_prev != NULL)
@@ -442,17 +453,20 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
 
        spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
 
-       if (ds->ds_phys->ds_next_clones_obj != 0) {
+       if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
                ASSERTV(uint64_t count);
                ASSERT0(zap_count(mos,
-                   ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
+                   dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
+                   count == 0);
                VERIFY0(dmu_object_free(mos,
-                   ds->ds_phys->ds_next_clones_obj, tx));
+                   dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
        }
-       if (ds->ds_phys->ds_props_obj != 0)
-               VERIFY0(zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
-       if (ds->ds_phys->ds_userrefs_obj != 0)
-               VERIFY0(zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
+       if (dsl_dataset_phys(ds)->ds_props_obj != 0)
+               VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
+                   tx));
+       if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
+               VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+                   tx));
        dsl_dir_rele(ds->ds_dir, ds);
        ds->ds_dir = NULL;
        dmu_object_free_zapified(mos, obj, tx);
@@ -510,7 +524,7 @@ dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
 
        error = dsl_sync_task(nvpair_name(pair),
            dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync,
-           &dsda, 0);
+           &dsda, 0, ZFS_SPACE_CHECK_NONE);
        fnvlist_free(dsda.dsda_successful_snaps);
 
        return (error);
@@ -555,7 +569,8 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
                dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
        } else {
                ASSERT(zilog == NULL);
-               ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
+               ASSERT3U(bp->blk_birth, >,
+                   dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
                (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
        }
 
@@ -577,9 +592,10 @@ old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
        ka.ds = ds;
        ka.tx = tx;
        VERIFY0(traverse_dataset(ds,
-           ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
+           dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
            kill_blkptr, &ka));
-       ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
+       ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+           dsl_dataset_phys(ds)->ds_unique_bytes == 0);
 }
 
 typedef struct dsl_destroy_head_arg {
@@ -593,8 +609,8 @@ dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
        uint64_t count;
        objset_t *mos;
 
-       ASSERT(!dsl_dataset_is_snapshot(ds));
-       if (dsl_dataset_is_snapshot(ds))
+       ASSERT(!ds->ds_is_snapshot);
+       if (ds->ds_is_snapshot)
                return (SET_ERROR(EINVAL));
 
        if (refcount_count(&ds->ds_longholds) != expected_holds)
@@ -608,21 +624,21 @@ dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
         * from.)
         */
        if (ds->ds_prev != NULL &&
-           ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
+           dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
                return (SET_ERROR(EBUSY));
 
        /*
         * Can't delete if there are children of this fs.
         */
        error = zap_count(mos,
-           ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
+           dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
        if (error != 0)
                return (error);
        if (count != 0)
                return (SET_ERROR(EEXIST));
 
        if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
-           ds->ds_prev->ds_phys->ds_num_children == 2 &&
+           dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
            ds->ds_prev->ds_userrefs == 0) {
                /* We need to remove the origin snapshot as well. */
                if (!refcount_is_zero(&ds->ds_prev->ds_longholds))
@@ -660,7 +676,18 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
 
        VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
 
-       ASSERT0(dd->dd_phys->dd_head_dataset_obj);
+       ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);
+
+       /*
+        * Decrement the filesystem count for all parent filesystems.
+        *
+        * When we receive an incremental stream into a filesystem that already
+        * exists, a temporary clone is created.  We never count this temporary
+        * clone, whose name begins with a '%'.
+        */
+       if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
+               dsl_fs_ss_count_adjust(dd->dd_parent, -1,
+                   DD_FIELD_FILESYSTEM_COUNT, tx);
 
        /*
         * Remove our reservation. The impl() routine avoids setting the
@@ -668,16 +695,17 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
         */
        dsl_dir_set_reservation_sync_impl(dd, 0, tx);
 
-       ASSERT0(dd->dd_phys->dd_used_bytes);
-       ASSERT0(dd->dd_phys->dd_reserved);
+       ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
+       ASSERT0(dsl_dir_phys(dd)->dd_reserved);
        for (t = 0; t < DD_USED_NUM; t++)
-               ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
+               ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
 
-       VERIFY0(zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
-       VERIFY0(zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
-       VERIFY0(dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
+       VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
+       VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
+       VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
        VERIFY0(zap_remove(mos,
-           dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
+           dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
+           dd->dd_myname, tx));
 
        dsl_dir_rele(dd, FTAG);
        dmu_object_free_zapified(mos, ddobj, tx);
@@ -692,10 +720,10 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
        boolean_t rmorigin;
        objset_t *os;
 
-       ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+       ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
        ASSERT(ds->ds_prev == NULL ||
-           ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
-       ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+           dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
+       ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
        ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
        /* We need to log before removing it from the namespace. */
@@ -703,7 +731,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 
        rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
            DS_IS_DEFER_DESTROY(ds->ds_prev) &&
-           ds->ds_prev->ds_phys->ds_num_children == 2 &&
+           dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
            ds->ds_prev->ds_userrefs == 0);
 
        /* Remove our reservation. */
@@ -714,24 +742,28 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
                ASSERT0(ds->ds_reserved);
        }
 
+       if (ds->ds_large_blocks)
+               spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+
        dsl_scan_ds_destroyed(ds, tx);
 
        obj = ds->ds_object;
 
-       if (ds->ds_phys->ds_prev_snap_obj != 0) {
+       if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
                /* This is a clone */
                ASSERT(ds->ds_prev != NULL);
-               ASSERT3U(ds->ds_prev->ds_phys->ds_next_snap_obj, !=, obj);
-               ASSERT0(ds->ds_phys->ds_next_snap_obj);
+               ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
+                   obj);
+               ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);
 
                dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-               if (ds->ds_prev->ds_phys->ds_next_clones_obj != 0) {
+               if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
                        dsl_dataset_remove_from_next_clones(ds->ds_prev,
                            obj, tx);
                }
 
-               ASSERT3U(ds->ds_prev->ds_phys->ds_num_children, >, 1);
-               ds->ds_prev->ds_phys->ds_num_children--;
+               ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
+               dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
        }
 
        /*
@@ -740,9 +772,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
         * safe to ignore the deadlist contents.)
         */
        dsl_deadlist_close(&ds->ds_deadlist);
-       dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
+       dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
        dmu_buf_will_dirty(ds->ds_dbuf, tx);
-       ds->ds_phys->ds_deadlist_obj = 0;
+       dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
 
        VERIFY0(dmu_objset_from_ds(ds, &os));
 
@@ -771,15 +803,16 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
                        scn->scn_async_destroying = B_TRUE;
                }
 
-               used = ds->ds_dir->dd_phys->dd_used_bytes;
-               comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
-               uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
+               used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
+               comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
+               uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
 
                ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
-                   ds->ds_phys->ds_unique_bytes == used);
+                   dsl_dataset_phys(ds)->ds_unique_bytes == used);
 
                bptree_add(mos, dp->dp_bptree_obj,
-                   &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
+                   &dsl_dataset_phys(ds)->ds_bp,
+                   dsl_dataset_phys(ds)->ds_prev_snap_txg,
                    used, comp, uncomp, tx);
                dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
                    -used, -comp, -uncomp, tx);
@@ -790,7 +823,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
        if (ds->ds_prev != NULL) {
                if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
                        VERIFY0(zap_remove_int(mos,
-                           ds->ds_prev->ds_dir->dd_phys->dd_clones,
+                           dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
                            ds->ds_object, tx));
                }
                prevobj = ds->ds_prev->ds_object;
@@ -809,22 +842,22 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 
        /* Erase the link in the dir */
        dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
-       ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
+       dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
        ddobj = ds->ds_dir->dd_object;
-       ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
-       VERIFY0(zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx));
+       ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
+       VERIFY0(zap_destroy(mos,
+           dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));
 
        if (ds->ds_bookmarks != 0) {
-               VERIFY0(zap_destroy(mos,
-                   ds->ds_bookmarks, tx));
+               VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
                spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
        }
 
        spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
 
-       ASSERT0(ds->ds_phys->ds_next_clones_obj);
-       ASSERT0(ds->ds_phys->ds_props_obj);
-       ASSERT0(ds->ds_phys->ds_userrefs_obj);
+       ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
+       ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
+       ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
        dsl_dir_rele(ds->ds_dir, ds);
        ds->ds_dir = NULL;
        dmu_object_free_zapified(mos, obj, tx);
@@ -862,7 +895,7 @@ dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
 
        /* Mark it as inconsistent on-disk, in case we crash */
        dmu_buf_will_dirty(ds->ds_dbuf, tx);
-       ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+       dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
 
        spa_history_log_internal_ds(ds, "destroy begin", tx, "");
        dsl_dataset_rele(ds, FTAG);
@@ -892,7 +925,8 @@ dsl_destroy_head(const char *name)
                objset_t *os;
 
                error = dsl_sync_task(name, dsl_destroy_head_check,
-                   dsl_destroy_head_begin_sync, &ddha, 0);
+                   dsl_destroy_head_begin_sync, &ddha,
+                   0, ZFS_SPACE_CHECK_NONE);
                if (error != 0)
                        return (error);
 
@@ -905,7 +939,8 @@ dsl_destroy_head(const char *name)
                if (error == 0) {
                        uint64_t obj;
                        uint64_t prev_snap_txg =
-                           dmu_objset_ds(os)->ds_phys->ds_prev_snap_txg;
+                           dsl_dataset_phys(dmu_objset_ds(os))->
+                           ds_prev_snap_txg;
                        for (obj = 0; error == 0;
                            error = dmu_object_next(os, &obj, FALSE,
                            prev_snap_txg))
@@ -917,7 +952,7 @@ dsl_destroy_head(const char *name)
        }
 
        return (dsl_sync_task(name, dsl_destroy_head_check,
-           dsl_destroy_head_sync, &ddha, 0));
+           dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 /*
index b94b68e15774c51ca2caa9146f66e0904a9e1a13..ba6c24486463cb3819b1678d5b33738de3375875 100644 (file)
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 Martin Matuska. All rights reserved.
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/dmu.h>
 #include <sys/zio.h>
 #include <sys/arc.h>
 #include <sys/sunddi.h>
+#include <sys/zfeature.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
 #include <sys/zvol.h>
 #include "zfs_namecheck.h"
+#include "zfs_prop.h"
+
+/*
+ * Filesystem and Snapshot Limits
+ * ------------------------------
+ *
+ * These limits are used to restrict the number of filesystems and/or snapshots
+ * that can be created at a given level in the tree or below. A typical
+ * use-case is with a delegated dataset where the administrator wants to ensure
+ * that a user within the zone is not creating too many additional filesystems
+ * or snapshots, even though they're not exceeding their space quota.
+ *
+ * The filesystem and snapshot counts are stored as extensible properties. This
+ * capability is controlled by a feature flag and must be enabled to be used.
+ * Once enabled, the feature is not active until the first limit is set. At
+ * that point, future operations to create/destroy filesystems or snapshots
+ * will validate and update the counts.
+ *
+ * Because the count properties will not exist before the feature is active,
+ * the counts are updated when a limit is first set on an uninitialized
+ * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
+ * all of the nested filesystems/snapshots. Thus, a new leaf node has a
+ * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
+ * snapshot count properties on a node indicate uninitialized counts on that
+ * node.) When first setting a limit on an uninitialized node, the code starts
+ * at the filesystem with the new limit and descends into all sub-filesystems
+ * to add the count properties.
+ *
+ * In practice this is lightweight since a limit is typically set when the
+ * filesystem is created and thus has no children. Once valid, changing the
+ * limit value won't require a re-traversal since the counts are already valid.
+ * When recursively fixing the counts, if a node with a limit is encountered
+ * during the descent, the counts are known to be valid and there is no need to
+ * descend into that filesystem's children. The counts on filesystems above the
+ * one with the new limit will still be uninitialized, unless a limit is
+ * eventually set on one of those filesystems. The counts are always recursively
+ * updated when a limit is set on a dataset, unless there is already a limit.
+ * When a new limit value is set on a filesystem with an existing limit, it is
+ * possible for the new limit to be less than the current count at that level
+ * since a user who can change the limit is also allowed to exceed the limit.
+ *
+ * Once the feature is active, then whenever a filesystem or snapshot is
+ * created, the code recurses up the tree, validating the new count against the
+ * limit at each initialized level. In practice, most levels will not have a
+ * limit set. If there is a limit at any initialized level up the tree, the
+ * check must pass or the creation will fail. Likewise, when a filesystem or
+ * snapshot is destroyed, the counts are recursively adjusted all the way up
+ * the initizized nodes in the tree. Renaming a filesystem into different point
+ * in the tree will first validate, then update the counts on each branch up to
+ * the common ancestor. A receive will also validate the counts and then update
+ * them.
+ *
+ * An exception to the above behavior is that the limit is not enforced if the
+ * user has permission to modify the limit. This is primarily so that
+ * recursive snapshots in the global zone always work. We want to prevent a
+ * denial-of-service in which a lower level delegated dataset could max out its
+ * limit and thus block recursive snapshots from being taken in the global zone.
+ * Because of this, it is possible for the snapshot count to be over the limit
+ * and snapshots taken in the global zone could cause a lower level dataset to
+ * hit or exceed its limit. The administrator taking the global zone recursive
+ * snapshot should be aware of this side-effect and behave accordingly.
+ * For consistency, the filesystem limit is also not enforced if the user can
+ * modify the limit.
+ *
+ * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
+ * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
+ * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
+ * dsl_dir_init_fs_ss_count().
+ *
+ * There is a special case when we receive a filesystem that already exists. In
+ * this case a temporary clone name of %X is created (see dmu_recv_begin). We
+ * never update the filesystem counts for temporary clones.
+ *
+ * Likewise, we do not update the snapshot counts for temporary snapshots,
+ * such as those created by zfs diff.
+ */
+
+extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
 
 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
 
-/* ARGSUSED */
 static void
-dsl_dir_evict(dmu_buf_t *db, void *arg)
+dsl_dir_evict(void *dbu)
 {
-       dsl_dir_t *dd = arg;
+       dsl_dir_t *dd = dbu;
        int t;
        ASSERTV(dsl_pool_t *dp = dd->dd_pool);
 
+       dd->dd_dbuf = NULL;
+
        for (t = 0; t < TXG_SIZE; t++) {
                ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
                ASSERT(dd->dd_tempreserved[t] == 0);
@@ -59,9 +143,9 @@ dsl_dir_evict(dmu_buf_t *db, void *arg)
        }
 
        if (dd->dd_parent)
-               dsl_dir_rele(dd->dd_parent, dd);
+               dsl_dir_async_rele(dd->dd_parent, dd);
 
-       spa_close(dd->dd_pool->dp_spa, dd);
+       spa_async_close(dd->dd_pool->dp_spa, dd);
 
        /*
         * The props callback list should have been cleaned up by
@@ -101,7 +185,6 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
                dd->dd_object = ddobj;
                dd->dd_dbuf = dbuf;
                dd->dd_pool = dp;
-               dd->dd_phys = dbuf->db_data;
                mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
 
                list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
@@ -109,9 +192,10 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
 
                dsl_dir_snap_cmtime_update(dd);
 
-               if (dd->dd_phys->dd_parent_obj) {
-                       err = dsl_dir_hold_obj(dp, dd->dd_phys->dd_parent_obj,
-                           NULL, dd, &dd->dd_parent);
+               if (dsl_dir_phys(dd)->dd_parent_obj) {
+                       err = dsl_dir_hold_obj(dp,
+                           dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
+                           &dd->dd_parent);
                        if (err != 0)
                                goto errout;
                        if (tail) {
@@ -119,14 +203,16 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
                                uint64_t foundobj;
 
                                err = zap_lookup(dp->dp_meta_objset,
-                                   dd->dd_parent->dd_phys->dd_child_dir_zapobj,
-                                   tail, sizeof (foundobj), 1, &foundobj);
+                                   dsl_dir_phys(dd->dd_parent)->
+                                   dd_child_dir_zapobj, tail,
+                                   sizeof (foundobj), 1, &foundobj);
                                ASSERT(err || foundobj == ddobj);
 #endif
                                (void) strcpy(dd->dd_myname, tail);
                        } else {
                                err = zap_value_search(dp->dp_meta_objset,
-                                   dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+                                   dsl_dir_phys(dd->dd_parent)->
+                                   dd_child_dir_zapobj,
                                    ddobj, 0, dd->dd_myname);
                        }
                        if (err != 0)
@@ -145,7 +231,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
                         * Just look at its phys directly instead.
                         */
                        err = dmu_bonus_hold(dp->dp_meta_objset,
-                           dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
+                           dsl_dir_phys(dd)->dd_origin_obj, FTAG,
+                           &origin_bonus);
                        if (err != 0)
                                goto errout;
                        origin_phys = origin_bonus->db_data;
@@ -154,9 +241,9 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
                        dmu_buf_rele(origin_bonus, FTAG);
                }
 
-               winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
-                   dsl_dir_evict);
-               if (winner) {
+               dmu_buf_init_user(&dd->dd_dbu, dsl_dir_evict, &dd->dd_dbuf);
+               winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
+               if (winner != NULL) {
                        if (dd->dd_parent)
                                dsl_dir_rele(dd->dd_parent, dd);
                        mutex_destroy(&dd->dd_lock);
@@ -200,6 +287,21 @@ dsl_dir_rele(dsl_dir_t *dd, void *tag)
        dmu_buf_rele(dd->dd_dbuf, tag);
 }
 
+/*
+ * Remove a reference to the given dsl dir that is being asynchronously
+ * released.  Async releases occur from a taskq performing eviction of
+ * dsl datasets and dirs.  This process is identical to a normal release
+ * with the exception of using the async API for releasing the reference on
+ * the spa.
+ */
+void
+dsl_dir_async_rele(dsl_dir_t *dd, void *tag)
+{
+       dprintf_dd(dd, "%s\n", "");
+       spa_async_close(dd->dd_pool->dp_spa, tag);
+       dmu_buf_rele(dd->dd_dbuf, tag);
+}
+
 /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
 void
 dsl_dir_name(dsl_dir_t *dd, char *buf)
@@ -333,7 +435,7 @@ dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
        }
 
        while (next != NULL) {
-               dsl_dir_t *child_ds;
+               dsl_dir_t *child_dd;
                err = getcomponent(next, buf, &nextnext);
                if (err != 0)
                        break;
@@ -341,10 +443,10 @@ dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
                if (next[0] == '@')
                        break;
                dprintf("looking up %s in obj%lld\n",
-                   buf, dd->dd_phys->dd_child_dir_zapobj);
+                   buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
 
                err = zap_lookup(dp->dp_meta_objset,
-                   dd->dd_phys->dd_child_dir_zapobj,
+                   dsl_dir_phys(dd)->dd_child_dir_zapobj,
                    buf, sizeof (ddobj), 1, &ddobj);
                if (err != 0) {
                        if (err == ENOENT)
@@ -352,11 +454,11 @@ dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
                        break;
                }
 
-               err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds);
+               err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
                if (err != 0)
                        break;
                dsl_dir_rele(dd, tag);
-               dd = child_ds;
+               dd = child_dd;
                next = nextnext;
        }
 
@@ -384,6 +486,400 @@ error:
        return (err);
 }
 
+/*
+ * If the counts are already initialized for this filesystem and its
+ * descendants then do nothing, otherwise initialize the counts.
+ *
+ * The counts on this filesystem, and those below, may be uninitialized due to
+ * either the use of a pre-existing pool which did not support the
+ * filesystem/snapshot limit feature, or one in which the feature had not yet
+ * been enabled.
+ *
+ * Recursively descend the filesystem tree and update the filesystem/snapshot
+ * counts on each filesystem below, then update the cumulative count on the
+ * current filesystem. If the filesystem already has a count set on it,
+ * then we know that its counts, and the counts on the filesystems below it,
+ * are already correct, so we don't have to update this filesystem.
+ */
+static void
+dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+       uint64_t my_fs_cnt = 0;
+       uint64_t my_ss_cnt = 0;
+       dsl_pool_t *dp = dd->dd_pool;
+       objset_t *os = dp->dp_meta_objset;
+       zap_cursor_t *zc;
+       zap_attribute_t *za;
+       dsl_dataset_t *ds;
+
+       ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
+       ASSERT(dsl_pool_config_held(dp));
+       ASSERT(dmu_tx_is_syncing(tx));
+
+       dsl_dir_zapify(dd, tx);
+
+       /*
+        * If the filesystem count has already been initialized then we
+        * don't need to recurse down any further.
+        */
+       if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
+               return;
+
+       zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+       za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+       /* Iterate my child dirs */
+       for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
+           zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
+               dsl_dir_t *chld_dd;
+               uint64_t count;
+
+               VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
+                   &chld_dd));
+
+               /*
+                * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
+                * temporary datasets.
+                */
+               if (chld_dd->dd_myname[0] == '$' ||
+                   chld_dd->dd_myname[0] == '%') {
+                       dsl_dir_rele(chld_dd, FTAG);
+                       continue;
+               }
+
+               my_fs_cnt++;    /* count this child */
+
+               dsl_dir_init_fs_ss_count(chld_dd, tx);
+
+               VERIFY0(zap_lookup(os, chld_dd->dd_object,
+                   DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
+               my_fs_cnt += count;
+               VERIFY0(zap_lookup(os, chld_dd->dd_object,
+                   DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
+               my_ss_cnt += count;
+
+               dsl_dir_rele(chld_dd, FTAG);
+       }
+       zap_cursor_fini(zc);
+       /* Count my snapshots (we counted children's snapshots above) */
+       VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
+           dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
+
+       for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
+           zap_cursor_retrieve(zc, za) == 0;
+           zap_cursor_advance(zc)) {
+               /* Don't count temporary snapshots */
+               if (za->za_name[0] != '%')
+                       my_ss_cnt++;
+       }
+       zap_cursor_fini(zc);
+
+       dsl_dataset_rele(ds, FTAG);
+
+       kmem_free(zc, sizeof (zap_cursor_t));
+       kmem_free(za, sizeof (zap_attribute_t));
+
+       /* we're in a sync task, update counts */
+       dmu_buf_will_dirty(dd->dd_dbuf, tx);
+       VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+           sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
+       VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+           sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
+}
+
+static int
+dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
+{
+       char *ddname = (char *)arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds;
+       dsl_dir_t *dd;
+       int error;
+
+       error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
+       if (error != 0)
+               return (error);
+
+       if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+               dsl_dataset_rele(ds, FTAG);
+               return (SET_ERROR(ENOTSUP));
+       }
+
+       dd = ds->ds_dir;
+       if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
+           dsl_dir_is_zapified(dd) &&
+           zap_contains(dp->dp_meta_objset, dd->dd_object,
+           DD_FIELD_FILESYSTEM_COUNT) == 0) {
+               dsl_dataset_rele(ds, FTAG);
+               return (SET_ERROR(EALREADY));
+       }
+
+       dsl_dataset_rele(ds, FTAG);
+       return (0);
+}
+
+static void
+dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
+{
+       char *ddname = (char *)arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_dataset_t *ds;
+       spa_t *spa;
+
+       VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
+
+       spa = dsl_dataset_get_spa(ds);
+
+       if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
+               /*
+                * Since the feature was not active and we're now setting a
+                * limit, increment the feature-active counter so that the
+                * feature becomes active for the first time.
+                *
+                * We are already in a sync task so we can update the MOS.
+                */
+               spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
+       }
+
+       /*
+        * Since we are now setting a non-UINT64_MAX limit on the filesystem,
+        * we need to ensure the counts are correct. Descend down the tree from
+        * this point and update all of the counts to be accurate.
+        */
+       dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
+
+       dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * Make sure the feature is enabled and activate it if necessary.
+ * Since we're setting a limit, ensure the on-disk counts are valid.
+ * This is only called by the ioctl path when setting a limit value.
+ *
+ * We do not need to validate the new limit, since users who can change the
+ * limit are also allowed to exceed the limit.
+ */
+int
+dsl_dir_activate_fs_ss_limit(const char *ddname)
+{
+       int error;
+
+       error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
+           dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
+           ZFS_SPACE_CHECK_RESERVED);
+
+       if (error == EALREADY)
+               error = 0;
+
+       return (error);
+}
+
+/*
+ * Used to determine if the filesystem_limit or snapshot_limit should be
+ * enforced. We allow the limit to be exceeded if the user has permission to
+ * write the property value. We pass in the creds that we got in the open
+ * context since we will always be the GZ root in syncing context. We also have
+ * to handle the case where we are allowed to change the limit on the current
+ * dataset, but there may be another limit in the tree above.
+ *
+ * We can never modify these two properties within a non-global zone. In
+ * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
+ * can't use that function since we are already holding the dp_config_rwlock.
+ * In addition, we already have the dd and dealing with snapshots is simplified
+ * in this code.
+ */
+
+typedef enum {
+       ENFORCE_ALWAYS,
+       ENFORCE_NEVER,
+       ENFORCE_ABOVE
+} enforce_res_t;
+
+static enforce_res_t
+dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr)
+{
+       enforce_res_t enforce = ENFORCE_ALWAYS;
+       uint64_t obj;
+       dsl_dataset_t *ds;
+       uint64_t zoned;
+
+       ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+           prop == ZFS_PROP_SNAPSHOT_LIMIT);
+
+#ifdef _KERNEL
+       if (crgetzoneid(cr) != GLOBAL_ZONEID)
+               return (ENFORCE_ALWAYS);
+
+       if (secpolicy_zfs(cr) == 0)
+               return (ENFORCE_NEVER);
+#endif
+
+       if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
+               return (ENFORCE_ALWAYS);
+
+       ASSERT(dsl_pool_config_held(dd->dd_pool));
+
+       if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
+               return (ENFORCE_ALWAYS);
+
+       if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) {
+               /* Only root can access zoned fs's from the GZ */
+               enforce = ENFORCE_ALWAYS;
+       } else {
+               if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
+                       enforce = ENFORCE_ABOVE;
+       }
+
+       dsl_dataset_rele(ds, FTAG);
+       return (enforce);
+}
+
+/*
+ * Check if adding additional child filesystem(s) would exceed any filesystem
+ * limits or adding additional snapshot(s) would exceed any snapshot limits.
+ * The prop argument indicates which limit to check.
+ *
+ * Note that all filesystem limits up to the root (or the highest
+ * initialized) filesystem or the given ancestor must be satisfied.
+ */
+int
+dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
+    dsl_dir_t *ancestor, cred_t *cr)
+{
+       objset_t *os = dd->dd_pool->dp_meta_objset;
+       uint64_t limit, count;
+       char *count_prop;
+       enforce_res_t enforce;
+       int err = 0;
+
+       ASSERT(dsl_pool_config_held(dd->dd_pool));
+       ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+           prop == ZFS_PROP_SNAPSHOT_LIMIT);
+
+       /*
+        * If we're allowed to change the limit, don't enforce the limit
+        * e.g. this can happen if a snapshot is taken by an administrative
+        * user in the global zone (i.e. a recursive snapshot by root).
+        * However, we must handle the case of delegated permissions where we
+        * are allowed to change the limit on the current dataset, but there
+        * is another limit in the tree above.
+        */
+       enforce = dsl_enforce_ds_ss_limits(dd, prop, cr);
+       if (enforce == ENFORCE_NEVER)
+               return (0);
+
+       /*
+        * e.g. if renaming a dataset with no snapshots, count adjustment
+        * is 0.
+        */
+       if (delta == 0)
+               return (0);
+
+       if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
+               /*
+                * We don't enforce the limit for temporary snapshots. This is
+                * indicated by a NULL cred_t argument.
+                */
+               if (cr == NULL)
+                       return (0);
+
+               count_prop = DD_FIELD_SNAPSHOT_COUNT;
+       } else {
+               count_prop = DD_FIELD_FILESYSTEM_COUNT;
+       }
+
+       /*
+        * If an ancestor has been provided, stop checking the limit once we
+        * hit that dir. We need this during rename so that we don't overcount
+        * the check once we recurse up to the common ancestor.
+        */
+       if (ancestor == dd)
+               return (0);
+
+       /*
+        * If we hit an uninitialized node while recursing up the tree, we can
+        * stop since we know there is no limit here (or above). The counts are
+        * not valid on this node and we know we won't touch this node's counts.
+        */
+       if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object,
+           count_prop, sizeof (count), 1, &count) == ENOENT)
+               return (0);
+
+       err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
+           B_FALSE);
+       if (err != 0)
+               return (err);
+
+       /* Is there a limit which we've hit? */
+       if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
+               return (SET_ERROR(EDQUOT));
+
+       if (dd->dd_parent != NULL)
+               err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
+                   ancestor, cr);
+
+       return (err);
+}
+
+/*
+ * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
+ * parents. When a new filesystem/snapshot is created, increment the count on
+ * all parents, and when a filesystem/snapshot is destroyed, decrement the
+ * count.
+ */
+void
+dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
+    dmu_tx_t *tx)
+{
+       int err;
+       objset_t *os = dd->dd_pool->dp_meta_objset;
+       uint64_t count;
+
+       ASSERT(dsl_pool_config_held(dd->dd_pool));
+       ASSERT(dmu_tx_is_syncing(tx));
+       ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
+           strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
+
+       /*
+        * When we receive an incremental stream into a filesystem that already
+        * exists, a temporary clone is created.  We don't count this temporary
+        * clone, whose name begins with a '%'. We also ignore hidden ($FREE,
+        * $MOS & $ORIGIN) objsets.
+        */
+       if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') &&
+           strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
+               return;
+
+       /*
+        * e.g. if renaming a dataset with no snapshots, count adjustment is 0
+        */
+       if (delta == 0)
+               return;
+
+       /*
+        * If we hit an uninitialized node while recursing up the tree, we can
+        * stop since we know the counts are not valid on this node and we
+        * know we shouldn't touch this node's counts. An uninitialized count
+        * on the node indicates that either the feature has not yet been
+        * activated or there are no limits on this part of the tree.
+        */
+       if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
+           prop, sizeof (count), 1, &count)) == ENOENT)
+               return;
+       VERIFY0(err);
+
+       count += delta;
+       /* Use a signed verify to make sure we're not neg. */
+       VERIFY3S(count, >=, 0);
+
+       VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
+           tx));
+
+       /* Roll up this additional count into our ancestors */
+       if (dd->dd_parent != NULL)
+               dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
+}
+
 uint64_t
 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
     dmu_tx_t *tx)
@@ -396,7 +892,7 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
        ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
            DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
        if (pds) {
-               VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
+               VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
                    name, sizeof (uint64_t), 1, &ddobj, tx));
        } else {
                /* it's the root dir */
@@ -408,8 +904,12 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
        ddphys = dbuf->db_data;
 
        ddphys->dd_creation_time = gethrestime_sec();
-       if (pds)
+       if (pds) {
                ddphys->dd_parent_obj = pds->dd_object;
+
+               /* update the filesystem counts */
+               dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
+       }
        ddphys->dd_props_zapobj = zap_create(mos,
            DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
        ddphys->dd_child_dir_zapobj = zap_create(mos,
@@ -424,9 +924,9 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 boolean_t
 dsl_dir_is_clone(dsl_dir_t *dd)
 {
-       return (dd->dd_phys->dd_origin_obj &&
+       return (dsl_dir_phys(dd)->dd_origin_obj &&
            (dd->dd_pool->dp_origin_snap == NULL ||
-           dd->dd_phys->dd_origin_obj !=
+           dsl_dir_phys(dd)->dd_origin_obj !=
            dd->dd_pool->dp_origin_snap->ds_object));
 }
 
@@ -435,35 +935,52 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
 {
        mutex_enter(&dd->dd_lock);
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
-           dd->dd_phys->dd_used_bytes);
-       dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
+           dsl_dir_phys(dd)->dd_used_bytes);
+       dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
+           dsl_dir_phys(dd)->dd_quota);
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
-           dd->dd_phys->dd_reserved);
+           dsl_dir_phys(dd)->dd_reserved);
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
-           dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
-           (dd->dd_phys->dd_uncompressed_bytes * 100 /
-           dd->dd_phys->dd_compressed_bytes));
+           dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
+           (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
+           dsl_dir_phys(dd)->dd_compressed_bytes));
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
-           dd->dd_phys->dd_uncompressed_bytes);
-       if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+           dsl_dir_phys(dd)->dd_uncompressed_bytes);
+       if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
                dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
-                   dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);
+                   dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
                dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
-                   dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]);
+                   dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
                dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
-                   dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]);
+                   dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
                dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
-                   dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] +
-                   dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]);
+                   dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
+                   dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
        }
        mutex_exit(&dd->dd_lock);
 
+       if (dsl_dir_is_zapified(dd)) {
+               uint64_t count;
+               objset_t *os = dd->dd_pool->dp_meta_objset;
+
+               if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+                   sizeof (count), 1, &count) == 0) {
+                       dsl_prop_nvlist_add_uint64(nv,
+                           ZFS_PROP_FILESYSTEM_COUNT, count);
+               }
+               if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+                   sizeof (count), 1, &count) == 0) {
+                       dsl_prop_nvlist_add_uint64(nv,
+                           ZFS_PROP_SNAPSHOT_COUNT, count);
+               }
+       }
+
        if (dsl_dir_is_clone(dd)) {
                dsl_dataset_t *ds;
                char buf[MAXNAMELEN];
 
                VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
-                   dd->dd_phys->dd_origin_obj, FTAG, &ds));
+                   dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
                dsl_dataset_name(ds, buf);
                dsl_dataset_rele(ds, FTAG);
                dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
@@ -475,7 +992,7 @@ dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
 {
        dsl_pool_t *dp = dd->dd_pool;
 
-       ASSERT(dd->dd_phys);
+       ASSERT(dsl_dir_phys(dd));
 
        if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
                /* up the hold count until we can be written out */
@@ -486,8 +1003,9 @@ dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
 static int64_t
 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
 {
-       uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
-       uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
+       uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
+       uint64_t new_accounted =
+           MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
        return (new_accounted - old_accounted);
 }
 
@@ -546,9 +1064,9 @@ dsl_dir_space_available(dsl_dir_t *dd,
        }
 
        mutex_enter(&dd->dd_lock);
-       if (dd->dd_phys->dd_quota != 0)
-               quota = dd->dd_phys->dd_quota;
-       used = dd->dd_phys->dd_used_bytes;
+       if (dsl_dir_phys(dd)->dd_quota != 0)
+               quota = dsl_dir_phys(dd)->dd_quota;
+       used = dsl_dir_phys(dd)->dd_used_bytes;
        if (!ondiskonly)
                used += dsl_dir_space_towrite(dd);
 
@@ -557,12 +1075,12 @@ dsl_dir_space_available(dsl_dir_t *dd,
                quota = MIN(quota, poolsize);
        }
 
-       if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
+       if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
                /*
                 * We have some space reserved, in addition to what our
                 * parent gave us.
                 */
-               parentspace += dd->dd_phys->dd_reserved - used;
+               parentspace += dsl_dir_phys(dd)->dd_reserved - used;
        }
 
        if (dd == ancestor) {
@@ -621,7 +1139,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
        est_inflight = dsl_dir_space_towrite(dd);
        for (i = 0; i < TXG_SIZE; i++)
                est_inflight += dd->dd_tempreserved[i];
-       used_on_disk = dd->dd_phys->dd_used_bytes;
+       used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
 
        /*
         * On the first iteration, fetch the dataset's used-on-disk and
@@ -645,10 +1163,10 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
         * If this transaction will result in a net free of space,
         * we want to let it through.
         */
-       if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
+       if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
                quota = UINT64_MAX;
        else
-               quota = dd->dd_phys->dd_quota;
+               quota = dsl_dir_phys(dd)->dd_quota;
 
        /*
         * Adjust the quota against the actual pool size at the root
@@ -703,7 +1221,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 
        /* see if it's OK with our parent */
        if (dd->dd_parent && parent_rsrv) {
-               boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
+               boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
 
                return (dsl_dir_tempreserve_impl(dd->dd_parent,
                    parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
@@ -828,7 +1346,7 @@ dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
                        dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
 
                est_used = dsl_dir_space_towrite(dd) +
-                   dd->dd_phys->dd_used_bytes;
+                   dsl_dir_phys(dd)->dd_used_bytes;
                parent_space = parent_delta(dd, est_used, space);
                mutex_exit(&dd->dd_lock);
 
@@ -863,27 +1381,28 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
 
        if (needlock)
                mutex_enter(&dd->dd_lock);
-       accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
-       ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used);
+       accounted_delta =
+           parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
+       ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used);
        ASSERT(compressed >= 0 ||
-           dd->dd_phys->dd_compressed_bytes >= -compressed);
+           dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
        ASSERT(uncompressed >= 0 ||
-           dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
-       dd->dd_phys->dd_used_bytes += used;
-       dd->dd_phys->dd_uncompressed_bytes += uncompressed;
-       dd->dd_phys->dd_compressed_bytes += compressed;
+           dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
+       dsl_dir_phys(dd)->dd_used_bytes += used;
+       dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
+       dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
 
-       if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+       if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
                ASSERT(used > 0 ||
-                   dd->dd_phys->dd_used_breakdown[type] >= -used);
-               dd->dd_phys->dd_used_breakdown[type] += used;
+                   dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
+               dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
 #ifdef DEBUG
                {
                        dd_used_t t;
                        uint64_t u = 0;
                        for (t = 0; t < DD_USED_NUM; t++)
-                               u += dd->dd_phys->dd_used_breakdown[t];
-                       ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes);
+                               u += dsl_dir_phys(dd)->dd_used_breakdown[t];
+                       ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
                }
 #endif
        }
@@ -907,17 +1426,18 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
        ASSERT(oldtype < DD_USED_NUM);
        ASSERT(newtype < DD_USED_NUM);
 
-       if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
+       if (delta == 0 ||
+           !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
                return;
 
        dmu_buf_will_dirty(dd->dd_dbuf, tx);
        mutex_enter(&dd->dd_lock);
        ASSERT(delta > 0 ?
-           dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
-           dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
-       ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
-       dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
-       dd->dd_phys->dd_used_breakdown[newtype] += delta;
+           dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
+           dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
+       ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
+       dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
+       dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
        mutex_exit(&dd->dd_lock);
 }
 
@@ -961,8 +1481,8 @@ dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
         */
        towrite = dsl_dir_space_towrite(ds->ds_dir);
        if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
-           (newval < ds->ds_dir->dd_phys->dd_reserved ||
-           newval < ds->ds_dir->dd_phys->dd_used_bytes + towrite)) {
+           (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
+           newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
                error = SET_ERROR(ENOSPC);
        }
        mutex_exit(&ds->ds_dir->dd_lock);
@@ -995,7 +1515,7 @@ dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
 
        dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
        mutex_enter(&ds->ds_dir->dd_lock);
-       ds->ds_dir->dd_phys->dd_quota = newval;
+       dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
        mutex_exit(&ds->ds_dir->dd_lock);
        dsl_dataset_rele(ds, FTAG);
 }
@@ -1010,7 +1530,7 @@ dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
        ddsqra.ddsqra_value = quota;
 
        return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
-           dsl_dir_set_quota_sync, &ddsqra, 0));
+           dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 int
@@ -1046,7 +1566,7 @@ dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
        }
 
        mutex_enter(&dd->dd_lock);
-       used = dd->dd_phys->dd_used_bytes;
+       used = dsl_dir_phys(dd)->dd_used_bytes;
        mutex_exit(&dd->dd_lock);
 
        if (dd->dd_parent) {
@@ -1056,13 +1576,13 @@ dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
                avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
        }
 
-       if (MAX(used, newval) > MAX(used, dd->dd_phys->dd_reserved)) {
+       if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
                uint64_t delta = MAX(used, newval) -
-                   MAX(used, dd->dd_phys->dd_reserved);
+                   MAX(used, dsl_dir_phys(dd)->dd_reserved);
 
                if (delta > avail ||
-                   (dd->dd_phys->dd_quota > 0 &&
-                   newval > dd->dd_phys->dd_quota))
+                   (dsl_dir_phys(dd)->dd_quota > 0 &&
+                   newval > dsl_dir_phys(dd)->dd_quota))
                        error = SET_ERROR(ENOSPC);
        }
 
@@ -1079,9 +1599,9 @@ dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
        dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
        mutex_enter(&dd->dd_lock);
-       used = dd->dd_phys->dd_used_bytes;
-       delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved);
-       dd->dd_phys->dd_reserved = value;
+       used = dsl_dir_phys(dd)->dd_used_bytes;
+       delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
+       dsl_dir_phys(dd)->dd_reserved = value;
 
        if (dd->dd_parent != NULL) {
                /* Roll up this additional usage into our ancestors */
@@ -1131,7 +1651,7 @@ dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
        ddsqra.ddsqra_value = reservation;
 
        return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
-           dsl_dir_set_reservation_sync, &ddsqra, 0));
+           dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 static dsl_dir_t *
@@ -1158,7 +1678,7 @@ would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
                return (delta);
 
        mutex_enter(&dd->dd_lock);
-       delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
+       delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
        mutex_exit(&dd->dd_lock);
        return (would_change(dd->dd_parent, delta, ancestor));
 }
@@ -1166,6 +1686,7 @@ would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
 typedef struct dsl_dir_rename_arg {
        const char *ddra_oldname;
        const char *ddra_newname;
+       cred_t *ddra_cred;
 } dsl_dir_rename_arg_t;
 
 /* ARGSUSED */
@@ -1230,10 +1751,57 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
                }
        }
 
+       if (dmu_tx_is_syncing(tx)) {
+               if (spa_feature_is_active(dp->dp_spa,
+                   SPA_FEATURE_FS_SS_LIMIT)) {
+                       /*
+                        * Although this is the check function and we don't
+                        * normally make on-disk changes in check functions,
+                        * we need to do that here.
+                        *
+                        * Ensure this portion of the tree's counts have been
+                        * initialized in case the new parent has limits set.
+                        */
+                       dsl_dir_init_fs_ss_count(dd, tx);
+               }
+       }
+
        if (newparent != dd->dd_parent) {
                /* is there enough space? */
                uint64_t myspace =
-                   MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
+                   MAX(dsl_dir_phys(dd)->dd_used_bytes,
+                   dsl_dir_phys(dd)->dd_reserved);
+               objset_t *os = dd->dd_pool->dp_meta_objset;
+               uint64_t fs_cnt = 0;
+               uint64_t ss_cnt = 0;
+
+               if (dsl_dir_is_zapified(dd)) {
+                       int err;
+
+                       err = zap_lookup(os, dd->dd_object,
+                           DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
+                           &fs_cnt);
+                       if (err != ENOENT && err != 0) {
+                               dsl_dir_rele(newparent, FTAG);
+                               dsl_dir_rele(dd, FTAG);
+                               return (err);
+                       }
+
+                       /*
+                        * have to add 1 for the filesystem itself that we're
+                        * moving
+                        */
+                       fs_cnt++;
+
+                       err = zap_lookup(os, dd->dd_object,
+                           DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
+                           &ss_cnt);
+                       if (err != ENOENT && err != 0) {
+                               dsl_dir_rele(newparent, FTAG);
+                               dsl_dir_rele(dd, FTAG);
+                               return (err);
+                       }
+               }
 
                /* no rename into our descendant */
                if (closest_common_ancestor(dd, newparent) == dd) {
@@ -1243,7 +1811,7 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
                }
 
                error = dsl_dir_transfer_possible(dd->dd_parent,
-                   newparent, myspace);
+                   newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred);
                if (error != 0) {
                        dsl_dir_rele(newparent, FTAG);
                        dsl_dir_rele(dd, FTAG);
@@ -1275,18 +1843,50 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
            "-> %s", ddra->ddra_newname);
 
        if (newparent != dd->dd_parent) {
+               objset_t *os = dd->dd_pool->dp_meta_objset;
+               uint64_t fs_cnt = 0;
+               uint64_t ss_cnt = 0;
+
+               /*
+                * We already made sure the dd counts were initialized in the
+                * check function.
+                */
+               if (spa_feature_is_active(dp->dp_spa,
+                   SPA_FEATURE_FS_SS_LIMIT)) {
+                       VERIFY0(zap_lookup(os, dd->dd_object,
+                           DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
+                           &fs_cnt));
+                       /* add 1 for the filesystem itself that we're moving */
+                       fs_cnt++;
+
+                       VERIFY0(zap_lookup(os, dd->dd_object,
+                           DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
+                           &ss_cnt));
+               }
+
+               dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
+                   DD_FIELD_FILESYSTEM_COUNT, tx);
+               dsl_fs_ss_count_adjust(newparent, fs_cnt,
+                   DD_FIELD_FILESYSTEM_COUNT, tx);
+
+               dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
+                   DD_FIELD_SNAPSHOT_COUNT, tx);
+               dsl_fs_ss_count_adjust(newparent, ss_cnt,
+                   DD_FIELD_SNAPSHOT_COUNT, tx);
+
                dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
-                   -dd->dd_phys->dd_used_bytes,
-                   -dd->dd_phys->dd_compressed_bytes,
-                   -dd->dd_phys->dd_uncompressed_bytes, tx);
+                   -dsl_dir_phys(dd)->dd_used_bytes,
+                   -dsl_dir_phys(dd)->dd_compressed_bytes,
+                   -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
                dsl_dir_diduse_space(newparent, DD_USED_CHILD,
-                   dd->dd_phys->dd_used_bytes,
-                   dd->dd_phys->dd_compressed_bytes,
-                   dd->dd_phys->dd_uncompressed_bytes, tx);
+                   dsl_dir_phys(dd)->dd_used_bytes,
+                   dsl_dir_phys(dd)->dd_compressed_bytes,
+                   dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
 
-               if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
-                       uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
-                           dd->dd_phys->dd_used_bytes;
+               if (dsl_dir_phys(dd)->dd_reserved >
+                   dsl_dir_phys(dd)->dd_used_bytes) {
+                       uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
+                           dsl_dir_phys(dd)->dd_used_bytes;
 
                        dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
                            -unused_rsrv, 0, 0, tx);
@@ -1298,18 +1898,19 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
        dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
        /* remove from old parent zapobj */
-       error = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+       error = zap_remove(mos,
+           dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
            dd->dd_myname, tx);
        ASSERT0(error);
 
        (void) strcpy(dd->dd_myname, mynewname);
        dsl_dir_rele(dd->dd_parent, dd);
-       dd->dd_phys->dd_parent_obj = newparent->dd_object;
+       dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
        VERIFY0(dsl_dir_hold_obj(dp,
            newparent->dd_object, NULL, dd, &dd->dd_parent));
 
        /* add to new parent zapobj */
-       VERIFY0(zap_add(mos, newparent->dd_phys->dd_child_dir_zapobj,
+       VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
            dd->dd_myname, 8, 1, &dd->dd_object, tx));
 
 #ifdef _KERNEL
@@ -1329,17 +1930,21 @@ dsl_dir_rename(const char *oldname, const char *newname)
 
        ddra.ddra_oldname = oldname;
        ddra.ddra_newname = newname;
+       ddra.ddra_cred = CRED();
 
        return (dsl_sync_task(oldname,
-           dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3));
+           dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
+           3, ZFS_SPACE_CHECK_RESERVED));
 }
 
 int
-dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
+dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
+    uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr)
 {
        dsl_dir_t *ancestor;
        int64_t adelta;
        uint64_t avail;
+       int err;
 
        ancestor = closest_common_ancestor(sdd, tdd);
        adelta = would_change(sdd, -space, ancestor);
@@ -1347,6 +1952,15 @@ dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
        if (avail < space)
                return (SET_ERROR(ENOSPC));
 
+       err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
+           ancestor, cr);
+       if (err != 0)
+               return (err);
+       err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
+           ancestor, cr);
+       if (err != 0)
+               return (err);
+
        return (0);
 }
 
@@ -1380,6 +1994,15 @@ dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
        dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
 }
 
+boolean_t
+dsl_dir_is_zapified(dsl_dir_t *dd)
+{
+       dmu_object_info_t doi;
+
+       dmu_object_info_from_db(dd->dd_dbuf, &doi);
+       return (doi.doi_type == DMU_OTN_ZAP_METADATA);
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(dsl_dir_set_quota);
 EXPORT_SYMBOL(dsl_dir_set_reservation);
index b54c03bc33fe8b834977c07cf7905a01ebe8f391..ada0eac63eeaaea4ed1de7a74a98821982d47a02 100644 (file)
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/dsl_pool.h>
@@ -137,7 +138,7 @@ dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
        int err;
 
        err = zap_lookup(dp->dp_meta_objset,
-           dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
+           dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
            name, sizeof (obj), 1, &obj);
        if (err)
                return (err);
@@ -169,8 +170,8 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
        mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
 
-       dp->dp_iput_taskq = taskq_create("zfs_iput_taskq", 1, minclsyspri,
-           1, 4, 0);
+       dp->dp_iput_taskq = taskq_create("z_iput", max_ncpus, defclsyspri,
+           max_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 
        return (dp);
 }
@@ -219,11 +220,11 @@ dsl_pool_open(dsl_pool_t *dp)
                err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
                if (err)
                        goto out;
-               err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
-                   FTAG, &ds);
+               err = dsl_dataset_hold_obj(dp,
+                   dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
                if (err == 0) {
                        err = dsl_dataset_hold_obj(dp,
-                           ds->ds_phys->ds_prev_snap_obj, dp,
+                           dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
                            &dp->dp_origin_snap);
                        dsl_dataset_rele(ds, FTAG);
                }
@@ -316,9 +317,18 @@ dsl_pool_close(dsl_pool_t *dp)
        txg_list_destroy(&dp->dp_sync_tasks);
        txg_list_destroy(&dp->dp_dirty_dirs);
 
-       arc_flush(dp->dp_spa);
+       /*
+        * We can't set retry to TRUE since we're explicitly specifying
+        * a spa to flush. This is good enough; any missed buffers for
+        * this spa won't cause trouble, and they'll eventually fall
+        * out of the ARC just like any other unused buffer.
+        */
+       arc_flush(dp->dp_spa, FALSE);
+
        txg_fini(dp);
        dsl_scan_fini(dp);
+       dmu_buf_user_evict_wait();
+
        rrw_destroy(&dp->dp_config_rwlock);
        mutex_destroy(&dp->dp_lock);
        taskq_destroy(dp->dp_iput_taskq);
@@ -369,7 +379,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
                    FREE_DIR_NAME, &dp->dp_free_dir));
 
                /* create and open the free_bplist */
-               obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
+               obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
                VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
                    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
                VERIFY0(bpobj_open(&dp->dp_free_bpobj,
@@ -609,17 +619,12 @@ dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
        uint64_t space, resv;
 
        /*
-        * Reserve about 1.6% (1/64), or at least 32MB, for allocation
-        * efficiency.
-        * XXX The intent log is not accounted for, so it must fit
-        * within this slop.
-        *
         * If we're trying to assess whether it's OK to do a free,
         * cut the reservation in half to allow forward progress
         * (e.g. make it possible to rm(1) files from a full pool).
         */
        space = spa_get_dspace(dp->dp_spa);
-       resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
+       resv = spa_get_slop_space(dp->dp_spa);
        if (netfree)
                resv >>= 1;
 
@@ -683,15 +688,15 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
        if (err)
                return (err);
 
-       while (ds->ds_phys->ds_prev_snap_obj != 0) {
-               err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
-                   FTAG, &prev);
+       while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+               err = dsl_dataset_hold_obj(dp,
+                   dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
                if (err) {
                        dsl_dataset_rele(ds, FTAG);
                        return (err);
                }
 
-               if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
+               if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
                        break;
                dsl_dataset_rele(ds, FTAG);
                ds = prev;
@@ -705,7 +710,7 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
                 * The $ORIGIN can't have any data, or the accounting
                 * will be wrong.
                 */
-               ASSERT0(prev->ds_phys->ds_bp.blk_birth);
+               ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
 
                /* The origin doesn't get attached to itself */
                if (ds->ds_object == prev->ds_object) {
@@ -714,33 +719,35 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
                }
 
                dmu_buf_will_dirty(ds->ds_dbuf, tx);
-               ds->ds_phys->ds_prev_snap_obj = prev->ds_object;
-               ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
+               dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
+               dsl_dataset_phys(ds)->ds_prev_snap_txg =
+                   dsl_dataset_phys(prev)->ds_creation_txg;
 
                dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
-               ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
+               dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
 
                dmu_buf_will_dirty(prev->ds_dbuf, tx);
-               prev->ds_phys->ds_num_children++;
+               dsl_dataset_phys(prev)->ds_num_children++;
 
-               if (ds->ds_phys->ds_next_snap_obj == 0) {
+               if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
                        ASSERT(ds->ds_prev == NULL);
                        VERIFY0(dsl_dataset_hold_obj(dp,
-                           ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
+                           dsl_dataset_phys(ds)->ds_prev_snap_obj,
+                           ds, &ds->ds_prev));
                }
        }
 
-       ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object);
-       ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object);
+       ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
+       ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
 
-       if (prev->ds_phys->ds_next_clones_obj == 0) {
+       if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
                dmu_buf_will_dirty(prev->ds_dbuf, tx);
-               prev->ds_phys->ds_next_clones_obj =
+               dsl_dataset_phys(prev)->ds_next_clones_obj =
                    zap_create(dp->dp_meta_objset,
                    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
        }
        VERIFY0(zap_add_int(dp->dp_meta_objset,
-           prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
+           dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
 
        dsl_dataset_rele(ds, FTAG);
        if (prev != dp->dp_origin_snap)
@@ -755,7 +762,7 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
        ASSERT(dp->dp_origin_snap != NULL);
 
        VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
-           tx, DS_FIND_CHILDREN));
+           tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
 }
 
 /* ARGSUSED */
@@ -765,20 +772,22 @@ upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
        dmu_tx_t *tx = arg;
        objset_t *mos = dp->dp_meta_objset;
 
-       if (ds->ds_dir->dd_phys->dd_origin_obj != 0) {
+       if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
                dsl_dataset_t *origin;
 
                VERIFY0(dsl_dataset_hold_obj(dp,
-                   ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
+                   dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
 
-               if (origin->ds_dir->dd_phys->dd_clones == 0) {
+               if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
                        dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
-                       origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
-                           DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+                       dsl_dir_phys(origin->ds_dir)->dd_clones =
+                           zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
+                           0, tx);
                }
 
                VERIFY0(zap_add_int(dp->dp_meta_objset,
-                   origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx));
+                   dsl_dir_phys(origin->ds_dir)->dd_clones,
+                   ds->ds_object, tx));
 
                dsl_dataset_rele(origin, FTAG);
        }
@@ -802,13 +811,13 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
         * subobj support.  So call dmu_object_alloc() directly.
         */
        obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
-           SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+           SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
        VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
            DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
        VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
 
        VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
-           upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
+           upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
 }
 
 void
@@ -826,7 +835,7 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
            NULL, 0, kcred, tx);
        VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
        dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
-       VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+       VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
            dp, &dp->dp_origin_snap));
        dsl_dataset_rele(ds, FTAG);
 }
@@ -1041,6 +1050,13 @@ dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
        rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
 }
 
+void
+dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag)
+{
+       ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
+       rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
+}
+
 void
 dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
 {
@@ -1053,6 +1069,12 @@ dsl_pool_config_held(dsl_pool_t *dp)
        return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
 }
 
+boolean_t
+dsl_pool_config_held_writer(dsl_pool_t *dp)
+{
+       return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(dsl_pool_config_enter);
 EXPORT_SYMBOL(dsl_pool_config_exit);
index d712473269262b1ee084c5a986d6b8abbb72b990..28b101eee5475b11563266f3d6342cacda82023a 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 Martin Matuska. All rights reserved.
  */
 
@@ -105,8 +105,8 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
                }
 
                /* Check for a local value. */
-               err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
-                   intsz, numints, buf);
+               err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+                   propname, intsz, numints, buf);
                if (err != ENOENT) {
                        if (setpoint != NULL && err == 0)
                                dsl_dir_name(dd, setpoint);
@@ -117,14 +117,14 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
                 * Skip the check for a received value if there is an explicit
                 * inheritance entry.
                 */
-               err = zap_contains(mos, dd->dd_phys->dd_props_zapobj,
+               err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
                    inheritstr);
                if (err != 0 && err != ENOENT)
                        break;
 
                if (err == ENOENT) {
                        /* Check for a received value. */
-                       err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
+                       err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
                            recvdstr, intsz, numints, buf);
                        if (err != ENOENT) {
                                if (setpoint != NULL && err == 0) {
@@ -163,19 +163,17 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
 {
        zfs_prop_t prop = zfs_name_to_prop(propname);
        boolean_t inheritable;
-       boolean_t snapshot;
        uint64_t zapobj;
 
        ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
        inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
-       snapshot = (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds));
-       zapobj = (ds->ds_phys == NULL ? 0 : ds->ds_phys->ds_props_obj);
+       zapobj = dsl_dataset_phys(ds)->ds_props_obj;
 
        if (zapobj != 0) {
                objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
                int err;
 
-               ASSERT(snapshot);
+               ASSERT(ds->ds_is_snapshot);
 
                /* Check for a local value. */
                err = zap_lookup(mos, zapobj, propname, intsz, numints, buf);
@@ -215,7 +213,7 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
        }
 
        return (dsl_prop_get_dd(ds->ds_dir, propname,
-           intsz, numints, buf, setpoint, snapshot));
+           intsz, numints, buf, setpoint, ds->ds_is_snapshot));
 }
 
 /*
@@ -327,7 +325,7 @@ dsl_prop_predict(dsl_dir_t *dd, const char *propname,
        }
 
        mos = dd->dd_pool->dp_meta_objset;
-       zapobj = dd->dd_phys->dd_props_zapobj;
+       zapobj = dsl_dir_phys(dd)->dd_props_zapobj;
        recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
 
        version = spa_version(dd->dd_pool->dp_spa);
@@ -443,9 +441,31 @@ dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
            cbr = list_next(&dd->dd_prop_cbs, cbr)) {
                uint64_t value;
 
+               /*
+                * Callback entries do not have holds on their datasets
+                * so that datasets with registered callbacks are still
+                * eligible for eviction.  Unlike operations on callbacks
+                * for a single dataset, we are performing a recursive
+                * descent of related datasets and the calling context
+                * for this iteration only has a dataset hold on the root.
+                * Without a hold, the callback's pointer to the dataset
+                * could be invalidated by eviction at any time.
+                *
+                * Use dsl_dataset_try_add_ref() to verify that the
+                * dataset has not begun eviction processing and to
+                * prevent eviction from occurring for the duration
+                * of the callback.  If the hold attempt fails, this
+                * object is already being evicted and the callback can
+                * be safely ignored.
+                */
+               if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
+                       continue;
+
                if (dsl_prop_get_ds(cbr->cbr_ds, cbr->cbr_propname,
                    sizeof (value), 1, &value, NULL) == 0)
                        cbr->cbr_func(cbr->cbr_arg, value);
+
+               dsl_dataset_rele(cbr->cbr_ds, FTAG);
        }
        mutex_exit(&dd->dd_lock);
 
@@ -486,7 +506,8 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
                 * If the prop is set here, then this change is not
                 * being inherited here or below; stop the recursion.
                 */
-               err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname);
+               err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+                   propname);
                if (err == 0) {
                        dsl_dir_rele(dd, FTAG);
                        return;
@@ -497,25 +518,34 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
        mutex_enter(&dd->dd_lock);
        for (cbr = list_head(&dd->dd_prop_cbs); cbr;
            cbr = list_next(&dd->dd_prop_cbs, cbr)) {
-               uint64_t propobj = cbr->cbr_ds->ds_phys->ds_props_obj;
+               uint64_t propobj;
 
-               if (strcmp(cbr->cbr_propname, propname) != 0)
+               /*
+                * cbr->cbf_ds may be invalidated due to eviction,
+                * requiring the use of dsl_dataset_try_add_ref().
+                * See comment block in dsl_prop_notify_all_cb()
+                * for details.
+                */
+               if (strcmp(cbr->cbr_propname, propname) != 0 ||
+                   !dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
                        continue;
 
+               propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj;
+
                /*
-                * If the property is set on this ds, then it is not
-                * inherited here; don't call the callback.
+                * If the property is not set on this ds, then it is
+                * inherited here; call the callback.
                 */
-               if (propobj && 0 == zap_contains(mos, propobj, propname))
-                       continue;
+               if (propobj == 0 || zap_contains(mos, propobj, propname) != 0)
+                       cbr->cbr_func(cbr->cbr_arg, value);
 
-               cbr->cbr_func(cbr->cbr_arg, value);
+               dsl_dataset_rele(cbr->cbr_ds, FTAG);
        }
        mutex_exit(&dd->dd_lock);
 
        za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
        for (zap_cursor_init(&zc, mos,
-           dd->dd_phys->dd_child_dir_zapobj);
+           dsl_dir_phys(dd)->dd_child_dir_zapobj);
            zap_cursor_retrieve(&zc, za) == 0;
            zap_cursor_advance(&zc)) {
                dsl_prop_changed_notify(dp, za->za_first_integer,
@@ -544,17 +574,17 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
 
        isint = (dodefault(propname, 8, 1, &intval) == 0);
 
-       if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
+       if (ds->ds_is_snapshot) {
                ASSERT(version >= SPA_VERSION_SNAP_PROPS);
-               if (ds->ds_phys->ds_props_obj == 0) {
+               if (dsl_dataset_phys(ds)->ds_props_obj == 0) {
                        dmu_buf_will_dirty(ds->ds_dbuf, tx);
-                       ds->ds_phys->ds_props_obj =
+                       dsl_dataset_phys(ds)->ds_props_obj =
                            zap_create(mos,
                            DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
                }
-               zapobj = ds->ds_phys->ds_props_obj;
+               zapobj = dsl_dataset_phys(ds)->ds_props_obj;
        } else {
-               zapobj = ds->ds_dir->dd_phys->dd_props_zapobj;
+               zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj;
        }
 
        if (version < SPA_VERSION_RECVD_PROPS) {
@@ -641,7 +671,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
        if (isint) {
                VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval));
 
-               if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
+               if (ds->ds_is_snapshot) {
                        dsl_prop_cb_record_t *cbr;
                        /*
                         * It's a snapshot; nothing can inherit this
@@ -759,7 +789,7 @@ dsl_props_set_check(void *arg, dmu_tx_t *tx)
                }
        }
 
-       if (dsl_dataset_is_snapshot(ds) && version < SPA_VERSION_SNAP_PROPS) {
+       if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) {
                dsl_dataset_rele(ds, FTAG);
                return (SET_ERROR(ENOTSUP));
        }
@@ -835,7 +865,7 @@ dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props)
                nblks = 2 * fnvlist_num_pairs(props);
 
        return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync,
-           &dpsa, nblks));
+           &dpsa, nblks, ZFS_SPACE_CHECK_RESERVED));
 }
 
 typedef enum dsl_prop_getflags {
@@ -982,16 +1012,16 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp,
 
        VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
-       if (dsl_dataset_is_snapshot(ds))
+       if (ds->ds_is_snapshot)
                flags |= DSL_PROP_GET_SNAPSHOT;
 
        ASSERT(dsl_pool_config_held(dp));
 
-       if (ds->ds_phys->ds_props_obj != 0) {
+       if (dsl_dataset_phys(ds)->ds_props_obj != 0) {
                ASSERT(flags & DSL_PROP_GET_SNAPSHOT);
                dsl_dataset_name(ds, setpoint);
-               err = dsl_prop_get_all_impl(mos, ds->ds_phys->ds_props_obj,
-                   setpoint, flags, *nvp);
+               err = dsl_prop_get_all_impl(mos,
+                   dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp);
                if (err)
                        goto out;
        }
@@ -1004,8 +1034,8 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp,
                        flags |= DSL_PROP_GET_INHERITING;
                }
                dsl_dir_name(dd, setpoint);
-               err = dsl_prop_get_all_impl(mos, dd->dd_phys->dd_props_zapobj,
-                   setpoint, flags, *nvp);
+               err = dsl_prop_get_all_impl(mos,
+                   dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp);
                if (err)
                        break;
        }
index 8b166bcc68eb07fef8fa57aabd1c9a1157ffcbc2..b989e763386b2759420841a4eb637340307e812e 100644 (file)
@@ -388,7 +388,7 @@ int
 dsl_scan_cancel(dsl_pool_t *dp)
 {
        return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
-           dsl_scan_cancel_sync, NULL, 3));
+           dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
 }
 
 static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
@@ -415,8 +415,8 @@ static uint64_t
 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 {
        uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
-       if (dsl_dataset_is_snapshot(ds))
-               return (MIN(smt, ds->ds_phys->ds_creation_txg));
+       if (ds->ds_is_snapshot)
+               return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
        return (smt);
 }
 
@@ -429,11 +429,14 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
            &scn->scn_phys, tx));
 }
 
+extern int zfs_vdev_async_write_active_min_dirty_percent;
+
 static boolean_t
 dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 {
        uint64_t elapsed_nanosecs;
        int mintime;
+       int dirty_pct;
 
        /* we never skip user/group accounting objects */
        if (zb && (int64_t)zb->zb_object < 0)
@@ -449,12 +452,28 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb)
        if (zb && zb->zb_level != 0)
                return (B_FALSE);
 
+       /*
+        * We pause if:
+        *  - we have scanned for the maximum time: an entire txg
+        *    timeout (default 5 sec)
+        *  or
+        *  - we have scanned for at least the minimum time (default 1 sec
+        *    for scrub, 3 sec for resilver), and either we have sufficient
+        *    dirty data that we are starting to write more quickly
+        *    (default 30%), or someone is explicitly waiting for this txg
+        *    to complete.
+        *  or
+        *  - the spa is shutting down because this pool is being exported
+        *    or the machine is rebooting.
+        */
        mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
            zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
        elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
-       if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+       dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
+       if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout ||
            (NSEC2MSEC(elapsed_nanosecs) > mintime &&
-           txg_sync_waiting(scn->scn_dp)) ||
+           (txg_sync_waiting(scn->scn_dp) ||
+           dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) ||
            spa_shutting_down(scn->scn_dp->dp_spa)) {
                if (zb) {
                        dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
@@ -571,7 +590,7 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
     uint64_t objset, uint64_t object, uint64_t blkid)
 {
        zbookmark_phys_t czb;
-       uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
+       arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 
        if (zfs_no_scrub_prefetch)
                return;
@@ -636,7 +655,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
        int err;
 
        if (BP_GET_LEVEL(bp) > 0) {
-               uint32_t flags = ARC_WAIT;
+               arc_flags_t flags = ARC_FLAG_WAIT;
                int i;
                blkptr_t *cbp;
                int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
@@ -663,7 +682,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
                }
                (void) arc_buf_remove_ref(buf, &buf);
        } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
-               uint32_t flags = ARC_WAIT;
+               arc_flags_t flags = ARC_FLAG_WAIT;
                dnode_phys_t *cdnp;
                int i, j;
                int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
@@ -689,7 +708,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 
                (void) arc_buf_remove_ref(buf, &buf);
        } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
-               uint32_t flags = ARC_WAIT;
+               arc_flags_t flags = ARC_FLAG_WAIT;
                objset_phys_t *osp;
                arc_buf_t *buf;
 
@@ -846,14 +865,15 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
                return;
 
        if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
-               if (dsl_dataset_is_snapshot(ds)) {
+               if (ds->ds_is_snapshot) {
                        /* Note, scn_cur_{min,max}_txg stays the same. */
                        scn->scn_phys.scn_bookmark.zb_objset =
-                           ds->ds_phys->ds_next_snap_obj;
+                           dsl_dataset_phys(ds)->ds_next_snap_obj;
                        zfs_dbgmsg("destroying ds %llu; currently traversing; "
                            "reset zb_objset to %llu",
                            (u_longlong_t)ds->ds_object,
-                           (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+                           (u_longlong_t)dsl_dataset_phys(ds)->
+                           ds_next_snap_obj);
                        scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
                } else {
                        SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
@@ -864,10 +884,10 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
                }
        } else if (zap_lookup_int_key(dp->dp_meta_objset,
            scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
-               ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+               ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
                VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
                    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
-               if (dsl_dataset_is_snapshot(ds)) {
+               if (ds->ds_is_snapshot) {
                        /*
                         * We keep the same mintxg; it could be >
                         * ds_creation_txg if the previous snapshot was
@@ -875,11 +895,13 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
                         */
                        VERIFY(zap_add_int_key(dp->dp_meta_objset,
                            scn->scn_phys.scn_queue_obj,
-                           ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
+                           dsl_dataset_phys(ds)->ds_next_snap_obj,
+                           mintxg, tx) == 0);
                        zfs_dbgmsg("destroying ds %llu; in queue; "
                            "replacing with %llu",
                            (u_longlong_t)ds->ds_object,
-                           (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+                           (u_longlong_t)dsl_dataset_phys(ds)->
+                           ds_next_snap_obj);
                } else {
                        zfs_dbgmsg("destroying ds %llu; in queue; removing",
                            (u_longlong_t)ds->ds_object);
@@ -906,26 +928,26 @@ dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
        if (scn->scn_phys.scn_state != DSS_SCANNING)
                return;
 
-       ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
+       ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
 
        if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
                scn->scn_phys.scn_bookmark.zb_objset =
-                   ds->ds_phys->ds_prev_snap_obj;
+                   dsl_dataset_phys(ds)->ds_prev_snap_obj;
                zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
                    "reset zb_objset to %llu",
                    (u_longlong_t)ds->ds_object,
-                   (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+                   (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
        } else if (zap_lookup_int_key(dp->dp_meta_objset,
            scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
                VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
                    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
                VERIFY(zap_add_int_key(dp->dp_meta_objset,
                    scn->scn_phys.scn_queue_obj,
-                   ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
+                   dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
                zfs_dbgmsg("snapshotting ds %llu; in queue; "
                    "replacing with %llu",
                    (u_longlong_t)ds->ds_object,
-                   (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+                   (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
        }
        dsl_scan_sync_state(scn, tx);
 }
@@ -958,8 +980,8 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
            ds1->ds_object, &mintxg) == 0) {
                int err;
 
-               ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
-               ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+               ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+               ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
                VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
                    scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
                err = zap_add_int_key(dp->dp_meta_objset,
@@ -977,8 +999,8 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
                    (u_longlong_t)ds2->ds_object);
        } else if (zap_lookup_int_key(dp->dp_meta_objset,
            scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
-               ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
-               ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+               ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+               ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
                VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
                    scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
                VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
@@ -1006,17 +1028,17 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
        int err;
        dsl_scan_t *scn = dp->dp_scan;
 
-       if (hds->ds_dir->dd_phys->dd_origin_obj != eca->originobj)
+       if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj)
                return (0);
 
        err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
        if (err)
                return (err);
 
-       while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
+       while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) {
                dsl_dataset_t *prev;
                err = dsl_dataset_hold_obj(dp,
-                   ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+                   dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 
                dsl_dataset_rele(ds, FTAG);
                if (err)
@@ -1025,7 +1047,7 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
        }
        VERIFY(zap_add_int_key(dp->dp_meta_objset,
            scn->scn_phys.scn_queue_obj, ds->ds_object,
-           ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
+           dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0);
        dsl_dataset_rele(ds, FTAG);
        return (0);
 }
@@ -1050,14 +1072,14 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
         * ZIL here, rather than in scan_recurse(), because the regular
         * snapshot block-sharing rules don't apply to it.
         */
-       if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds))
+       if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot)
                dsl_scan_zil(dp, &os->os_zil_header);
 
        /*
         * Iterate over the bps in this ds.
         */
        dmu_buf_will_dirty(ds->ds_dbuf, tx);
-       dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
+       dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
 
        dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
        dsl_dataset_name(ds, dsname);
@@ -1091,14 +1113,15 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
        /*
         * Add descendent datasets to work queue.
         */
-       if (ds->ds_phys->ds_next_snap_obj != 0) {
+       if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
                VERIFY(zap_add_int_key(dp->dp_meta_objset,
-                   scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
-                   ds->ds_phys->ds_creation_txg, tx) == 0);
+                   scn->scn_phys.scn_queue_obj,
+                   dsl_dataset_phys(ds)->ds_next_snap_obj,
+                   dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0);
        }
-       if (ds->ds_phys->ds_num_children > 1) {
+       if (dsl_dataset_phys(ds)->ds_num_children > 1) {
                boolean_t usenext = B_FALSE;
-               if (ds->ds_phys->ds_next_clones_obj != 0) {
+               if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
                        uint64_t count;
                        /*
                         * A bug in a previous version of the code could
@@ -1108,17 +1131,17 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
                         * next_clones_obj when its count is correct.
                         */
                        int err = zap_count(dp->dp_meta_objset,
-                           ds->ds_phys->ds_next_clones_obj, &count);
+                           dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
                        if (err == 0 &&
-                           count == ds->ds_phys->ds_num_children - 1)
+                           count == dsl_dataset_phys(ds)->ds_num_children - 1)
                                usenext = B_TRUE;
                }
 
                if (usenext) {
                        VERIFY0(zap_join_key(dp->dp_meta_objset,
-                           ds->ds_phys->ds_next_clones_obj,
+                           dsl_dataset_phys(ds)->ds_next_clones_obj,
                            scn->scn_phys.scn_queue_obj,
-                           ds->ds_phys->ds_creation_txg, tx));
+                           dsl_dataset_phys(ds)->ds_creation_txg, tx));
                } else {
                        struct enqueue_clones_arg eca;
                        eca.tx = tx;
@@ -1146,10 +1169,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
        if (err)
                return (err);
 
-       while (ds->ds_phys->ds_prev_snap_obj != 0) {
+       while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
                dsl_dataset_t *prev;
-               err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
-                   FTAG, &prev);
+               err = dsl_dataset_hold_obj(dp,
+                   dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
                if (err) {
                        dsl_dataset_rele(ds, FTAG);
                        return (err);
@@ -1158,7 +1181,7 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
                /*
                 * If this is a clone, we don't need to worry about it for now.
                 */
-               if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+               if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
                        dsl_dataset_rele(ds, FTAG);
                        dsl_dataset_rele(prev, FTAG);
                        return (0);
@@ -1168,7 +1191,7 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
        }
 
        VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
-           ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
+           ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0);
        dsl_dataset_rele(ds, FTAG);
        return (0);
 }
@@ -1348,7 +1371,7 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
                } else {
                        scn->scn_phys.scn_cur_min_txg =
                            MAX(scn->scn_phys.scn_min_txg,
-                           ds->ds_phys->ds_prev_snap_txg);
+                           dsl_dataset_phys(ds)->ds_prev_snap_txg);
                }
                scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
                dsl_dataset_rele(ds, FTAG);
@@ -1505,11 +1528,15 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                            dp->dp_bptree_obj, tx));
                        dp->dp_bptree_obj = 0;
                        scn->scn_async_destroying = B_FALSE;
+                       scn->scn_async_stalled = B_FALSE;
                } else {
                        /*
-                        * If we didn't make progress, mark the async destroy as
-                        * stalled, so that we will not initiate a spa_sync() on
-                        * its behalf.
+                        * If we didn't make progress, mark the async
+                        * destroy as stalled, so that we will not initiate
+                        * a spa_sync() on its behalf.  Note that we only
+                        * check this if we are not finished, because if the
+                        * bptree had no blocks for us to visit, we can
+                        * finish without "making progress".
                         */
                        scn->scn_async_stalled =
                            (scn->scn_visited_this_txg == 0);
@@ -1534,9 +1561,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
        if (err != 0)
                return;
        if (!scn->scn_async_destroying && zfs_free_leak_on_eio &&
-           (dp->dp_free_dir->dd_phys->dd_used_bytes != 0 ||
-           dp->dp_free_dir->dd_phys->dd_compressed_bytes != 0 ||
-           dp->dp_free_dir->dd_phys->dd_uncompressed_bytes != 0)) {
+           (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
+           dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
+           dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
                /*
                 * We have finished background destroying, but there is still
                 * some space left in the dp_free_dir. Transfer this leaked
@@ -1551,19 +1578,19 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                        rrw_exit(&dp->dp_config_rwlock, FTAG);
                }
                dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
-                   dp->dp_free_dir->dd_phys->dd_used_bytes,
-                   dp->dp_free_dir->dd_phys->dd_compressed_bytes,
-                   dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
+                   dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+                   dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+                   dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
                dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
-                   -dp->dp_free_dir->dd_phys->dd_used_bytes,
-                   -dp->dp_free_dir->dd_phys->dd_compressed_bytes,
-                   -dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
+                   -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+                   -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+                   -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
        }
        if (!scn->scn_async_destroying) {
                /* finished; verify that space accounting went to zero */
-               ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes);
-               ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes);
-               ASSERT0(dp->dp_free_dir->dd_phys->dd_uncompressed_bytes);
+               ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
+               ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
+               ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
        }
 
        if (scn->scn_phys.scn_state != DSS_SCANNING)
@@ -1844,7 +1871,7 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
        (void) spa_vdev_state_exit(spa, NULL, 0);
 
        return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
-           dsl_scan_setup_sync, &func, 0));
+           dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
index 5f345f498a8445dafd52fe82409510f4ecafeaaf..28130d25711a8dfca04059963a247a107d02d139 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -64,7 +64,8 @@ dsl_null_checkfunc(void *arg, dmu_tx_t *tx)
  */
 int
 dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
-    dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified)
+    dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check)
 {
        spa_t *spa;
        dmu_tx_t *tx;
@@ -84,6 +85,7 @@ top:
        dst.dst_pool = dp;
        dst.dst_txg = dmu_tx_get_txg(tx);
        dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT;
+       dst.dst_space_check = space_check;
        dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc;
        dst.dst_syncfunc = syncfunc;
        dst.dst_arg = arg;
@@ -117,13 +119,14 @@ top:
 
 void
 dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
-    int blocks_modified, dmu_tx_t *tx)
+    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
 {
        dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP);
 
        dst->dst_pool = dp;
        dst->dst_txg = dmu_tx_get_txg(tx);
        dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT;
+       dst->dst_space_check = space_check;
        dst->dst_checkfunc = dsl_null_checkfunc;
        dst->dst_syncfunc = syncfunc;
        dst->dst_arg = arg;
@@ -140,25 +143,34 @@ void
 dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx)
 {
        dsl_pool_t *dp = dst->dst_pool;
-       uint64_t quota, used;
 
        ASSERT0(dst->dst_error);
 
        /*
-        * Check for sufficient space.  We just check against what's
-        * on-disk; we don't want any in-flight accounting to get in our
-        * way, because open context may have already used up various
-        * in-core limits (arc_tempreserve, dsl_pool_tempreserve).
+        * Check for sufficient space.
+        *
+        * When the sync task was created, the caller specified the
+        * type of space checking required.  See the comment in
+        * zfs_space_check_t for details on the semantics of each
+        * type of space checking.
+        *
+        * We just check against what's on-disk; we don't want any
+        * in-flight accounting to get in our way, because open context
+        * may have already used up various in-core limits
+        * (arc_tempreserve, dsl_pool_tempreserve).
         */
-       quota = dsl_pool_adjustedsize(dp, B_FALSE) -
-           metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
-       used = dp->dp_root_dir->dd_phys->dd_used_bytes;
-       /* MOS space is triple-dittoed, so we multiply by 3. */
-       if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) {
-               dst->dst_error = SET_ERROR(ENOSPC);
-               if (dst->dst_nowaiter)
-                       kmem_free(dst, sizeof (*dst));
-               return;
+       if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) {
+               uint64_t quota = dsl_pool_adjustedsize(dp,
+                   dst->dst_space_check == ZFS_SPACE_CHECK_RESERVED) -
+                   metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+               uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+               /* MOS space is triple-dittoed, so we multiply by 3. */
+               if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) {
+                       dst->dst_error = SET_ERROR(ENOSPC);
+                       if (dst->dst_nowaiter)
+                               kmem_free(dst, sizeof (*dst));
+                       return;
+               }
        }
 
        /*
index 1d6c9df8979b0f3b6320201192f3c076bba36efd..1b234ed480f92bd4f2bf765acb6e06444a833cd8 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  */
 
@@ -64,10 +64,10 @@ dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag,
                return (SET_ERROR(E2BIG));
 
        /* tags must be unique (if ds already exists) */
-       if (ds != NULL && ds->ds_phys->ds_userrefs_obj != 0) {
+       if (ds != NULL && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
                uint64_t value;
 
-               error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj,
+               error = zap_lookup(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
                    htag, 8, 1, &value);
                if (error == 0)
                        error = SET_ERROR(EEXIST);
@@ -141,16 +141,16 @@ dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds,
 
        ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
-       if (ds->ds_phys->ds_userrefs_obj == 0) {
+       if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) {
                /*
                 * This is the first user hold for this dataset.  Create
                 * the userrefs zap object.
                 */
                dmu_buf_will_dirty(ds->ds_dbuf, tx);
-               zapobj = ds->ds_phys->ds_userrefs_obj =
+               zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj =
                    zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
        } else {
-               zapobj = ds->ds_phys->ds_userrefs_obj;
+               zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
        }
        ds->ds_userrefs++;
 
@@ -319,7 +319,8 @@ dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist)
        dduha.dduha_minor = cleanup_minor;
 
        ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check,
-           dsl_dataset_user_hold_sync, &dduha, fnvlist_num_pairs(holds));
+           dsl_dataset_user_hold_sync, &dduha,
+           fnvlist_num_pairs(holds), ZFS_SPACE_CHECK_RESERVED);
        fnvlist_free(dduha.dduha_chkholds);
 
        return (ret);
@@ -354,7 +355,7 @@ dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura,
        objset_t *mos;
        int numholds;
 
-       if (!dsl_dataset_is_snapshot(ds))
+       if (!ds->ds_is_snapshot)
                return (SET_ERROR(EINVAL));
 
        if (nvlist_empty(holds))
@@ -362,7 +363,7 @@ dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura,
 
        numholds = 0;
        mos = ds->ds_dir->dd_pool->dp_meta_objset;
-       zapobj = ds->ds_phys->ds_userrefs_obj;
+       zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
        VERIFY0(nvlist_alloc(&holds_found, NV_UNIQUE_NAME, KM_SLEEP));
 
        for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
@@ -400,7 +401,8 @@ dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura,
                numholds++;
        }
 
-       if (DS_IS_DEFER_DESTROY(ds) && ds->ds_phys->ds_num_children == 1 &&
+       if (DS_IS_DEFER_DESTROY(ds) &&
+           dsl_dataset_phys(ds)->ds_num_children == 1 &&
            ds->ds_userrefs == numholds) {
                /* we need to destroy the snapshot as well */
                if (dsl_dataset_long_held(ds)) {
@@ -488,8 +490,8 @@ dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds,
                error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx);
                VERIFY(error == 0 || error == ENOENT);
 
-               VERIFY0(zap_remove(mos, ds->ds_phys->ds_userrefs_obj, holdname,
-                   tx));
+               VERIFY0(zap_remove(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+                   holdname, tx));
                ds->ds_userrefs--;
 
                spa_history_log_internal_ds(ds, "release", tx,
@@ -519,7 +521,7 @@ dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx)
                    fnvpair_value_nvlist(pair), tx);
                if (nvlist_exists(ddura->ddura_todelete, name)) {
                        ASSERT(ds->ds_userrefs == 0 &&
-                           ds->ds_phys->ds_num_children == 1 &&
+                           dsl_dataset_phys(ds)->ds_num_children == 1 &&
                            DS_IS_DEFER_DESTROY(ds));
                        dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx);
                }
@@ -608,7 +610,7 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
            KM_SLEEP));
 
        error = dsl_sync_task(pool, dsl_dataset_user_release_check,
-           dsl_dataset_user_release_sync, &ddura, 0);
+           dsl_dataset_user_release_sync, &ddura, 0, ZFS_SPACE_CHECK_NONE);
        fnvlist_free(ddura.ddura_todelete);
        fnvlist_free(ddura.ddura_chkholds);
 
@@ -651,13 +653,13 @@ dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl)
                return (err);
        }
 
-       if (ds->ds_phys->ds_userrefs_obj != 0) {
+       if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
                zap_attribute_t *za;
                zap_cursor_t zc;
 
                za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
                for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
-                   ds->ds_phys->ds_userrefs_obj);
+                   dsl_dataset_phys(ds)->ds_userrefs_obj);
                    zap_cursor_retrieve(&zc, za) == 0;
                    zap_cursor_advance(&zc)) {
                        fnvlist_add_uint64(nvl, za->za_name,
index 56787137b3a2bb6d8ff56a79544e30141c522f30..999bd8adc5187efff9a2747129296dffe3e2aaf6 100644 (file)
@@ -593,8 +593,9 @@ zfs_zevent_fd_hold(int fd, minor_t *minorp, zfs_zevent_t **ze)
        if (fp == NULL)
                return (EBADF);
 
-       *minorp = zfsdev_getminor(fp->f_file);
-       error = zfs_zevent_minor_to_state(*minorp, ze);
+       error = zfsdev_getminor(fp->f_file, minorp);
+       if (error == 0)
+               error = zfs_zevent_minor_to_state(*minorp, ze);
 
        if (error)
                zfs_zevent_fd_rele(fd);
@@ -676,7 +677,7 @@ zfs_zevent_wait(zfs_zevent_t *ze)
        }
 
        zevent_waiters++;
-       cv_wait_interruptible(&zevent_cv, &zevent_lock);
+       cv_wait_sig(&zevent_cv, &zevent_lock);
        if (issig(JUSTLOOKING))
                error = EINTR;
 
index 311465ebc5834e5cf041c6a74f182540dce0e1cf..59bcefd346c0e41a0cb68cfad9f8879416fa7dae 100644 (file)
 #define        METASLAB_ACTIVE_MASK            \
        (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
 
-uint64_t metaslab_aliquot = 512ULL << 10;
+/*
+ * Metaslab granularity, in bytes. This is roughly similar to what would be
+ * referred to as the "stripe size" in traditional RAID arrays. In normal
+ * operation, we will try to write this amount of data to a top-level vdev
+ * before moving on to the next one.
+ */
+unsigned long metaslab_aliquot = 512 << 10;
+
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;    /* force gang blocks */
 
 /*
@@ -137,12 +144,6 @@ uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
  */
 int metaslab_df_free_pct = 4;
 
-/*
- * A metaslab is considered "free" if it contains a contiguous
- * segment which is greater than metaslab_min_alloc_size.
- */
-uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
-
 /*
  * Percentage of all cpus that can be used by the metaslab taskq.
  */
@@ -491,7 +492,7 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
        mg->mg_activation_count = 0;
 
        mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
-           minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
+           maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
 
        return (mg);
 }
@@ -562,7 +563,7 @@ metaslab_group_passivate(metaslab_group_t *mg)
                return;
        }
 
-       taskq_wait(mg->mg_taskq);
+       taskq_wait_outstanding(mg->mg_taskq, 0);
        metaslab_group_alloc_update(mg);
 
        mgprev = mg->mg_prev;
@@ -1517,7 +1518,7 @@ metaslab_weight(metaslab_t *msp)
         * In effect, this means that we'll select the metaslab with the most
         * free bandwidth rather than simply the one with the most free space.
         */
-       if (metaslab_lba_weighting_enabled) {
+       if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
                weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
                ASSERT(weight >= space && weight <= 2 * space);
        }
@@ -1578,6 +1579,7 @@ metaslab_preload(void *arg)
 {
        metaslab_t *msp = arg;
        spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+       fstrans_cookie_t cookie = spl_fstrans_mark();
 
        ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
 
@@ -1591,6 +1593,7 @@ metaslab_preload(void *arg)
         */
        msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
        mutex_exit(&msp->ms_lock);
+       spl_fstrans_unmark(cookie);
 }
 
 static void
@@ -1602,7 +1605,7 @@ metaslab_group_preload(metaslab_group_t *mg)
        int m = 0;
 
        if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
-               taskq_wait(mg->mg_taskq);
+               taskq_wait_outstanding(mg->mg_taskq, 0);
                return;
        }
 
@@ -2341,28 +2344,42 @@ top:
                         * figure out whether the corresponding vdev is
                         * over- or under-used relative to the pool,
                         * and set an allocation bias to even it out.
+                        *
+                        * Bias is also used to compensate for unequally
+                        * sized vdevs so that space is allocated fairly.
                         */
                        if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
                                vdev_stat_t *vs = &vd->vdev_stat;
-                               int64_t vu, cu;
-
-                               vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
-                               cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
+                               int64_t vs_free = vs->vs_space - vs->vs_alloc;
+                               int64_t mc_free = mc->mc_space - mc->mc_alloc;
+                               int64_t ratio;
 
                                /*
                                 * Calculate how much more or less we should
                                 * try to allocate from this device during
                                 * this iteration around the rotor.
-                                * For example, if a device is 80% full
-                                * and the pool is 20% full then we should
-                                * reduce allocations by 60% on this device.
                                 *
-                                * mg_bias = (20 - 80) * 512K / 100 = -307K
+                                * This basically introduces a zero-centered
+                                * bias towards the devices with the most
+                                * free space, while compensating for vdev
+                                * size differences.
+                                *
+                                * Examples:
+                                *  vdev V1 = 16M/128M
+                                *  vdev V2 = 16M/128M
+                                *  ratio(V1) = 100% ratio(V2) = 100%
+                                *
+                                *  vdev V1 = 16M/128M
+                                *  vdev V2 = 64M/128M
+                                *  ratio(V1) = 127% ratio(V2) =  72%
                                 *
-                                * This reduces allocations by 307K for this
-                                * iteration.
+                                *  vdev V1 = 16M/128M
+                                *  vdev V2 = 64M/512M
+                                *  ratio(V1) =  40% ratio(V2) = 160%
                                 */
-                               mg->mg_bias = ((cu - vu) *
+                               ratio = (vs_free * mc->mc_alloc_groups * 100) /
+                                   (mc_free + 1);
+                               mg->mg_bias = ((ratio - 100) *
                                    (int64_t)mg->mg_aliquot) / 100;
                        } else if (!metaslab_bias_enabled) {
                                mg->mg_bias = 0;
@@ -2692,6 +2709,7 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp)
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
+module_param(metaslab_aliquot, ulong, 0644);
 module_param(metaslab_debug_load, int, 0644);
 module_param(metaslab_debug_unload, int, 0644);
 module_param(metaslab_preload_enabled, int, 0644);
@@ -2702,6 +2720,8 @@ module_param(metaslab_fragmentation_factor_enabled, int, 0644);
 module_param(metaslab_lba_weighting_enabled, int, 0644);
 module_param(metaslab_bias_enabled, int, 0644);
 
+MODULE_PARM_DESC(metaslab_aliquot,
+       "allocation granularity (a.k.a. stripe size)");
 MODULE_PARM_DESC(metaslab_debug_load,
        "load all metaslabs when pool is first opened");
 MODULE_PARM_DESC(metaslab_debug_unload,
diff --git a/zfs/module/zfs/multilist.c b/zfs/module/zfs/multilist.c
new file mode 100644 (file)
index 0000000..e4446de
--- /dev/null
@@ -0,0 +1,375 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/multilist.h>
+#include <sys/trace_multilist.h>
+
+/* needed for spa_get_random() */
+#include <sys/spa.h>
+
+/*
+ * Given the object contained on the list, return a pointer to the
+ * object's multilist_node_t structure it contains.
+ */
+#ifdef DEBUG
+static multilist_node_t *
+multilist_d2l(multilist_t *ml, void *obj)
+{
+       return ((multilist_node_t *)((char *)obj + ml->ml_offset));
+}
+#endif
+
+/*
+ * Initialize a new mutlilist using the parameters specified.
+ *
+ *  - 'size' denotes the size of the structure containing the
+ *     multilist_node_t.
+ *  - 'offset' denotes the byte offset of the mutlilist_node_t within
+ *     the structure that contains it.
+ *  - 'num' specifies the number of internal sublists to create.
+ *  - 'index_func' is used to determine which sublist to insert into
+ *     when the multilist_insert() function is called; as well as which
+ *     sublist to remove from when multilist_remove() is called. The
+ *     requirements this function must meet, are the following:
+ *
+ *      - It must always return the same value when called on the same
+ *        object (to ensure the object is removed from the list it was
+ *        inserted into).
+ *
+ *      - It must return a value in the range [0, number of sublists).
+ *        The multilist_get_num_sublists() function may be used to
+ *        determine the number of sublists in the multilist.
+ *
+ *     Also, in order to reduce internal contention between the sublists
+ *     during insertion and removal, this function should choose evenly
+ *     between all available sublists when inserting. This isn't a hard
+ *     requirement, but a general rule of thumb in order to garner the
+ *     best multi-threaded performance out of the data structure.
+ */
+void
+multilist_create(multilist_t *ml, size_t size, size_t offset, unsigned int num,
+    multilist_sublist_index_func_t *index_func)
+{
+       int i;
+
+       ASSERT3P(ml, !=, NULL);
+       ASSERT3U(size, >, 0);
+       ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
+       ASSERT3U(num, >, 0);
+       ASSERT3P(index_func, !=, NULL);
+
+       ml->ml_offset = offset;
+       ml->ml_num_sublists = num;
+       ml->ml_index_func = index_func;
+
+       ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
+           ml->ml_num_sublists, KM_SLEEP);
+
+       ASSERT3P(ml->ml_sublists, !=, NULL);
+
+       for (i = 0; i < ml->ml_num_sublists; i++) {
+               multilist_sublist_t *mls = &ml->ml_sublists[i];
+               mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL);
+               list_create(&mls->mls_list, size, offset);
+       }
+}
+
+/*
+ * Destroy the given multilist object, and free up any memory it holds.
+ */
+void
+multilist_destroy(multilist_t *ml)
+{
+       int i;
+
+       ASSERT(multilist_is_empty(ml));
+
+       for (i = 0; i < ml->ml_num_sublists; i++) {
+               multilist_sublist_t *mls = &ml->ml_sublists[i];
+
+               ASSERT(list_is_empty(&mls->mls_list));
+
+               list_destroy(&mls->mls_list);
+               mutex_destroy(&mls->mls_lock);
+       }
+
+       ASSERT3P(ml->ml_sublists, !=, NULL);
+       kmem_free(ml->ml_sublists,
+           sizeof (multilist_sublist_t) * ml->ml_num_sublists);
+
+       ml->ml_num_sublists = 0;
+       ml->ml_offset = 0;
+}
+
+/*
+ * Insert the given object into the multilist.
+ *
+ * This function will insert the object specified into the sublist
+ * determined using the function given at multilist creation time.
+ *
+ * The sublist locks are automatically acquired if not already held, to
+ * ensure consistency when inserting and removing from multiple threads.
+ */
+void
+multilist_insert(multilist_t *ml, void *obj)
+{
+       unsigned int sublist_idx = ml->ml_index_func(ml, obj);
+       multilist_sublist_t *mls;
+       boolean_t need_lock;
+
+       DTRACE_PROBE3(multilist__insert, multilist_t *, ml,
+           unsigned int, sublist_idx, void *, obj);
+
+       ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+
+       mls = &ml->ml_sublists[sublist_idx];
+
+       /*
+        * Note: Callers may already hold the sublist lock by calling
+        * multilist_sublist_lock().  Here we rely on MUTEX_HELD()
+        * returning TRUE if and only if the current thread holds the
+        * lock.  While it's a little ugly to make the lock recursive in
+        * this way, it works and allows the calling code to be much
+        * simpler -- otherwise it would have to pass around a flag
+        * indicating that it already has the lock.
+        */
+       need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+       if (need_lock)
+               mutex_enter(&mls->mls_lock);
+
+       ASSERT(!multilist_link_active(multilist_d2l(ml, obj)));
+
+       multilist_sublist_insert_head(mls, obj);
+
+       if (need_lock)
+               mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * Remove the given object from the multilist.
+ *
+ * This function will remove the object specified from the sublist
+ * determined using the function given at multilist creation time.
+ *
+ * The necessary sublist locks are automatically acquired, to ensure
+ * consistency when inserting and removing from multiple threads.
+ */
+void
+multilist_remove(multilist_t *ml, void *obj)
+{
+       unsigned int sublist_idx = ml->ml_index_func(ml, obj);
+       multilist_sublist_t *mls;
+       boolean_t need_lock;
+
+       DTRACE_PROBE3(multilist__remove, multilist_t *, ml,
+           unsigned int, sublist_idx, void *, obj);
+
+       ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+
+       mls = &ml->ml_sublists[sublist_idx];
+       /* See comment in multilist_insert(). */
+       need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+       if (need_lock)
+               mutex_enter(&mls->mls_lock);
+
+       ASSERT(multilist_link_active(multilist_d2l(ml, obj)));
+
+       multilist_sublist_remove(mls, obj);
+
+       if (need_lock)
+               mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * Check to see if this multilist object is empty.
+ *
+ * This will return TRUE if it finds all of the sublists of this
+ * multilist to be empty, and FALSE otherwise. Each sublist lock will be
+ * automatically acquired as necessary.
+ *
+ * If concurrent insertions and removals are occurring, the semantics
+ * of this function become a little fuzzy. Instead of locking all
+ * sublists for the entire call time of the function, each sublist is
+ * only locked as it is individually checked for emptiness. Thus, it's
+ * possible for this function to return TRUE with non-empty sublists at
+ * the time the function returns. This would be due to another thread
+ * inserting into a given sublist, after that specific sublist was check
+ * and deemed empty, but before all sublists have been checked.
+ */
+int
+multilist_is_empty(multilist_t *ml)
+{
+       int i;
+
+       for (i = 0; i < ml->ml_num_sublists; i++) {
+               multilist_sublist_t *mls = &ml->ml_sublists[i];
+               /* See comment in multilist_insert(). */
+               boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+               if (need_lock)
+                       mutex_enter(&mls->mls_lock);
+
+               if (!list_is_empty(&mls->mls_list)) {
+                       if (need_lock)
+                               mutex_exit(&mls->mls_lock);
+
+                       return (FALSE);
+               }
+
+               if (need_lock)
+                       mutex_exit(&mls->mls_lock);
+       }
+
+       return (TRUE);
+}
+
+/* Return the number of sublists composing this multilist */
+unsigned int
+multilist_get_num_sublists(multilist_t *ml)
+{
+       return (ml->ml_num_sublists);
+}
+
+/* Return a randomly selected, valid sublist index for this multilist */
+unsigned int
+multilist_get_random_index(multilist_t *ml)
+{
+       return (spa_get_random(ml->ml_num_sublists));
+}
+
+/* Lock and return the sublist specified at the given index */
+multilist_sublist_t *
+multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+{
+       multilist_sublist_t *mls;
+
+       ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+       mls = &ml->ml_sublists[sublist_idx];
+       mutex_enter(&mls->mls_lock);
+
+       return (mls);
+}
+
+void
+multilist_sublist_unlock(multilist_sublist_t *mls)
+{
+       mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * We're allowing any object to be inserted into this specific sublist,
+ * but this can lead to trouble if multilist_remove() is called to
+ * remove this object. Specifically, if calling ml_index_func on this
+ * object returns an index for sublist different than what is passed as
+ * a parameter here, any call to multilist_remove() with this newly
+ * inserted object is undefined! (the call to multilist_remove() will
+ * remove the object from a list that it isn't contained in)
+ */
+void
+multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj)
+{
+       ASSERT(MUTEX_HELD(&mls->mls_lock));
+       list_insert_head(&mls->mls_list, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
+{
+       ASSERT(MUTEX_HELD(&mls->mls_lock));
+       list_insert_tail(&mls->mls_list, obj);
+}
+
+/*
+ * Move the object one element forward in the list.
+ *
+ * This function will move the given object forward in the list (towards
+ * the head) by one object. So, in essence, it will swap its position in
+ * the list with its "prev" pointer. If the given object is already at the
+ * head of the list, it cannot be moved forward any more than it already
+ * is, so no action is taken.
+ *
+ * NOTE: This function **must not** remove any object from the list other
+ *       than the object given as the parameter. This is relied upon in
+ *       arc_evict_state_impl().
+ */
+void
+multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj)
+{
+       void *prev = list_prev(&mls->mls_list, obj);
+
+       ASSERT(MUTEX_HELD(&mls->mls_lock));
+       ASSERT(!list_is_empty(&mls->mls_list));
+
+       /* 'obj' must be at the head of the list, nothing to do */
+       if (prev == NULL)
+               return;
+
+       list_remove(&mls->mls_list, obj);
+       list_insert_before(&mls->mls_list, prev, obj);
+}
+
+void
+multilist_sublist_remove(multilist_sublist_t *mls, void *obj)
+{
+       ASSERT(MUTEX_HELD(&mls->mls_lock));
+       list_remove(&mls->mls_list, obj);
+}
+
+void *
+multilist_sublist_head(multilist_sublist_t *mls)
+{
+       ASSERT(MUTEX_HELD(&mls->mls_lock));
+       return (list_head(&mls->mls_list));
+}
+
+void *
+multilist_sublist_tail(multilist_sublist_t *mls)
+{
+       ASSERT(MUTEX_HELD(&mls->mls_lock));
+       return (list_tail(&mls->mls_list));
+}
+
+void *
+multilist_sublist_next(multilist_sublist_t *mls, void *obj)
+{
+       ASSERT(MUTEX_HELD(&mls->mls_lock));
+       return (list_next(&mls->mls_list, obj));
+}
+
+void *
+multilist_sublist_prev(multilist_sublist_t *mls, void *obj)
+{
+       ASSERT(MUTEX_HELD(&mls->mls_lock));
+       return (list_prev(&mls->mls_list, obj));
+}
+
+void
+multilist_link_init(multilist_node_t *link)
+{
+       list_link_init(link);
+}
+
+int
+multilist_link_active(multilist_node_t *link)
+{
+       return (list_link_active(link));
+}
index 22175e06ab307f5177fa934ed5e3e586cb525ac1..6422fd1c1fa6ecf2a9283fefeabca772f6b0a76a 100644 (file)
@@ -33,7 +33,7 @@
 #include <sys/zio.h>
 #include <sys/range_tree.h>
 
-static kmem_cache_t *range_seg_cache;
+kmem_cache_t *range_seg_cache;
 
 void
 range_tree_init(void)
index 8e80166c7d14daae2f9e16528cd6f2f948022133..51394c01c4310cfccb68ba9f931299b83f37ef6e 100644 (file)
@@ -159,8 +159,8 @@ rrw_destroy(rrwlock_t *rrl)
        refcount_destroy(&rrl->rr_linked_rcount);
 }
 
-void
-rrw_enter_read(rrwlock_t *rrl, void *tag)
+static void
+rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag)
 {
        mutex_enter(&rrl->rr_lock);
 #if !defined(DEBUG) && defined(_KERNEL)
@@ -176,7 +176,7 @@ rrw_enter_read(rrwlock_t *rrl, void *tag)
        ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
 
        while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted &&
-           refcount_is_zero(&rrl->rr_anon_rcount) &&
+           refcount_is_zero(&rrl->rr_anon_rcount) && !prio &&
            rrn_find(rrl) == NULL))
                cv_wait(&rrl->rr_cv, &rrl->rr_lock);
 
@@ -191,6 +191,25 @@ rrw_enter_read(rrwlock_t *rrl, void *tag)
        mutex_exit(&rrl->rr_lock);
 }
 
+void
+rrw_enter_read(rrwlock_t *rrl, void *tag)
+{
+       rrw_enter_read_impl(rrl, B_FALSE, tag);
+}
+
+/*
+ * take a read lock even if there are pending write lock requests. if we want
+ * to take a lock reentrantly, but from different threads (that have a
+ * relationship to each other), the normal detection mechanism to overrule
+ * the pending writer does not work, so we have to give an explicit hint here.
+ */
+void
+rrw_enter_read_prio(rrwlock_t *rrl, void *tag)
+{
+       rrw_enter_read_impl(rrl, B_TRUE, tag);
+}
+
+
 void
 rrw_enter_write(rrwlock_t *rrl)
 {
@@ -286,3 +305,91 @@ rrw_tsd_destroy(void *arg)
                    (void *)curthread, (void *)rn->rn_rrl);
        }
 }
+
+/*
+ * A reader-mostly lock implementation, tuning above reader-writer locks
+ * for hightly parallel read acquisitions, while pessimizing writes.
+ *
+ * The idea is to split single busy lock into array of locks, so that
+ * each reader can lock only one of them for read, depending on result
+ * of simple hash function.  That proportionally reduces lock congestion.
+ * Writer same time has to sequentially aquire write on all the locks.
+ * That makes write aquisition proportionally slower, but in places where
+ * it is used (filesystem unmount) performance is not critical.
+ *
+ * All the functions below are direct wrappers around functions above.
+ */
+void
+rrm_init(rrmlock_t *rrl, boolean_t track_all)
+{
+       int i;
+
+       for (i = 0; i < RRM_NUM_LOCKS; i++)
+               rrw_init(&rrl->locks[i], track_all);
+}
+
+void
+rrm_destroy(rrmlock_t *rrl)
+{
+       int i;
+
+       for (i = 0; i < RRM_NUM_LOCKS; i++)
+               rrw_destroy(&rrl->locks[i]);
+}
+
+void
+rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
+{
+       if (rw == RW_READER)
+               rrm_enter_read(rrl, tag);
+       else
+               rrm_enter_write(rrl);
+}
+
+/*
+ * This maps the current thread to a specific lock.  Note that the lock
+ * must be released by the same thread that acquired it.  We do this
+ * mapping by taking the thread pointer mod a prime number.  We examine
+ * only the low 32 bits of the thread pointer, because 32-bit division
+ * is faster than 64-bit division, and the high 32 bits have little
+ * entropy anyway.
+ */
+#define        RRM_TD_LOCK()   (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
+
+void
+rrm_enter_read(rrmlock_t *rrl, void *tag)
+{
+       rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
+}
+
+void
+rrm_enter_write(rrmlock_t *rrl)
+{
+       int i;
+
+       for (i = 0; i < RRM_NUM_LOCKS; i++)
+               rrw_enter_write(&rrl->locks[i]);
+}
+
+void
+rrm_exit(rrmlock_t *rrl, void *tag)
+{
+       int i;
+
+       if (rrl->locks[0].rr_writer == curthread) {
+               for (i = 0; i < RRM_NUM_LOCKS; i++)
+                       rrw_exit(&rrl->locks[i], tag);
+       } else {
+               rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
+       }
+}
+
+boolean_t
+rrm_held(rrmlock_t *rrl, krw_t rw)
+{
+       if (rw == RW_WRITER) {
+               return (rrw_held(&rrl->locks[0], rw));
+       } else {
+               return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
+       }
+}
index 9063d1dae44955ee05e1564fe8a4610a067c73d0..2383252e2447b1dc22cd65b7f3dcf480ed5a31b8 100644 (file)
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -209,12 +210,6 @@ sa_cache_constructor(void *buf, void *unused, int kmflag)
 {
        sa_handle_t *hdl = buf;
 
-       hdl->sa_bonus_tab = NULL;
-       hdl->sa_spill_tab = NULL;
-       hdl->sa_os = NULL;
-       hdl->sa_userp = NULL;
-       hdl->sa_bonus = NULL;
-       hdl->sa_spill = NULL;
        mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
        return (0);
 }
@@ -501,7 +496,7 @@ sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
 
        if (size == 0) {
                blocksize = SPA_MINBLOCKSIZE;
-       } else if (size > SPA_MAXBLOCKSIZE) {
+       } else if (size > SPA_OLD_MAXBLOCKSIZE) {
                ASSERT(0);
                return (SET_ERROR(EFBIG));
        } else {
@@ -690,7 +685,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
        hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
            SA_BONUS, &spill_idx, &used, &spilling);
 
-       if (used > SPA_MAXBLOCKSIZE)
+       if (used > SPA_OLD_MAXBLOCKSIZE)
                return (SET_ERROR(EFBIG));
 
        VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
@@ -714,7 +709,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
                    attr_count - spill_idx, hdl->sa_spill, SA_SPILL, &i,
                    &spill_used, &dummy);
 
-               if (spill_used > SPA_MAXBLOCKSIZE)
+               if (spill_used > SPA_OLD_MAXBLOCKSIZE)
                        return (SET_ERROR(EFBIG));
 
                if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
@@ -1112,6 +1107,9 @@ fail:
        if (sa->sa_user_table)
                kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
        mutex_exit(&sa->sa_lock);
+       avl_destroy(&sa->sa_layout_hash_tree);
+       avl_destroy(&sa->sa_layout_num_tree);
+       mutex_destroy(&sa->sa_lock);
        kmem_free(sa, sizeof (sa_os_t));
        return ((error == ECKSUM) ? EIO : error);
 }
@@ -1148,6 +1146,7 @@ sa_tear_down(objset_t *os)
 
        avl_destroy(&sa->sa_layout_hash_tree);
        avl_destroy(&sa->sa_layout_num_tree);
+       mutex_destroy(&sa->sa_lock);
 
        kmem_free(sa, sizeof (sa_os_t));
        os->os_sa = NULL;
@@ -1302,10 +1301,10 @@ sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
 }
 
 /*ARGSUSED*/
-void
-sa_evict(dmu_buf_t *db, void *sap)
+static void
+sa_evict(void *dbu)
 {
-       panic("evicting sa dbuf %p\n", (void *)db);
+       panic("evicting sa dbuf\n");
 }
 
 static void
@@ -1357,18 +1356,16 @@ sa_spill_rele(sa_handle_t *hdl)
 void
 sa_handle_destroy(sa_handle_t *hdl)
 {
+       dmu_buf_t *db = hdl->sa_bonus;
+
        mutex_enter(&hdl->sa_lock);
-       (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl,
-           NULL, NULL, NULL);
+       (void) dmu_buf_remove_user(db, &hdl->sa_dbu);
 
-       if (hdl->sa_bonus_tab) {
+       if (hdl->sa_bonus_tab)
                sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
-               hdl->sa_bonus_tab = NULL;
-       }
-       if (hdl->sa_spill_tab) {
+
+       if (hdl->sa_spill_tab)
                sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
-               hdl->sa_spill_tab = NULL;
-       }
 
        dmu_buf_rele(hdl->sa_bonus, NULL);
 
@@ -1384,7 +1381,7 @@ sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
     sa_handle_type_t hdl_type, sa_handle_t **handlepp)
 {
        int error = 0;
-       sa_handle_t *handle;
+       sa_handle_t *handle = NULL;
 #ifdef ZFS_DEBUG
        dmu_object_info_t doi;
 
@@ -1395,23 +1392,31 @@ sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
        /* find handle, if it exists */
        /* if one doesn't exist then create a new one, and initialize it */
 
-       handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL;
+       if (hdl_type == SA_HDL_SHARED)
+               handle = dmu_buf_get_user(db);
+
        if (handle == NULL) {
-               sa_handle_t *newhandle;
+               sa_handle_t *winner = NULL;
+
                handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
+               handle->sa_dbu.dbu_evict_func = NULL;
                handle->sa_userp = userp;
                handle->sa_bonus = db;
                handle->sa_os = os;
                handle->sa_spill = NULL;
+               handle->sa_bonus_tab = NULL;
+               handle->sa_spill_tab = NULL;
 
                error = sa_build_index(handle, SA_BONUS);
-               newhandle = (hdl_type == SA_HDL_SHARED) ?
-                   dmu_buf_set_user_ie(db, handle,
-                   NULL, sa_evict) : NULL;
 
-               if (newhandle != NULL) {
+               if (hdl_type == SA_HDL_SHARED) {
+                       dmu_buf_init_user(&handle->sa_dbu, sa_evict, NULL);
+                       winner = dmu_buf_set_user_ie(db, &handle->sa_dbu);
+               }
+
+               if (winner != NULL) {
                        kmem_cache_free(sa_cache, handle);
-                       handle = newhandle;
+                       handle = winner;
                }
        }
        *handlepp = handle;
@@ -1943,14 +1948,6 @@ sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
            blksize, nblocks);
 }
 
-void
-sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl)
-{
-       (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus,
-           oldhdl, newhdl, NULL, sa_evict);
-       oldhdl->sa_bonus = NULL;
-}
-
 void
 sa_set_userp(sa_handle_t *hdl, void *ptr)
 {
@@ -2049,7 +2046,6 @@ EXPORT_SYMBOL(sa_size);
 EXPORT_SYMBOL(sa_update_from_cb);
 EXPORT_SYMBOL(sa_object_info);
 EXPORT_SYMBOL(sa_object_size);
-EXPORT_SYMBOL(sa_update_user);
 EXPORT_SYMBOL(sa_get_userdata);
 EXPORT_SYMBOL(sa_set_userp);
 EXPORT_SYMBOL(sa_get_db);
index 998ec3e543f9e0931be5adf4048c88459b079fa7..2e23a341fb13119ecd98902715543e53c038834f 100644 (file)
@@ -23,6 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2013, 2014, Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 /*
@@ -126,9 +127,9 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
        /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
        { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* NULL */
-       { ZTI_N(8),     ZTI_NULL,       ZTI_BATCH,      ZTI_NULL }, /* READ */
-       { ZTI_BATCH,    ZTI_N(5),       ZTI_N(16),      ZTI_N(5) }, /* WRITE */
-       { ZTI_P(4, 8),  ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* FREE */
+       { ZTI_N(8),     ZTI_NULL,       ZTI_P(12, 8),   ZTI_NULL }, /* READ */
+       { ZTI_BATCH,    ZTI_N(5),       ZTI_P(12, 8),   ZTI_N(5) }, /* WRITE */
+       { ZTI_P(12, 8), ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* FREE */
        { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* CLAIM */
        { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* IOCTL */
 };
@@ -237,7 +238,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
                 */
                if (pool->dp_free_dir != NULL) {
                        spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
-                           pool->dp_free_dir->dd_phys->dd_used_bytes, src);
+                           dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
+                           src);
                } else {
                        spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
                            NULL, 0, src);
@@ -245,7 +247,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 
                if (pool->dp_leak_dir != NULL) {
                        spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
-                           pool->dp_leak_dir->dd_phys->dd_used_bytes, src);
+                           dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
+                           src);
                } else {
                        spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
                            NULL, 0, src);
@@ -263,6 +266,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
                spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
                    0, ZPROP_SRC_LOCAL);
 
+       if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+               spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+                   MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
+       } else {
+               spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+                   SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
+       }
+
        if ((dp = list_head(&spa->spa_config_list)) != NULL) {
                if (dp->scd_path == NULL) {
                        spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
@@ -479,7 +490,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 
                        if (!error) {
                                objset_t *os;
-                               uint64_t compress;
+                               uint64_t propval;
 
                                if (strval == NULL || strval[0] == '\0') {
                                        objnum = zpool_prop_default_numeric(
@@ -491,15 +502,25 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
                                if (error)
                                        break;
 
-                               /* Must be ZPL and not gzip compressed. */
+                               /*
+                                * Must be ZPL, and its property settings
+                                * must be supported by GRUB (compression
+                                * is not gzip, and large blocks are not used).
+                                */
 
                                if (dmu_objset_type(os) != DMU_OST_ZFS) {
                                        error = SET_ERROR(ENOTSUP);
                                } else if ((error =
                                    dsl_prop_get_int_ds(dmu_objset_ds(os),
                                    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
-                                   &compress)) == 0 &&
-                                   !BOOTFS_COMPRESS_VALID(compress)) {
+                                   &propval)) == 0 &&
+                                   !BOOTFS_COMPRESS_VALID(propval)) {
+                                       error = SET_ERROR(ENOTSUP);
+                               } else if ((error =
+                                   dsl_prop_get_int_ds(dmu_objset_ds(os),
+                                   zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+                                   &propval)) == 0 &&
+                                   propval > SPA_OLD_MAXBLOCKSIZE) {
                                        error = SET_ERROR(ENOTSUP);
                                } else {
                                        objnum = dmu_objset_id(os);
@@ -663,7 +684,8 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp)
                         * feature descriptions object.
                         */
                        error = dsl_sync_task(spa->spa_name, NULL,
-                           spa_sync_version, &ver, 6);
+                           spa_sync_version, &ver,
+                           6, ZFS_SPACE_CHECK_RESERVED);
                        if (error)
                                return (error);
                        continue;
@@ -675,7 +697,7 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp)
 
        if (need_sync) {
                return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
-                   nvp, 6));
+                   nvp, 6, ZFS_SPACE_CHECK_RESERVED));
        }
 
        return (0);
@@ -756,7 +778,7 @@ spa_change_guid(spa_t *spa)
        guid = spa_generate_guid(NULL);
 
        error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
-           spa_change_guid_sync, &guid, 5);
+           spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
 
        if (error == 0) {
                spa_config_sync(spa, B_FALSE, B_TRUE);
@@ -822,7 +844,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
        uint_t count = ztip->zti_count;
        spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
        char name[32];
-       uint_t i, flags = 0;
+       uint_t i, flags = TASKQ_DYNAMIC;
        boolean_t batch = B_FALSE;
 
        if (mode == ZTI_MODE_NULL) {
@@ -876,11 +898,13 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
                        pri_t pri = maxclsyspri;
                        /*
                         * The write issue taskq can be extremely CPU
-                        * intensive.  Run it at slightly lower priority
-                        * than the other taskqs.
+                        * intensive.  Run it at slightly less important
+                        * priority than the other taskqs.  Under Linux this
+                        * means incrementing the priority value on platforms
+                        * like illumos it should be decremented.
                         */
                        if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
-                               pri--;
+                               pri++;
 
                        tq = taskq_create_proc(name, value, pri, 50,
                            INT_MAX, spa->spa_proc, flags);
@@ -1093,6 +1117,8 @@ spa_activate(spa_t *spa, int mode)
 
        list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
            offsetof(vdev_t, vdev_config_dirty_node));
+       list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
+           offsetof(objset_t, os_evicting_node));
        list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
            offsetof(vdev_t, vdev_state_dirty_node));
 
@@ -1121,9 +1147,12 @@ spa_deactivate(spa_t *spa)
        ASSERT(spa->spa_async_zio_root == NULL);
        ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
 
+       spa_evicting_os_wait(spa);
+
        txg_list_destroy(&spa->spa_vdev_txg_list);
 
        list_destroy(&spa->spa_config_dirty_list);
+       list_destroy(&spa->spa_evicting_os_list);
        list_destroy(&spa->spa_state_dirty_list);
 
        taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
@@ -1746,6 +1775,7 @@ static boolean_t
 spa_check_logs(spa_t *spa)
 {
        boolean_t rv = B_FALSE;
+       dsl_pool_t *dp = spa_get_dsl(spa);
 
        switch (spa->spa_log_state) {
        default:
@@ -1753,8 +1783,8 @@ spa_check_logs(spa_t *spa)
        case SPA_LOG_MISSING:
                /* need to recheck in case slog has been restored */
        case SPA_LOG_UNKNOWN:
-               rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
-                   NULL, DS_FIND_CHILDREN) != 0);
+               rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+                   zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
                if (rv)
                        spa_set_log_state(spa, SPA_LOG_MISSING);
                break;
@@ -2134,6 +2164,11 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
                    mosconfig, &ereport);
        }
 
+       /*
+        * Don't count references from objsets that are already closed
+        * and are making their way through the eviction process.
+        */
+       spa_evicting_os_wait(spa);
        spa->spa_minref = refcount_count(&spa->spa_refcount);
        if (error) {
                if (error != EEXIST) {
@@ -2212,6 +2247,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                return (error);
 
        ASSERT(spa->spa_root_vdev == rvd);
+       ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
+       ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
 
        if (type != SPA_IMPORT_ASSEMBLE) {
                ASSERT(spa_guid(spa) == pool_guid);
@@ -2700,7 +2737,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
                        return (SET_ERROR(ENXIO));
 
-               if (spa_check_logs(spa)) {
+               if (spa_writeable(spa) && spa_check_logs(spa)) {
                        *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
                        return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
                }
@@ -2731,6 +2768,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
            spa->spa_load_max_txg == UINT64_MAX)) {
                dmu_tx_t *tx;
                int need_update = B_FALSE;
+               dsl_pool_t *dp = spa_get_dsl(spa);
                int c;
 
                ASSERT(state != SPA_LOAD_TRYIMPORT);
@@ -2744,9 +2782,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                 */
                spa->spa_claiming = B_TRUE;
 
-               tx = dmu_tx_create_assigned(spa_get_dsl(spa),
-                   spa_first_txg(spa));
-               (void) dmu_objset_find(spa_name(spa),
+               tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
+               (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
                    zil_claim, tx, DS_FIND_CHILDREN);
                dmu_tx_commit(tx);
 
@@ -3773,6 +3810,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 
        spa_history_log_version(spa, "create");
 
+       /*
+        * Don't count references from objsets that are already closed
+        * and are making their way through the eviction process.
+        */
+       spa_evicting_os_wait(spa);
        spa->spa_minref = refcount_count(&spa->spa_refcount);
 
        mutex_exit(&spa_namespace_lock);
@@ -4312,8 +4354,10 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
         * modify its state.  Objsets may be open only because they're dirty,
         * so we have to force it to sync before checking spa_refcnt.
         */
-       if (spa->spa_sync_on)
+       if (spa->spa_sync_on) {
                txg_wait_synced(spa->spa_dsl_pool, 0);
+               spa_evicting_os_wait(spa);
+       }
 
        /*
         * A pool cannot be exported or destroyed if there are active
@@ -6351,21 +6395,6 @@ spa_sync(spa_t *spa, uint64_t txg)
                }
        }
 
-       /*
-        * If anything has changed in this txg, or if someone is waiting
-        * for this txg to sync (eg, spa_vdev_remove()), push the
-        * deferred frees from the previous txg.  If not, leave them
-        * alone so that we don't generate work on an otherwise idle
-        * system.
-        */
-       if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
-           !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
-           !txg_list_empty(&dp->dp_sync_tasks, txg) ||
-           ((dsl_scan_active(dp->dp_scan) ||
-           txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
-               spa_sync_deferred_frees(spa, tx);
-       }
-
        /*
         * Iterate to convergence.
         */
@@ -6383,6 +6412,11 @@ spa_sync(spa_t *spa, uint64_t txg)
                if (pass < zfs_sync_pass_deferred_free) {
                        spa_sync_frees(spa, free_bpl, tx);
                } else {
+                       /*
+                        * We can not defer frees in pass 1, because
+                        * we sync the deferred frees later in pass 1.
+                        */
+                       ASSERT3U(pass, >, 1);
                        bplist_iterate(free_bpl, bpobj_enqueue_cb,
                            &spa->spa_deferred_bpobj, tx);
                }
@@ -6393,8 +6427,37 @@ spa_sync(spa_t *spa, uint64_t txg)
                while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)))
                        vdev_sync(vd, txg);
 
-               if (pass == 1)
+               if (pass == 1) {
                        spa_sync_upgrades(spa, tx);
+                       ASSERT3U(txg, >=,
+                           spa->spa_uberblock.ub_rootbp.blk_birth);
+                       /*
+                        * Note: We need to check if the MOS is dirty
+                        * because we could have marked the MOS dirty
+                        * without updating the uberblock (e.g. if we
+                        * have sync tasks but no dirty user data).  We
+                        * need to check the uberblock's rootbp because
+                        * it is updated if we have synced out dirty
+                        * data (though in this case the MOS will most
+                        * likely also be dirty due to second order
+                        * effects, we don't want to rely on that here).
+                        */
+                       if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+                           !dmu_objset_is_dirty(mos, txg)) {
+                               /*
+                                * Nothing changed on the first pass,
+                                * therefore this TXG is a no-op.  Avoid
+                                * syncing deferred frees, so that we
+                                * can keep this TXG as a no-op.
+                                */
+                               ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
+                                   txg));
+                               ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+                               ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+                               break;
+                       }
+                       spa_sync_deferred_frees(spa, tx);
+               }
 
        } while (dmu_objset_is_dirty(mos, txg));
 
index e846ec9adc434f5a61f6f88135930ebbff3881a4..929f18165934e22cd01a8745d2640fe42bfaee97 100644 (file)
@@ -152,6 +152,7 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
        char *buf;
        vnode_t *vp;
        int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
+       int error;
        char *temp;
 
        /*
@@ -173,6 +174,26 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
        VERIFY(nvlist_pack(nvl, &buf, &buflen, NV_ENCODE_XDR,
            KM_SLEEP) == 0);
 
+#ifdef __linux__
+       /*
+        * Write the configuration to disk.  Due to the complexity involved
+        * in performing a rename from within the kernel the file is truncated
+        * and overwritten in place.  In the event of an error the file is
+        * unlinked to make sure we always have a consistent view of the data.
+        */
+       error = vn_open(dp->scd_path, UIO_SYSSPACE, oflags, 0644, &vp, 0, 0);
+       if (error == 0) {
+               error = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0,
+                   UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, NULL);
+               if (error == 0)
+                       error = VOP_FSYNC(vp, FSYNC, kcred, NULL);
+
+               (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL);
+
+               if (error)
+                       (void) vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE);
+       }
+#else
        /*
         * Write the configuration to disk.  We need to do the traditional
         * 'write to temporary file, sync, move over original' to make sure we
@@ -190,6 +211,7 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
        }
 
        (void) vn_remove(temp, UIO_SYSSPACE, RMFILE);
+#endif
 
        vmem_free(buf, buflen);
        kmem_free(temp, MAXPATHLEN);
index 14e681e77d8bbda0074e7c2c9d728f893d374c89..01aa4641e63fe50483fc949973e5ccbf3d2d785f 100644 (file)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/spa.h>
@@ -89,7 +89,7 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
 
        ASSERT(spa->spa_history == 0);
        spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
-           SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
+           SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
            sizeof (spa_history_phys_t), tx);
 
        VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
@@ -323,7 +323,7 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
 
        /* Kick this off asynchronously; errors are ignored. */
        dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync,
-           nvarg, 0, tx);
+           nvarg, 0, ZFS_SPACE_CHECK_NONE, tx);
        dmu_tx_commit(tx);
 
        /* spa_history_log_sync will free nvl */
@@ -458,7 +458,7 @@ log_internal(nvlist_t *nvl, const char *operation, spa_t *spa,
                spa_history_log_sync(nvl, tx);
        } else {
                dsl_sync_task_nowait(spa_get_dsl(spa),
-                   spa_history_log_sync, nvl, 0, tx);
+                   spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx);
        }
        /* spa_history_log_sync() will free nvl */
 }
@@ -520,7 +520,7 @@ spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
        dsl_dir_name(dd, namebuf);
        fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
        fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID,
-           dd->dd_phys->dd_head_dataset_obj);
+           dsl_dir_phys(dd)->dd_head_dataset_obj);
 
        va_start(adx, fmt);
        log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx);
index 7a96ea18bfc580b8be8336649385f546a33d8ea7..409dce12121276ce97e08d0a3477112e4957c1a5 100644 (file)
@@ -20,8 +20,9 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 static avl_tree_t spa_namespace_avl;
 kmutex_t spa_namespace_lock;
 static kcondvar_t spa_namespace_cv;
-static int spa_active_count;
 int spa_max_replication_override = SPA_DVAS_PER_BP;
 
 static kmutex_t spa_spare_lock;
@@ -311,6 +311,32 @@ int zfs_deadman_enabled = 1;
  */
 int spa_asize_inflation = 24;
 
+/*
+ * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
+ * the pool to be consumed.  This ensures that we don't run the pool
+ * completely out of space, due to unaccounted changes (e.g. to the MOS).
+ * It also limits the worst-case time to allocate space.  If we have
+ * less than this amount of free space, most ZPL operations (e.g. write,
+ * create) will return ENOSPC.
+ *
+ * Certain operations (e.g. file removal, most administrative actions) can
+ * use half the slop space.  They will only return ENOSPC if less than half
+ * the slop space is free.  Typically, once the pool has less than the slop
+ * space free, the user will use these operations to free up space in the pool.
+ * These are the operations that call dsl_pool_adjustedsize() with the netfree
+ * argument set to TRUE.
+ *
+ * A very restricted set of operations are always permitted, regardless of
+ * the amount of free space.  These are the operations that call
+ * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy".  If these
+ * operations result in a net increase in the amount of space used,
+ * it is possible to run the pool completely out of space, causing it to
+ * be permanently read-only.
+ *
+ * See also the comments in zfs_space_check_t.
+ */
+int spa_slop_shift = 5;
+
 /*
  * ==========================================================================
  * SPA config locking
@@ -525,6 +551,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
        mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -534,6 +561,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
        mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL);
 
        cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
+       cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
@@ -560,10 +588,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
        /*
         * Set the alternate root, if there is one.
         */
-       if (altroot) {
+       if (altroot)
                spa->spa_root = spa_strdup(altroot);
-               spa_active_count++;
-       }
 
        /*
         * Every pool starts with the default cachefile
@@ -597,6 +623,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 
        spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
 
+       spa->spa_min_ashift = INT_MAX;
+       spa->spa_max_ashift = 0;
+
        /*
         * As a pool is being created, treat all features as disabled by
         * setting SPA_FEATURE_DISABLED for all entries in the feature
@@ -622,16 +651,15 @@ spa_remove(spa_t *spa)
 
        ASSERT(MUTEX_HELD(&spa_namespace_lock));
        ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+       ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0);
 
        nvlist_free(spa->spa_config_splitting);
 
        avl_remove(&spa_namespace_avl, spa);
        cv_broadcast(&spa_namespace_cv);
 
-       if (spa->spa_root) {
+       if (spa->spa_root)
                spa_strfree(spa->spa_root);
-               spa_active_count--;
-       }
 
        while ((dp = list_head(&spa->spa_config_list)) != NULL) {
                list_remove(&spa->spa_config_list, dp);
@@ -656,6 +684,7 @@ spa_remove(spa_t *spa)
                bplist_destroy(&spa->spa_free_bplist[t]);
 
        cv_destroy(&spa->spa_async_cv);
+       cv_destroy(&spa->spa_evicting_os_cv);
        cv_destroy(&spa->spa_proc_cv);
        cv_destroy(&spa->spa_scrub_io_cv);
        cv_destroy(&spa->spa_suspend_cv);
@@ -663,6 +692,7 @@ spa_remove(spa_t *spa)
        mutex_destroy(&spa->spa_async_lock);
        mutex_destroy(&spa->spa_errlist_lock);
        mutex_destroy(&spa->spa_errlog_lock);
+       mutex_destroy(&spa->spa_evicting_os_lock);
        mutex_destroy(&spa->spa_history_lock);
        mutex_destroy(&spa->spa_proc_lock);
        mutex_destroy(&spa->spa_props_lock);
@@ -719,6 +749,20 @@ spa_close(spa_t *spa, void *tag)
        (void) refcount_remove(&spa->spa_refcount, tag);
 }
 
+/*
+ * Remove a reference to the given spa_t held by a dsl dir that is
+ * being asynchronously released.  Async releases occur from a taskq
+ * performing eviction of dsl datasets and dirs.  The namespace lock
+ * isn't held and the hold by the object being evicted may contribute to
+ * spa_minref (e.g. dataset or directory released during pool export),
+ * so the asserts in spa_close() do not apply.
+ */
+void
+spa_async_close(spa_t *spa, void *tag)
+{
+       (void) refcount_remove(&spa->spa_refcount, tag);
+}
+
 /*
  * Check to see if the spa refcount is zero.  Must be called with
  * spa_namespace_lock held.  We really compare against spa_minref, which is the
@@ -1549,6 +1593,18 @@ spa_get_asize(spa_t *spa, uint64_t lsize)
        return (lsize * spa_asize_inflation);
 }
 
+/*
+ * Return the amount of slop space in bytes.  It is 1/32 of the pool (3.2%),
+ * or at least 32MB.
+ *
+ * See the comment above spa_slop_shift for details.
+ */
+uint64_t
+spa_get_slop_space(spa_t *spa) {
+       uint64_t space = spa_get_dspace(spa);
+       return (MAX(space >> spa_slop_shift, SPA_MINDEVSIZE >> 1));
+}
+
 uint64_t
 spa_get_dspace(spa_t *spa)
 {
@@ -1602,6 +1658,34 @@ spa_log_class(spa_t *spa)
        return (spa->spa_log_class);
 }
 
+void
+spa_evicting_os_register(spa_t *spa, objset_t *os)
+{
+       mutex_enter(&spa->spa_evicting_os_lock);
+       list_insert_head(&spa->spa_evicting_os_list, os);
+       mutex_exit(&spa->spa_evicting_os_lock);
+}
+
+void
+spa_evicting_os_deregister(spa_t *spa, objset_t *os)
+{
+       mutex_enter(&spa->spa_evicting_os_lock);
+       list_remove(&spa->spa_evicting_os_list, os);
+       cv_broadcast(&spa->spa_evicting_os_cv);
+       mutex_exit(&spa->spa_evicting_os_lock);
+}
+
+void
+spa_evicting_os_wait(spa_t *spa)
+{
+       mutex_enter(&spa->spa_evicting_os_lock);
+       while (!list_is_empty(&spa->spa_evicting_os_list))
+               cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
+       mutex_exit(&spa->spa_evicting_os_lock);
+
+       dmu_buf_user_evict_wait();
+}
+
 int
 spa_max_replication(spa_t *spa)
 {
@@ -1745,7 +1829,6 @@ spa_init(int mode)
        dmu_init();
        zil_init();
        vdev_cache_stat_init();
-       vdev_file_init();
        zfs_prop_init();
        zpool_prop_init();
        zpool_feature_init();
@@ -1760,7 +1843,6 @@ spa_fini(void)
 
        spa_evict_all();
 
-       vdev_file_fini();
        vdev_cache_stat_fini();
        zil_fini();
        dmu_fini();
@@ -1904,6 +1986,15 @@ spa_debug_enabled(spa_t *spa)
        return (spa->spa_debug);
 }
 
+int
+spa_maxblocksize(spa_t *spa)
+{
+       if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
+               return (SPA_MAXBLOCKSIZE);
+       else
+               return (SPA_OLD_MAXBLOCKSIZE);
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 /* Namespace manipulation */
 EXPORT_SYMBOL(spa_lookup);
@@ -1959,6 +2050,7 @@ EXPORT_SYMBOL(spa_suspended);
 EXPORT_SYMBOL(spa_bootfs);
 EXPORT_SYMBOL(spa_delegation);
 EXPORT_SYMBOL(spa_meta_objset);
+EXPORT_SYMBOL(spa_maxblocksize);
 
 /* Miscellaneous support routines */
 EXPORT_SYMBOL(spa_rename);
@@ -2002,4 +2094,7 @@ MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer");
 module_param(spa_asize_inflation, int, 0644);
 MODULE_PARM_DESC(spa_asize_inflation,
        "SPA size estimate multiplication factor");
+
+module_param(spa_slop_shift, int, 0644);
+MODULE_PARM_DESC(spa_slop_shift, "Reserved free space in pool");
 #endif
index 3e39dba2c2e672f27d33ce7c2d7a9a37f08e22fa..2b8559b5d276b0c53f1ef61271bfceb66afeb8d2 100644 (file)
@@ -200,7 +200,7 @@ spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags)
        if (zfs_read_history == 0 && ssh->size == 0)
                return;
 
-       if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED))
+       if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED))
                return;
 
        srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP);
index 470cf18bff3011cf78917df6447a729b61fa0ed6..0c9990e8547bd39de16304eb19d4bf101efef0ad 100644 (file)
@@ -23,6 +23,7 @@
  * (and only one) C file, so this dummy file exists for that purpose.
  */
 
+#include <sys/multilist.h>
 #include <sys/arc_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
@@ -31,6 +32,7 @@
 #include <sys/dsl_dataset.h>
 #include <sys/dmu_tx.h>
 #include <sys/dnode.h>
+#include <sys/multilist.h>
 #include <sys/zfs_znode.h>
 #include <sys/zil_impl.h>
 #include <sys/zrlock.h>
@@ -42,6 +44,7 @@
 #include <sys/trace_dbuf.h>
 #include <sys/trace_dmu.h>
 #include <sys/trace_dnode.h>
+#include <sys/trace_multilist.h>
 #include <sys/trace_txg.h>
 #include <sys/trace_zil.h>
 #include <sys/trace_zrlock.h>
index 2977bf9f34045f989f13f8952afda3f3fd8eaacb..1d5ee97b1368d9c5fe56274abed4cae4b2c74002 100644 (file)
@@ -205,7 +205,7 @@ txg_sync_start(dsl_pool_t *dp)
        tx->tx_threads = 2;
 
        tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
-           dp, 0, &p0, TS_RUN, minclsyspri);
+           dp, 0, &p0, TS_RUN, defclsyspri);
 
        /*
         * The sync thread can need a larger-than-default stack size on
@@ -213,7 +213,7 @@ txg_sync_start(dsl_pool_t *dp)
         * scrub_visitbp() recursion.
         */
        tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread,
-           dp, 0, &p0, TS_RUN, minclsyspri);
+           dp, 0, &p0, TS_RUN, defclsyspri);
 
        mutex_exit(&tx->tx_sync_lock);
 }
@@ -242,10 +242,10 @@ txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
        CALLB_CPR_SAFE_BEGIN(cpr);
 
        if (time)
-               (void) cv_timedwait_interruptible(cv, &tx->tx_sync_lock,
+               (void) cv_timedwait_sig(cv, &tx->tx_sync_lock,
                    ddi_get_lbolt() + time);
        else
-               cv_wait_interruptible(cv, &tx->tx_sync_lock);
+               cv_wait_sig(cv, &tx->tx_sync_lock);
 
        CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
 }
@@ -445,8 +445,8 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
                         * Commit callback taskq hasn't been created yet.
                         */
                        tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
-                           100, minclsyspri, max_ncpus, INT_MAX,
-                           TASKQ_THREADS_CPU_PCT | TASKQ_PREPOPULATE);
+                           max_ncpus, defclsyspri, max_ncpus, max_ncpus * 2,
+                           TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
                }
 
                cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
@@ -471,7 +471,7 @@ txg_wait_callbacks(dsl_pool_t *dp)
        tx_state_t *tx = &dp->dp_tx;
 
        if (tx->tx_commit_cb_taskq != NULL)
-               taskq_wait(tx->tx_commit_cb_taskq);
+               taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0);
 }
 
 static void
index a07dc00ae19a84ee787445176e7c5e38ac13a452..f8bdecdf57497c7e797bc71b2f21c7185fab1e49 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -40,10 +40,10 @@ uberblock_verify(uberblock_t *ub)
 }
 
 /*
- * Update the uberblock and return a boolean value indicating whether
- * anything changed in this transaction group.
+ * Update the uberblock and return TRUE if anything changed in this
+ * transaction group.
  */
-int
+boolean_t
 uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
 {
        ASSERT(ub->ub_txg < txg);
index 52198261e434ec74510453a3c7b7f32191407ab9..7aff5455b10b8a871c4869a3999146ae746333ce 100644 (file)
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -179,6 +179,27 @@ vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
        return (NULL);
 }
 
+static int
+vdev_count_leaves_impl(vdev_t *vd)
+{
+       int n = 0;
+       int c;
+
+       if (vd->vdev_ops->vdev_op_leaf)
+               return (1);
+
+       for (c = 0; c < vd->vdev_children; c++)
+               n += vdev_count_leaves_impl(vd->vdev_child[c]);
+
+       return (n);
+}
+
+int
+vdev_count_leaves(spa_t *spa)
+{
+       return (vdev_count_leaves_impl(spa->spa_root_vdev));
+}
+
 void
 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 {
@@ -847,9 +868,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 
        /*
         * Compute the raidz-deflation ratio.  Note, we hard-code
-        * in 128k (1 << 17) because it is the current "typical" blocksize.
-        * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
-        * or we will inconsistently account for existing bp's.
+        * in 128k (1 << 17) because it is the "typical" blocksize.
+        * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
+        * otherwise it would inconsistently account for existing bp's.
         */
        vd->vdev_deflate_ratio = (1 << 17) /
            (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
@@ -1087,6 +1108,7 @@ vdev_open_child(void *arg)
        vd->vdev_open_thread = curthread;
        vd->vdev_open_error = vdev_open(vd);
        vd->vdev_open_thread = NULL;
+       vd->vdev_parent->vdev_nonrot &= vd->vdev_nonrot;
 }
 
 static boolean_t
@@ -1113,15 +1135,19 @@ vdev_open_children(vdev_t *vd)
        int children = vd->vdev_children;
        int c;
 
+       vd->vdev_nonrot = B_TRUE;
+
        /*
         * in order to handle pools on top of zvols, do the opens
         * in a single thread so that the same thread holds the
         * spa_namespace_lock
         */
        if (vdev_uses_zvols(vd)) {
-               for (c = 0; c < children; c++)
+               for (c = 0; c < children; c++) {
                        vd->vdev_child[c]->vdev_open_error =
                            vdev_open(vd->vdev_child[c]);
+                       vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
+               }
                return;
        }
        tq = taskq_create("vdev_open", children, minclsyspri,
@@ -1132,6 +1158,9 @@ vdev_open_children(vdev_t *vd)
                    TQ_SLEEP) != 0);
 
        taskq_destroy(tq);
+
+       for (c = 0; c < children; c++)
+               vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
 }
 
 /*
@@ -1315,6 +1344,17 @@ vdev_open(vdev_t *vd)
                return (error);
        }
 
+       /*
+        * Track the min and max ashift values for normal data devices.
+        */
+       if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+           !vd->vdev_islog && vd->vdev_aux == NULL) {
+               if (vd->vdev_ashift > spa->spa_max_ashift)
+                       spa->spa_max_ashift = vd->vdev_ashift;
+               if (vd->vdev_ashift < spa->spa_min_ashift)
+                       spa->spa_min_ashift = vd->vdev_ashift;
+       }
+
        /*
         * If a leaf vdev has a DTL, and seems healthy, then kick off a
         * resilver.  But don't do this if we are doing a reopen for a scrub,
index c860e7cb6db8a7b4ce9ed261152b02666c91469b..ebf0e8bfeb8747f600f80e89e04efec01145a909 100644 (file)
@@ -42,9 +42,9 @@ static void *zfs_vdev_holder = VDEV_HOLDER;
  */
 typedef struct dio_request {
        struct completion       dr_comp;        /* Completion for sync IO */
-       atomic_t                dr_ref;         /* References */
        zio_t                   *dr_zio;        /* Parent ZIO */
-       int                     dr_rw;          /* Read/Write */
+       atomic_t                dr_ref;         /* References */
+       int                     dr_wait;        /* Wait for IO */
        int                     dr_error;       /* Bio error */
        int                     dr_bio_count;   /* Count of bio's */
        struct bio              *dr_bio[0];     /* Attached bio's */
@@ -301,6 +301,9 @@ skip_open:
        /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
        v->vdev_nowritecache = B_FALSE;
 
+       /* Inform the ZIO pipeline that we are non-rotational */
+       v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
+
        /* Physical volume size in bytes */
        *psize = bdev_capacity(vd->vd_bdev);
 
@@ -366,27 +369,6 @@ vdev_disk_dio_free(dio_request_t *dr)
            sizeof (struct bio *) * dr->dr_bio_count);
 }
 
-static int
-vdev_disk_dio_is_sync(dio_request_t *dr)
-{
-#ifdef HAVE_BIO_RW_SYNC
-       /* BIO_RW_SYNC preferred interface from 2.6.12-2.6.29 */
-       return (dr->dr_rw & (1 << BIO_RW_SYNC));
-#else
-#ifdef HAVE_BIO_RW_SYNCIO
-       /* BIO_RW_SYNCIO preferred interface from 2.6.30-2.6.35 */
-       return (dr->dr_rw & (1 << BIO_RW_SYNCIO));
-#else
-#ifdef HAVE_REQ_SYNC
-       /* REQ_SYNC preferred interface from 2.6.36-2.6.xx */
-       return (dr->dr_rw & REQ_SYNC);
-#else
-#error "Unable to determine bio sync flag"
-#endif /* HAVE_REQ_SYNC */
-#endif /* HAVE_BIO_RW_SYNC */
-#endif /* HAVE_BIO_RW_SYNCIO */
-}
-
 static void
 vdev_disk_dio_get(dio_request_t *dr)
 {
@@ -425,6 +407,7 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
 {
        dio_request_t *dr = bio->bi_private;
        int rc;
+       int wait;
 
        if (dr->dr_error == 0) {
 #ifdef HAVE_1ARG_BIO_END_IO_T
@@ -437,11 +420,12 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
 #endif
        }
 
+       wait = dr->dr_wait;
        /* Drop reference aquired by __vdev_disk_physio */
        rc = vdev_disk_dio_put(dr);
 
        /* Wake up synchronous waiter this is the last outstanding bio */
-       if ((rc == 1) && vdev_disk_dio_is_sync(dr))
+       if (wait && rc == 1)
                complete(&dr->dr_comp);
 }
 
@@ -491,14 +475,30 @@ bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
        return (bio_size);
 }
 
+static inline void
+vdev_submit_bio(int rw, struct bio *bio)
+{
+#ifdef HAVE_CURRENT_BIO_TAIL
+       struct bio **bio_tail = current->bio_tail;
+       current->bio_tail = NULL;
+       submit_bio(rw, bio);
+       current->bio_tail = bio_tail;
+#else
+       struct bio_list *bio_list = current->bio_list;
+       current->bio_list = NULL;
+       submit_bio(rw, bio);
+       current->bio_list = bio_list;
+#endif
+}
+
 static int
 __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
-    size_t kbuf_size, uint64_t kbuf_offset, int flags)
+    size_t kbuf_size, uint64_t kbuf_offset, int flags, int wait)
 {
        dio_request_t *dr;
        caddr_t bio_ptr;
        uint64_t bio_offset;
-       int bio_size, bio_count = 16;
+       int rw, bio_size, bio_count = 16;
        int i = 0, error = 0;
 
        ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size);
@@ -511,8 +511,9 @@ retry:
        if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
                bio_set_flags_failfast(bdev, &flags);
 
+       rw = flags;
        dr->dr_zio = zio;
-       dr->dr_rw = flags;
+       dr->dr_wait = wait;
 
        /*
         * When the IO size exceeds the maximum bio size for the request
@@ -541,9 +542,9 @@ retry:
                        goto retry;
                }
 
-               dr->dr_bio[i] = bio_alloc(GFP_NOIO,
-                   bio_nr_pages(bio_ptr, bio_size));
                /* bio_alloc() with __GFP_WAIT never returns NULL */
+               dr->dr_bio[i] = bio_alloc(GFP_NOIO,
+                   MIN(bio_nr_pages(bio_ptr, bio_size), BIO_MAX_PAGES));
                if (unlikely(dr->dr_bio[i] == NULL)) {
                        vdev_disk_dio_free(dr);
                        return (ENOMEM);
@@ -554,7 +555,7 @@ retry:
 
                dr->dr_bio[i]->bi_bdev = bdev;
                BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
-               dr->dr_bio[i]->bi_rw = dr->dr_rw;
+               dr->dr_bio[i]->bi_rw = rw;
                dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
                dr->dr_bio[i]->bi_private = dr;
 
@@ -566,7 +567,7 @@ retry:
                bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
        }
 
-       /* Extra reference to protect dio_request during submit_bio */
+       /* Extra reference to protect dio_request during vdev_submit_bio */
        vdev_disk_dio_get(dr);
        if (zio)
                zio->io_delay = jiffies_64;
@@ -574,7 +575,7 @@ retry:
        /* Submit all bio's associated with this dio */
        for (i = 0; i < dr->dr_bio_count; i++)
                if (dr->dr_bio[i])
-                       submit_bio(dr->dr_rw, dr->dr_bio[i]);
+                       vdev_submit_bio(rw, dr->dr_bio[i]);
 
        /*
         * On synchronous blocking requests we wait for all bio the completion
@@ -584,7 +585,7 @@ retry:
         * only synchronous consumer is vdev_disk_read_rootlabel() all other
         * IO originating from vdev_disk_io_start() is asynchronous.
         */
-       if (vdev_disk_dio_is_sync(dr)) {
+       if (wait) {
                wait_for_completion(&dr->dr_comp);
                error = dr->dr_error;
                ASSERT3S(atomic_read(&dr->dr_ref), ==, 1);
@@ -600,7 +601,7 @@ vdev_disk_physio(struct block_device *bdev, caddr_t kbuf,
     size_t size, uint64_t offset, int flags)
 {
        bio_set_flags_failfast(bdev, &flags);
-       return (__vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags));
+       return (__vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags, 1));
 }
 
 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, rc)
@@ -641,17 +642,18 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
        bio->bi_private = zio;
        bio->bi_bdev = bdev;
        zio->io_delay = jiffies_64;
-       submit_bio(VDEV_WRITE_FLUSH_FUA, bio);
+       vdev_submit_bio(VDEV_WRITE_FLUSH_FUA, bio);
        invalidate_bdev(bdev);
 
        return (0);
 }
 
-static int
+static void
 vdev_disk_io_start(zio_t *zio)
 {
        vdev_t *v = zio->io_vd;
        vdev_disk_t *vd = v->vdev_tsd;
+       zio_priority_t pri = zio->io_priority;
        int flags, error;
 
        switch (zio->io_type) {
@@ -659,7 +661,8 @@ vdev_disk_io_start(zio_t *zio)
 
                if (!vdev_readable(v)) {
                        zio->io_error = SET_ERROR(ENXIO);
-                       return (ZIO_PIPELINE_CONTINUE);
+                       zio_interrupt(zio);
+                       return;
                }
 
                switch (zio->io_cmd) {
@@ -675,7 +678,7 @@ vdev_disk_io_start(zio_t *zio)
 
                        error = vdev_disk_io_flush(vd->vd_bdev, zio);
                        if (error == 0)
-                               return (ZIO_PIPELINE_STOP);
+                               return;
 
                        zio->io_error = error;
                        if (error == ENOTSUP)
@@ -687,29 +690,35 @@ vdev_disk_io_start(zio_t *zio)
                        zio->io_error = SET_ERROR(ENOTSUP);
                }
 
-               return (ZIO_PIPELINE_CONTINUE);
-
+               zio_execute(zio);
+               return;
        case ZIO_TYPE_WRITE:
-               flags = WRITE;
+               if ((pri == ZIO_PRIORITY_SYNC_WRITE) && (v->vdev_nonrot))
+                       flags = WRITE_SYNC;
+               else
+                       flags = WRITE;
                break;
 
        case ZIO_TYPE_READ:
-               flags = READ;
+               if ((pri == ZIO_PRIORITY_SYNC_READ) && (v->vdev_nonrot))
+                       flags = READ_SYNC;
+               else
+                       flags = READ;
                break;
 
        default:
                zio->io_error = SET_ERROR(ENOTSUP);
-               return (ZIO_PIPELINE_CONTINUE);
+               zio_interrupt(zio);
+               return;
        }
 
        error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
-           zio->io_size, zio->io_offset, flags);
+           zio->io_size, zio->io_offset, flags, 0);
        if (error) {
                zio->io_error = error;
-               return (ZIO_PIPELINE_CONTINUE);
+               zio_interrupt(zio);
+               return;
        }
-
-       return (ZIO_PIPELINE_STOP);
 }
 
 static void
index 7f43ad8001f4001589dd8bd22e7cb7f35af01bdb..a29ea7bf9515a2b5106417b4fd036e0cfc3fe616 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -36,8 +36,6 @@
  * Virtual device vector for files.
  */
 
-static taskq_t *vdev_file_taskq;
-
 static void
 vdev_file_hold(vdev_t *vd)
 {
@@ -59,6 +57,9 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
        vattr_t vattr;
        int error;
 
+       /* Rotational optimizations only make sense on block devices */
+       vd->vdev_nonrot = B_TRUE;
+
        /*
         * We must have a pathname, and it must be absolute.
         */
@@ -172,7 +173,7 @@ vdev_file_io_fsync(void *arg)
        zio_interrupt(zio);
 }
 
-static int
+static void
 vdev_file_io_start(zio_t *zio)
 {
        vdev_t *vd = zio->io_vd;
@@ -182,7 +183,8 @@ vdev_file_io_start(zio_t *zio)
                /* XXPOLICY */
                if (!vdev_readable(vd)) {
                        zio->io_error = SET_ERROR(ENXIO);
-                       return (ZIO_PIPELINE_CONTINUE);
+                       zio_interrupt(zio);
+                       return;
                }
 
                switch (zio->io_cmd) {
@@ -199,9 +201,9 @@ vdev_file_io_start(zio_t *zio)
                         * the sync must be dispatched to a different context.
                         */
                        if (spl_fstrans_check()) {
-                               VERIFY3U(taskq_dispatch(vdev_file_taskq,
+                               VERIFY3U(taskq_dispatch(system_taskq,
                                    vdev_file_io_fsync, zio, TQ_SLEEP), !=, 0);
-                               return (ZIO_PIPELINE_STOP);
+                               return;
                        }
 
                        zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
@@ -211,13 +213,12 @@ vdev_file_io_start(zio_t *zio)
                        zio->io_error = SET_ERROR(ENOTSUP);
                }
 
-               return (ZIO_PIPELINE_CONTINUE);
+               zio_execute(zio);
+               return;
        }
 
-       VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
+       VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, zio,
            TQ_SLEEP), !=, 0);
-
-       return (ZIO_PIPELINE_STOP);
 }
 
 /* ARGSUSED */
@@ -239,21 +240,6 @@ vdev_ops_t vdev_file_ops = {
        B_TRUE                  /* leaf vdev */
 };
 
-void
-vdev_file_init(void)
-{
-       vdev_file_taskq = taskq_create("vdev_file_taskq", 100, minclsyspri,
-           max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
-
-       VERIFY(vdev_file_taskq);
-}
-
-void
-vdev_file_fini(void)
-{
-       taskq_destroy(vdev_file_taskq);
-}
-
 /*
  * From userland we access disks just like files.
  */
index 77c3d8d385e9eb6c175b3ae960c1a1317bb65e41..6b699e883e37a8c90e1e3ee95a34db8b741cc45d 100644 (file)
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -327,7 +327,7 @@ vdev_mirror_child_select(zio_t *zio)
        return (-1);
 }
 
-static int
+static void
 vdev_mirror_io_start(zio_t *zio)
 {
        mirror_map_t *mm;
@@ -352,7 +352,8 @@ vdev_mirror_io_start(zio_t *zio)
                                    zio->io_type, zio->io_priority, 0,
                                    vdev_mirror_scrub_done, mc));
                        }
-                       return (ZIO_PIPELINE_CONTINUE);
+                       zio_execute(zio);
+                       return;
                }
                /*
                 * For normal reads just pick one child.
@@ -378,7 +379,7 @@ vdev_mirror_io_start(zio_t *zio)
                c++;
        }
 
-       return (ZIO_PIPELINE_CONTINUE);
+       zio_execute(zio);
 }
 
 static int
index b9eb99d18005e33c3ba6dbd6a7480a554f1cc181..228757334234d241f980058397438d3a80716dcf 100644 (file)
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 /*
@@ -66,11 +66,11 @@ vdev_missing_close(vdev_t *vd)
 }
 
 /* ARGSUSED */
-static int
+static void
 vdev_missing_io_start(zio_t *zio)
 {
        zio->io_error = SET_ERROR(ENOTSUP);
-       return (ZIO_PIPELINE_CONTINUE);
+       zio_execute(zio);
 }
 
 /* ARGSUSED */
index 3fa4219f260e7099369773d690c2a64a01be276b..4a4544f29ed2d3024ca4381700f7b82828b21474 100644 (file)
@@ -167,7 +167,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60;
  * we include spans of optional I/Os to aid aggregation at the disk even when
  * they aren't able to help us aggregate at this level.
  */
-int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
 int zfs_vdev_read_gap_limit = 32 << 10;
 int zfs_vdev_write_gap_limit = 4 << 10;
 
@@ -190,6 +190,22 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
        return (0);
 }
 
+static inline avl_tree_t *
+vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
+{
+       return (&vq->vq_class[p].vqc_queued_tree);
+}
+
+static inline avl_tree_t *
+vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
+{
+       ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE);
+       if (t == ZIO_TYPE_READ)
+               return (&vq->vq_read_offset_tree);
+       else
+               return (&vq->vq_write_offset_tree);
+}
+
 int
 vdev_queue_timestamp_compare(const void *x1, const void *x2)
 {
@@ -303,7 +319,7 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
 
        /* find a queue that has not reached its minimum # outstanding i/os */
        for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
-               if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 &&
+               if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
                    vq->vq_class[p].vqc_active <
                    vdev_queue_class_min_active(p))
                        return (p);
@@ -314,7 +330,7 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
         * maximum # outstanding i/os.
         */
        for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
-               if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 &&
+               if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
                    vq->vq_class[p].vqc_active <
                    vdev_queue_class_max_active(spa, p))
                        return (p);
@@ -332,23 +348,31 @@ vdev_queue_init(vdev_t *vd)
 
        mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
        vq->vq_vdev = vd;
+       taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent);
 
        avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
            sizeof (zio_t), offsetof(struct zio, io_queue_node));
+       avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
+               vdev_queue_offset_compare, sizeof (zio_t),
+               offsetof(struct zio, io_offset_node));
+       avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
+               vdev_queue_offset_compare, sizeof (zio_t),
+               offsetof(struct zio, io_offset_node));
 
        for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+               int (*compfn) (const void *, const void *);
+
                /*
-                * The synchronous i/o queues are FIFO rather than LBA ordered.
-                * This provides more consistent latency for these i/os, and
-                * they tend to not be tightly clustered anyway so there is
-                * little to no throughput loss.
+                * The synchronous i/o queues are dispatched in FIFO rather
+                * than LBA order. This provides more consistent latency for
+                * these i/os.
                 */
-               boolean_t fifo = (p == ZIO_PRIORITY_SYNC_READ ||
-                   p == ZIO_PRIORITY_SYNC_WRITE);
-               avl_create(&vq->vq_class[p].vqc_queued_tree,
-                   fifo ? vdev_queue_timestamp_compare :
-                   vdev_queue_offset_compare,
-                   sizeof (zio_t), offsetof(struct zio, io_queue_node));
+               if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
+                       compfn = vdev_queue_timestamp_compare;
+               else
+                       compfn = vdev_queue_offset_compare;
+               avl_create(vdev_queue_class_tree(vq, p), compfn,
+                       sizeof (zio_t), offsetof(struct zio, io_queue_node));
        }
 }
 
@@ -359,8 +383,10 @@ vdev_queue_fini(vdev_t *vd)
        zio_priority_t p;
 
        for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
-               avl_destroy(&vq->vq_class[p].vqc_queued_tree);
+               avl_destroy(vdev_queue_class_tree(vq, p));
        avl_destroy(&vq->vq_active_tree);
+       avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
+       avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
 
        mutex_destroy(&vq->vq_lock);
 }
@@ -372,7 +398,8 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
        spa_stats_history_t *ssh = &spa->spa_stats.io_history;
 
        ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-       avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
+       avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+       avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
 
        if (ssh->kstat != NULL) {
                mutex_enter(&ssh->lock);
@@ -388,7 +415,8 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
        spa_stats_history_t *ssh = &spa->spa_stats.io_history;
 
        ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-       avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
+       avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+       avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
 
        if (ssh->kstat != NULL) {
                mutex_enter(&ssh->lock);
@@ -472,8 +500,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
        uint64_t maxgap = 0;
        uint64_t size;
        boolean_t stretch = B_FALSE;
-       vdev_queue_class_t *vqc = &vq->vq_class[zio->io_priority];
-       avl_tree_t *t = &vqc->vqc_queued_tree;
+       avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
        enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
 
        if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
@@ -486,15 +513,6 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
        zfs_vdev_aggregation_limit =
            MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE);
 
-       /*
-        * The synchronous i/o queues are not sorted by LBA, so we can't
-        * find adjacent i/os.  These i/os tend to not be tightly clustered,
-        * or too large to aggregate, so this has little impact on performance.
-        */
-       if (zio->io_priority == ZIO_PRIORITY_SYNC_READ ||
-           zio->io_priority == ZIO_PRIORITY_SYNC_WRITE)
-               return (NULL);
-
        first = last = zio;
 
        if (zio->io_type == ZIO_TYPE_READ)
@@ -627,7 +645,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq)
        zio_t *zio, *aio;
        zio_priority_t p;
        avl_index_t idx;
-       vdev_queue_class_t *vqc;
+       avl_tree_t *tree;
 
 again:
        ASSERT(MUTEX_HELD(&vq->vq_lock));
@@ -645,14 +663,14 @@ again:
         *
         * For FIFO queues (sync), issue the i/o with the lowest timestamp.
         */
-       vqc = &vq->vq_class[p];
+       tree = vdev_queue_class_tree(vq, p);
        vq->vq_io_search.io_timestamp = 0;
        vq->vq_io_search.io_offset = vq->vq_last_offset + 1;
-       VERIFY3P(avl_find(&vqc->vqc_queued_tree, &vq->vq_io_search,
+       VERIFY3P(avl_find(tree, &vq->vq_io_search,
            &idx), ==, NULL);
-       zio = avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER);
+       zio = avl_nearest(tree, idx, AVL_AFTER);
        if (zio == NULL)
-               zio = avl_first(&vqc->vqc_queued_tree);
+               zio = avl_first(tree);
        ASSERT3U(zio->io_priority, ==, p);
 
        aio = vdev_queue_aggregate(vq, zio);
@@ -813,5 +831,5 @@ MODULE_PARM_DESC(zfs_vdev_sync_write_max_active,
 
 module_param(zfs_vdev_sync_write_min_active, int, 0644);
 MODULE_PARM_DESC(zfs_vdev_sync_write_min_active,
-       "Min active sync write I/Osper vdev");
+       "Min active sync write I/Os per vdev");
 #endif
index 493b332c440517229a87e39d98d4c0c28412298e..b9479092c8429d7d688ac22f67a01aa88d58148e 100644 (file)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -1567,7 +1567,7 @@ vdev_raidz_child_done(zio_t *zio)
  *      vdevs have had errors, then create zio read operations to the parity
  *      columns' VDevs as well.
  */
-static int
+static void
 vdev_raidz_io_start(zio_t *zio)
 {
        vdev_t *vd = zio->io_vd;
@@ -1611,7 +1611,8 @@ vdev_raidz_io_start(zio_t *zio)
                            ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
                }
 
-               return (ZIO_PIPELINE_CONTINUE);
+               zio_execute(zio);
+               return;
        }
 
        ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -1651,7 +1652,7 @@ vdev_raidz_io_start(zio_t *zio)
                }
        }
 
-       return (ZIO_PIPELINE_CONTINUE);
+       zio_execute(zio);
 }
 
 
index 5ffa138a6b4bb6698dd7ecc9c7c4790b77eaaaf4..c5ea392b6a1d510637a0dc15bc1f5aed720f5fd9 100644 (file)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 /*
@@ -50,9 +51,9 @@
 
 int fzap_default_block_shift = 14; /* 16k blocksize */
 
-static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
-static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
+extern inline zap_phys_t *zap_f_phys(zap_t *zap);
 
+static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
 
 void
 fzap_byteswap(void *vbuf, size_t size)
@@ -80,13 +81,12 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
        ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
        zap->zap_ismicro = FALSE;
 
-       (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
-           &zap->zap_f.zap_phys, zap_evict);
+       zap->zap_dbu.dbu_evict_func = zap_evict;
 
        mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
        zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
 
-       zp = zap->zap_f.zap_phys;
+       zp = zap_f_phys(zap);
        /*
         * explicitly zero it since it might be coming from an
         * initialized microzap
@@ -117,7 +117,6 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 
        l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
        l->l_dbuf = db;
-       l->l_phys = db->db_data;
 
        zap_leaf_init(l, zp->zap_normflags != 0);
 
@@ -325,10 +324,10 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
         * If we are within 2 bits of running out, stop growing, since
         * this is already an aberrant condition.
         */
-       if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
+       if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
                return (SET_ERROR(ENOSPC));
 
-       if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+       if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
                /*
                 * We are outgrowing the "embedded" ptrtbl (the one
                 * stored in the header block).  Give it its own entire
@@ -338,9 +337,9 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
                dmu_buf_t *db_new;
                int err;
 
-               ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+               ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
                    ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
-               ASSERT0(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk);
+               ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
 
                newblk = zap_allocate_blocks(zap, 1);
                err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
@@ -353,17 +352,17 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
                    db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
                dmu_buf_rele(db_new, FTAG);
 
-               zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
-               zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
-               zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
+               zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
+               zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
+               zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
 
-               ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
-                   zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
+               ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
+                   zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
                    (FZAP_BLOCK_SHIFT(zap)-3));
 
                return (0);
        } else {
-               return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+               return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
                    zap_ptrtbl_transfer, tx));
        }
 }
@@ -373,8 +372,8 @@ zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
 {
        dmu_buf_will_dirty(zap->zap_dbuf, tx);
        mutex_enter(&zap->zap_f.zap_num_entries_mtx);
-       ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
-       zap->zap_f.zap_phys->zap_num_entries += delta;
+       ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
+       zap_f_phys(zap)->zap_num_entries += delta;
        mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 }
 
@@ -383,16 +382,25 @@ zap_allocate_blocks(zap_t *zap, int nblocks)
 {
        uint64_t newblk;
        ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-       newblk = zap->zap_f.zap_phys->zap_freeblk;
-       zap->zap_f.zap_phys->zap_freeblk += nblocks;
+       newblk = zap_f_phys(zap)->zap_freeblk;
+       zap_f_phys(zap)->zap_freeblk += nblocks;
        return (newblk);
 }
 
+static void
+zap_leaf_pageout(void *dbu)
+{
+       zap_leaf_t *l = dbu;
+
+       rw_destroy(&l->l_rwlock);
+       kmem_free(l, sizeof (zap_leaf_t));
+}
+
 static zap_leaf_t *
 zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 {
        void *winner;
-       zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
+       zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 
        ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
@@ -400,18 +408,18 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
        rw_enter(&l->l_rwlock, RW_WRITER);
        l->l_blkid = zap_allocate_blocks(zap, 1);
        l->l_dbuf = NULL;
-       l->l_phys = NULL;
 
        VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
            l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
            DMU_READ_NO_PREFETCH));
-       winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
+       dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf);
+       winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
        ASSERT(winner == NULL);
        dmu_buf_will_dirty(l->l_dbuf, tx);
 
        zap_leaf_init(l, zap->zap_normflags != 0);
 
-       zap->zap_f.zap_phys->zap_num_leafs++;
+       zap_f_phys(zap)->zap_num_leafs++;
 
        return (l);
 }
@@ -421,7 +429,7 @@ fzap_count(zap_t *zap, uint64_t *count)
 {
        ASSERT(!zap->zap_ismicro);
        mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
-       *count = zap->zap_f.zap_phys->zap_num_entries;
+       *count = zap_f_phys(zap)->zap_num_entries;
        mutex_exit(&zap->zap_f.zap_num_entries_mtx);
        return (0);
 }
@@ -437,16 +445,6 @@ zap_put_leaf(zap_leaf_t *l)
        dmu_buf_rele(l->l_dbuf, NULL);
 }
 
-_NOTE(ARGSUSED(0))
-static void
-zap_leaf_pageout(dmu_buf_t *db, void *vl)
-{
-       zap_leaf_t *l = vl;
-
-       rw_destroy(&l->l_rwlock);
-       kmem_free(l, sizeof (zap_leaf_t));
-}
-
 static zap_leaf_t *
 zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 {
@@ -454,20 +452,20 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 
        ASSERT(blkid != 0);
 
-       l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
+       l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
        rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL);
        rw_enter(&l->l_rwlock, RW_WRITER);
        l->l_blkid = blkid;
        l->l_bs = highbit64(db->db_size) - 1;
        l->l_dbuf = db;
-       l->l_phys = NULL;
 
-       winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
+       dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf);
+       winner = dmu_buf_set_user(db, &l->l_dbu);
 
        rw_exit(&l->l_rwlock);
        if (winner != NULL) {
                /* someone else set it first */
-               zap_leaf_pageout(NULL, l);
+               zap_leaf_pageout(&l->l_dbu);
                l = winner;
        }
 
@@ -476,7 +474,7 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
         * chain.  There should be no chained leafs (as we have removed
         * support for them).
         */
-       ASSERT0(l->l_phys->l_hdr.lh_pad1);
+       ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
 
        /*
         * There should be more hash entries than there can be
@@ -486,11 +484,11 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 
        /* The chunks should begin at the end of the hash table */
        ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, (zap_leaf_chunk_t *)
-           &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
+           &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
 
        /* The chunks should end at the end of the block */
        ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
-           (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size);
+           (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
 
        return (l);
 }
@@ -523,16 +521,15 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
 
        rw_enter(&l->l_rwlock, lt);
        /*
-        * Must lock before dirtying, otherwise l->l_phys could change,
+        * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
         * causing ASSERT below to fail.
         */
        if (lt == RW_WRITER)
                dmu_buf_will_dirty(db, tx);
        ASSERT3U(l->l_blkid, ==, blkid);
        ASSERT3P(l->l_dbuf, ==, db);
-       ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
-       ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF);
-       ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+       ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
+       ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
        *lp = l;
        return (0);
@@ -543,13 +540,13 @@ zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
 {
        ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
-       if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+       if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
                ASSERT3U(idx, <,
-                   (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
+                   (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
                *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
                return (0);
        } else {
-               return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+               return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
                    idx, valp));
        }
 }
@@ -560,11 +557,11 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
        ASSERT(tx != NULL);
        ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
-       if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
+       if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
                ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
                return (0);
        } else {
-               return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+               return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
                    idx, blk, tx));
        }
 }
@@ -576,16 +573,17 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
        int err;
 
        ASSERT(zap->zap_dbuf == NULL ||
-           zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
-       ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
-       idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+           zap_f_phys(zap) == zap->zap_dbuf->db_data);
+       ASSERT3U(zap_f_phys(zap)->zap_magic, ==, ZAP_MAGIC);
+       idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
        err = zap_idx_to_blk(zap, idx, &blk);
        if (err != 0)
                return (err);
        err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
 
-       ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) ==
-           (*lp)->l_phys->l_hdr.lh_prefix);
+       ASSERT(err ||
+           ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
+           zap_leaf_phys(*lp)->l_hdr.lh_prefix);
        return (err);
 }
 
@@ -597,16 +595,16 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
        zap_leaf_t *nl;
        int prefix_diff, i, err;
        uint64_t sibling;
-       int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
+       int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
-       ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+       ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
        ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
        ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
-           l->l_phys->l_hdr.lh_prefix);
+           zap_leaf_phys(l)->l_hdr.lh_prefix);
 
        if (zap_tryupgradedir(zap, tx) == 0 ||
-           old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+           old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
                /* We failed to upgrade, or need to grow the pointer table */
                objset_t *os = zap->zap_objset;
                uint64_t object = zap->zap_object;
@@ -621,7 +619,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
                ASSERT(!zap->zap_ismicro);
 
                while (old_prefix_len ==
-                   zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+                   zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
                        err = zap_grow_ptrtbl(zap, tx);
                        if (err)
                                return (err);
@@ -631,18 +629,18 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
                if (err)
                        return (err);
 
-               if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) {
+               if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
                        /* it split while our locks were down */
                        *lp = l;
                        return (0);
                }
        }
        ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-       ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+       ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
        ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
-           l->l_phys->l_hdr.lh_prefix);
+           zap_leaf_phys(l)->l_hdr.lh_prefix);
 
-       prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
+       prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
            (old_prefix_len + 1);
        sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
 
@@ -664,7 +662,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
                ASSERT0(err); /* we checked for i/o errors above */
        }
 
-       if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) {
+       if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
                /* we want the sibling */
                zap_put_leaf(l);
                *lp = nl;
@@ -680,13 +678,13 @@ static void
 zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
 {
        zap_t *zap = zn->zn_zap;
-       int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
-       int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift &&
-           l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
+       int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+       int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
+           zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
 
        zap_put_leaf(l);
 
-       if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) {
+       if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
                int err;
 
                /*
@@ -706,7 +704,7 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
                }
 
                /* could have finished growing while our locks were down */
-               if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift)
+               if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
                        (void) zap_grow_ptrtbl(zap, tx);
        }
 }
@@ -937,7 +935,7 @@ fzap_prefetch(zap_name_t *zn)
        int bs;
 
        idx = ZAP_HASH_IDX(zn->zn_hash,
-           zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+           zap_f_phys(zap)->zap_ptrtbl.zt_shift);
        if (zap_idx_to_blk(zap, idx, &blk) != 0)
                return;
        bs = FZAP_BLOCK_SHIFT(zap);
@@ -1169,8 +1167,8 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
 
        if (zc->zc_leaf &&
            (ZAP_HASH_IDX(zc->zc_hash,
-           zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) !=
-           zc->zc_leaf->l_phys->l_hdr.lh_prefix)) {
+           zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
+           zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
                rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
                zap_put_leaf(zc->zc_leaf);
                zc->zc_leaf = NULL;
@@ -1191,10 +1189,11 @@ again:
 
        if (err == ENOENT) {
                uint64_t nocare =
-                   (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1;
+                   (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
                zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
                zc->zc_cd = 0;
-               if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) {
+               if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 ||
+                   zc->zc_hash == 0) {
                        zc->zc_hash = -1ULL;
                } else {
                        zap_put_leaf(zc->zc_leaf);
@@ -1252,31 +1251,6 @@ zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
        }
 }
 
-int
-fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn)
-{
-       int err;
-       zap_leaf_t *l;
-       zap_entry_handle_t zeh;
-
-       if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
-               return (SET_ERROR(ENAMETOOLONG));
-
-       err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l);
-       if (err != 0)
-               return (err);
-
-       err = zap_leaf_lookup(l, zn, &zeh);
-       if (err == 0) {
-               zc->zc_leaf = l;
-               zc->zc_hash = zeh.zeh_hash;
-               zc->zc_cd = zeh.zeh_cd;
-       }
-
-       rw_exit(&l->l_rwlock);
-       return (err);
-}
-
 void
 fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 {
@@ -1286,25 +1260,25 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
        /*
         * Set zap_phys_t fields
         */
-       zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
-       zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
-       zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
-       zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type;
-       zs->zs_magic = zap->zap_f.zap_phys->zap_magic;
-       zs->zs_salt = zap->zap_f.zap_phys->zap_salt;
+       zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
+       zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
+       zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
+       zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
+       zs->zs_magic = zap_f_phys(zap)->zap_magic;
+       zs->zs_salt = zap_f_phys(zap)->zap_salt;
 
        /*
         * Set zap_ptrtbl fields
         */
-       zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
-       zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk;
+       zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+       zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
        zs->zs_ptrtbl_blks_copied =
-           zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied;
-       zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk;
-       zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
-       zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+           zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
+       zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
+       zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
+       zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 
-       if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+       if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
                /* the ptrtbl is entirely in the header block. */
                zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
                    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
@@ -1312,16 +1286,16 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
                int b;
 
                dmu_prefetch(zap->zap_objset, zap->zap_object,
-                   zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs,
-                   zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs);
+                   zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
+                   zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs);
 
-               for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
+               for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
                    b++) {
                        dmu_buf_t *db;
                        int err;
 
                        err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-                           (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
+                           (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
                            FTAG, &db, DMU_READ_NO_PREFETCH);
                        if (err == 0) {
                                zap_stats_ptrtbl(zap, db->db_data,
@@ -1358,7 +1332,7 @@ fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
         *   could extend the table.
         */
        if (add) {
-               if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0)
+               if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0)
                        *towrite += zap->zap_dbuf->db_size;
                else
                        *towrite += (zap->zap_dbuf->db_size * 3);
index 9578048250e2f675c30525dc50d492fe58ac8212..3abc08cff476f09c634567a6681d236b75c82ec3 100644 (file)
@@ -48,10 +48,12 @@ static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
 
 #define        LEAF_HASH(l, h) \
        ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
-       ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len)))
+       ((h) >> \
+       (64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
 
-#define        LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
+#define        LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
 
+extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l);
 
 static void
 zap_memset(void *a, int c, size_t n)
@@ -107,8 +109,11 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
 {
        int i;
        zap_leaf_t l;
+       dmu_buf_t l_dbuf;
+
+       l_dbuf.db_data = buf;
        l.l_bs = highbit64(size) - 1;
-       l.l_phys = buf;
+       l.l_dbuf = &l_dbuf;
 
        buf->l_hdr.lh_block_type =      BSWAP_64(buf->l_hdr.lh_block_type);
        buf->l_hdr.lh_prefix =          BSWAP_64(buf->l_hdr.lh_prefix);
@@ -161,18 +166,20 @@ zap_leaf_init(zap_leaf_t *l, boolean_t sort)
        int i;
 
        l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
-       zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
-       zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+       zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
+           sizeof (struct zap_leaf_header));
+       zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+           2*ZAP_LEAF_HASH_NUMENTRIES(l));
        for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
                ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
                ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
        }
        ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
-       l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
-       l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
-       l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
+       zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF;
+       zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
+       zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
        if (sort)
-               l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+               zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
 }
 
 /*
@@ -184,15 +191,16 @@ zap_leaf_chunk_alloc(zap_leaf_t *l)
 {
        int chunk;
 
-       ASSERT(l->l_phys->l_hdr.lh_nfree > 0);
+       ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
 
-       chunk = l->l_phys->l_hdr.lh_freelist;
+       chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
        ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
        ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
 
-       l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
+       zap_leaf_phys(l)->l_hdr.lh_freelist =
+           ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
 
-       l->l_phys->l_hdr.lh_nfree--;
+       zap_leaf_phys(l)->l_hdr.lh_nfree--;
 
        return (chunk);
 }
@@ -201,16 +209,16 @@ static void
 zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
 {
        struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
-       ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
+       ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
        ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
        ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
 
        zlf->lf_type = ZAP_CHUNK_FREE;
-       zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
+       zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
        bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
-       l->l_phys->l_hdr.lh_freelist = chunk;
+       zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
 
-       l->l_phys->l_hdr.lh_nfree++;
+       zap_leaf_phys(l)->l_hdr.lh_nfree++;
 }
 
 /*
@@ -396,7 +404,7 @@ zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
        uint16_t *chunkp;
        struct zap_leaf_entry *le;
 
-       ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+       ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
 again:
        for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
@@ -416,7 +424,7 @@ again:
                 * lowest-cd match for MT_FIRST.
                 */
                ASSERT(zn->zn_matchtype == MT_EXACT ||
-                   (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
+                   (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
                if (zap_leaf_array_match(l, zn, le->le_name_chunk,
                    le->le_name_numints)) {
                        zeh->zeh_num_integers = le->le_value_numints;
@@ -456,10 +464,10 @@ zap_leaf_lookup_closest(zap_leaf_t *l,
        uint16_t lh;
        struct zap_leaf_entry *le;
 
-       ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+       ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
        for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
-               for (chunk = l->l_phys->l_hash[lh];
+               for (chunk = zap_leaf_phys(l)->l_hash[lh];
                    chunk != CHAIN_END; chunk = le->le_next) {
                        le = ZAP_LEAF_ENTRY(l, chunk);
 
@@ -539,7 +547,7 @@ zap_entry_update(zap_entry_handle_t *zeh,
        delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
            ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
 
-       if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
+       if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
                return (SET_ERROR(EAGAIN));
 
        zap_leaf_array_free(l, &le->le_value_chunk);
@@ -569,7 +577,7 @@ zap_entry_remove(zap_entry_handle_t *zeh)
        *zeh->zeh_chunkp = le->le_next;
        zap_leaf_chunk_free(l, entry_chunk);
 
-       l->l_phys->l_hdr.lh_nentries--;
+       zap_leaf_phys(l)->l_hdr.lh_nentries--;
 }
 
 int
@@ -593,7 +601,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
 
        if (cd == ZAP_NEED_CD) {
                /* find the lowest unused cd */
-               if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
+               if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
                        cd = 0;
 
                        for (chunk = *LEAF_HASH_ENTPTR(l, h);
@@ -629,7 +637,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
                ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
        }
 
-       if (l->l_phys->l_hdr.lh_nfree < numchunks)
+       if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks)
                return (SET_ERROR(EAGAIN));
 
        /* make the entry */
@@ -650,7 +658,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
        /* XXX if we did the search above, we could just use that */
        chunkp = zap_leaf_rehash_entry(l, chunk);
 
-       l->l_phys->l_hdr.lh_nentries++;
+       zap_leaf_phys(l)->l_hdr.lh_nentries++;
 
        zeh->zeh_leaf = l;
        zeh->zeh_num_integers = num_integers;
@@ -784,8 +792,8 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 
        zap_leaf_chunk_free(l, entry);
 
-       l->l_phys->l_hdr.lh_nentries--;
-       nl->l_phys->l_hdr.lh_nentries++;
+       zap_leaf_phys(l)->l_hdr.lh_nentries--;
+       zap_leaf_phys(nl)->l_hdr.lh_nentries++;
 }
 
 /*
@@ -795,19 +803,22 @@ void
 zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 {
        int i;
-       int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
+       int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
        /* set new prefix and prefix_len */
-       l->l_phys->l_hdr.lh_prefix <<= 1;
-       l->l_phys->l_hdr.lh_prefix_len++;
-       nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1;
-       nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
+       zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
+       zap_leaf_phys(l)->l_hdr.lh_prefix_len++;
+       zap_leaf_phys(nl)->l_hdr.lh_prefix =
+           zap_leaf_phys(l)->l_hdr.lh_prefix | 1;
+       zap_leaf_phys(nl)->l_hdr.lh_prefix_len =
+           zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
        /* break existing hash chains */
-       zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+       zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+           2*ZAP_LEAF_HASH_NUMENTRIES(l));
 
        if (sort)
-               l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+               zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
 
        /*
         * Transfer entries whose hash bit 'bit' is set to nl; rehash
@@ -835,25 +846,25 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 {
        int i, n;
 
-       n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
-           l->l_phys->l_hdr.lh_prefix_len;
+       n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+           zap_leaf_phys(l)->l_hdr.lh_prefix_len;
        n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
        zs->zs_leafs_with_2n_pointers[n]++;
 
 
-       n = l->l_phys->l_hdr.lh_nentries/5;
+       n = zap_leaf_phys(l)->l_hdr.lh_nentries/5;
        n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
        zs->zs_blocks_with_n5_entries[n]++;
 
        n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
-           l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
+           zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
            (1<<FZAP_BLOCK_SHIFT(zap));
        n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
        zs->zs_blocks_n_tenths_full[n]++;
 
        for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
                int nentries = 0;
-               int chunk = l->l_phys->l_hash[i];
+               int chunk = zap_leaf_phys(l)->l_hash[i];
 
                while (chunk != CHAIN_END) {
                        struct zap_leaf_entry *le =
index dfa7c661565991c070998ad2fc143d0334d84afc..29406e660c5b733514dc417ea7d51b8bceacf34a 100644 (file)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zio.h>
 #include <sys/zap_leaf.h>
 #include <sys/avl.h>
 #include <sys/arc.h>
+#include <sys/dmu_objset.h>
 
 #ifdef _KERNEL
 #include <sys/sunddi.h>
 #endif
 
+extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
+
 static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
 
 uint64_t
@@ -45,7 +49,7 @@ zap_getflags(zap_t *zap)
 {
        if (zap->zap_ismicro)
                return (0);
-       return (zap->zap_u.zap_fat.zap_phys->zap_flags);
+       return (zap_f_phys(zap)->zap_flags);
 }
 
 int
@@ -384,7 +388,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
         * it, because zap_lockdir() checks zap_ismicro without the lock
         * held.
         */
-       winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict);
+       dmu_buf_init_user(&zap->zap_dbu, zap_evict, &zap->zap_dbuf);
+       winner = dmu_buf_set_user(db, &zap->zap_dbu);
 
        if (winner != NULL) {
                rw_exit(&zap->zap_rwlock);
@@ -396,15 +401,15 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
        }
 
        if (zap->zap_ismicro) {
-               zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
-               zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags;
+               zap->zap_salt = zap_m_phys(zap)->mz_salt;
+               zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
                zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
                avl_create(&zap->zap_m.zap_avl, mze_compare,
                    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
 
                for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
                        mzap_ent_phys_t *mze =
-                           &zap->zap_m.zap_phys->mz_chunk[i];
+                           &zap_m_phys(zap)->mz_chunk[i];
                        if (mze->mze_name[0]) {
                                zap_name_t *zn;
 
@@ -416,8 +421,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
                        }
                }
        } else {
-               zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
-               zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags;
+               zap->zap_salt = zap_f_phys(zap)->zap_salt;
+               zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
 
                ASSERT3U(sizeof (struct zap_leaf_header), ==,
                    2*ZAP_LEAF_CHUNKSIZE);
@@ -427,7 +432,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
                 * other members.
                 */
                ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
-                   &zap->zap_f.zap_phys->zap_salt);
+                   &zap_f_phys(zap)->zap_salt);
 
                /*
                 * The embedded pointer table should end at the end of
@@ -435,7 +440,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
                 */
                ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
                    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
-                   (uintptr_t)zap->zap_f.zap_phys, ==,
+                   (uintptr_t)zap_f_phys(zap), ==,
                    zap->zap_dbuf->db_size);
        }
        rw_exit(&zap->zap_rwlock);
@@ -479,7 +484,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
                /* it was upgraded, now we only need reader */
                ASSERT(lt == RW_WRITER);
                ASSERT(RW_READER ==
-                   (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
+                   ((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
                rw_downgrade(&zap->zap_rwlock);
                lt = RW_READER;
        }
@@ -650,9 +655,9 @@ zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
        uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 
        ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
-           leaf_blockshift <= SPA_MAXBLOCKSHIFT &&
+           leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
            indirect_blockshift >= SPA_MINBLOCKSHIFT &&
-           indirect_blockshift <= SPA_MAXBLOCKSHIFT);
+           indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
 
        VERIFY(dmu_object_set_blocksize(os, obj,
            1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
@@ -673,11 +678,10 @@ zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
        return (dmu_object_free(os, zapobj, tx));
 }
 
-_NOTE(ARGSUSED(0))
 void
-zap_evict(dmu_buf_t *db, void *vzap)
+zap_evict(void *dbu)
 {
-       zap_t *zap = vzap;
+       zap_t *zap = dbu;
 
        rw_destroy(&zap->zap_rwlock);
 
@@ -936,7 +940,7 @@ mzap_addent(zap_name_t *zn, uint64_t value)
 #ifdef ZFS_DEBUG
        for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
                ASSERTV(mzap_ent_phys_t *mze);
-               ASSERT(mze = &zap->zap_m.zap_phys->mz_chunk[i]);
+               ASSERT(mze = &zap_m_phys(zap)->mz_chunk[i]);
                ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
        }
 #endif
@@ -947,7 +951,7 @@ mzap_addent(zap_name_t *zn, uint64_t value)
 
 again:
        for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
-               mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+               mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
                if (mze->mze_name[0] == 0) {
                        mze->mze_value = value;
                        mze->mze_cd = cd;
@@ -1149,7 +1153,7 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
                        err = SET_ERROR(ENOENT);
                } else {
                        zap->zap_m.zap_num_entries--;
-                       bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+                       bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
                            sizeof (mzap_ent_phys_t));
                        mze_remove(zap, mze);
                }
@@ -1314,46 +1318,6 @@ zap_cursor_advance(zap_cursor_t *zc)
        zc->zc_cd++;
 }
 
-int
-zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt)
-{
-       int err = 0;
-       mzap_ent_t *mze;
-       zap_name_t *zn;
-
-       if (zc->zc_zap == NULL) {
-               err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
-                   RW_READER, TRUE, FALSE, &zc->zc_zap);
-               if (err)
-                       return (err);
-       } else {
-               rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
-       }
-
-       zn = zap_name_alloc(zc->zc_zap, name, mt);
-       if (zn == NULL) {
-               rw_exit(&zc->zc_zap->zap_rwlock);
-               return (SET_ERROR(ENOTSUP));
-       }
-
-       if (!zc->zc_zap->zap_ismicro) {
-               err = fzap_cursor_move_to_key(zc, zn);
-       } else {
-               mze = mze_find(zn);
-               if (mze == NULL) {
-                       err = SET_ERROR(ENOENT);
-                       goto out;
-               }
-               zc->zc_hash = mze->mze_hash;
-               zc->zc_cd = mze->mze_cd;
-       }
-
-out:
-       zap_name_free(zn);
-       rw_exit(&zc->zc_zap->zap_rwlock);
-       return (err);
-}
-
 int
 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
 {
@@ -1384,7 +1348,6 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
        zap_t *zap;
        int err = 0;
 
-
        /*
         * Since, we don't have a name, we cannot figure out which blocks will
         * be affected in this operation. So, account for the worst case :
@@ -1397,7 +1360,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
         * large microzap results in a promotion to fatzap.
         */
        if (name == NULL) {
-               *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+               *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
                return (err);
        }
 
@@ -1421,7 +1384,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
                        /*
                         * We treat this case as similar to (name == NULL)
                         */
-                       *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+                       *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
                }
        } else {
                /*
@@ -1440,12 +1403,12 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
                 *                      ptrtbl blocks
                 */
                if (dmu_buf_freeable(zap->zap_dbuf))
-                       *tooverwrite += SPA_MAXBLOCKSIZE;
+                       *tooverwrite += MZAP_MAX_BLKSZ;
                else
-                       *towrite += SPA_MAXBLOCKSIZE;
+                       *towrite += MZAP_MAX_BLKSZ;
 
                if (add) {
-                       *towrite += 4 * SPA_MAXBLOCKSIZE;
+                       *towrite += 4 * MZAP_MAX_BLKSZ;
                }
        }
 
@@ -1491,7 +1454,6 @@ EXPORT_SYMBOL(zap_cursor_fini);
 EXPORT_SYMBOL(zap_cursor_retrieve);
 EXPORT_SYMBOL(zap_cursor_advance);
 EXPORT_SYMBOL(zap_cursor_serialize);
-EXPORT_SYMBOL(zap_cursor_move_to_key);
 EXPORT_SYMBOL(zap_cursor_init_serialized);
 EXPORT_SYMBOL(zap_get_stats);
 #endif
index a901448220b6d9de99c8f0d44726c3bb0049cc45..609a72ab301a220e55aef68f53285b1dde433e0e 100644 (file)
@@ -56,7 +56,8 @@ valid_char(char c, boolean_t after_colon)
 {
        return ((c >= 'a' && c <= 'z') ||
            (c >= '0' && c <= '9') ||
-           c == (after_colon ? '_' : '.'));
+           (after_colon && c == '_') ||
+           (!after_colon && (c == '.' || c == '-')));
 }
 
 /*
@@ -215,8 +216,30 @@ zpool_feature_init(void)
            B_TRUE, B_FALSE, B_FALSE, bookmarks_deps);
        }
 
+       {
+       static const spa_feature_t filesystem_limits_deps[] = {
+           SPA_FEATURE_EXTENSIBLE_DATASET,
+           SPA_FEATURE_NONE
+       };
+       zfeature_register(SPA_FEATURE_FS_SS_LIMIT,
+           "com.joyent:filesystem_limits", "filesystem_limits",
+           "Filesystem and snapshot limits.", B_TRUE, B_FALSE, B_FALSE,
+           filesystem_limits_deps);
+       }
+
        zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
            "com.delphix:embedded_data", "embedded_data",
            "Blocks which compress very well use even less space.",
            B_FALSE, B_TRUE, B_TRUE, NULL);
+
+       {
+       static const spa_feature_t large_blocks_deps[] = {
+               SPA_FEATURE_EXTENSIBLE_DATASET,
+               SPA_FEATURE_NONE
+       };
+       zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
+           "org.open-zfs:large_blocks", "large_blocks",
+           "Support for blocks larger than 128KB.", B_FALSE, B_FALSE, B_FALSE,
+           large_blocks_deps);
+       }
 }
index 59405de82b00f1831d4944e7f9de23f8fc29ee9d..b70eb66239489751383bd32b3f5aa852785d2290 100644 (file)
 #include <sys/zpl.h>
 #include "zfs_namecheck.h"
 
+/*
+ * Two AVL trees are maintained which contain all currently automounted
+ * snapshots.  Every automounted snapshots maps to a single zfs_snapentry_t
+ * entry which MUST:
+ *
+ *   - be attached to both trees, and
+ *   - be unique, no duplicate entries are allowed.
+ *
+ * The zfs_snapshots_by_name tree is indexed by the full dataset name
+ * while the zfs_snapshots_by_objsetid tree is indexed by the unique
+ * objsetid.  This allows for fast lookups either by name or objsetid.
+ */
+static avl_tree_t zfs_snapshots_by_name;
+static avl_tree_t zfs_snapshots_by_objsetid;
+static kmutex_t zfs_snapshot_lock;
+
 /*
  * Control Directory Tunables (.zfs)
  */
 int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;
+int zfs_admin_snapshot = 0;
 
 /*
  * Dedicated task queue for unmounting snapshots.
  */
 static taskq_t *zfs_expire_taskq;
 
+typedef struct {
+       char            *se_name;       /* full snapshot name */
+       char            *se_path;       /* full mount path */
+       uint64_t        se_objsetid;    /* snapshot objset id */
+       struct dentry   *se_root_dentry; /* snapshot root dentry */
+       taskqid_t       se_taskqid;     /* scheduled unmount taskqid */
+       avl_node_t      se_node_name;   /* zfs_snapshots_by_name link */
+       avl_node_t      se_node_objsetid; /* zfs_snapshots_by_objsetid link */
+       refcount_t      se_refcount;    /* reference count */
+} zfs_snapentry_t;
+
+static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay);
+
+/*
+ * Allocate a new zfs_snapentry_t being careful to make a copy of the
+ * the snapshot name and provided mount point.  No reference is taken.
+ */
 static zfs_snapentry_t *
-zfsctl_sep_alloc(void)
+zfsctl_snapshot_alloc(char *full_name, char *full_path, uint64_t objsetid,
+    struct dentry *root_dentry)
 {
-       return (kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP));
+       zfs_snapentry_t *se;
+
+       se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP);
+
+       se->se_name = strdup(full_name);
+       se->se_path = strdup(full_path);
+       se->se_objsetid = objsetid;
+       se->se_root_dentry = root_dentry;
+       se->se_taskqid = -1;
+
+       refcount_create(&se->se_refcount);
+
+       return (se);
 }
 
-void
-zfsctl_sep_free(zfs_snapentry_t *sep)
+/*
+ * Free a zfs_snapentry_t the called must ensure there are no active
+ * references.
+ */
+static void
+zfsctl_snapshot_free(zfs_snapentry_t *se)
 {
-       kmem_free(sep->se_name, MAXNAMELEN);
-       kmem_free(sep->se_path, PATH_MAX);
-       kmem_free(sep, sizeof (zfs_snapentry_t));
+       refcount_destroy(&se->se_refcount);
+       strfree(se->se_name);
+       strfree(se->se_path);
+
+       kmem_free(se, sizeof (zfs_snapentry_t));
 }
 
 /*
- * Attempt to expire an automounted snapshot, unmounts are attempted every
- * 'zfs_expire_snapshot' seconds until they succeed.  The work request is
- * responsible for rescheduling itself and freeing the zfs_expire_snapshot_t.
+ * Hold a reference on the zfs_snapentry_t.
  */
 static void
-zfsctl_expire_snapshot(void *data)
+zfsctl_snapshot_hold(zfs_snapentry_t *se)
 {
-       zfs_snapentry_t *sep = (zfs_snapentry_t *)data;
-       zfs_sb_t *zsb = ITOZSB(sep->se_inode);
-       int error;
+       refcount_add(&se->se_refcount, NULL);
+}
 
-       error = zfsctl_unmount_snapshot(zsb, sep->se_name, MNT_EXPIRE);
-       if (error == EBUSY)
-               sep->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq,
-                   zfsctl_expire_snapshot, sep, TQ_SLEEP,
-                   ddi_get_lbolt() + zfs_expire_snapshot * HZ);
+/*
+ * Release a reference on the zfs_snapentry_t.  When the number of
+ * references drops to zero the structure will be freed.
+ */
+static void
+zfsctl_snapshot_rele(zfs_snapentry_t *se)
+{
+       if (refcount_remove(&se->se_refcount, NULL) == 0)
+               zfsctl_snapshot_free(se);
 }
 
-int
-snapentry_compare(const void *a, const void *b)
+/*
+ * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and
+ * zfs_snapshots_by_objsetid trees.  While the zfs_snapentry_t is part
+ * of the trees a reference is held.
+ */
+static void
+zfsctl_snapshot_add(zfs_snapentry_t *se)
+{
+       ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
+       refcount_add(&se->se_refcount, NULL);
+       avl_add(&zfs_snapshots_by_name, se);
+       avl_add(&zfs_snapshots_by_objsetid, se);
+}
+
+/*
+ * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and
+ * zfs_snapshots_by_objsetid trees.  Upon removal a reference is dropped,
+ * this can result in the structure being freed if that was the last
+ * remaining reference.
+ */
+static void
+zfsctl_snapshot_remove(zfs_snapentry_t *se)
+{
+       ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
+       avl_remove(&zfs_snapshots_by_name, se);
+       avl_remove(&zfs_snapshots_by_objsetid, se);
+       zfsctl_snapshot_rele(se);
+}
+
+/*
+ * Snapshot name comparison function for the zfs_snapshots_by_name.
+ */
+static int
+snapentry_compare_by_name(const void *a, const void *b)
 {
-       const zfs_snapentry_t *sa = a;
-       const zfs_snapentry_t *sb = b;
-       int ret = strcmp(sa->se_name, sb->se_name);
+       const zfs_snapentry_t *se_a = a;
+       const zfs_snapentry_t *se_b = b;
+       int ret;
+
+       ret = strcmp(se_a->se_name, se_b->se_name);
 
        if (ret < 0)
                return (-1);
@@ -145,12 +233,199 @@ snapentry_compare(const void *a, const void *b)
                return (0);
 }
 
+/*
+ * Snapshot name comparison function for the zfs_snapshots_by_objsetid.
+ */
+static int
+snapentry_compare_by_objsetid(const void *a, const void *b)
+{
+       const zfs_snapentry_t *se_a = a;
+       const zfs_snapentry_t *se_b = b;
+
+       if (se_a->se_objsetid < se_b->se_objsetid)
+               return (-1);
+       else if (se_a->se_objsetid > se_b->se_objsetid)
+               return (1);
+       else
+               return (0);
+}
+
+/*
+ * Find a zfs_snapentry_t in zfs_snapshots_by_name.  If the snapname
+ * is found a pointer to the zfs_snapentry_t is returned and a reference
+ * taken on the structure.  The caller is responsible for dropping the
+ * reference with zfsctl_snapshot_rele().  If the snapname is not found
+ * NULL will be returned.
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_find_by_name(char *snapname)
+{
+       zfs_snapentry_t *se, search;
+
+       ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
+
+       search.se_name = snapname;
+       se = avl_find(&zfs_snapshots_by_name, &search, NULL);
+       if (se)
+               refcount_add(&se->se_refcount, NULL);
+
+       return (se);
+}
+
+/*
+ * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id
+ * rather than the snapname.  In all other respects it behaves the same
+ * as zfsctl_snapshot_find_by_name().
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_find_by_objsetid(uint64_t objsetid)
+{
+       zfs_snapentry_t *se, search;
+
+       ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
+
+       search.se_objsetid = objsetid;
+       se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL);
+       if (se)
+               refcount_add(&se->se_refcount, NULL);
+
+       return (se);
+}
+
+/*
+ * Rename a zfs_snapentry_t in the zfs_snapshots_by_name.  The structure is
+ * removed, renamed, and added back to the new correct location in the tree.
+ */
+static int
+zfsctl_snapshot_rename(char *old_snapname, char *new_snapname)
+{
+       zfs_snapentry_t *se;
+
+       ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
+
+       se = zfsctl_snapshot_find_by_name(old_snapname);
+       if (se == NULL)
+               return (ENOENT);
+
+       zfsctl_snapshot_remove(se);
+       strfree(se->se_name);
+       se->se_name = strdup(new_snapname);
+       zfsctl_snapshot_add(se);
+       zfsctl_snapshot_rele(se);
+
+       return (0);
+}
+
+/*
+ * Delayed task responsible for unmounting an expired automounted snapshot.
+ */
+static void
+snapentry_expire(void *data)
+{
+       zfs_snapentry_t *se = (zfs_snapentry_t *)data;
+       uint64_t objsetid = se->se_objsetid;
+
+       se->se_taskqid = -1;
+       (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE);
+       zfsctl_snapshot_rele(se);
+
+       /*
+        * Reschedule the unmount if the zfs_snapentry_t wasn't removed.
+        * This can occur when the snapshot is busy.
+        */
+       mutex_enter(&zfs_snapshot_lock);
+       if ((se = zfsctl_snapshot_find_by_objsetid(objsetid)) != NULL) {
+               zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
+               zfsctl_snapshot_rele(se);
+       }
+       mutex_exit(&zfs_snapshot_lock);
+}
+
+/*
+ * Cancel an automatic unmount of a snapname.  This callback is responsible
+ * for dropping the reference on the zfs_snapentry_t which was taken when
+ * during dispatch.
+ */
+static void
+zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
+{
+       ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
+
+       if (taskq_cancel_id(zfs_expire_taskq, se->se_taskqid) == 0) {
+               se->se_taskqid = -1;
+               zfsctl_snapshot_rele(se);
+       }
+}
+
+/*
+ * Dispatch the unmount task for delayed handling with a hold protecting it.
+ */
+static void
+zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
+{
+       ASSERT3S(se->se_taskqid, ==, -1);
+
+       se->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq,
+           snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);
+       zfsctl_snapshot_hold(se);
+}
+
+/*
+ * Schedule an automatic unmount of objset id to occur in delay seconds from
+ * now.  Any previous delayed unmount will be cancelled in favor of the
+ * updated deadline.  A reference is taken by zfsctl_snapshot_find_by_name()
+ * and held until the outstanding task is handled or cancelled.
+ */
+int
+zfsctl_snapshot_unmount_delay(uint64_t objsetid, int delay)
+{
+       zfs_snapentry_t *se;
+       int error = ENOENT;
+
+       mutex_enter(&zfs_snapshot_lock);
+       if ((se = zfsctl_snapshot_find_by_objsetid(objsetid)) != NULL) {
+               zfsctl_snapshot_unmount_cancel(se);
+               zfsctl_snapshot_unmount_delay_impl(se, delay);
+               zfsctl_snapshot_rele(se);
+               error = 0;
+       }
+       mutex_exit(&zfs_snapshot_lock);
+
+       return (error);
+}
+
+/*
+ * Check if snapname is currently mounted.  Returned non-zero when mounted
+ * and zero when unmounted.
+ */
+static boolean_t
+zfsctl_snapshot_ismounted(char *snapname)
+{
+       zfs_snapentry_t *se;
+       boolean_t ismounted = B_FALSE;
+
+       mutex_enter(&zfs_snapshot_lock);
+       if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) {
+               zfsctl_snapshot_rele(se);
+               ismounted = B_TRUE;
+       }
+       mutex_exit(&zfs_snapshot_lock);
+
+       return (ismounted);
+}
+
+/*
+ * Check if the given inode is a part of the virtual .zfs directory.
+ */
 boolean_t
 zfsctl_is_node(struct inode *ip)
 {
        return (ITOZ(ip)->z_is_ctldir);
 }
 
+/*
+ * Check if the given inode is a .zfs/snapshots/snapname directory.
+ */
 boolean_t
 zfsctl_is_snapdir(struct inode *ip)
 {
@@ -249,24 +524,6 @@ zfsctl_inode_lookup(zfs_sb_t *zsb, uint64_t id,
        return (ip);
 }
 
-/*
- * Free zfsctl inode specific structures, currently there are none.
- */
-void
-zfsctl_inode_destroy(struct inode *ip)
-{
-}
-
-/*
- * An inode is being evicted from the cache.
- */
-void
-zfsctl_inode_inactive(struct inode *ip)
-{
-       if (zfsctl_is_snapdir(ip))
-               zfsctl_snapdir_inactive(ip);
-}
-
 /*
  * Create the '.zfs' directory.  This directory is cached as part of the VFS
  * structure.  This results in a hold on the zfs_sb_t.  The code in zfs_umount()
@@ -295,13 +552,27 @@ zfsctl_create(zfs_sb_t *zsb)
 }
 
 /*
- * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
+ * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name.
+ * Only called when the filesystem is unmounted.
  */
 void
 zfsctl_destroy(zfs_sb_t *zsb)
 {
-       iput(zsb->z_ctldir);
-       zsb->z_ctldir = NULL;
+       if (zsb->z_issnap) {
+               zfs_snapentry_t *se;
+               uint64_t objsetid = dmu_objset_id(zsb->z_os);
+
+               mutex_enter(&zfs_snapshot_lock);
+               if ((se = zfsctl_snapshot_find_by_objsetid(objsetid)) != NULL) {
+                       zfsctl_snapshot_unmount_cancel(se);
+                       zfsctl_snapshot_remove(se);
+                       zfsctl_snapshot_rele(se);
+               }
+               mutex_exit(&zfs_snapshot_lock);
+       } else if (zsb->z_ctldir) {
+               iput(zsb->z_ctldir);
+               zsb->z_ctldir = NULL;
+       }
 }
 
 /*
@@ -315,8 +586,45 @@ zfsctl_root(znode_t *zp)
        igrab(ZTOZSB(zp)->z_ctldir);
        return (ZTOZSB(zp)->z_ctldir);
 }
+/*
+ * Generate a long fid which includes the root object and objset of a
+ * snapshot but not the generation number.  For the root object the
+ * generation number is ignored when zero to avoid needing to open
+ * the dataset when generating fids for the snapshot names.
+ */
+static int
+zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp)
+{
+       zfs_sb_t *zsb = ITOZSB(ip);
+       zfid_short_t *zfid = (zfid_short_t *)fidp;
+       zfid_long_t *zlfid = (zfid_long_t *)fidp;
+       uint32_t gen = 0;
+       uint64_t object;
+       uint64_t objsetid;
+       int i;
+
+       object = zsb->z_root;
+       objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino;
+       zfid->zf_len = LONG_FID_LEN;
+
+       for (i = 0; i < sizeof (zfid->zf_object); i++)
+               zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+       for (i = 0; i < sizeof (zfid->zf_gen); i++)
+               zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+       for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+               zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
+
+       for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+               zlfid->zf_setgen[i] = 0;
 
-/*ARGSUSED*/
+       return (0);
+}
+
+/*
+ * Generate an appropriate fid for an entry in the .zfs directory.
+ */
 int
 zfsctl_fid(struct inode *ip, fid_t *fidp)
 {
@@ -334,6 +642,11 @@ zfsctl_fid(struct inode *ip, fid_t *fidp)
                return (SET_ERROR(ENOSPC));
        }
 
+       if (zfsctl_is_snapdir(ip)) {
+               ZFS_EXIT(zsb);
+               return (zfsctl_snapdir_fid(ip, fidp));
+       }
+
        zfid = (zfid_short_t *)fidp;
 
        zfid->zf_len = SHORT_FID_LEN;
@@ -349,31 +662,33 @@ zfsctl_fid(struct inode *ip, fid_t *fidp)
        return (0);
 }
 
+/*
+ * Construct a full dataset name in full_name: "pool/dataset@snap_name"
+ */
 static int
-zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname)
+zfsctl_snapshot_name(zfs_sb_t *zsb, const char *snap_name, int len,
+    char *full_name)
 {
-       objset_t *os = ITOZSB(ip)->z_os;
+       objset_t *os = zsb->z_os;
 
-       if (zfs_component_namecheck(name, NULL, NULL) != 0)
+       if (zfs_component_namecheck(snap_name, NULL, NULL) != 0)
                return (SET_ERROR(EILSEQ));
 
-       dmu_objset_name(os, zname);
-       if ((strlen(zname) + 1 + strlen(name)) >= len)
+       dmu_objset_name(os, full_name);
+       if ((strlen(full_name) + 1 + strlen(snap_name)) >= len)
                return (SET_ERROR(ENAMETOOLONG));
 
-       (void) strcat(zname, "@");
-       (void) strcat(zname, name);
+       (void) strcat(full_name, "@");
+       (void) strcat(full_name, snap_name);
 
        return (0);
 }
 
 /*
- * Gets the full dataset name that corresponds to the given snapshot name
- * Example:
- *     zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1"
+ * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
  */
 static int
-zfsctl_snapshot_zpath(struct path *path, int len, char *zpath)
+zfsctl_snapshot_path(struct path *path, int len, char *full_path)
 {
        char *path_buffer, *path_ptr;
        int path_len, error = 0;
@@ -392,18 +707,59 @@ zfsctl_snapshot_zpath(struct path *path, int len, char *zpath)
                goto out;
        }
 
-       memcpy(zpath, path_ptr, path_len);
-       zpath[path_len] = '\0';
+       memcpy(full_path, path_ptr, path_len);
+       full_path[path_len] = '\0';
 out:
        kmem_free(path_buffer, len);
 
        return (error);
 }
 
+/*
+ * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
+ */
+static int
+zfsctl_snapshot_path_objset(zfs_sb_t *zsb, uint64_t objsetid,
+    int path_len, char *full_path)
+{
+       objset_t *os = zsb->z_os;
+       fstrans_cookie_t cookie;
+       char *snapname;
+       boolean_t case_conflict;
+       uint64_t id, pos = 0;
+       int error = 0;
+
+       if (zsb->z_mntopts->z_mntpoint == NULL)
+               return (ENOENT);
+
+       cookie = spl_fstrans_mark();
+       snapname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+
+       while (error == 0) {
+               dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+               error = dmu_snapshot_list_next(zsb->z_os, MAXNAMELEN,
+                   snapname, &id, &pos, &case_conflict);
+               dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+               if (error)
+                       goto out;
+
+               if (id == objsetid)
+                       break;
+       }
+
+       memset(full_path, 0, path_len);
+       snprintf(full_path, path_len - 1, "%s/.zfs/snapshot/%s",
+           zsb->z_mntopts->z_mntpoint, snapname);
+out:
+       kmem_free(snapname, MAXNAMELEN);
+       spl_fstrans_unmark(cookie);
+
+       return (error);
+}
+
 /*
  * Special case the handling of "..".
  */
-/* ARGSUSED */
 int
 zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp,
     int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
@@ -438,7 +794,6 @@ zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp,
  * snapshot if it exist, creating the pseudo filesystem inode as necessary.
  * Perform a mount of the associated dataset on top of the inode.
  */
-/* ARGSUSED */
 int
 zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp,
     int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
@@ -457,52 +812,30 @@ zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp,
 
        *ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SNAPDIRS - id,
            &simple_dir_operations, &simple_dir_inode_operations);
-       if (*ipp) {
-#ifdef HAVE_AUTOMOUNT
-               (*ipp)->i_flags |= S_AUTOMOUNT;
-#endif /* HAVE_AUTOMOUNT */
-       } else {
+       if (*ipp == NULL)
                error = SET_ERROR(ENOENT);
-       }
 
        ZFS_EXIT(zsb);
 
        return (error);
 }
 
-static void
-zfsctl_rename_snap(zfs_sb_t *zsb, zfs_snapentry_t *sep, const char *name)
-{
-       avl_index_t where;
-
-       ASSERT(MUTEX_HELD(&zsb->z_ctldir_lock));
-       ASSERT(sep != NULL);
-
-       /*
-        * Change the name in the AVL tree.
-        */
-       avl_remove(&zsb->z_ctldir_snaps, sep);
-       (void) strcpy(sep->se_name, name);
-       VERIFY(avl_find(&zsb->z_ctldir_snaps, sep, &where) == NULL);
-       avl_insert(&zsb->z_ctldir_snaps, sep, where);
-}
-
 /*
  * Renaming a directory under '.zfs/snapshot' will automatically trigger
  * a rename of the snapshot to the new given name.  The rename is confined
  * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.
  */
-/*ARGSUSED*/
 int
 zfsctl_snapdir_rename(struct inode *sdip, char *snm,
     struct inode *tdip, char *tnm, cred_t *cr, int flags)
 {
        zfs_sb_t *zsb = ITOZSB(sdip);
-       zfs_snapentry_t search, *sep;
-       avl_index_t where;
        char *to, *from, *real, *fsname;
        int error;
 
+       if (!zfs_admin_snapshot)
+               return (EACCES);
+
        ZFS_ENTER(zsb);
 
        to = kmem_alloc(MAXNAMELEN, KM_SLEEP);
@@ -522,9 +855,9 @@ zfsctl_snapdir_rename(struct inode *sdip, char *snm,
 
        dmu_objset_name(zsb->z_os, fsname);
 
-       error = zfsctl_snapshot_zname(sdip, snm, MAXNAMELEN, from);
+       error = zfsctl_snapshot_name(ITOZSB(sdip), snm, MAXNAMELEN, from);
        if (error == 0)
-               error = zfsctl_snapshot_zname(tdip, tnm, MAXNAMELEN, to);
+               error = zfsctl_snapshot_name(ITOZSB(tdip), tnm, MAXNAMELEN, to);
        if (error == 0)
                error = zfs_secpolicy_rename_perms(from, to, cr);
        if (error != 0)
@@ -546,19 +879,13 @@ zfsctl_snapdir_rename(struct inode *sdip, char *snm,
                goto out;
        }
 
-       mutex_enter(&zsb->z_ctldir_lock);
+       mutex_enter(&zfs_snapshot_lock);
 
        error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);
-       if (error)
-               goto out_unlock;
-
-       search.se_name = (char *)snm;
-       sep = avl_find(&zsb->z_ctldir_snaps, &search, &where);
-       if (sep)
-               zfsctl_rename_snap(zsb, sep, tnm);
+       if (error == 0)
+               (void) zfsctl_snapshot_rename(snm, tnm);
 
-out_unlock:
-       mutex_exit(&zsb->z_ctldir_lock);
+       mutex_exit(&zfs_snapshot_lock);
 out:
        kmem_free(from, MAXNAMELEN);
        kmem_free(to, MAXNAMELEN);
@@ -574,7 +901,6 @@ out:
  * Removing a directory under '.zfs/snapshot' will automatically trigger
  * the removal of the snapshot with the given name.
  */
-/* ARGSUSED */
 int
 zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags)
 {
@@ -582,6 +908,9 @@ zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags)
        char *snapname, *real;
        int error;
 
+       if (!zfs_admin_snapshot)
+               return (EACCES);
+
        ZFS_ENTER(zsb);
 
        snapname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
@@ -597,13 +926,13 @@ zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags)
                }
        }
 
-       error = zfsctl_snapshot_zname(dip, name, MAXNAMELEN, snapname);
+       error = zfsctl_snapshot_name(ITOZSB(dip), name, MAXNAMELEN, snapname);
        if (error == 0)
                error = zfs_secpolicy_destroy_perms(snapname, cr);
        if (error != 0)
                goto out;
 
-       error = zfsctl_unmount_snapshot(zsb, name, MNT_FORCE);
+       error = zfsctl_snapshot_unmount(snapname, MNT_FORCE);
        if ((error == 0) || (error == ENOENT))
                error = dsl_destroy_snapshot(snapname, B_FALSE);
 out:
@@ -619,7 +948,6 @@ out:
  * Creating a directory under '.zfs/snapshot' will automatically trigger
  * the creation of a new snapshot with the given name.
  */
-/* ARGSUSED */
 int
 zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap,
        struct inode **ipp, cred_t *cr, int flags)
@@ -628,6 +956,9 @@ zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap,
        char *dsname;
        int error;
 
+       if (!zfs_admin_snapshot)
+               return (EACCES);
+
        dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 
        if (zfs_component_namecheck(dirname, NULL, NULL) != 0) {
@@ -655,36 +986,6 @@ out:
        return (error);
 }
 
-/*
- * When a .zfs/snapshot/<snapshot> inode is evicted they must be removed
- * from the snapshot list.  This will normally happen as part of the auto
- * unmount, however in the case of a manual snapshot unmount this will be
- * the only notification we receive.
- */
-void
-zfsctl_snapdir_inactive(struct inode *ip)
-{
-       zfs_sb_t *zsb = ITOZSB(ip);
-       zfs_snapentry_t *sep, *next;
-
-       mutex_enter(&zsb->z_ctldir_lock);
-
-       sep = avl_first(&zsb->z_ctldir_snaps);
-       while (sep != NULL) {
-               next = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
-
-               if (sep->se_inode == ip) {
-                       avl_remove(&zsb->z_ctldir_snaps, sep);
-                       taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid);
-                       zfsctl_sep_free(sep);
-                       break;
-               }
-               sep = next;
-       }
-
-       mutex_exit(&zsb->z_ctldir_lock);
-}
-
 /*
  * Attempt to unmount a snapshot by making a call to user space.
  * There is no assurance that this can or will succeed, is just a
@@ -697,18 +998,29 @@ zfsctl_snapdir_inactive(struct inode *ip)
        "     2>/dev/null; " \
        "umount -t zfs -n %s'%s'"
 
-static int
-__zfsctl_unmount_snapshot(zfs_snapentry_t *sep, int flags)
+int
+zfsctl_snapshot_unmount(char *snapname, int flags)
 {
        char *argv[] = { "/bin/sh", "-c", NULL, NULL };
        char *envp[] = { NULL };
+       zfs_snapentry_t *se;
        int error;
 
+       mutex_enter(&zfs_snapshot_lock);
+       if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) {
+               mutex_exit(&zfs_snapshot_lock);
+               return (ENOENT);
+       }
+       mutex_exit(&zfs_snapshot_lock);
+
        argv[2] = kmem_asprintf(SET_UNMOUNT_CMD,
-           flags & MNT_FORCE ? "-f " : "", sep->se_path);
+           flags & MNT_FORCE ? "-f " : "", se->se_path);
+       zfsctl_snapshot_rele(se);
+       dprintf("unmount; path=%s\n", se->se_path);
        error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
        strfree(argv[2]);
 
+
        /*
         * The umount system utility will return 256 on error.  We must
         * assume this error is because the file system is busy so it is
@@ -717,91 +1029,10 @@ __zfsctl_unmount_snapshot(zfs_snapentry_t *sep, int flags)
        if (error)
                error = SET_ERROR(EBUSY);
 
-       /*
-        * This was the result of a manual unmount, cancel the delayed work
-        * to prevent zfsctl_expire_snapshot() from attempting a unmount.
-        */
-       if ((error == 0) && !(flags & MNT_EXPIRE))
-               taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid);
-
-
        return (error);
 }
 
-int
-zfsctl_unmount_snapshot(zfs_sb_t *zsb, char *name, int flags)
-{
-       zfs_snapentry_t search;
-       zfs_snapentry_t *sep;
-       int error = 0;
-
-       mutex_enter(&zsb->z_ctldir_lock);
-
-       search.se_name = name;
-       sep = avl_find(&zsb->z_ctldir_snaps, &search, NULL);
-       if (sep) {
-               avl_remove(&zsb->z_ctldir_snaps, sep);
-               mutex_exit(&zsb->z_ctldir_lock);
-
-               error = __zfsctl_unmount_snapshot(sep, flags);
-
-               mutex_enter(&zsb->z_ctldir_lock);
-               if (error == EBUSY)
-                       avl_add(&zsb->z_ctldir_snaps, sep);
-               else
-                       zfsctl_sep_free(sep);
-       } else {
-               error = SET_ERROR(ENOENT);
-       }
-
-       mutex_exit(&zsb->z_ctldir_lock);
-       ASSERT3S(error, >=, 0);
-
-       return (error);
-}
-
-/*
- * Traverse all mounted snapshots and attempt to unmount them.  This
- * is best effort, on failure EEXIST is returned and count will be set
- * to the number of file snapshots which could not be unmounted.
- */
-int
-zfsctl_unmount_snapshots(zfs_sb_t *zsb, int flags, int *count)
-{
-       zfs_snapentry_t *sep, *next;
-       int error = 0;
-
-       *count = 0;
-
-       ASSERT(zsb->z_ctldir != NULL);
-       mutex_enter(&zsb->z_ctldir_lock);
-
-       sep = avl_first(&zsb->z_ctldir_snaps);
-       while (sep != NULL) {
-               next = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
-               avl_remove(&zsb->z_ctldir_snaps, sep);
-               mutex_exit(&zsb->z_ctldir_lock);
-
-               error = __zfsctl_unmount_snapshot(sep, flags);
-
-               mutex_enter(&zsb->z_ctldir_lock);
-               if (error == EBUSY) {
-                       avl_add(&zsb->z_ctldir_snaps, sep);
-                       (*count)++;
-               } else {
-                       zfsctl_sep_free(sep);
-               }
-
-               sep = next;
-       }
-
-       mutex_exit(&zsb->z_ctldir_lock);
-
-       return ((*count > 0) ? EEXIST : 0);
-}
-
 #define        MOUNT_BUSY 0x80         /* Mount failed due to EBUSY (from mntent.h) */
-
 #define        SET_MOUNT_CMD \
        "exec 0</dev/null " \
        "     1>/dev/null " \
@@ -809,31 +1040,45 @@ zfsctl_unmount_snapshots(zfs_sb_t *zsb, int flags, int *count)
        "mount -t zfs -n '%s' '%s'"
 
 int
-zfsctl_mount_snapshot(struct path *path, int flags)
+zfsctl_snapshot_mount(struct path *path, int flags)
 {
        struct dentry *dentry = path->dentry;
        struct inode *ip = dentry->d_inode;
-       zfs_sb_t *zsb = ITOZSB(ip);
+       zfs_sb_t *zsb;
+       zfs_sb_t *snap_zsb;
+       zfs_snapentry_t *se;
        char *full_name, *full_path;
-       zfs_snapentry_t *sep;
-       zfs_snapentry_t search;
        char *argv[] = { "/bin/sh", "-c", NULL, NULL };
        char *envp[] = { NULL };
        int error;
 
+       if (ip == NULL)
+               return (EISDIR);
+
+       zsb = ITOZSB(ip);
        ZFS_ENTER(zsb);
 
        full_name = kmem_zalloc(MAXNAMELEN, KM_SLEEP);
-       full_path = kmem_zalloc(PATH_MAX, KM_SLEEP);
+       full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 
-       error = zfsctl_snapshot_zname(ip, dname(dentry), MAXNAMELEN, full_name);
+       error = zfsctl_snapshot_name(zsb, dname(dentry),
+           MAXNAMELEN, full_name);
        if (error)
                goto error;
 
-       error = zfsctl_snapshot_zpath(path, PATH_MAX, full_path);
+       error = zfsctl_snapshot_path(path, MAXPATHLEN, full_path);
        if (error)
                goto error;
 
+       /*
+        * Multiple concurrent automounts of a snapshot are never allowed.
+        * The snapshot may be manually mounted as many times as desired.
+        */
+       if (zfsctl_snapshot_ismounted(full_name)) {
+               error = SET_ERROR(EISDIR);
+               goto error;
+       }
+
        /*
         * Attempt to mount the snapshot from user space.  Normally this
         * would be done using the vfs_kern_mount() function, however that
@@ -846,48 +1091,38 @@ zfsctl_mount_snapshot(struct path *path, int flags)
         * Take note that if the program was executed successfully the return
         * value from call_usermodehelper() will be (exitcode << 8 + signal).
         */
+       dprintf("mount; name=%s path=%s\n", full_name, full_path);
        argv[2] = kmem_asprintf(SET_MOUNT_CMD, full_name, full_path);
        error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
        strfree(argv[2]);
        if (error && !(error & MOUNT_BUSY << 8)) {
-               printk("ZFS: Unable to automount %s at %s: %d\n",
-                   full_name, full_path, error);
+               cmn_err(CE_WARN, "Unable to automount %s/%s: %d",
+                   full_path, full_name, error);
                error = SET_ERROR(EISDIR);
                goto error;
        }
 
-       error = 0;
-       mutex_enter(&zsb->z_ctldir_lock);
-
        /*
-        * Ensure a previous entry does not exist, if it does safely remove
-        * it any cancel the outstanding expiration.  This can occur when a
-        * snapshot is manually unmounted and then an automount is triggered.
+        * Follow down in to the mounted snapshot and set MNT_SHRINKABLE
+        * to identify this as an automounted filesystem.
         */
-       search.se_name = full_name;
-       sep = avl_find(&zsb->z_ctldir_snaps, &search, NULL);
-       if (sep) {
-               avl_remove(&zsb->z_ctldir_snaps, sep);
-               taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid);
-               zfsctl_sep_free(sep);
-       }
-
-       sep = zfsctl_sep_alloc();
-       sep->se_name = full_name;
-       sep->se_path = full_path;
-       sep->se_inode = ip;
-       avl_add(&zsb->z_ctldir_snaps, sep);
-
-       sep->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq,
-           zfsctl_expire_snapshot, sep, TQ_SLEEP,
-           ddi_get_lbolt() + zfs_expire_snapshot * HZ);
+       zpl_follow_down_one(path);
+       snap_zsb = ITOZSB(path->dentry->d_inode);
+       snap_zsb->z_parent = zsb;
+       dentry = path->dentry;
+       path->mnt->mnt_flags |= MNT_SHRINKABLE;
+       zpl_follow_up(path);
+       error = 0;
 
-       mutex_exit(&zsb->z_ctldir_lock);
+       mutex_enter(&zfs_snapshot_lock);
+       se = zfsctl_snapshot_alloc(full_name, full_path,
+           dmu_objset_id(snap_zsb->z_os), dentry);
+       zfsctl_snapshot_add(se);
+       zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
+       mutex_exit(&zfs_snapshot_lock);
 error:
-       if (error) {
-               kmem_free(full_name, MAXNAMELEN);
-               kmem_free(full_path, PATH_MAX);
-       }
+       kmem_free(full_name, MAXNAMELEN);
+       kmem_free(full_path, MAXPATHLEN);
 
        ZFS_EXIT(zsb);
 
@@ -895,82 +1130,71 @@ error:
 }
 
 /*
- * Check if this super block has a matching objset id.
+ * Given the objset id of the snapshot return its zfs_sb_t as zsbp.
  */
-static int
-zfsctl_test_super(struct super_block *sb, void *objsetidp)
-{
-       zfs_sb_t *zsb = sb->s_fs_info;
-       uint64_t objsetid = *(uint64_t *)objsetidp;
-
-       return (dmu_objset_id(zsb->z_os) == objsetid);
-}
-
-/*
- * Prevent a new super block from being allocated if an existing one
- * could not be located.  We only want to preform a lookup operation.
- */
-static int
-zfsctl_set_super(struct super_block *sb, void *objsetidp)
-{
-       return (-EEXIST);
-}
-
 int
 zfsctl_lookup_objset(struct super_block *sb, uint64_t objsetid, zfs_sb_t **zsbp)
 {
-       zfs_sb_t *zsb = sb->s_fs_info;
-       struct super_block *sbp;
-       zfs_snapentry_t *sep;
-       uint64_t id;
+       zfs_snapentry_t *se;
        int error;
 
-       ASSERT(zsb->z_ctldir != NULL);
-
-       mutex_enter(&zsb->z_ctldir_lock);
-
        /*
-        * Verify that the snapshot is mounted.
+        * Verify that the snapshot is mounted then lookup the mounted root
+        * rather than the covered mount point.  This may fail if the
+        * snapshot has just been unmounted by an unrelated user space
+        * process.  This race cannot occur to an expired mount point
+        * because we hold the zfs_snapshot_lock to prevent the race.
         */
-       sep = avl_first(&zsb->z_ctldir_snaps);
-       while (sep != NULL) {
-               error = dmu_snapshot_lookup(zsb->z_os, sep->se_name, &id);
-               if (error)
-                       goto out;
-
-               if (id == objsetid)
-                       break;
+       mutex_enter(&zfs_snapshot_lock);
+       if ((se = zfsctl_snapshot_find_by_objsetid(objsetid)) != NULL) {
+               zfs_sb_t *zsb;
+
+               zsb = ITOZSB(se->se_root_dentry->d_inode);
+               ASSERT3U(dmu_objset_id(zsb->z_os), ==, objsetid);
+
+               if (time_after(jiffies, zsb->z_snap_defer_time +
+                   MAX(zfs_expire_snapshot * HZ / 2, HZ))) {
+                       zsb->z_snap_defer_time = jiffies;
+                       zfsctl_snapshot_unmount_delay(objsetid,
+                           zfs_expire_snapshot);
+               }
 
-               sep = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
+               *zsbp = zsb;
+               zfsctl_snapshot_rele(se);
+               error = SET_ERROR(0);
+       } else {
+               error = SET_ERROR(ENOENT);
        }
+       mutex_exit(&zfs_snapshot_lock);
 
-       if (sep != NULL) {
-               /*
-                * Lookup the mounted root rather than the covered mount
-                * point.  This may fail if the snapshot has just been
-                * unmounted by an unrelated user space process.  This
-                * race cannot occur to an expired mount point because
-                * we hold the zsb->z_ctldir_lock to prevent the race.
-                */
-               sbp = zpl_sget(&zpl_fs_type, zfsctl_test_super,
-                   zfsctl_set_super, 0, &id);
-               if (IS_ERR(sbp)) {
-                       error = -PTR_ERR(sbp);
-               } else {
-                       *zsbp = sbp->s_fs_info;
-                       deactivate_super(sbp);
+       /*
+        * Automount the snapshot given the objset id by constructing the
+        * full mount point and performing a traversal.
+        */
+       if (error == ENOENT) {
+               struct path path;
+               char *mnt;
+
+               mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+               error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid,
+                   MAXPATHLEN, mnt);
+               if (error) {
+                       kmem_free(mnt, MAXPATHLEN);
+                       return (SET_ERROR(error));
                }
-       } else {
-               error = SET_ERROR(EINVAL);
+
+               error = kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
+               if (error == 0) {
+                       *zsbp = ITOZSB(path.dentry->d_inode);
+                       path_put(&path);
+               }
+
+               kmem_free(mnt, MAXPATHLEN);
        }
-out:
-       mutex_exit(&zsb->z_ctldir_lock);
-       ASSERT3S(error, >=, 0);
 
        return (error);
 }
 
-/* ARGSUSED */
 int
 zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
     int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
@@ -1009,7 +1233,15 @@ zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
 void
 zfsctl_init(void)
 {
-       zfs_expire_taskq = taskq_create("z_unmount", 1, maxclsyspri,
+       avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name,
+           sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
+           se_node_name));
+       avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid,
+           sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
+           se_node_objsetid));
+       mutex_init(&zfs_snapshot_lock, NULL, MUTEX_DEFAULT, NULL);
+
+       zfs_expire_taskq = taskq_create("z_unmount", 1, defclsyspri,
            1, 8, TASKQ_PREPOPULATE);
 }
 
@@ -1021,7 +1253,14 @@ void
 zfsctl_fini(void)
 {
        taskq_destroy(zfs_expire_taskq);
+
+       avl_destroy(&zfs_snapshots_by_name);
+       avl_destroy(&zfs_snapshots_by_objsetid);
+       mutex_destroy(&zfs_snapshot_lock);
 }
 
+module_param(zfs_admin_snapshot, int, 0644);
+MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot");
+
 module_param(zfs_expire_snapshot, int, 0644);
 MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot");
index e835397e9453bd15492fd84825afa05a4b641f7f..2770359c8b48e459d63c69faa003eb90b63aff04 100644 (file)
  */
 
 #include <sys/zfs_context.h>
+#include <sys/kstat.h>
 
 list_t zfs_dbgmsgs;
-int zfs_dbgmsg_size;
+int zfs_dbgmsg_size = 0;
 kmutex_t zfs_dbgmsgs_lock;
 int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
+kstat_t *zfs_dbgmsg_kstat;
+
+/*
+ * By default only enable the internal ZFS debug messages when running
+ * in userspace (ztest).  The kernel log must be manually enabled.
+ *
+ * # Enable the kernel debug message log.
+ * echo 1 > /sys/module/zfs/parameters/zfs_dbgmsg_enable
+ *
+ * # Clear the kernel debug message log.
+ * echo 0 >/proc/spl/kstat/zfs/dbgmsg
+ */
+#if defined(_KERNEL)
+int zfs_dbgmsg_enable = 0;
+#else
+int zfs_dbgmsg_enable = 1;
+#endif
+
+static int
+zfs_dbgmsg_headers(char *buf, size_t size)
+{
+       (void) snprintf(buf, size, "%-12s %-8s\n", "timestamp", "message");
+
+       return (0);
+}
+
+static int
+zfs_dbgmsg_data(char *buf, size_t size, void *data)
+{
+       zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)data;
+
+       (void) snprintf(buf, size, "%-12llu %-s\n",
+           (u_longlong_t) zdm->zdm_timestamp, zdm->zdm_msg);
+
+       return (0);
+}
+
+static void *
+zfs_dbgmsg_addr(kstat_t *ksp, loff_t n)
+{
+       zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)ksp->ks_private;
+
+       ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock));
+
+       if (n == 0)
+               ksp->ks_private = list_head(&zfs_dbgmsgs);
+       else if (zdm)
+               ksp->ks_private = list_next(&zfs_dbgmsgs, zdm);
+
+       return (ksp->ks_private);
+}
+
+static void
+zfs_dbgmsg_purge(int max_size)
+{
+       zfs_dbgmsg_t *zdm;
+       int size;
+
+       ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock));
+
+       while (zfs_dbgmsg_size > max_size) {
+               zdm = list_remove_head(&zfs_dbgmsgs);
+               if (zdm == NULL)
+                       return;
+
+               size = zdm->zdm_size;
+               kmem_free(zdm, size);
+               zfs_dbgmsg_size -= size;
+       }
+}
+
+static int
+zfs_dbgmsg_update(kstat_t *ksp, int rw)
+{
+       if (rw == KSTAT_WRITE)
+               zfs_dbgmsg_purge(0);
+
+       return (0);
+}
 
 void
 zfs_dbgmsg_init(void)
@@ -36,87 +116,124 @@ zfs_dbgmsg_init(void)
        list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t),
            offsetof(zfs_dbgmsg_t, zdm_node));
        mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL);
+
+       zfs_dbgmsg_kstat = kstat_create("zfs", 0, "dbgmsg", "misc",
+           KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+       if (zfs_dbgmsg_kstat) {
+               zfs_dbgmsg_kstat->ks_lock = &zfs_dbgmsgs_lock;
+               zfs_dbgmsg_kstat->ks_ndata = UINT32_MAX;
+               zfs_dbgmsg_kstat->ks_private = NULL;
+               zfs_dbgmsg_kstat->ks_update = zfs_dbgmsg_update;
+               kstat_set_raw_ops(zfs_dbgmsg_kstat, zfs_dbgmsg_headers,
+                   zfs_dbgmsg_data, zfs_dbgmsg_addr);
+               kstat_install(zfs_dbgmsg_kstat);
+       }
 }
 
 void
 zfs_dbgmsg_fini(void)
 {
-       zfs_dbgmsg_t *zdm;
+       if (zfs_dbgmsg_kstat)
+               kstat_delete(zfs_dbgmsg_kstat);
 
-       while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) {
-               int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
-               kmem_free(zdm, size);
-               zfs_dbgmsg_size -= size;
-       }
+       mutex_enter(&zfs_dbgmsgs_lock);
+       zfs_dbgmsg_purge(0);
+       mutex_exit(&zfs_dbgmsgs_lock);
        mutex_destroy(&zfs_dbgmsgs_lock);
-       ASSERT0(zfs_dbgmsg_size);
 }
 
-/*
- * To get this data enable the zfs__dbgmsg tracepoint as shown:
- *
- * # Enable zfs__dbgmsg tracepoint, clear the tracepoint ring buffer
- * $ echo 1 > /sys/kernel/debug/tracing/events/zfs/enable
- * $ echo 0 > /sys/kernel/debug/tracing/trace
- *
- * # Dump the ring buffer.
- * $ cat /sys/kernel/debug/tracing/trace
- */
 void
-zfs_dbgmsg(const char *fmt, ...)
+__zfs_dbgmsg(char *buf)
 {
+       zfs_dbgmsg_t *zdm;
        int size;
+
+       size = sizeof (zfs_dbgmsg_t) + strlen(buf);
+       zdm = kmem_zalloc(size, KM_SLEEP);
+       zdm->zdm_size = size;
+       zdm->zdm_timestamp = gethrestime_sec();
+       strcpy(zdm->zdm_msg, buf);
+
+       mutex_enter(&zfs_dbgmsgs_lock);
+       list_insert_tail(&zfs_dbgmsgs, zdm);
+       zfs_dbgmsg_size += size;
+       zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0));
+       mutex_exit(&zfs_dbgmsgs_lock);
+}
+
+#ifdef _KERNEL
+void
+__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
+{
+       const char *newfile;
        va_list adx;
+       size_t size;
+       char *buf;
        char *nl;
-       zfs_dbgmsg_t *zdm;
 
-       va_start(adx, fmt);
-       size = vsnprintf(NULL, 0, fmt, adx);
-       va_end(adx);
+       if (!zfs_dbgmsg_enable && !(zfs_flags & ZFS_DEBUG_DPRINTF))
+               return;
+
+       size = 1024;
+       buf = kmem_alloc(size, KM_SLEEP);
 
        /*
-        * There is one byte of string in sizeof (zfs_dbgmsg_t), used
-        * for the terminating null.
+        * Get rid of annoying prefix to filename.
         */
-       zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP);
-       zdm->zdm_timestamp = gethrestime_sec();
+       newfile = strrchr(file, '/');
+       if (newfile != NULL) {
+               newfile = newfile + 1; /* Get rid of leading / */
+       } else {
+               newfile = file;
+       }
 
        va_start(adx, fmt);
-       (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx);
+       (void) vsnprintf(buf, size, fmt, adx);
        va_end(adx);
 
        /*
         * Get rid of trailing newline.
         */
-       nl = strrchr(zdm->zdm_msg, '\n');
+       nl = strrchr(buf, '\n');
        if (nl != NULL)
                *nl = '\0';
 
-       DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg);
+       /*
+        * To get this data enable the zfs__dprintf trace point as shown:
+        *
+        * # Enable zfs__dprintf tracepoint, clear the tracepoint ring buffer
+        * $ echo 1 > /sys/module/zfs/parameters/zfs_flags
+        * $ echo 1 > /sys/kernel/debug/tracing/events/zfs/enable
+        * $ echo 0 > /sys/kernel/debug/tracing/trace
+        *
+        * # Dump the ring buffer.
+        * $ cat /sys/kernel/debug/tracing/trace
+        */
+       if (zfs_flags & ZFS_DEBUG_DPRINTF)
+               DTRACE_PROBE4(zfs__dprintf,
+                   char *, newfile, char *, func, int, line, char *, buf);
 
-       mutex_enter(&zfs_dbgmsgs_lock);
-       list_insert_tail(&zfs_dbgmsgs, zdm);
-       zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size;
-       while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) {
-               zdm = list_remove_head(&zfs_dbgmsgs);
-               size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
-               kmem_free(zdm, size);
-               zfs_dbgmsg_size -= size;
-       }
-       mutex_exit(&zfs_dbgmsgs_lock);
+       /*
+        * To get this data enable the zfs debug log as shown:
+        *
+        * # Set zfs_dbgmsg enable, clear the log buffer
+        * $ echo 1 > /sys/module/zfs/parameters/zfs_dbgmsg_enable
+        * $ echo 0 > /proc/spl/kstat/zfs/dbgmsg
+        *
+        * # Dump the log buffer.
+        * $ cat /proc/spl/kstat/zfs/dbgmsg
+        */
+       if (zfs_dbgmsg_enable)
+               __zfs_dbgmsg(buf);
+
+       kmem_free(buf, size);
 }
+#endif /* _KERNEL */
 
-void
-zfs_dbgmsg_print(const char *tag)
-{
-#if !defined(_KERNEL)
-       zfs_dbgmsg_t *zdm;
+#ifdef _KERNEL
+module_param(zfs_dbgmsg_enable, int, 0644);
+MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log");
 
-       (void) printf("ZFS_DBGMSG(%s):\n", tag);
-       mutex_enter(&zfs_dbgmsgs_lock);
-       for (zdm = list_head(&zfs_dbgmsgs); zdm;
-           zdm = list_next(&zfs_dbgmsgs, zdm))
-               (void) printf("%s\n", zdm->zdm_msg);
-       mutex_exit(&zfs_dbgmsgs_lock);
-#endif /* !_KERNEL */
-}
+module_param(zfs_dbgmsg_maxsize, int, 0644);
+MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size");
+#endif
index f8190a519f1ffb0e15e31515034ca29d176ddfce..7ce19693e2d40fc596271dfddac407e1073df001 100644 (file)
@@ -25,8 +25,8 @@
  * Portions Copyright 2012 Pawel Jakub Dawidek <pawel@dawidek.net>
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
- * Copyright (c) 201i3 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
@@ -247,55 +247,6 @@ static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
 int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *);
 static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp);
 
-#if defined(HAVE_DECLARE_EVENT_CLASS)
-void
-__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
-{
-       const char *newfile;
-       size_t size = 4096;
-       char *buf = kmem_alloc(size, KM_SLEEP);
-       char *nl;
-       va_list adx;
-
-       /*
-        * Get rid of annoying prefix to filename.
-        */
-       newfile = strrchr(file, '/');
-       if (newfile != NULL) {
-               newfile = newfile + 1; /* Get rid of leading / */
-       } else {
-               newfile = file;
-       }
-
-       va_start(adx, fmt);
-       (void) vsnprintf(buf, size, fmt, adx);
-       va_end(adx);
-
-       /*
-        * Get rid of trailing newline.
-        */
-       nl = strrchr(buf, '\n');
-       if (nl != NULL)
-               *nl = '\0';
-
-       /*
-        * To get this data enable the zfs__dprintf trace point as shown:
-        *
-        * # Enable zfs__dprintf tracepoint, clear the tracepoint ring buffer
-        * $ echo 1 > /sys/module/zfs/parameters/zfs_flags
-        * $ echo 1 > /sys/kernel/debug/tracing/events/zfs/enable
-        * $ echo 0 > /sys/kernel/debug/tracing/trace
-        *
-        * # Dump the ring buffer.
-        * $ cat /sys/kernel/debug/tracing/trace
-        */
-       DTRACE_PROBE4(zfs__dprintf,
-           char *, newfile, char *, func, int, line, char *, buf);
-
-       kmem_free(buf, size);
-}
-#endif /* HAVE_DECLARE_EVENT_CLASS */
-
 static void
 history_str_free(char *buf)
 {
@@ -641,12 +592,14 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
                break;
 
        case ZFS_PROP_QUOTA:
+       case ZFS_PROP_FILESYSTEM_LIMIT:
+       case ZFS_PROP_SNAPSHOT_LIMIT:
                if (!INGLOBALZONE(curproc)) {
                        uint64_t zoned;
                        char setpoint[MAXNAMELEN];
                        /*
                         * Unprivileged users are allowed to modify the
-                        * quota on things *under* (ie. contained by)
+                        * limit on things *under* (ie. contained by)
                         * the thing they own.
                         */
                        if (dsl_prop_get_integer(dsname, "zoned", &zoned,
@@ -944,7 +897,7 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
                dd = clone->ds_dir;
 
                error = dsl_dataset_hold_obj(dd->dd_pool,
-                   dd->dd_phys->dd_origin_obj, FTAG, &origin);
+                   dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin);
                if (error != 0) {
                        dsl_dataset_rele(clone, FTAG);
                        dsl_pool_rele(dp, FTAG);
@@ -1332,7 +1285,7 @@ get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
        if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
            iflag)) != 0) {
                vmem_free(packed, size);
-               return (error);
+               return (SET_ERROR(EFAULT));
        }
 
        if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
@@ -1447,9 +1400,9 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer)
        int error = 0;
 
        if (get_zfs_sb(name, zsbp) != 0)
-               error = zfs_sb_create(name, zsbp);
+               error = zfs_sb_create(name, NULL, zsbp);
        if (error == 0) {
-               rrw_enter(&(*zsbp)->z_teardown_lock, (writer) ? RW_WRITER :
+               rrm_enter(&(*zsbp)->z_teardown_lock, (writer) ? RW_WRITER :
                    RW_READER, tag);
                if ((*zsbp)->z_unmounted) {
                        /*
@@ -1457,7 +1410,7 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer)
                         * thread should be just about to disassociate the
                         * objset from the zsb.
                         */
-                       rrw_exit(&(*zsbp)->z_teardown_lock, tag);
+                       rrm_exit(&(*zsbp)->z_teardown_lock, tag);
                        return (SET_ERROR(EBUSY));
                }
        }
@@ -1467,7 +1420,7 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer)
 static void
 zfs_sb_rele(zfs_sb_t *zsb, void *tag)
 {
-       rrw_exit(&zsb->z_teardown_lock, tag);
+       rrm_exit(&zsb->z_teardown_lock, tag);
 
        if (zsb->z_sb) {
                deactivate_super(zsb->z_sb);
@@ -2390,7 +2343,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
        const char *propname = nvpair_name(pair);
        zfs_prop_t prop = zfs_name_to_prop(propname);
        uint64_t intval;
-       int err;
+       int err = -1;
 
        if (prop == ZPROP_INVAL) {
                if (zfs_prop_userquota(propname))
@@ -2417,6 +2370,21 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
        case ZFS_PROP_REFQUOTA:
                err = dsl_dataset_set_refquota(dsname, source, intval);
                break;
+       case ZFS_PROP_FILESYSTEM_LIMIT:
+       case ZFS_PROP_SNAPSHOT_LIMIT:
+               if (intval == UINT64_MAX) {
+                       /* clearing the limit, just do it */
+                       err = 0;
+               } else {
+                       err = dsl_dir_activate_fs_ss_limit(dsname);
+               }
+               /*
+                * Set err to -1 to force the zfs_set_prop_nvlist code down the
+                * default path to set the value in the nvlist.
+                */
+               if (err == 0)
+                       err = -1;
+               break;
        case ZFS_PROP_RESERVATION:
                err = dsl_dir_set_reservation(dsname, source, intval);
                break;
@@ -3184,7 +3152,7 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
                        volblocksize = zfs_prop_default_numeric(
                            ZFS_PROP_VOLBLOCKSIZE);
 
-               if ((error = zvol_check_volblocksize(
+               if ((error = zvol_check_volblocksize(fsname,
                    volblocksize)) != 0 ||
                    (error = zvol_check_volsize(volsize,
                    volblocksize)) != 0)
@@ -3393,37 +3361,20 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
  * This function is best-effort.  Callers must deal gracefully if it
  * remains mounted (or is remounted after this call).
  *
- * XXX: This function should detect a failure to unmount a snapdir of a dataset
- * and return the appropriate error code when it is mounted. Its Illumos and
- * FreeBSD counterparts do this. We do not do this on Linux because there is no
- * clear way to access the mount information that FreeBSD and Illumos use to
- * distinguish between things with mounted snapshot directories, and things
- * without mounted snapshot directories, which include zvols. Returning a
- * failure for the latter causes `zfs destroy` to fail on zvol snapshots.
+ * Returns 0 if the argument is not a snapshot, or it is not currently a
+ * filesystem, or we were able to unmount it.  Returns error code otherwise.
  */
 int
 zfs_unmount_snap(const char *snapname)
 {
-       zfs_sb_t *zsb = NULL;
-       char *dsname;
-       char *fullname;
-       char *ptr;
+       int err;
 
-       if ((ptr = strchr(snapname, '@')) == NULL)
+       if (strchr(snapname, '@') == NULL)
                return (0);
 
-       dsname = kmem_alloc(ptr - snapname + 1, KM_SLEEP);
-       strlcpy(dsname, snapname, ptr - snapname + 1);
-       fullname = strdup(snapname);
-
-       if (zfs_sb_hold(dsname, FTAG, &zsb, B_FALSE) == 0) {
-               ASSERT(!dsl_pool_config_held(dmu_objset_pool(zsb->z_os)));
-               (void) zfsctl_unmount_snapshot(zsb, fullname, MNT_FORCE);
-               zfs_sb_rele(zsb, FTAG);
-       }
-
-       kmem_free(dsname, ptr - snapname + 1);
-       strfree(fullname);
+       err = zfsctl_snapshot_unmount((char *)snapname, MNT_FORCE);
+       if (err != 0 && err != ENOENT)
+               return (SET_ERROR(err));
 
        return (0);
 }
@@ -3773,8 +3724,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
                 * the SPA supports it. We ignore any errors here since
                 * we'll catch them later.
                 */
-               if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
-                   nvpair_value_uint64(pair, &intval) == 0) {
+               if (nvpair_value_uint64(pair, &intval) == 0) {
                        if (intval >= ZIO_COMPRESS_GZIP_1 &&
                            intval <= ZIO_COMPRESS_GZIP_9 &&
                            zfs_earlier_version(dsname,
@@ -3825,6 +3775,43 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
                        return (SET_ERROR(ENOTSUP));
                break;
 
+       case ZFS_PROP_VOLBLOCKSIZE:
+       case ZFS_PROP_RECORDSIZE:
+               /* Record sizes above 128k need the feature to be enabled */
+               if (nvpair_value_uint64(pair, &intval) == 0 &&
+                   intval > SPA_OLD_MAXBLOCKSIZE) {
+                       spa_t *spa;
+
+                       /*
+                        * If this is a bootable dataset then
+                        * the we don't allow large (>128K) blocks,
+                        * because GRUB doesn't support them.
+                        */
+                       if (zfs_is_bootfs(dsname) &&
+                           intval > SPA_OLD_MAXBLOCKSIZE) {
+                               return (SET_ERROR(EDOM));
+                       }
+
+                       /*
+                        * We don't allow setting the property above 1MB,
+                        * unless the tunable has been changed.
+                        */
+                       if (intval > zfs_max_recordsize ||
+                           intval > SPA_MAXBLOCKSIZE)
+                               return (SET_ERROR(EDOM));
+
+                       if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+                               return (err);
+
+                       if (!spa_feature_is_enabled(spa,
+                           SPA_FEATURE_LARGE_BLOCKS)) {
+                               spa_close(spa, FTAG);
+                               return (SET_ERROR(ENOTSUP));
+                       }
+                       spa_close(spa, FTAG);
+               }
+               break;
+
        case ZFS_PROP_SHARESMB:
                if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
                        return (SET_ERROR(ENOTSUP));
@@ -4204,7 +4191,7 @@ out:
  * zc_fromobj  objsetid of incremental fromsnap (may be zero)
  * zc_guid     if set, estimate size of stream only.  zc_cookie is ignored.
  *             output size in zc_objset_type.
- * zc_flags    if =1, WRITE_EMBEDDED records are permitted
+ * zc_flags    lzc_send_flags
  *
  * outputs:
  * zc_objset_type      estimated size, if zc_guid is set
@@ -4216,6 +4203,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
        offset_t off;
        boolean_t estimate = (zc->zc_guid != 0);
        boolean_t embedok = (zc->zc_flags & 0x1);
+       boolean_t large_block_ok = (zc->zc_flags & 0x2);
 
        if (zc->zc_obj != 0) {
                dsl_pool_t *dp;
@@ -4232,7 +4220,8 @@ zfs_ioc_send(zfs_cmd_t *zc)
                }
 
                if (dsl_dir_is_clone(tosnap->ds_dir))
-                       zc->zc_fromobj = tosnap->ds_dir->dd_phys->dd_origin_obj;
+                       zc->zc_fromobj =
+                           dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj;
                dsl_dataset_rele(tosnap, FTAG);
                dsl_pool_rele(dp, FTAG);
        }
@@ -4276,7 +4265,8 @@ zfs_ioc_send(zfs_cmd_t *zc)
 
                off = fp->f_offset;
                error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
-                   zc->zc_fromobj, embedok, zc->zc_cookie, fp->f_vnode, &off);
+                   zc->zc_fromobj, embedok, large_block_ok,
+                   zc->zc_cookie, fp->f_vnode, &off);
 
                if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
                        fp->f_offset = off;
@@ -4669,8 +4659,7 @@ zfs_ioc_next_obj(zfs_cmd_t *zc)
        if (error != 0)
                return (error);
 
-       error = dmu_object_next(os, &zc->zc_obj, B_FALSE,
-           os->os_dsl_dataset->ds_phys->ds_prev_snap_txg);
+       error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0);
 
        dmu_objset_rele(os, FTAG);
        return (error);
@@ -5117,11 +5106,19 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
                return (error);
 
        error = dsl_dataset_hold(dp, lastsnap, FTAG, &new);
+       if (error == 0 && !new->ds_is_snapshot) {
+               dsl_dataset_rele(new, FTAG);
+               error = SET_ERROR(EINVAL);
+       }
        if (error != 0) {
                dsl_pool_rele(dp, FTAG);
                return (error);
        }
        error = dsl_dataset_hold(dp, firstsnap, FTAG, &old);
+       if (error == 0 && !old->ds_is_snapshot) {
+               dsl_dataset_rele(old, FTAG);
+               error = SET_ERROR(EINVAL);
+       }
        if (error != 0) {
                dsl_dataset_rele(new, FTAG);
                dsl_pool_rele(dp, FTAG);
@@ -5142,6 +5139,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
  * innvl: {
  *     "fd" -> file descriptor to write stream to (int32)
  *     (optional) "fromsnap" -> full snap name to send an incremental from
+ *     (optional) "largeblockok" -> (value ignored)
+ *         indicates that blocks > 128KB are permitted
  *     (optional) "embedok" -> (value ignored)
  *         presence indicates DRR_WRITE_EMBEDDED records are permitted
  * }
@@ -5157,6 +5156,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
        char *fromname = NULL;
        int fd;
        file_t *fp;
+       boolean_t largeblockok;
        boolean_t embedok;
 
        error = nvlist_lookup_int32(innvl, "fd", &fd);
@@ -5165,13 +5165,15 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 
        (void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
 
+       largeblockok = nvlist_exists(innvl, "largeblockok");
        embedok = nvlist_exists(innvl, "embedok");
 
        if ((fp = getf(fd)) == NULL)
                return (SET_ERROR(EBADF));
 
        off = fp->f_offset;
-       error = dmu_send(snapname, fromname, embedok, fd, fp->f_vnode, &off);
+       error = dmu_send(snapname, fromname, embedok, largeblockok,
+           fd, fp->f_vnode, &off);
 
        if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
                fp->f_offset = off;
@@ -5185,7 +5187,8 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
  * of bytes that will be written to the fd supplied to zfs_ioc_send_new().
  *
  * innvl: {
- *     (optional) "fromsnap" -> full snap name to send an incremental from
+ *     (optional) "from" -> full snap or bookmark name to send an incremental
+ *                          from
  * }
  *
  * outnvl: {
@@ -5196,7 +5199,6 @@ static int
 zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 {
        dsl_pool_t *dp;
-       dsl_dataset_t *fromsnap = NULL;
        dsl_dataset_t *tosnap;
        int error;
        char *fromname;
@@ -5212,27 +5214,55 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
                return (error);
        }
 
-       error = nvlist_lookup_string(innvl, "fromsnap", &fromname);
+       error = nvlist_lookup_string(innvl, "from", &fromname);
        if (error == 0) {
-               error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
-               if (error != 0) {
-                       dsl_dataset_rele(tosnap, FTAG);
-                       dsl_pool_rele(dp, FTAG);
-                       return (error);
+               if (strchr(fromname, '@') != NULL) {
+                       /*
+                        * If from is a snapshot, hold it and use the more
+                        * efficient dmu_send_estimate to estimate send space
+                        * size using deadlists.
+                        */
+                       dsl_dataset_t *fromsnap;
+                       error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
+                       if (error != 0)
+                               goto out;
+                       error = dmu_send_estimate(tosnap, fromsnap, &space);
+                       dsl_dataset_rele(fromsnap, FTAG);
+               } else if (strchr(fromname, '#') != NULL) {
+                       /*
+                        * If from is a bookmark, fetch the creation TXG of the
+                        * snapshot it was created from and use that to find
+                        * blocks that were born after it.
+                        */
+                       zfs_bookmark_phys_t frombm;
+
+                       error = dsl_bookmark_lookup(dp, fromname, tosnap,
+                           &frombm);
+                       if (error != 0)
+                               goto out;
+                       error = dmu_send_estimate_from_txg(tosnap,
+                           frombm.zbm_creation_txg, &space);
+               } else {
+                       /*
+                        * from is not properly formatted as a snapshot or
+                        * bookmark
+                        */
+                       error = SET_ERROR(EINVAL);
+                       goto out;
                }
+       } else {
+               // If estimating the size of a full send, use dmu_send_estimate
+               error = dmu_send_estimate(tosnap, NULL, &space);
        }
 
-       error = dmu_send_estimate(tosnap, fromsnap, &space);
        fnvlist_add_uint64(outnvl, "space", space);
 
-       if (fromsnap != NULL)
-               dsl_dataset_rele(fromsnap, FTAG);
+out:
        dsl_dataset_rele(tosnap, FTAG);
        dsl_pool_rele(dp, FTAG);
        return (error);
 }
 
-
 static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
 
 static void
@@ -5600,13 +5630,35 @@ zfsdev_get_state(minor_t minor, enum zfsdev_state_type which)
        return (ptr);
 }
 
-minor_t
-zfsdev_getminor(struct file *filp)
+int
+zfsdev_getminor(struct file *filp, minor_t *minorp)
 {
+       zfsdev_state_t *zs, *fpd;
+
        ASSERT(filp != NULL);
-       ASSERT(filp->private_data != NULL);
+       ASSERT(!MUTEX_HELD(&zfsdev_state_lock));
+
+       fpd = filp->private_data;
+       if (fpd == NULL)
+               return (EBADF);
+
+       mutex_enter(&zfsdev_state_lock);
+
+       for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+
+               if (zs->zs_minor == -1)
+                       continue;
+
+               if (fpd == zs) {
+                       *minorp = fpd->zs_minor;
+                       mutex_exit(&zfsdev_state_lock);
+                       return (0);
+               }
+       }
+
+       mutex_exit(&zfsdev_state_lock);
 
-       return (((zfsdev_state_t *)filp->private_data)->zs_minor);
+       return (EBADF);
 }
 
 /*
index cfce83138df2422933d86fe307b2d131b8df8efa..38d8de0ebf9764b1381e73882f17039d9aa7c725 100644 (file)
@@ -492,7 +492,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
                 * If the write would overflow the largest block then split it.
                 */
                if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
-                       len = SPA_MAXBLOCKSIZE >> 1;
+                       len = SPA_OLD_MAXBLOCKSIZE >> 1;
                else
                        len = resid;
 
index 18a0671a86f698fc6750febf040d259979673f86..bc3892645fe1071db24670e46a22483c62e86931 100644 (file)
@@ -126,13 +126,20 @@ zfs_onexit_fd_hold(int fd, minor_t *minorp)
 {
        file_t *fp;
        zfs_onexit_t *zo;
+       int error;
 
        fp = getf(fd);
        if (fp == NULL)
                return (SET_ERROR(EBADF));
 
-       *minorp = zfsdev_getminor(fp->f_file);
-       return (zfs_onexit_minor_to_state(*minorp, &zo));
+       error = zfsdev_getminor(fp->f_file, minorp);
+       if (error == 0)
+               error = zfs_onexit_minor_to_state(*minorp, &zo);
+
+       if (error)
+               zfs_onexit_fd_rele(fd);
+
+       return (error);
 }
 
 void
index 257ab4254bbdb618cda6310c6f9b1da46b8d7338..c9a9da7528d73b4a8e3d9df2a387b5a278cf4482 100644 (file)
@@ -22,8 +22,7 @@
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
-#include <sys/types.h>
-#include <sys/param.h>
+#include <sys/zfs_context.h>
 #include <sys/vnode.h>
 #include <sys/sa.h>
 #include <sys/zfs_acl.h>
index 0a8145f0913eee9d84828bbf49f7003f2b776279..f105d9aeda123d5b789e8785ff5e5cf77da07e45 100644 (file)
@@ -187,10 +187,9 @@ static void
 blksz_changed_cb(void *arg, uint64_t newval)
 {
        zfs_sb_t *zsb = arg;
-
-       if (newval < SPA_MINBLOCKSIZE ||
-           newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
-               newval = SPA_MAXBLOCKSIZE;
+       ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zsb->z_os)));
+       ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+       ASSERT(ISP2(newval));
 
        zsb->z_max_blksz = newval;
 }
@@ -263,11 +262,22 @@ zfs_register_callbacks(zfs_sb_t *zsb)
 {
        struct dsl_dataset *ds = NULL;
        objset_t *os = zsb->z_os;
-       boolean_t do_readonly = B_FALSE;
+       zfs_mntopts_t *zmo = zsb->z_mntopts;
        int error = 0;
 
-       if (zfs_is_readonly(zsb) || !spa_writeable(dmu_objset_spa(os)))
-               do_readonly = B_TRUE;
+       ASSERT(zsb);
+       ASSERT(zmo);
+
+       /*
+        * The act of registering our callbacks will destroy any mount
+        * options we may have.  In order to enable temporary overrides
+        * of mount options, we stash away the current values and
+        * restore them after we register the callbacks.
+        */
+       if (zfs_is_readonly(zsb) || !spa_writeable(dmu_objset_spa(os))) {
+               zmo->z_do_readonly = B_TRUE;
+               zmo->z_readonly = B_TRUE;
+       }
 
        /*
         * Register property callbacks.
@@ -308,8 +318,25 @@ zfs_register_callbacks(zfs_sb_t *zsb)
        if (error)
                goto unregister;
 
-       if (do_readonly)
-               readonly_changed_cb(zsb, B_TRUE);
+       /*
+        * Invoke our callbacks to restore temporary mount options.
+        */
+       if (zmo->z_do_readonly)
+               readonly_changed_cb(zsb, zmo->z_readonly);
+       if (zmo->z_do_setuid)
+               setuid_changed_cb(zsb, zmo->z_setuid);
+       if (zmo->z_do_exec)
+               exec_changed_cb(zsb, zmo->z_exec);
+       if (zmo->z_do_devices)
+               devices_changed_cb(zsb, zmo->z_devices);
+       if (zmo->z_do_xattr)
+               xattr_changed_cb(zsb, zmo->z_xattr);
+       if (zmo->z_do_atime)
+               atime_changed_cb(zsb, zmo->z_atime);
+       if (zmo->z_do_relatime)
+               relatime_changed_cb(zsb, zmo->z_relatime);
+       if (zmo->z_do_nbmand)
+               nbmand_changed_cb(zsb, zmo->z_nbmand);
 
        return (0);
 
@@ -643,8 +670,26 @@ zfs_owner_overquota(zfs_sb_t *zsb, znode_t *zp, boolean_t isgroup)
 }
 EXPORT_SYMBOL(zfs_owner_overquota);
 
+zfs_mntopts_t *
+zfs_mntopts_alloc(void)
+{
+       return (kmem_zalloc(sizeof (zfs_mntopts_t), KM_SLEEP));
+}
+
+void
+zfs_mntopts_free(zfs_mntopts_t *zmo)
+{
+       if (zmo->z_osname)
+               strfree(zmo->z_osname);
+
+       if (zmo->z_mntpoint)
+               strfree(zmo->z_mntpoint);
+
+       kmem_free(zmo, sizeof (zfs_mntopts_t));
+}
+
 int
-zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
+zfs_sb_create(const char *osname, zfs_mntopts_t *zmo, zfs_sb_t **zsbp)
 {
        objset_t *os;
        zfs_sb_t *zsb;
@@ -664,6 +709,11 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
                return (error);
        }
 
+       /*
+        * Optional temporary mount options, free'd in zfs_sb_free().
+        */
+       zsb->z_mntopts = (zmo ? zmo : zfs_mntopts_alloc());
+
        /*
         * Initialize the zfs-specific filesystem structure.
         * Should probably make this a kmem cache, shuffle fields,
@@ -671,7 +721,7 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
         */
        zsb->z_sb = NULL;
        zsb->z_parent = zsb;
-       zsb->z_max_blksz = SPA_MAXBLOCKSIZE;
+       zsb->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
        zsb->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
        zsb->z_os = os;
 
@@ -772,7 +822,7 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
        mutex_init(&zsb->z_lock, NULL, MUTEX_DEFAULT, NULL);
        list_create(&zsb->z_all_znodes, sizeof (znode_t),
            offsetof(znode_t, z_link_node));
-       rrw_init(&zsb->z_teardown_lock, B_FALSE);
+       rrm_init(&zsb->z_teardown_lock, B_FALSE);
        rw_init(&zsb->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
        rw_init(&zsb->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 
@@ -781,10 +831,6 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
        for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
                mutex_init(&zsb->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
-       avl_create(&zsb->z_ctldir_snaps, snapentry_compare,
-           sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
-       mutex_init(&zsb->z_ctldir_lock, NULL, MUTEX_DEFAULT, NULL);
-
        *zsbp = zsb;
        return (0);
 
@@ -891,14 +937,13 @@ zfs_sb_free(zfs_sb_t *zsb)
        mutex_destroy(&zsb->z_znodes_lock);
        mutex_destroy(&zsb->z_lock);
        list_destroy(&zsb->z_all_znodes);
-       rrw_destroy(&zsb->z_teardown_lock);
+       rrm_destroy(&zsb->z_teardown_lock);
        rw_destroy(&zsb->z_teardown_inactive_lock);
        rw_destroy(&zsb->z_fuid_lock);
        for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
                mutex_destroy(&zsb->z_hold_mtx[i]);
        vmem_free(zsb->z_hold_mtx, sizeof (kmutex_t) * ZFS_OBJ_MTX_SZ);
-       mutex_destroy(&zsb->z_ctldir_lock);
-       avl_destroy(&zsb->z_ctldir_snaps);
+       zfs_mntopts_free(zsb->z_mntopts);
        kmem_free(zsb, sizeof (zfs_sb_t));
 }
 EXPORT_SYMBOL(zfs_sb_free);
@@ -1215,14 +1260,14 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
                 */
                int round = 0;
                while (zsb->z_nr_znodes > 0) {
-                       taskq_wait(dsl_pool_iput_taskq(dmu_objset_pool(
-                           zsb->z_os)));
+                       taskq_wait_outstanding(dsl_pool_iput_taskq(
+                           dmu_objset_pool(zsb->z_os)), 0);
                        if (++round > 1 && !unmounting)
                                break;
                }
        }
 
-       rrw_enter(&zsb->z_teardown_lock, RW_WRITER, FTAG);
+       rrm_enter(&zsb->z_teardown_lock, RW_WRITER, FTAG);
 
        if (!unmounting) {
                /*
@@ -1253,7 +1298,7 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
         */
        if (!unmounting && (zsb->z_unmounted || zsb->z_os == NULL)) {
                rw_exit(&zsb->z_teardown_inactive_lock);
-               rrw_exit(&zsb->z_teardown_lock, FTAG);
+               rrm_exit(&zsb->z_teardown_lock, FTAG);
                return (SET_ERROR(EIO));
        }
 
@@ -1281,7 +1326,7 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
         */
        if (unmounting) {
                zsb->z_unmounted = B_TRUE;
-               rrw_exit(&zsb->z_teardown_lock, FTAG);
+               rrm_exit(&zsb->z_teardown_lock, FTAG);
                rw_exit(&zsb->z_teardown_inactive_lock);
        }
 
@@ -1317,16 +1362,15 @@ atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0);
 #endif
 
 int
-zfs_domount(struct super_block *sb, void *data, int silent)
+zfs_domount(struct super_block *sb, zfs_mntopts_t *zmo, int silent)
 {
-       zpl_mount_data_t *zmd = data;
-       const char *osname = zmd->z_osname;
+       const char *osname = zmo->z_osname;
        zfs_sb_t *zsb;
        struct inode *root_inode;
        uint64_t recordsize;
        int error;
 
-       error = zfs_sb_create(osname, &zsb);
+       error = zfs_sb_create(osname, zmo, &zsb);
        if (error)
                return (error);
 
@@ -1374,6 +1418,7 @@ zfs_domount(struct super_block *sb, void *data, int silent)
                acltype_changed_cb(zsb, pval);
                zsb->z_issnap = B_TRUE;
                zsb->z_os->os_sync = ZFS_SYNC_DISABLED;
+               zsb->z_snap_defer_time = jiffies;
 
                mutex_enter(&zsb->z_os->os_user_ptr_lock);
                dmu_objset_set_user(zsb->z_os, zsb);
@@ -1423,8 +1468,8 @@ zfs_preumount(struct super_block *sb)
 {
        zfs_sb_t *zsb = sb->s_fs_info;
 
-       if (zsb != NULL && zsb->z_ctldir != NULL)
-               zfsctl_destroy(zsb);
+       if (zsb)
+               zfsctl_destroy(sb->s_fs_info);
 }
 EXPORT_SYMBOL(zfs_preumount);
 
@@ -1468,13 +1513,15 @@ zfs_umount(struct super_block *sb)
 EXPORT_SYMBOL(zfs_umount);
 
 int
-zfs_remount(struct super_block *sb, int *flags, char *data)
+zfs_remount(struct super_block *sb, int *flags, zfs_mntopts_t *zmo)
 {
-       /*
-        * All namespace flags (MNT_*) and super block flags (MS_*) will
-        * be handled by the Linux VFS.  Only handle custom options here.
-        */
-       return (0);
+       zfs_sb_t *zsb = sb->s_fs_info;
+       int error;
+
+       zfs_unregister_callbacks(zsb);
+       error = zfs_register_callbacks(zsb);
+
+       return (error);
 }
 EXPORT_SYMBOL(zfs_remount);
 
@@ -1553,6 +1600,8 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
        zp_gen = zp_gen & gen_mask;
        if (zp_gen == 0)
                zp_gen = 1;
+       if ((fid_gen == 0) && (zsb->z_root == object))
+               fid_gen = zp_gen;
        if (zp->z_unlinked || zp_gen != fid_gen) {
                dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen,
                    fid_gen);
@@ -1600,7 +1649,7 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname)
        znode_t *zp;
        uint64_t sa_obj = 0;
 
-       ASSERT(RRW_WRITE_HELD(&zsb->z_teardown_lock));
+       ASSERT(RRM_WRITE_HELD(&zsb->z_teardown_lock));
        ASSERT(RW_WRITE_HELD(&zsb->z_teardown_inactive_lock));
 
        /*
@@ -1664,7 +1713,7 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname)
 bail:
        /* release the VFS ops */
        rw_exit(&zsb->z_teardown_inactive_lock);
-       rrw_exit(&zsb->z_teardown_lock, FTAG);
+       rrm_exit(&zsb->z_teardown_lock, FTAG);
 
        if (err) {
                /*
@@ -1803,7 +1852,7 @@ zfs_init(void)
 void
 zfs_fini(void)
 {
-       taskq_wait(system_taskq);
+       taskq_wait_outstanding(system_taskq, 0);
        unregister_filesystem(&zpl_fs_type);
        zfs_znode_fini();
        zfsctl_fini();
index 723d6210f26f1121845f17a06e495eb5a3e4c43d..944f0ad3ddb84c544c46392cd4d4f9613232b325 100644 (file)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
@@ -376,7 +377,6 @@ mappedread(struct inode *ip, int nbytes, uio_t *uio)
        struct address_space *mp = ip->i_mapping;
        struct page *pp;
        znode_t *zp = ITOZ(ip);
-       objset_t *os = ITOZSB(ip)->z_os;
        int64_t start, off;
        uint64_t bytes;
        int len = nbytes;
@@ -403,7 +403,8 @@ mappedread(struct inode *ip, int nbytes, uio_t *uio)
                        unlock_page(pp);
                        page_cache_release(pp);
                } else {
-                       error = dmu_read_uio(os, zp->z_id, uio, bytes);
+                       error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+                           uio, bytes);
                }
 
                len -= bytes;
@@ -440,7 +441,6 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 {
        znode_t         *zp = ITOZ(ip);
        zfs_sb_t        *zsb = ITOZSB(ip);
-       objset_t        *os;
        ssize_t         n, nbytes;
        int             error = 0;
        rl_t            *rl;
@@ -450,7 +450,6 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 
        ZFS_ENTER(zsb);
        ZFS_VERIFY_ZP(zp);
-       os = zsb->z_os;
 
        if (zp->z_pflags & ZFS_AV_QUARANTINED) {
                ZFS_EXIT(zsb);
@@ -531,10 +530,12 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
                nbytes = MIN(n, zfs_read_chunk_size -
                    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 
-               if (zp->z_is_mapped && !(ioflag & O_DIRECT))
+               if (zp->z_is_mapped && !(ioflag & O_DIRECT)) {
                        error = mappedread(ip, nbytes, uio);
-               else
-                       error = dmu_read_uio(os, zp->z_id, uio, nbytes);
+               } else {
+                       error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+                           uio, nbytes);
+               }
 
                if (error) {
                        /* convert checksum errors into IO errors */
@@ -591,10 +592,10 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
        int             max_blksz = zsb->z_max_blksz;
        int             error = 0;
        arc_buf_t       *abuf;
-       iovec_t         *aiov = NULL;
+       const iovec_t   *aiov = NULL;
        xuio_t          *xuio = NULL;
        int             i_iov = 0;
-       iovec_t         *iovp = uio->uio_iov;
+       const iovec_t   *iovp = uio->uio_iov;
        int             write_eof;
        int             count = 0;
        sa_bulk_attr_t  bulk[4];
@@ -714,6 +715,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 
                if (xuio && abuf == NULL) {
                        ASSERT(i_iov < iovcnt);
+                       ASSERT3U(uio->uio_segflg, !=, UIO_BVEC);
                        aiov = &iovp[i_iov];
                        abuf = dmu_xuio_arcbuf(xuio, i_iov);
                        dmu_xuio_clear(xuio, i_iov);
@@ -771,8 +773,14 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
                        uint64_t new_blksz;
 
                        if (zp->z_blksz > max_blksz) {
+                               /*
+                                * File's blocksize is already larger than the
+                                * "recordsize" property.  Only let it grow to
+                                * the next power of 2.
+                                */
                                ASSERT(!ISP2(zp->z_blksz));
-                               new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
+                               new_blksz = MIN(end_size,
+                                   1 << highbit64(zp->z_blksz));
                        } else {
                                new_blksz = MIN(end_size, max_blksz);
                        }
@@ -891,6 +899,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
                        uio_prefaultpages(MIN(n, max_blksz), uio);
        }
 
+       zfs_inode_update(zp);
        zfs_range_unlock(rl);
 
        /*
@@ -906,7 +915,6 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
            zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
                zil_commit(zilog, zp->z_id);
 
-       zfs_inode_update(zp);
        ZFS_EXIT(zsb);
        return (0);
 }
@@ -2156,6 +2164,8 @@ zfs_fsync(struct inode *ip, int syncflag, cred_t *cr)
                zil_commit(zsb->z_log, zp->z_id);
                ZFS_EXIT(zsb);
        }
+       tsd_set(zfs_fsyncer_key, NULL);
+
        return (0);
 }
 EXPORT_SYMBOL(zfs_fsync);
@@ -2405,6 +2415,16 @@ zfs_getattr_fast(struct inode *ip, struct kstat *sp)
 
        mutex_exit(&zp->z_lock);
 
+       /*
+        * Required to prevent NFS client from detecting different inode
+        * numbers of snapshot root dentry before and after snapshot mount.
+        */
+       if (zsb->z_issnap) {
+               if (ip->i_sb->s_root->d_inode == ip)
+                       sp->ino = ZFSCTL_INO_SNAPDIRS -
+                               dmu_objset_id(zsb->z_os);
+       }
+
        ZFS_EXIT(zsb);
 
        return (0);
@@ -3865,6 +3885,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
        uint64_t        mtime[2], ctime[2];
        sa_bulk_attr_t  bulk[3];
        int             cnt = 0;
+       struct address_space *mapping;
 
        ZFS_ENTER(zsb);
        ZFS_VERIFY_ZP(zp);
@@ -3911,10 +3932,59 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
         * 2) Before setting or clearing write back on a page the range lock
         *    must be held in order to prevent a lock inversion with the
         *    zfs_free_range() function.
+        *
+        * This presents a problem because upon entering this function the
+        * page lock is already held.  To safely acquire the range lock the
+        * page lock must be dropped.  This creates a window where another
+        * process could truncate, invalidate, dirty, or write out the page.
+        *
+        * Therefore, after successfully reacquiring the range and page locks
+        * the current page state is checked.  In the common case everything
+        * will be as is expected and it can be written out.  However, if
+        * the page state has changed it must be handled accordingly.
         */
+       mapping = pp->mapping;
+       redirty_page_for_writepage(wbc, pp);
        unlock_page(pp);
+
        rl = zfs_range_lock(zp, pgoff, pglen, RL_WRITER);
+       lock_page(pp);
+
+       /* Page mapping changed or it was no longer dirty, we're done */
+       if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
+               unlock_page(pp);
+               zfs_range_unlock(rl);
+               ZFS_EXIT(zsb);
+               return (0);
+       }
+
+       /* Another process started write block if required */
+       if (PageWriteback(pp)) {
+               unlock_page(pp);
+               zfs_range_unlock(rl);
+
+               if (wbc->sync_mode != WB_SYNC_NONE)
+                       wait_on_page_writeback(pp);
+
+               ZFS_EXIT(zsb);
+               return (0);
+       }
+
+       /* Clear the dirty flag the required locks are held */
+       if (!clear_page_dirty_for_io(pp)) {
+               unlock_page(pp);
+               zfs_range_unlock(rl);
+               ZFS_EXIT(zsb);
+               return (0);
+       }
+
+       /*
+        * Counterpart for redirty_page_for_writepage() above.  This page
+        * was in fact not skipped and should not be counted as if it were.
+        */
+       wbc->pages_skipped--;
        set_page_writeback(pp);
+       unlock_page(pp);
 
        tx = dmu_tx_create(zsb->z_os);
        dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
@@ -4036,15 +4106,16 @@ zfs_inactive(struct inode *ip)
        znode_t *zp = ITOZ(ip);
        zfs_sb_t *zsb = ITOZSB(ip);
        int error;
+       int need_unlock = 0;
 
-       if (zfsctl_is_node(ip)) {
-               zfsctl_inode_inactive(ip);
-               return;
+       /* Only read lock if we haven't already write locked, e.g. rollback */
+       if (!RW_WRITE_HELD(&zsb->z_teardown_inactive_lock)) {
+               need_unlock = 1;
+               rw_enter(&zsb->z_teardown_inactive_lock, RW_READER);
        }
-
-       rw_enter(&zsb->z_teardown_inactive_lock, RW_READER);
        if (zp->z_sa_hdl == NULL) {
-               rw_exit(&zsb->z_teardown_inactive_lock);
+               if (need_unlock)
+                       rw_exit(&zsb->z_teardown_inactive_lock);
                return;
        }
 
@@ -4067,7 +4138,8 @@ zfs_inactive(struct inode *ip)
        }
 
        zfs_zinactive(zp);
-       rw_exit(&zsb->z_teardown_inactive_lock);
+       if (need_unlock)
+               rw_exit(&zsb->z_teardown_inactive_lock);
 }
 EXPORT_SYMBOL(zfs_inactive);
 
index a3d64fe01b592be8b3944513235abc334994e515..d39743de943161d8366316c4909ca5a7eca2e8a3 100644 (file)
@@ -61,6 +61,7 @@
 #endif /* _KERNEL */
 
 #include <sys/dmu.h>
+#include <sys/dmu_objset.h>
 #include <sys/refcount.h>
 #include <sys/stat.h>
 #include <sys/zap.h>
@@ -273,9 +274,6 @@ zfs_inode_destroy(struct inode *ip)
        znode_t *zp = ITOZ(ip);
        zfs_sb_t *zsb = ZTOZSB(zp);
 
-       if (zfsctl_is_node(ip))
-               zfsctl_inode_destroy(ip);
-
        mutex_enter(&zsb->z_znodes_lock);
        if (list_link_active(&zp->z_link_node)) {
                list_remove(&zsb->z_all_znodes, zp);
@@ -328,8 +326,8 @@ zfs_inode_set_ops(zfs_sb_t *zsb, struct inode *ip)
         */
        case S_IFCHR:
        case S_IFBLK:
-               VERIFY(sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zsb),
-                   &rdev, sizeof (rdev)) == 0);
+               sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zsb), &rdev,
+                   sizeof (rdev));
                /*FALLTHROUGH*/
        case S_IFIFO:
        case S_IFSOCK:
@@ -338,8 +336,15 @@ zfs_inode_set_ops(zfs_sb_t *zsb, struct inode *ip)
                break;
 
        default:
-               printk("ZFS: Invalid mode: 0x%x\n", ip->i_mode);
-               VERIFY(0);
+               zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
+                   (u_longlong_t)ip->i_ino, ip->i_mode);
+
+               /* Assume the inode is a file and attempt to continue */
+               ip->i_mode = S_IFREG | 0644;
+               ip->i_op = &zpl_inode_operations;
+               ip->i_fop = &zpl_file_operations;
+               ip->i_mapping->a_ops = &zpl_address_space_operations;
+               break;
        }
 }
 
@@ -1304,8 +1309,13 @@ zfs_extend(znode_t *zp, uint64_t end)
                 * We are growing the file past the current block size.
                 */
                if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
+                       /*
+                        * File's blocksize is already larger than the
+                        * "recordsize" property.  Only let it grow to
+                        * the next power of 2.
+                        */
                        ASSERT(!ISP2(zp->z_blksz));
-                       newblksz = MIN(end, SPA_MAXBLOCKSIZE);
+                       newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
                } else {
                        newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
                }
index 15897b363de6f5cc318e893a2cc82fc688636984..289b23c7f488a44755be6c59f4929ac0b618ff36 100644 (file)
@@ -204,7 +204,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
     char **end)
 {
        enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
-       uint32_t aflags = ARC_WAIT;
+       arc_flags_t aflags = ARC_FLAG_WAIT;
        arc_buf_t *abuf = NULL;
        zbookmark_phys_t zb;
        int error;
@@ -243,6 +243,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
                            sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
                                error = SET_ERROR(ECKSUM);
                        } else {
+                               ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
                                bcopy(lr, dst, len);
                                *end = (char *)dst + len;
                                *nbp = zilc->zc_next_blk;
@@ -257,6 +258,8 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
                            (zilc->zc_nused > (size - sizeof (*zilc)))) {
                                error = SET_ERROR(ECKSUM);
                        } else {
+                               ASSERT3U(zilc->zc_nused, <=,
+                                   SPA_OLD_MAXBLOCKSIZE);
                                bcopy(lr, dst, zilc->zc_nused);
                                *end = (char *)dst + zilc->zc_nused;
                                *nbp = zilc->zc_next_blk;
@@ -277,7 +280,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
 {
        enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
        const blkptr_t *bp = &lr->lr_blkptr;
-       uint32_t aflags = ARC_WAIT;
+       arc_flags_t aflags = ARC_FLAG_WAIT;
        arc_buf_t *abuf = NULL;
        zbookmark_phys_t zb;
        int error;
@@ -342,7 +345,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
         * If the log has been claimed, stop if we encounter a sequence
         * number greater than the highest claimed sequence number.
         */
-       lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+       lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
        zil_bp_tree_init(zilog);
 
        for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
@@ -389,7 +392,7 @@ done:
            (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
 
        zil_bp_tree_fini(zilog);
-       zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
+       zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
 
        return (error);
 }
@@ -497,7 +500,7 @@ zilog_dirty(zilog_t *zilog, uint64_t txg)
        dsl_pool_t *dp = zilog->zl_dmu_pool;
        dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
-       if (dsl_dataset_is_snapshot(ds))
+       if (ds->ds_is_snapshot)
                panic("dirtying snapshot!");
 
        if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
@@ -657,7 +660,7 @@ zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
 }
 
 int
-zil_claim(const char *osname, void *txarg)
+zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 {
        dmu_tx_t *tx = txarg;
        uint64_t first_txg = dmu_tx_get_txg(tx);
@@ -666,15 +669,16 @@ zil_claim(const char *osname, void *txarg)
        objset_t *os;
        int error;
 
-       error = dmu_objset_own(osname, DMU_OST_ANY, B_FALSE, FTAG, &os);
+       error = dmu_objset_own_obj(dp, ds->ds_object,
+           DMU_OST_ANY, B_FALSE, FTAG, &os);
        if (error != 0) {
                /*
                 * EBUSY indicates that the objset is inconsistent, in which
                 * case it can not have a ZIL.
                 */
                if (error != EBUSY) {
-                       cmn_err(CE_WARN, "can't open objset for %s, error %u",
-                               osname, error);
+                       cmn_err(CE_WARN, "can't open objset for %llu, error %u",
+                           (unsigned long long)ds->ds_object, error);
                }
 
                return (0);
@@ -722,8 +726,9 @@ zil_claim(const char *osname, void *txarg)
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
+/* ARGSUSED */
 int
-zil_check_log_chain(const char *osname, void *tx)
+zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 {
        zilog_t *zilog;
        objset_t *os;
@@ -732,9 +737,10 @@ zil_check_log_chain(const char *osname, void *tx)
 
        ASSERT(tx == NULL);
 
-       error = dmu_objset_hold(osname, FTAG, &os);
+       error = dmu_objset_from_ds(ds, &os);
        if (error != 0) {
-               cmn_err(CE_WARN, "can't open objset for %s", osname);
+               cmn_err(CE_WARN, "can't open objset %llu, error %d",
+                   (unsigned long long)ds->ds_object, error);
                return (0);
        }
 
@@ -757,10 +763,8 @@ zil_check_log_chain(const char *osname, void *tx)
                        valid = vdev_log_state_valid(vd);
                spa_config_exit(os->os_spa, SCL_STATE, FTAG);
 
-               if (!valid) {
-                       dmu_objset_rele(os, FTAG);
+               if (!valid)
                        return (0);
-               }
        }
 
        /*
@@ -773,8 +777,6 @@ zil_check_log_chain(const char *osname, void *tx)
        error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
            zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
 
-       dmu_objset_rele(os, FTAG);
-
        return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
 
@@ -941,7 +943,7 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
  *
  * These must be a multiple of 4KB. Note only the amount used (again
  * aligned to 4KB) actually gets written. However, we can't always just
- * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
+ * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
  */
 uint64_t zil_block_buckets[] = {
     4096,              /* non TX_WRITE */
@@ -1023,7 +1025,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
                continue;
        zil_blksz = zil_block_buckets[i];
        if (zil_blksz == UINT64_MAX)
-               zil_blksz = SPA_MAXBLOCKSIZE;
+               zil_blksz = SPA_OLD_MAXBLOCKSIZE;
        zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
        for (i = 0; i < ZIL_PREV_BLKS; i++)
                zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
@@ -1886,7 +1888,7 @@ zil_open(objset_t *os, zil_get_data_t *get_data)
        ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
        zilog->zl_get_data = get_data;
-       zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
+       zilog->zl_clean_taskq = taskq_create("zil_clean", 1, defclsyspri,
            2, 2, TASKQ_PREPOPULATE);
 
        return (zilog);
index 066f04f1864c6c8a6ccc59c70573ebecf38f28db..c378742eda0a435cf826504114da2a0962cff906 100644 (file)
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  */
 
+#include <sys/sysmacros.h>
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
@@ -59,6 +60,9 @@ kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 int zio_delay_max = ZIO_DELAY_MAX;
 
+#define        ZIO_PIPELINE_CONTINUE           0x100
+#define        ZIO_PIPELINE_STOP               0x101
+
 /*
  * The following actions directly effect the spa's sync-to-convergence logic.
  * The values below define the sync pass when we start performing the action.
@@ -104,9 +108,8 @@ zio_init(void)
 
        /*
         * For small buffers, we want a cache for each multiple of
-        * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
-        * for each quarter-power of 2.  For large buffers, we want
-        * a cache for each multiple of PAGESIZE.
+        * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
+        * for each quarter-power of 2.
         */
        for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
                size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
@@ -114,7 +117,16 @@ zio_init(void)
                size_t align = 0;
                size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
 
-               while (p2 & (p2 - 1))
+#ifdef _ILP32
+               /*
+                * Cache size limited to 1M on 32-bit platforms until ARC
+                * buffers no longer require virtual address space.
+                */
+               if (size > zfs_max_recordsize)
+                       break;
+#endif
+
+               while (!ISP2(p2))
                        p2 &= p2 - 1;
 
 #ifndef _KERNEL
@@ -129,10 +141,8 @@ zio_init(void)
 #endif
                if (size <= 4 * SPA_MINBLOCKSIZE) {
                        align = SPA_MINBLOCKSIZE;
-               } else if (IS_P2ALIGNED(size, PAGESIZE)) {
-                       align = PAGESIZE;
                } else if (IS_P2ALIGNED(size, p2 >> 2)) {
-                       align = p2 >> 2;
+                       align = MIN(p2 >> 2, PAGESIZE);
                }
 
                if (align != 0) {
@@ -171,6 +181,14 @@ zio_fini(void)
        kmem_cache_t *last_data_cache = NULL;
 
        for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+#ifdef _ILP32
+               /*
+                * Cache size limited to 1M on 32-bit platforms until ARC
+                * buffers no longer require virtual address space.
+                */
+               if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize)
+                       break;
+#endif
                if (zio_buf_cache[c] != last_cache) {
                        last_cache = zio_buf_cache[c];
                        kmem_cache_destroy(zio_buf_cache[c]);
@@ -209,7 +227,7 @@ zio_buf_alloc(size_t size)
 {
        size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
-       ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+       VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
        return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 }
@@ -225,7 +243,7 @@ zio_data_buf_alloc(size_t size)
 {
        size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
-       ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+       VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
        return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 }
@@ -235,7 +253,7 @@ zio_buf_free(void *buf, size_t size)
 {
        size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
-       ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+       VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
        kmem_cache_free(zio_buf_cache[c], buf);
 }
@@ -245,7 +263,7 @@ zio_data_buf_free(void *buf, size_t size)
 {
        size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
-       ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+       VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
        kmem_cache_free(zio_data_buf_cache[c], buf);
 }
@@ -593,6 +611,90 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
        return (zio_null(NULL, spa, NULL, done, private, flags));
 }
 
+void
+zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
+{
+       int i;
+
+       if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
+               zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
+                   bp, (longlong_t)BP_GET_TYPE(bp));
+       }
+       if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
+           BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
+               zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
+                   bp, (longlong_t)BP_GET_CHECKSUM(bp));
+       }
+       if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
+           BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
+               zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
+                   bp, (longlong_t)BP_GET_COMPRESS(bp));
+       }
+       if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
+               zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
+                   bp, (longlong_t)BP_GET_LSIZE(bp));
+       }
+       if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
+               zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
+                   bp, (longlong_t)BP_GET_PSIZE(bp));
+       }
+
+       if (BP_IS_EMBEDDED(bp)) {
+               if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
+                       zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
+                           bp, (longlong_t)BPE_GET_ETYPE(bp));
+               }
+       }
+
+       /*
+        * Pool-specific checks.
+        *
+        * Note: it would be nice to verify that the blk_birth and
+        * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
+        * allows the birth time of log blocks (and dmu_sync()-ed blocks
+        * that are in the log) to be arbitrarily large.
+        */
+       for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+               uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
+               vdev_t *vd;
+               uint64_t offset, asize;
+               if (vdevid >= spa->spa_root_vdev->vdev_children) {
+                       zfs_panic_recover("blkptr at %p DVA %u has invalid "
+                           "VDEV %llu",
+                           bp, i, (longlong_t)vdevid);
+               }
+               vd = spa->spa_root_vdev->vdev_child[vdevid];
+               if (vd == NULL) {
+                       zfs_panic_recover("blkptr at %p DVA %u has invalid "
+                           "VDEV %llu",
+                           bp, i, (longlong_t)vdevid);
+               }
+               if (vd->vdev_ops == &vdev_hole_ops) {
+                       zfs_panic_recover("blkptr at %p DVA %u has hole "
+                           "VDEV %llu",
+                           bp, i, (longlong_t)vdevid);
+
+               }
+               if (vd->vdev_ops == &vdev_missing_ops) {
+                       /*
+                        * "missing" vdevs are valid during import, but we
+                        * don't have their detailed info (e.g. asize), so
+                        * we can't perform any more checks on them.
+                        */
+                       continue;
+               }
+               offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
+               asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
+               if (BP_IS_GANG(bp))
+                       asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+               if (offset + asize > vd->vdev_asize) {
+                       zfs_panic_recover("blkptr at %p DVA %u has invalid "
+                           "OFFSET %llu",
+                           bp, i, (longlong_t)offset);
+               }
+       }
+}
+
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
@@ -600,6 +702,8 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 {
        zio_t *zio;
 
+       zfs_blkptr_verify(spa, bp);
+
        zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
            data, size, done, private,
            ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
@@ -1091,19 +1195,26 @@ zio_write_bp_init(zio_t *zio)
                        return (ZIO_PIPELINE_CONTINUE);
                } else {
                        /*
-                        * Round up compressed size to MINBLOCKSIZE and
-                        * zero the tail.
+                        * Round up compressed size up to the ashift
+                        * of the smallest-ashift device, and zero the tail.
+                        * This ensures that the compressed size of the BP
+                        * (and thus compressratio property) are correct,
+                        * in that we charge for the padding used to fill out
+                        * the last sector.
                         */
-                       size_t rounded =
-                           P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE);
-                       if (rounded > psize) {
-                               bzero((char *)cbuf + psize, rounded - psize);
-                               psize = rounded;
-                       }
-                       if (psize == lsize) {
+                       size_t rounded;
+
+                       ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
+
+                       rounded = (size_t)P2ROUNDUP(psize,
+                           1ULL << spa->spa_min_ashift);
+                       if (rounded >= lsize) {
                                compress = ZIO_COMPRESS_OFF;
                                zio_buf_free(cbuf, lsize);
+                               psize = lsize;
                        } else {
+                               bzero((char *)cbuf + psize, rounded - psize);
+                               psize = rounded;
                                zio_push_transform(zio, cbuf,
                                    psize, lsize, NULL);
                        }
@@ -2130,7 +2241,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 
                if (ddp->ddp_phys_birth != 0) {
                        arc_buf_t *abuf = NULL;
-                       uint32_t aflags = ARC_WAIT;
+                       arc_flags_t aflags = ARC_FLAG_WAIT;
                        blkptr_t blk = *zio->io_bp;
                        int error;
 
@@ -2526,6 +2637,18 @@ zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
  * Read and write to physical devices
  * ==========================================================================
  */
+
+
+/*
+ * Issue an I/O to the underlying vdev. Typically the issue pipeline
+ * stops after this stage and will resume upon I/O completion.
+ * However, there are instances where the vdev layer may need to
+ * continue the pipeline when an I/O was not issued. Since the I/O
+ * that was sent to the vdev layer might be different than the one
+ * currently active in the pipeline (see vdev_queue_io()), we explicitly
+ * force the underlying vdev layers to call either zio_execute() or
+ * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
+ */
 static int
 zio_vdev_io_start(zio_t *zio)
 {
@@ -2543,7 +2666,8 @@ zio_vdev_io_start(zio_t *zio)
                /*
                 * The mirror_ops handle multiple DVAs in a single BP.
                 */
-               return (vdev_mirror_ops.vdev_op_io_start(zio));
+               vdev_mirror_ops.vdev_op_io_start(zio);
+               return (ZIO_PIPELINE_STOP);
        }
 
        /*
@@ -2551,7 +2675,7 @@ zio_vdev_io_start(zio_t *zio)
         * can quickly react to certain workloads.  In particular, we care
         * about non-scrubbing, top-level reads and writes with the following
         * characteristics:
-        *      - synchronous writes of user data to non-slog devices
+        *      - synchronous writes of user data to non-slog devices
         *      - any reads of user data
         * When these conditions are met, adjust the timestamp of spa_last_io
         * which allows the scan thread to adjust its workload accordingly.
@@ -2637,7 +2761,8 @@ zio_vdev_io_start(zio_t *zio)
                }
        }
 
-       return (vd->vdev_ops->vdev_op_io_start(zio));
+       vd->vdev_ops->vdev_op_io_start(zio);
+       return (ZIO_PIPELINE_STOP);
 }
 
 static int
@@ -2862,7 +2987,8 @@ zio_checksum_verify(zio_t *zio)
 
        if ((error = zio_checksum_error(zio, &info)) != 0) {
                zio->io_error = error;
-               if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+               if (error == ECKSUM &&
+                   !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
                        zfs_ereport_start_checksum(zio->io_spa,
                            zio->io_vd, zio, zio->io_offset,
                            zio->io_size, NULL, &info);
index 07446234922b63e92d1214483b0251ef0230d417..6b8d6c39bd912b98e5e19f4e58bc43a15afb6447 100644 (file)
@@ -34,6 +34,7 @@
 #include <sys/zfs_context.h>
 #include <sys/compress.h>
 #include <sys/spa.h>
+#include <sys/zfeature.h>
 #include <sys/zio.h>
 #include <sys/zio_compress.h>
 
@@ -61,19 +62,27 @@ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
 };
 
 enum zio_compress
-zio_compress_select(enum zio_compress child, enum zio_compress parent)
+zio_compress_select(spa_t *spa, enum zio_compress child,
+    enum zio_compress parent)
 {
+       enum zio_compress result;
+
        ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
        ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
-       ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON);
+       ASSERT(parent != ZIO_COMPRESS_INHERIT);
 
-       if (child == ZIO_COMPRESS_INHERIT)
-               return (parent);
+       result = child;
+       if (result == ZIO_COMPRESS_INHERIT)
+               result = parent;
 
-       if (child == ZIO_COMPRESS_ON)
-               return (ZIO_COMPRESS_ON_VALUE);
+       if (result == ZIO_COMPRESS_ON) {
+               if (spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS))
+                       result = ZIO_COMPRESS_LZ4_ON_VALUE;
+               else
+                       result = ZIO_COMPRESS_LEGACY_ON_VALUE;
+       }
 
-       return (child);
+       return (result);
 }
 
 size_t
index 5afb23c595aeddef4cb1a0f0ed3547dab81eb1d9..40b507a0b6d83f3803c2cc07dfafd943312ad1e5 100644 (file)
@@ -439,7 +439,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
         * fault injection isn't a performance critical path.
         */
        if (flags & ZINJECT_FLUSH_ARC)
-               arc_flush(NULL);
+               /*
+                * We must use FALSE to ensure arc_flush returns, since
+                * we're not preventing concurrent ARC insertions.
+                */
+               arc_flush(NULL, FALSE);
 
        return (0);
 }
index d93d900aa1e7f1b89340056fb105fba427f5526f..dd02e9e99465f2c663e7f9d34a39fe02946201eb 100644 (file)
@@ -160,19 +160,9 @@ const struct inode_operations zpl_ops_root = {
 static struct vfsmount *
 zpl_snapdir_automount(struct path *path)
 {
-       struct dentry *dentry = path->dentry;
        int error;
 
-       /*
-        * We must briefly disable automounts for this dentry because the
-        * user space mount utility will trigger another lookup on this
-        * directory.  That will result in zpl_snapdir_automount() being
-        * called repeatedly.  The DCACHE_NEED_AUTOMOUNT flag can be
-        * safely reset once the mount completes.
-        */
-       dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
-       error = -zfsctl_mount_snapshot(path, 0);
-       dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+       error = -zfsctl_snapshot_mount(path, 0);
        if (error)
                return (ERR_PTR(error));
 
@@ -188,8 +178,10 @@ zpl_snapdir_automount(struct path *path)
 #endif /* HAVE_AUTOMOUNT */
 
 /*
- * Revalidate any dentry in the snapshot directory on lookup, since a snapshot
- * having the same name have been created or destroyed since it was cached.
+ * Negative dentries must always be revalidated so newly created snapshots
+ * can be detected and automounted.  Normal dentries should be kept because
+ * as of the 3.18 kernel revaliding the mountpoint dentry will result in
+ * the snapshot being immediately unmounted.
  */
 static int
 #ifdef HAVE_D_REVALIDATE_NAMEIDATA
@@ -198,7 +190,7 @@ zpl_snapdir_revalidate(struct dentry *dentry, struct nameidata *i)
 zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags)
 #endif
 {
-       return (0);
+       return (!!dentry->d_inode);
 }
 
 dentry_operations_t zpl_dops_snapdirs = {
@@ -245,6 +237,9 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
        ASSERT(error == 0 || ip == NULL);
        d_clear_d_op(dentry);
        d_set_d_op(dentry, &zpl_dops_snapdirs);
+#ifdef HAVE_AUTOMOUNT
+       dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+#endif
 
        return (d_splice_alias(ip, dentry));
 }
@@ -373,7 +368,7 @@ zpl_snapdir_getattr(struct vfsmount *mnt, struct dentry *dentry,
 
        ZFS_ENTER(zsb);
        error = simple_getattr(mnt, dentry, stat);
-       stat->nlink = stat->size = avl_numnodes(&zsb->z_ctldir_snaps) + 2;
+       stat->nlink = stat->size = 2;
        stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zsb->z_os);
        stat->atime = CURRENT_TIME;
        ZFS_EXIT(zsb);
index 23d85cad90747f306e91c693acda09f4007aa3a1..6f051a0485a11208cd3f5e327205ccd4a3d6861d 100644 (file)
@@ -102,8 +102,21 @@ zpl_fh_to_dentry(struct super_block *sb, struct fid *fh,
        rc = zfs_vget(sb, &ip, fid);
        spl_fstrans_unmark(cookie);
 
-       if (rc != 0)
+       if (rc) {
+               /*
+                * If we see ENOENT it might mean that an NFSv4 * client
+                * is using a cached inode value in a file handle and
+                * that the sought after file has had its inode changed
+                * by a third party.  So change the error to ESTALE
+                * which will trigger a full lookup by the client and
+                * will find the new filename/inode pair if it still
+                * exists.
+                */
+               if (rc == ENOENT)
+                       rc = ESTALE;
+
                return (ERR_PTR(-rc));
+       }
 
        ASSERT((ip != NULL) && !IS_ERR(ip));
 
@@ -139,6 +152,9 @@ zpl_commit_metadata(struct inode *inode)
        fstrans_cookie_t cookie;
        int error;
 
+       if (zfsctl_is_node(inode))
+               return (0);
+
        crhold(cr);
        cookie = spl_fstrans_mark();
        error = -zfs_fsync(inode, 0, cr);
index 5471140122ac299b35eeb90c28284cf87d4f7776..a23bc7d8dd41f3c5b3e365ebb27a116006ce7dd7 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  */
 
 
@@ -202,17 +203,18 @@ zpl_aio_fsync(struct kiocb *kiocb, int datasync)
 #error "Unsupported fops->fsync() implementation"
 #endif
 
-static inline ssize_t
+static ssize_t
 zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
-    unsigned long nr_segs, loff_t *ppos, uio_seg_t segment,
-    int flags, cred_t *cr)
+    unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags,
+    cred_t *cr, size_t skip)
 {
        ssize_t read;
        uio_t uio;
        int error;
        fstrans_cookie_t cookie;
 
-       uio.uio_iov = (struct iovec *)iovp;
+       uio.uio_iov = iovp;
+       uio.uio_skip = skip;
        uio.uio_resid = count;
        uio.uio_iovcnt = nr_segs;
        uio.uio_loffset = *ppos;
@@ -242,7 +244,7 @@ zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos,
        iov.iov_len = len;
 
        return (zpl_read_common_iovec(ip, &iov, len, 1, ppos, segment,
-           flags, cr));
+           flags, cr, 0));
 }
 
 static ssize_t
@@ -261,24 +263,17 @@ zpl_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 
 static ssize_t
 zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp,
-    unsigned long nr_segs, size_t count)
+    unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip)
 {
        cred_t *cr = CRED();
        struct file *filp = kiocb->ki_filp;
        ssize_t read;
-       size_t alloc_size = sizeof (struct iovec) * nr_segs;
-       struct iovec *iov_tmp = kmem_alloc(alloc_size, KM_SLEEP);
-       bcopy(iovp, iov_tmp, alloc_size);
-
-       ASSERT(iovp);
 
        crhold(cr);
-       read = zpl_read_common_iovec(filp->f_mapping->host, iov_tmp, count,
-           nr_segs, &kiocb->ki_pos, UIO_USERSPACE, filp->f_flags, cr);
+       read = zpl_read_common_iovec(filp->f_mapping->host, iovp, count,
+           nr_segs, &kiocb->ki_pos, seg, filp->f_flags, cr, skip);
        crfree(cr);
 
-       kmem_free(iov_tmp, alloc_size);
-
        return (read);
 }
 
@@ -286,22 +281,32 @@ zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp,
 static ssize_t
 zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
 {
-       return (zpl_iter_read_common(kiocb, to->iov, to->nr_segs,
-           iov_iter_count(to)));
+       ssize_t ret;
+       uio_seg_t seg = UIO_USERSPACE;
+       if (to->type & ITER_KVEC)
+               seg = UIO_SYSSPACE;
+       if (to->type & ITER_BVEC)
+               seg = UIO_BVEC;
+       ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs,
+           iov_iter_count(to), seg, to->iov_offset);
+       if (ret > 0)
+               iov_iter_advance(to, ret);
+       return (ret);
 }
 #else
 static ssize_t
 zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp,
     unsigned long nr_segs, loff_t pos)
 {
-       return (zpl_iter_read_common(kiocb, iovp, nr_segs, kiocb->ki_nbytes));
+       return (zpl_iter_read_common(kiocb, iovp, nr_segs, kiocb->ki_nbytes,
+           UIO_USERSPACE, 0));
 }
 #endif /* HAVE_VFS_RW_ITERATE */
 
-static inline ssize_t
+static ssize_t
 zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
-    unsigned long nr_segs, loff_t *ppos, uio_seg_t segment,
-    int flags, cred_t *cr)
+    unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags,
+    cred_t *cr, size_t skip)
 {
        ssize_t wrote;
        uio_t uio;
@@ -311,7 +316,8 @@ zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
        if (flags & O_APPEND)
                *ppos = i_size_read(ip);
 
-       uio.uio_iov = (struct iovec *)iovp;
+       uio.uio_iov = iovp;
+       uio.uio_skip = skip;
        uio.uio_resid = count;
        uio.uio_iovcnt = nr_segs;
        uio.uio_loffset = *ppos;
@@ -340,7 +346,7 @@ zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos,
        iov.iov_len = len;
 
        return (zpl_write_common_iovec(ip, &iov, len, 1, ppos, segment,
-           flags, cr));
+           flags, cr, 0));
 }
 
 static ssize_t
@@ -359,24 +365,17 @@ zpl_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 
 static ssize_t
 zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp,
-    unsigned long nr_segs, size_t count)
+    unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip)
 {
        cred_t *cr = CRED();
        struct file *filp = kiocb->ki_filp;
        ssize_t wrote;
-       size_t alloc_size = sizeof (struct iovec) * nr_segs;
-       struct iovec *iov_tmp = kmem_alloc(alloc_size, KM_SLEEP);
-       bcopy(iovp, iov_tmp, alloc_size);
-
-       ASSERT(iovp);
 
        crhold(cr);
-       wrote = zpl_write_common_iovec(filp->f_mapping->host, iov_tmp, count,
-           nr_segs, &kiocb->ki_pos, UIO_USERSPACE, filp->f_flags, cr);
+       wrote = zpl_write_common_iovec(filp->f_mapping->host, iovp, count,
+           nr_segs, &kiocb->ki_pos, seg, filp->f_flags, cr, skip);
        crfree(cr);
 
-       kmem_free(iov_tmp, alloc_size);
-
        return (wrote);
 }
 
@@ -384,15 +383,25 @@ zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp,
 static ssize_t
 zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
 {
-       return (zpl_iter_write_common(kiocb, from->iov, from->nr_segs,
-           iov_iter_count(from)));
+       ssize_t ret;
+       uio_seg_t seg = UIO_USERSPACE;
+       if (from->type & ITER_KVEC)
+               seg = UIO_SYSSPACE;
+       if (from->type & ITER_BVEC)
+               seg = UIO_BVEC;
+       ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs,
+           iov_iter_count(from), seg, from->iov_offset);
+       if (ret > 0)
+               iov_iter_advance(from, ret);
+       return (ret);
 }
 #else
 static ssize_t
 zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp,
     unsigned long nr_segs, loff_t pos)
 {
-       return (zpl_iter_write_common(kiocb, iovp, nr_segs, kiocb->ki_nbytes));
+       return (zpl_iter_write_common(kiocb, iovp, nr_segs, kiocb->ki_nbytes,
+           UIO_USERSPACE, 0));
 }
 #endif /* HAVE_VFS_RW_ITERATE */
 
index 7f999cd7194ff8e57516349dfee185ded0785f64..6475c72d710b19c9ff95f59d3369bc65a9824467 100644 (file)
  */
 /*
  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  */
 
 
+#include <sys/zfs_ctldir.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_vnops.h>
 #include <sys/zfs_znode.h>
@@ -107,9 +109,14 @@ zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode,
        cookie = spl_fstrans_mark();
        error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL);
        if (error == 0) {
-               VERIFY0(zpl_xattr_security_init(ip, dir, &dentry->d_name));
-               VERIFY0(zpl_init_acl(ip, dir));
                d_instantiate(dentry, ip);
+
+               error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+               if (error == 0)
+                       error = zpl_init_acl(ip, dir);
+
+               if (error)
+                       (void) zfs_remove(dir, dname(dentry), cr);
        }
 
        spl_fstrans_unmark(cookie);
@@ -145,9 +152,14 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode,
        cookie = spl_fstrans_mark();
        error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL);
        if (error == 0) {
-               VERIFY0(zpl_xattr_security_init(ip, dir, &dentry->d_name));
-               VERIFY0(zpl_init_acl(ip, dir));
                d_instantiate(dentry, ip);
+
+               error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+               if (error == 0)
+                       error = zpl_init_acl(ip, dir);
+
+               if (error)
+                       (void) zfs_remove(dir, dname(dentry), cr);
        }
 
        spl_fstrans_unmark(cookie);
@@ -191,9 +203,14 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode)
        cookie = spl_fstrans_mark();
        error = -zfs_mkdir(dir, dname(dentry), vap, &ip, cr, 0, NULL);
        if (error == 0) {
-               VERIFY0(zpl_xattr_security_init(ip, dir, &dentry->d_name));
-               VERIFY0(zpl_init_acl(ip, dir));
                d_instantiate(dentry, ip);
+
+               error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+               if (error == 0)
+                       error = zpl_init_acl(ip, dir);
+
+               if (error)
+                       (void) zfs_rmdir(dir, dname(dentry), NULL, cr, 0);
        }
 
        spl_fstrans_unmark(cookie);
@@ -224,21 +241,9 @@ zpl_rmdir(struct inode * dir, struct dentry *dentry)
 static int
 zpl_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
-       boolean_t issnap = ITOZSB(dentry->d_inode)->z_issnap;
        int error;
        fstrans_cookie_t cookie;
 
-       /*
-        * Ensure MNT_SHRINKABLE is set on snapshots to ensure they are
-        * unmounted automatically with the parent file system.  This
-        * is done on the first getattr because it's not easy to get the
-        * vfsmount structure at mount time.  This call path is explicitly
-        * marked unlikely to avoid any performance impact.  FWIW, ext4
-        * resorts to a similar trick for sysadmin convenience.
-        */
-       if (unlikely(issnap && !(mnt->mnt_flags & MNT_SHRINKABLE)))
-               mnt->mnt_flags |= MNT_SHRINKABLE;
-
        cookie = spl_fstrans_mark();
        error = -zfs_getattr_fast(dentry->d_inode, stat);
        spl_fstrans_unmark(cookie);
@@ -318,8 +323,11 @@ zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name)
        cookie = spl_fstrans_mark();
        error = -zfs_symlink(dir, dname(dentry), vap, (char *)name, &ip, cr, 0);
        if (error == 0) {
-               VERIFY0(zpl_xattr_security_init(ip, dir, &dentry->d_name));
                d_instantiate(dentry, ip);
+
+               error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+               if (error)
+                       (void) zfs_remove(dir, dname(dentry), cr);
        }
 
        spl_fstrans_unmark(cookie);
@@ -353,6 +361,7 @@ zpl_follow_link(struct dentry *dentry, void **symlink_cookie)
 
        uio.uio_iov = &iov;
        uio.uio_iovcnt = 1;
+       uio.uio_skip = 0;
        uio.uio_resid = (MAXPATHLEN - 1);
        uio.uio_segflg = UIO_SYSSPACE;
 
@@ -483,6 +492,19 @@ zpl_revalidate(struct dentry *dentry, unsigned int flags)
        if (flags & LOOKUP_RCU)
                return (-ECHILD);
 
+       /*
+        * Automounted snapshots rely on periodic dentry revalidation
+        * to defer snapshots from being automatically unmounted.
+        */
+       if (zsb->z_issnap) {
+               if (time_after(jiffies, zsb->z_snap_defer_time +
+                   MAX(zfs_expire_snapshot * HZ / 2, HZ))) {
+                       zsb->z_snap_defer_time = jiffies;
+                       zfsctl_snapshot_unmount_delay(
+                           dmu_objset_id(zsb->z_os), zfs_expire_snapshot);
+               }
+       }
+
        /*
         * After a rollback negative dentries created before the rollback
         * time must be invalidated.  Otherwise they can obscure files which
index a8d26ec1c9a84ac627668278e112b310538b58b7..bcdbbd69e2808e56d8b6588f9ed77aab19b5b745 100644 (file)
@@ -184,49 +184,217 @@ zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
        return (error);
 }
 
+enum {
+       TOKEN_RO,
+       TOKEN_RW,
+       TOKEN_SETUID,
+       TOKEN_NOSETUID,
+       TOKEN_EXEC,
+       TOKEN_NOEXEC,
+       TOKEN_DEVICES,
+       TOKEN_NODEVICES,
+       TOKEN_DIRXATTR,
+       TOKEN_SAXATTR,
+       TOKEN_XATTR,
+       TOKEN_NOXATTR,
+       TOKEN_ATIME,
+       TOKEN_NOATIME,
+       TOKEN_RELATIME,
+       TOKEN_NORELATIME,
+       TOKEN_NBMAND,
+       TOKEN_NONBMAND,
+       TOKEN_MNTPOINT,
+       TOKEN_LAST,
+};
+
+static const match_table_t zpl_tokens = {
+       { TOKEN_RO,             MNTOPT_RO },
+       { TOKEN_RW,             MNTOPT_RW },
+       { TOKEN_SETUID,         MNTOPT_SETUID },
+       { TOKEN_NOSETUID,       MNTOPT_NOSETUID },
+       { TOKEN_EXEC,           MNTOPT_EXEC },
+       { TOKEN_NOEXEC,         MNTOPT_NOEXEC },
+       { TOKEN_DEVICES,        MNTOPT_DEVICES },
+       { TOKEN_NODEVICES,      MNTOPT_NODEVICES },
+       { TOKEN_DIRXATTR,       MNTOPT_DIRXATTR },
+       { TOKEN_SAXATTR,        MNTOPT_SAXATTR },
+       { TOKEN_XATTR,          MNTOPT_XATTR },
+       { TOKEN_NOXATTR,        MNTOPT_NOXATTR },
+       { TOKEN_ATIME,          MNTOPT_ATIME },
+       { TOKEN_NOATIME,        MNTOPT_NOATIME },
+       { TOKEN_RELATIME,       MNTOPT_RELATIME },
+       { TOKEN_NORELATIME,     MNTOPT_NORELATIME },
+       { TOKEN_NBMAND,         MNTOPT_NBMAND },
+       { TOKEN_NONBMAND,       MNTOPT_NONBMAND },
+       { TOKEN_MNTPOINT,       MNTOPT_MNTPOINT "=%s" },
+       { TOKEN_LAST,           NULL },
+};
+
+static int
+zpl_parse_option(char *option, int token, substring_t *args, zfs_mntopts_t *zmo)
+{
+       switch (token) {
+       case TOKEN_RO:
+               zmo->z_readonly = B_TRUE;
+               zmo->z_do_readonly = B_TRUE;
+               break;
+       case TOKEN_RW:
+               zmo->z_readonly = B_FALSE;
+               zmo->z_do_readonly = B_TRUE;
+               break;
+       case TOKEN_SETUID:
+               zmo->z_setuid = B_TRUE;
+               zmo->z_do_setuid = B_TRUE;
+               break;
+       case TOKEN_NOSETUID:
+               zmo->z_setuid = B_FALSE;
+               zmo->z_do_setuid = B_TRUE;
+               break;
+       case TOKEN_EXEC:
+               zmo->z_exec = B_TRUE;
+               zmo->z_do_exec = B_TRUE;
+               break;
+       case TOKEN_NOEXEC:
+               zmo->z_exec = B_FALSE;
+               zmo->z_do_exec = B_TRUE;
+               break;
+       case TOKEN_DEVICES:
+               zmo->z_devices = B_TRUE;
+               zmo->z_do_devices = B_TRUE;
+               break;
+       case TOKEN_NODEVICES:
+               zmo->z_devices = B_FALSE;
+               zmo->z_do_devices = B_TRUE;
+               break;
+       case TOKEN_DIRXATTR:
+               zmo->z_xattr = ZFS_XATTR_DIR;
+               zmo->z_do_xattr = B_TRUE;
+               break;
+       case TOKEN_SAXATTR:
+               zmo->z_xattr = ZFS_XATTR_SA;
+               zmo->z_do_xattr = B_TRUE;
+               break;
+       case TOKEN_XATTR:
+               zmo->z_xattr = ZFS_XATTR_DIR;
+               zmo->z_do_xattr = B_TRUE;
+               break;
+       case TOKEN_NOXATTR:
+               zmo->z_xattr = ZFS_XATTR_OFF;
+               zmo->z_do_xattr = B_TRUE;
+               break;
+       case TOKEN_ATIME:
+               zmo->z_atime = B_TRUE;
+               zmo->z_do_atime = B_TRUE;
+               break;
+       case TOKEN_NOATIME:
+               zmo->z_atime = B_FALSE;
+               zmo->z_do_atime = B_TRUE;
+               break;
+       case TOKEN_RELATIME:
+               zmo->z_relatime = B_TRUE;
+               zmo->z_do_relatime = B_TRUE;
+               break;
+       case TOKEN_NORELATIME:
+               zmo->z_relatime = B_FALSE;
+               zmo->z_do_relatime = B_TRUE;
+               break;
+       case TOKEN_NBMAND:
+               zmo->z_nbmand = B_TRUE;
+               zmo->z_do_nbmand = B_TRUE;
+               break;
+       case TOKEN_NONBMAND:
+               zmo->z_nbmand = B_FALSE;
+               zmo->z_do_nbmand = B_TRUE;
+               break;
+       case TOKEN_MNTPOINT:
+               zmo->z_mntpoint = match_strdup(&args[0]);
+               if (zmo->z_mntpoint == NULL)
+                       return (-ENOMEM);
+
+               break;
+       default:
+               break;
+       }
+
+       return (0);
+}
+
+/*
+ * Parse the mntopts string storing the results in provided zmo argument.
+ * If an error occurs the zmo argument will not be modified.  The caller
+ * needs to set isremount when recycling an existing zfs_mntopts_t.
+ */
+static int
+zpl_parse_options(char *osname, char *mntopts, zfs_mntopts_t *zmo,
+    boolean_t isremount)
+{
+       zfs_mntopts_t *tmp_zmo;
+       int error;
+
+       tmp_zmo = zfs_mntopts_alloc();
+       tmp_zmo->z_osname = strdup(osname);
+
+       if (mntopts) {
+               substring_t args[MAX_OPT_ARGS];
+               char *tmp_mntopts, *p;
+               int token;
+
+               tmp_mntopts = strdup(mntopts);
+
+               while ((p = strsep(&tmp_mntopts, ",")) != NULL) {
+                       if (!*p)
+                               continue;
+
+                       args[0].to = args[0].from = NULL;
+                       token = match_token(p, zpl_tokens, args);
+                       error = zpl_parse_option(p, token, args, tmp_zmo);
+                       if (error) {
+                               zfs_mntopts_free(tmp_zmo);
+                               strfree(tmp_mntopts);
+                               return (error);
+                       }
+               }
+
+               strfree(tmp_mntopts);
+       }
+
+       if (isremount == B_TRUE) {
+               if (zmo->z_osname)
+                       strfree(zmo->z_osname);
+
+               if (zmo->z_mntpoint)
+                       strfree(zmo->z_mntpoint);
+       } else {
+               ASSERT3P(zmo->z_osname, ==, NULL);
+               ASSERT3P(zmo->z_mntpoint, ==, NULL);
+       }
+
+       memcpy(zmo, tmp_zmo, sizeof (zfs_mntopts_t));
+       kmem_free(tmp_zmo, sizeof (zfs_mntopts_t));
+
+       return (0);
+}
+
 static int
 zpl_remount_fs(struct super_block *sb, int *flags, char *data)
 {
+       zfs_sb_t *zsb = sb->s_fs_info;
        fstrans_cookie_t cookie;
        int error;
 
+       error = zpl_parse_options(zsb->z_mntopts->z_osname, data,
+           zsb->z_mntopts, B_TRUE);
+       if (error)
+               return (error);
+
        cookie = spl_fstrans_mark();
-       error = -zfs_remount(sb, flags, data);
+       error = -zfs_remount(sb, flags, zsb->z_mntopts);
        spl_fstrans_unmark(cookie);
        ASSERT3S(error, <=, 0);
 
        return (error);
 }
 
-static void
-zpl_umount_begin(struct super_block *sb)
-{
-       zfs_sb_t *zsb = sb->s_fs_info;
-       int count;
-
-       /*
-        * Best effort to unmount snapshots in .zfs/snapshot/.  Normally this
-        * isn't required because snapshots have the MNT_SHRINKABLE flag set.
-        */
-       if (zsb->z_ctldir)
-               (void) zfsctl_unmount_snapshots(zsb, MNT_FORCE, &count);
-}
-
-/*
- * ZFS specific features must be explicitly handled here, the VFS will
- * automatically handled the following generic functionality.
- *
- *   MNT_NOSUID,
- *   MNT_NODEV,
- *   MNT_NOEXEC,
- *   MNT_NOATIME,
- *   MNT_NODIRATIME,
- *   MNT_READONLY,
- *   MNT_STRICTATIME,
- *   MS_SYNCHRONOUS,
- *   MS_DIRSYNC,
- *   MS_MANDLOCK.
- */
 static int
 __zpl_show_options(struct seq_file *seq, zfs_sb_t *zsb)
 {
@@ -263,11 +431,12 @@ zpl_show_options(struct seq_file *seq, struct vfsmount *vfsp)
 static int
 zpl_fill_super(struct super_block *sb, void *data, int silent)
 {
+       zfs_mntopts_t *zmo = (zfs_mntopts_t *)data;
        fstrans_cookie_t cookie;
        int error;
 
        cookie = spl_fstrans_mark();
-       error = -zfs_domount(sb, data, silent);
+       error = -zfs_domount(sb, zmo, silent);
        spl_fstrans_unmark(cookie);
        ASSERT3S(error, <=, 0);
 
@@ -279,18 +448,32 @@ static struct dentry *
 zpl_mount(struct file_system_type *fs_type, int flags,
     const char *osname, void *data)
 {
-       zpl_mount_data_t zmd = { osname, data };
+       zfs_mntopts_t *zmo = zfs_mntopts_alloc();
+       int error;
+
+       error = zpl_parse_options((char *)osname, (char *)data, zmo, B_FALSE);
+       if (error) {
+               zfs_mntopts_free(zmo);
+               return (ERR_PTR(error));
+       }
 
-       return (mount_nodev(fs_type, flags, &zmd, zpl_fill_super));
+       return (mount_nodev(fs_type, flags, zmo, zpl_fill_super));
 }
 #else
 static int
 zpl_get_sb(struct file_system_type *fs_type, int flags,
     const char *osname, void *data, struct vfsmount *mnt)
 {
-       zpl_mount_data_t zmd = { osname, data };
+       zfs_mntopts_t *zmo = zfs_mntopts_alloc();
+       int error;
+
+       error = zpl_parse_options((char *)osname, (char *)data, zmo, B_FALSE);
+       if (error) {
+               zfs_mntopts_free(zmo);
+               return (error);
+       }
 
-       return (get_sb_nodev(fs_type, flags, &zmd, zpl_fill_super, mnt));
+       return (get_sb_nodev(fs_type, flags, zmo, zpl_fill_super, mnt));
 }
 #endif /* HAVE_MOUNT_NODEV */
 
@@ -318,24 +501,11 @@ zpl_prune_sb(int64_t nr_to_scan, void *arg)
 static int
 zpl_nr_cached_objects(struct super_block *sb)
 {
-       zfs_sb_t *zsb = sb->s_fs_info;
-       int nr;
-
-       mutex_enter(&zsb->z_znodes_lock);
-       nr = zsb->z_nr_znodes;
-       mutex_exit(&zsb->z_znodes_lock);
-
-       return (nr);
+       return (0);
 }
 #endif /* HAVE_NR_CACHED_OBJECTS */
 
 #ifdef HAVE_FREE_CACHED_OBJECTS
-/*
- * Attempt to evict some meta data from the cache.  The ARC operates in
- * terms of bytes while the Linux VFS uses objects.  Now because this is
- * just a best effort eviction and the exact values aren't critical so we
- * extrapolate from an object count to a byte size using the znode_t size.
- */
 static void
 zpl_free_cached_objects(struct super_block *sb, int nr_to_scan)
 {
@@ -359,7 +529,6 @@ const struct super_operations zpl_super_operations = {
        .sync_fs                = zpl_sync_fs,
        .statfs                 = zpl_statfs,
        .remount_fs             = zpl_remount_fs,
-       .umount_begin           = zpl_umount_begin,
        .show_options           = zpl_show_options,
        .show_stats             = NULL,
 #ifdef HAVE_NR_CACHED_OBJECTS
index 144e72eb46d6593ccf6f6fa1067d65747a369c38..52f9ee83930e8dc847d2a0191888bc3e47ac8fab 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014 by Delphix. All rights reserved.
  */
 
 /*
@@ -43,7 +44,7 @@
  * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is
  * treated as zero references.
  */
-#define        ZRL_LOCKED      ((uint32_t)-1)
+#define        ZRL_LOCKED      -1
 #define        ZRL_DESTROYED   -2
 
 void
@@ -61,7 +62,7 @@ zrl_init(zrlock_t *zrl)
 void
 zrl_destroy(zrlock_t *zrl)
 {
-       ASSERT(zrl->zr_refcount == 0);
+       ASSERT0(zrl->zr_refcount);
 
        mutex_destroy(&zrl->zr_mtx);
        zrl->zr_refcount = ZRL_DESTROYED;
@@ -81,7 +82,7 @@ zrl_add(zrlock_t *zrl)
                uint32_t cas = atomic_cas_32(
                    (uint32_t *)&zrl->zr_refcount, n, n + 1);
                if (cas == n) {
-                       ASSERT((int32_t)n >= 0);
+                       ASSERT3S((int32_t)n, >=, 0);
 #ifdef ZFS_DEBUG
                        if (zrl->zr_owner == curthread) {
                                DTRACE_PROBE2(zrlock__reentry,
@@ -99,7 +100,7 @@ zrl_add(zrlock_t *zrl)
        while (zrl->zr_refcount == ZRL_LOCKED) {
                cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
        }
-       ASSERT(zrl->zr_refcount >= 0);
+       ASSERT3S(zrl->zr_refcount, >=, 0);
        zrl->zr_refcount++;
 #ifdef ZFS_DEBUG
        zrl->zr_owner = curthread;
@@ -113,14 +114,14 @@ zrl_remove(zrlock_t *zrl)
 {
        uint32_t n;
 
-       n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
-       ASSERT((int32_t)n >= 0);
 #ifdef ZFS_DEBUG
        if (zrl->zr_owner == curthread) {
                zrl->zr_owner = NULL;
                zrl->zr_caller = NULL;
        }
 #endif
+       n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
+       ASSERT3S((int32_t)n, >=, 0);
 }
 
 int
@@ -133,14 +134,14 @@ zrl_tryenter(zrlock_t *zrl)
                    (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED);
                if (cas == 0) {
 #ifdef ZFS_DEBUG
-                       ASSERT(zrl->zr_owner == NULL);
+                       ASSERT3P(zrl->zr_owner, ==, NULL);
                        zrl->zr_owner = curthread;
 #endif
                        return (1);
                }
        }
 
-       ASSERT((int32_t)n > ZRL_DESTROYED);
+       ASSERT3S((int32_t)n, >, ZRL_DESTROYED);
 
        return (0);
 }
@@ -148,11 +149,11 @@ zrl_tryenter(zrlock_t *zrl)
 void
 zrl_exit(zrlock_t *zrl)
 {
-       ASSERT(zrl->zr_refcount == ZRL_LOCKED);
+       ASSERT3S(zrl->zr_refcount, ==, ZRL_LOCKED);
 
        mutex_enter(&zrl->zr_mtx);
 #ifdef ZFS_DEBUG
-       ASSERT(zrl->zr_owner == curthread);
+       ASSERT3P(zrl->zr_owner, ==, curthread);
        zrl->zr_owner = NULL;
        membar_producer();      /* make sure the owner store happens first */
 #endif
@@ -166,7 +167,7 @@ zrl_refcount(zrlock_t *zrl)
 {
        int n;
 
-       ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+       ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
 
        n = (int)zrl->zr_refcount;
        return (n <= 0 ? 0 : n);
@@ -175,7 +176,7 @@ zrl_refcount(zrlock_t *zrl)
 int
 zrl_is_zero(zrlock_t *zrl)
 {
-       ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+       ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
 
        return (zrl->zr_refcount <= 0);
 }
@@ -183,7 +184,7 @@ zrl_is_zero(zrlock_t *zrl)
 int
 zrl_is_locked(zrlock_t *zrl)
 {
-       ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+       ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
 
        return (zrl->zr_refcount == ZRL_LOCKED);
 }
index d180b5b5b76f468ba67aa6b71e5f35d094727c11..c81f02a3907b9fc77644e8eb34427c09d58b0ac0 100644 (file)
@@ -40,6 +40,7 @@
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/zap.h>
+#include <sys/zfeature.h>
 #include <sys/zil_impl.h>
 #include <sys/zio.h>
 #include <sys/zfs_rlock.h>
 
 unsigned int zvol_inhibit_dev = 0;
 unsigned int zvol_major = ZVOL_MAJOR;
-unsigned int zvol_threads = 32;
+unsigned int zvol_prefetch_bytes = (128 * 1024);
 unsigned long zvol_max_discard_blocks = 16384;
 
-static taskq_t *zvol_taskq;
 static kmutex_t zvol_state_lock;
 static list_t zvol_state_list;
 static char *zvol_tag = "zvol_tag";
@@ -380,8 +380,31 @@ out:
  * Sanity check volume block size.
  */
 int
-zvol_check_volblocksize(uint64_t volblocksize)
+zvol_check_volblocksize(const char *name, uint64_t volblocksize)
 {
+       /* Record sizes above 128k need the feature to be enabled */
+       if (volblocksize > SPA_OLD_MAXBLOCKSIZE) {
+               spa_t *spa;
+               int error;
+
+               if ((error = spa_open(name, &spa, FTAG)) != 0)
+                       return (error);
+
+               if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+                       spa_close(spa, FTAG);
+                       return (SET_ERROR(ENOTSUP));
+               }
+
+               /*
+                * We don't allow setting the property above 1MB,
+                * unless the tunable has been changed.
+                */
+               if (volblocksize > zfs_max_recordsize)
+                       return (SET_ERROR(EDOM));
+
+               spa_close(spa, FTAG);
+       }
+
        if (volblocksize < SPA_MINBLOCKSIZE ||
            volblocksize > SPA_MAXBLOCKSIZE ||
            !ISP2(volblocksize))
@@ -566,34 +589,24 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
        }
 }
 
-/*
- * Common write path running under the zvol taskq context.  This function
- * is responsible for copying the request structure data in to the DMU and
- * signaling the request queue with the result of the copy.
- */
-static void
-zvol_write(void *arg)
+static int
+zvol_write(struct bio *bio)
 {
-       struct request *req = (struct request *)arg;
-       struct request_queue *q = req->q;
-       zvol_state_t *zv = q->queuedata;
-       fstrans_cookie_t cookie = spl_fstrans_mark();
-       uint64_t offset = blk_rq_pos(req) << 9;
-       uint64_t size = blk_rq_bytes(req);
+       zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
+       uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+       uint64_t size = BIO_BI_SIZE(bio);
        int error = 0;
        dmu_tx_t *tx;
        rl_t *rl;
 
-       if (req->cmd_flags & VDEV_REQ_FLUSH)
+       if (bio->bi_rw & VDEV_REQ_FLUSH)
                zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
        /*
         * Some requests are just for flush and nothing else.
         */
-       if (size == 0) {
-               error = 0;
+       if (size == 0)
                goto out;
-       }
 
        rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER);
 
@@ -608,96 +621,83 @@ zvol_write(void *arg)
                goto out;
        }
 
-       error = dmu_write_req(zv->zv_objset, ZVOL_OBJ, req, tx);
+       error = dmu_write_bio(zv->zv_objset, ZVOL_OBJ, bio, tx);
        if (error == 0)
                zvol_log_write(zv, tx, offset, size,
-                   req->cmd_flags & VDEV_REQ_FUA);
+                   !!(bio->bi_rw & VDEV_REQ_FUA));
 
        dmu_tx_commit(tx);
        zfs_range_unlock(rl);
 
-       if ((req->cmd_flags & VDEV_REQ_FUA) ||
+       if ((bio->bi_rw & VDEV_REQ_FUA) ||
            zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)
                zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 out:
-       blk_end_request(req, -error, size);
-       spl_fstrans_unmark(cookie);
+       return (error);
 }
 
-#ifdef HAVE_BLK_QUEUE_DISCARD
-static void
-zvol_discard(void *arg)
+static int
+zvol_discard(struct bio *bio)
 {
-       struct request *req = (struct request *)arg;
-       struct request_queue *q = req->q;
-       zvol_state_t *zv = q->queuedata;
-       fstrans_cookie_t cookie = spl_fstrans_mark();
-       uint64_t start = blk_rq_pos(req) << 9;
-       uint64_t end = start + blk_rq_bytes(req);
+       zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
+       uint64_t start = BIO_BI_SECTOR(bio) << 9;
+       uint64_t size = BIO_BI_SIZE(bio);
+       uint64_t end = start + size;
        int error;
        rl_t *rl;
 
-       if (end > zv->zv_volsize) {
-               error = EIO;
-               goto out;
-       }
+       if (end > zv->zv_volsize)
+               return (SET_ERROR(EIO));
 
        /*
-        * Align the request to volume block boundaries. If we don't,
-        * then this will force dnode_free_range() to zero out the
-        * unaligned parts, which is slow (read-modify-write) and
-        * useless since we are not freeing any space by doing so.
+        * Align the request to volume block boundaries when REQ_SECURE is
+        * available, but not requested. If we don't, then this will force
+        * dnode_free_range() to zero out the unaligned parts, which is slow
+        * (read-modify-write) and useless since we are not freeing any space
+        * by doing so. Kernels that do not support REQ_SECURE (2.6.32 through
+        * 2.6.35) will not receive this optimization.
         */
-       start = P2ROUNDUP(start, zv->zv_volblocksize);
-       end = P2ALIGN(end, zv->zv_volblocksize);
-
-       if (start >= end) {
-               error = 0;
-               goto out;
+#ifdef REQ_SECURE
+       if (!(bio->bi_rw & REQ_SECURE)) {
+               start = P2ROUNDUP(start, zv->zv_volblocksize);
+               end = P2ALIGN(end, zv->zv_volblocksize);
+               size = end - start;
        }
+#endif
 
-       rl = zfs_range_lock(&zv->zv_znode, start, end - start, RL_WRITER);
+       if (start >= end)
+               return (0);
+
+       rl = zfs_range_lock(&zv->zv_znode, start, size, RL_WRITER);
 
-       error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end-start);
+       error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size);
 
        /*
         * TODO: maybe we should add the operation to the log.
         */
 
        zfs_range_unlock(rl);
-out:
-       blk_end_request(req, -error, blk_rq_bytes(req));
-       spl_fstrans_unmark(cookie);
+
+       return (error);
 }
-#endif /* HAVE_BLK_QUEUE_DISCARD */
 
-/*
- * Common read path running under the zvol taskq context.  This function
- * is responsible for copying the requested data out of the DMU and in to
- * a linux request structure.  It then must signal the request queue with
- * an error code describing the result of the copy.
- */
-static void
-zvol_read(void *arg)
+static int
+zvol_read(struct bio *bio)
 {
-       struct request *req = (struct request *)arg;
-       struct request_queue *q = req->q;
-       zvol_state_t *zv = q->queuedata;
-       fstrans_cookie_t cookie = spl_fstrans_mark();
-       uint64_t offset = blk_rq_pos(req) << 9;
-       uint64_t size = blk_rq_bytes(req);
+       zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
+       uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+       uint64_t len = BIO_BI_SIZE(bio);
        int error;
        rl_t *rl;
 
-       if (size == 0) {
-               error = 0;
-               goto out;
-       }
+       if (len == 0)
+               return (0);
 
-       rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
 
-       error = dmu_read_req(zv->zv_objset, ZVOL_OBJ, req);
+       rl = zfs_range_lock(&zv->zv_znode, offset, len, RL_READER);
+
+       error = dmu_read_bio(zv->zv_objset, ZVOL_OBJ, bio);
 
        zfs_range_unlock(rl);
 
@@ -705,91 +705,58 @@ zvol_read(void *arg)
        if (error == ECKSUM)
                error = SET_ERROR(EIO);
 
-out:
-       blk_end_request(req, -error, size);
-       spl_fstrans_unmark(cookie);
-}
-
-/*
- * Request will be added back to the request queue and retried if
- * it cannot be immediately dispatched to the taskq for handling
- */
-static inline void
-zvol_dispatch(task_func_t func, struct request *req)
-{
-       if (!taskq_dispatch(zvol_taskq, func, (void *)req, TQ_NOSLEEP))
-               blk_requeue_request(req->q, req);
+       return (error);
 }
 
-/*
- * Common request path.  Rather than registering a custom make_request()
- * function we use the generic Linux version.  This is done because it allows
- * us to easily merge read requests which would otherwise we performed
- * synchronously by the DMU.  This is less critical in write case where the
- * DMU will perform the correct merging within a transaction group.  Using
- * the generic make_request() also let's use leverage the fact that the
- * elevator with ensure correct ordering in regards to barrior IOs.  On
- * the downside it means that in the write case we end up doing request
- * merging twice once in the elevator and once in the DMU.
- *
- * The request handler is called under a spin lock so all the real work
- * is handed off to be done in the context of the zvol taskq.  This function
- * simply performs basic request sanity checking and hands off the request.
- */
-static void
-zvol_request(struct request_queue *q)
+static MAKE_REQUEST_FN_RET
+zvol_request(struct request_queue *q, struct bio *bio)
 {
        zvol_state_t *zv = q->queuedata;
-       struct request *req;
-       unsigned int size;
-
-       while ((req = blk_fetch_request(q)) != NULL) {
-               size = blk_rq_bytes(req);
-
-               if (size != 0 && blk_rq_pos(req) + blk_rq_sectors(req) >
-                   get_capacity(zv->zv_disk)) {
-                       printk(KERN_INFO
-                           "%s: bad access: block=%llu, count=%lu\n",
-                           req->rq_disk->disk_name,
-                           (long long unsigned)blk_rq_pos(req),
-                           (long unsigned)blk_rq_sectors(req));
-                       __blk_end_request(req, -EIO, size);
-                       continue;
-               }
+       fstrans_cookie_t cookie = spl_fstrans_mark();
+       uint64_t offset = BIO_BI_SECTOR(bio);
+       unsigned int sectors = bio_sectors(bio);
+       int rw = bio_data_dir(bio);
+#ifdef HAVE_GENERIC_IO_ACCT
+       unsigned long start = jiffies;
+#endif
+       int error = 0;
 
-               if (!blk_fs_request(req)) {
-                       printk(KERN_INFO "%s: non-fs cmd\n",
-                           req->rq_disk->disk_name);
-                       __blk_end_request(req, -EIO, size);
-                       continue;
-               }
+       if (bio_has_data(bio) && offset + sectors >
+           get_capacity(zv->zv_disk)) {
+               printk(KERN_INFO
+                   "%s: bad access: block=%llu, count=%lu\n",
+                   zv->zv_disk->disk_name,
+                   (long long unsigned)offset,
+                   (long unsigned)sectors);
+               error = SET_ERROR(EIO);
+               goto out1;
+       }
 
-               switch (rq_data_dir(req)) {
-               case READ:
-                       zvol_dispatch(zvol_read, req);
-                       break;
-               case WRITE:
-                       if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
-                               __blk_end_request(req, -EROFS, size);
-                               break;
-                       }
+       generic_start_io_acct(rw, sectors, &zv->zv_disk->part0);
 
-#ifdef HAVE_BLK_QUEUE_DISCARD
-                       if (req->cmd_flags & VDEV_REQ_DISCARD) {
-                               zvol_dispatch(zvol_discard, req);
-                               break;
-                       }
-#endif /* HAVE_BLK_QUEUE_DISCARD */
+       if (rw == WRITE) {
+               if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
+                       error = SET_ERROR(EROFS);
+                       goto out2;
+               }
 
-                       zvol_dispatch(zvol_write, req);
-                       break;
-               default:
-                       printk(KERN_INFO "%s: unknown cmd: %d\n",
-                           req->rq_disk->disk_name, (int)rq_data_dir(req));
-                       __blk_end_request(req, -EIO, size);
-                       break;
+               if (bio->bi_rw & VDEV_REQ_DISCARD) {
+                       error = zvol_discard(bio);
+                       goto out2;
                }
-       }
+
+               error = zvol_write(bio);
+       } else
+               error = zvol_read(bio);
+
+out2:
+       generic_end_io_acct(rw, &zv->zv_disk->part0, start);
+out1:
+       BIO_END_IO(bio, -error);
+       spl_fstrans_unmark(cookie);
+#ifdef HAVE_MAKE_REQUEST_FN_RET_INT
+       return (0);
+#endif
 }
 
 static void
@@ -1235,25 +1202,17 @@ static zvol_state_t *
 zvol_alloc(dev_t dev, const char *name)
 {
        zvol_state_t *zv;
-       int error = 0;
 
        zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 
        spin_lock_init(&zv->zv_lock);
        list_link_init(&zv->zv_next);
 
-       zv->zv_queue = blk_init_queue(zvol_request, &zv->zv_lock);
+       zv->zv_queue = blk_alloc_queue(GFP_ATOMIC);
        if (zv->zv_queue == NULL)
                goto out_kmem;
 
-#ifdef HAVE_ELEVATOR_CHANGE
-       error = elevator_change(zv->zv_queue, "noop");
-#endif /* HAVE_ELEVATOR_CHANGE */
-       if (error) {
-               printk("ZFS: Unable to set \"%s\" scheduler for zvol %s: %d\n",
-                   "noop", name, error);
-               goto out_queue;
-       }
+       blk_queue_make_request(zv->zv_queue, zvol_request);
 
 #ifdef HAVE_BLK_QUEUE_FLUSH
        blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA);
@@ -1339,6 +1298,7 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
        objset_t *os;
        dmu_object_info_t *doi;
        uint64_t volsize;
+       uint64_t len;
        unsigned minor = 0;
        int error = 0;
 
@@ -1389,20 +1349,21 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
 
        set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
 
-       blk_queue_max_hw_sectors(zv->zv_queue, DMU_MAX_ACCESS / 512);
+       blk_queue_max_hw_sectors(zv->zv_queue, (DMU_MAX_ACCESS / 4) >> 9);
        blk_queue_max_segments(zv->zv_queue, UINT16_MAX);
        blk_queue_max_segment_size(zv->zv_queue, UINT_MAX);
        blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize);
        blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize);
-#ifdef HAVE_BLK_QUEUE_DISCARD
        blk_queue_max_discard_sectors(zv->zv_queue,
            (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
        blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize);
        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue);
-#endif
-#ifdef HAVE_BLK_QUEUE_NONROT
+#ifdef QUEUE_FLAG_NONROT
        queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue);
 #endif
+#ifdef QUEUE_FLAG_ADD_RANDOM
+       queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue);
+#endif
 
        if (spa_writeable(dmu_objset_spa(os))) {
                if (zil_replay_disable)
@@ -1411,6 +1372,18 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
                        zil_replay(os, zv, zvol_replay_vector);
        }
 
+       /*
+        * When udev detects the addition of the device it will immediately
+        * invoke blkid(8) to determine the type of content on the device.
+        * Prefetching the blocks commonly scanned by blkid(8) will speed
+        * up this process.
+        */
+       len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
+       if (len > 0) {
+               dmu_prefetch(os, ZVOL_OBJ, 0, len);
+               dmu_prefetch(os, ZVOL_OBJ, volsize - len, len);
+       }
+
        zv->zv_objset = NULL;
 out_dmu_objset_disown:
        dmu_objset_disown(os, zvol_tag);
@@ -1631,18 +1604,10 @@ zvol_init(void)
 
        mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
 
-       zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_threads, maxclsyspri,
-           zvol_threads, INT_MAX, TASKQ_PREPOPULATE);
-       if (zvol_taskq == NULL) {
-               printk(KERN_INFO "ZFS: taskq_create() failed\n");
-               error = -ENOMEM;
-               goto out1;
-       }
-
        error = register_blkdev(zvol_major, ZVOL_DRIVER);
        if (error) {
                printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
-               goto out2;
+               goto out;
        }
 
        blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
@@ -1650,9 +1615,7 @@ zvol_init(void)
 
        return (0);
 
-out2:
-       taskq_destroy(zvol_taskq);
-out1:
+out:
        mutex_destroy(&zvol_state_lock);
        list_destroy(&zvol_state_list);
 
@@ -1665,7 +1628,6 @@ zvol_fini(void)
        zvol_remove_minors(NULL);
        blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
        unregister_blkdev(zvol_major, ZVOL_DRIVER);
-       taskq_destroy(zvol_taskq);
        mutex_destroy(&zvol_state_lock);
        list_destroy(&zvol_state_list);
 }
@@ -1676,8 +1638,8 @@ MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
 module_param(zvol_major, uint, 0444);
 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
 
-module_param(zvol_threads, uint, 0444);
-MODULE_PARM_DESC(zvol_threads, "Number of threads for zvol device");
-
 module_param(zvol_max_discard_blocks, ulong, 0444);
 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
+
+module_param(zvol_prefetch_bytes, uint, 0644);
+MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
index 437bf22876370b5ff8a1ecb479f0e28e8b5039b4..10a101da4dc0e68ae248b715a864a0becbfbf8e4 100644 (file)
@@ -1,7 +1,10 @@
+src = @abs_top_srcdir@/module/zpios
+obj = @abs_builddir@
+
 MODULE := zpios
 
 EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@
 
 obj-$(CONFIG_ZFS) := $(MODULE).o
 
-$(MODULE)-objs += @top_srcdir@/module/zpios/pios.o
+$(MODULE)-objs += pios.o
index 043f275045821051e6de32ce273f29a51783cef5..5fb5ce8b31d88dd24236736d8913cfdbfc76d83a 100644 (file)
@@ -3,15 +3,18 @@
 /* Define to 1 to enabled dmu tx validation */
 #undef DEBUG_DMU_TX
 
+/* bio_end_io_t wants 1 arg */
+#undef HAVE_1ARG_BIO_END_IO_T
+
 /* invalidate_bdev() wants 1 arg */
 #undef HAVE_1ARG_INVALIDATE_BDEV
 
+/* kmap_atomic wants 1 args */
+#undef HAVE_1ARG_KMAP_ATOMIC
+
 /* bdi_setup_and_register() wants 2 args */
 #undef HAVE_2ARGS_BDI_SETUP_AND_REGISTER
 
-/* bio_end_io_t wants 2 args */
-#undef HAVE_2ARGS_BIO_END_IO_T
-
 /* bdi_setup_and_register() wants 3 args */
 #undef HAVE_3ARGS_BDI_SETUP_AND_REGISTER
 
 /* bio has bi_iter */
 #undef HAVE_BIO_BVEC_ITER
 
-/* REQ_FAILFAST_MASK is defined */
-#undef HAVE_BIO_REQ_FAILFAST_MASK
+/* BIO_RW_BARRIER is defined */
+#undef HAVE_BIO_RW_BARRIER
 
-/* BIO_RW_FAILFAST is defined */
-#undef HAVE_BIO_RW_FAILFAST
+/* BIO_RW_DISCARD is defined */
+#undef HAVE_BIO_RW_DISCARD
 
 /* BIO_RW_FAILFAST_* are defined */
 #undef HAVE_BIO_RW_FAILFAST_DTD
 
-/* BIO_RW_SYNC is defined */
-#undef HAVE_BIO_RW_SYNC
-
-/* BIO_RW_SYNCIO is defined */
-#undef HAVE_BIO_RW_SYNCIO
-
 /* blkdev_get_by_path() is available */
 #undef HAVE_BLKDEV_GET_BY_PATH
 
-/* blk_end_request() is available */
-#undef HAVE_BLK_END_REQUEST
-
-/* blk_end_request() is GPL-only */
-#undef HAVE_BLK_END_REQUEST_GPL_ONLY
-
-/* blk_fetch_request() is available */
-#undef HAVE_BLK_FETCH_REQUEST
-
-/* blk_queue_discard() is available */
-#undef HAVE_BLK_QUEUE_DISCARD
-
 /* blk_queue_flush() is available */
 #undef HAVE_BLK_QUEUE_FLUSH
 
 /* blk_queue_flush() is GPL-only */
 #undef HAVE_BLK_QUEUE_FLUSH_GPL_ONLY
 
-/* blk_queue_io_opt() is available */
-#undef HAVE_BLK_QUEUE_IO_OPT
-
 /* blk_queue_max_hw_sectors() is available */
 #undef HAVE_BLK_QUEUE_MAX_HW_SECTORS
 
 /* blk_queue_max_segments() is available */
 #undef HAVE_BLK_QUEUE_MAX_SEGMENTS
 
-/* blk_queue_nonrot() is available */
-#undef HAVE_BLK_QUEUE_NONROT
-
-/* blk_queue_physical_block_size() is available */
-#undef HAVE_BLK_QUEUE_PHYSICAL_BLOCK_SIZE
-
-/* blk_requeue_request() is available */
-#undef HAVE_BLK_REQUEUE_REQUEST
-
-/* blk_rq_bytes() is available */
-#undef HAVE_BLK_RQ_BYTES
-
-/* blk_rq_bytes() is GPL-only */
-#undef HAVE_BLK_RQ_BYTES_GPL_ONLY
-
-/* blk_rq_pos() is available */
-#undef HAVE_BLK_RQ_POS
-
-/* blk_rq_sectors() is available */
-#undef HAVE_BLK_RQ_SECTORS
-
 /* struct block_device_operations.release returns void */
 #undef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
 
 /* iops->create() passes nameidata */
 #undef HAVE_CREATE_NAMEIDATA
 
+/* current->bio_list exists */
+#undef HAVE_CURRENT_BIO_LIST
+
+/* current->bio_tail exists */
+#undef HAVE_CURRENT_BIO_TAIL
+
 /* current_umask() exists */
 #undef HAVE_CURRENT_UMASK
 
 /* kernel defines fmode_t */
 #undef HAVE_FMODE_T
 
+/* follow_down_one() is available */
+#undef HAVE_FOLLOW_DOWN_ONE
+
 /* iops->follow_link() nameidata */
 #undef HAVE_FOLLOW_LINK_NAMEIDATA
 
 /* fops->fsync() with dentry */
 #undef HAVE_FSYNC_WITH_DENTRY
 
+/* generic_start_io_acct()/generic_end_io_acct() avaliable */
+#undef HAVE_GENERIC_IO_ACCT
+
 /* iops->get_acl() exists */
 #undef HAVE_GET_ACL
 
 /* lseek_execute() is available */
 #undef HAVE_LSEEK_EXECUTE
 
+/* Noting that make_request_fn() returns int */
+#undef HAVE_MAKE_REQUEST_FN_RET_INT
+
 /* Define to 1 if you have the <memory.h> header file. */
 #undef HAVE_MEMORY_H
 
 /* iops->put_link() nameidata */
 #undef HAVE_PUT_LINK_NAMEIDATA
 
-/* REQ_SYNC is defined */
-#undef HAVE_REQ_SYNC
-
-/* rq_for_each_segment() is available */
-#undef HAVE_RQ_FOR_EACH_SEGMENT
-
-/* rq_for_each_segment() wants bio_vec */
-#undef HAVE_RQ_FOR_EACH_SEGMENT_BV
-
-/* rq_for_each_segment() wants bio_vec * */
-#undef HAVE_RQ_FOR_EACH_SEGMENT_BVP
-
-/* rq_is_sync() is available */
-#undef HAVE_RQ_IS_SYNC
+/* REQ_FAILFAST_MASK is defined */
+#undef HAVE_REQ_FAILFAST_MASK
 
 /* set_nlink() is available */
 #undef HAVE_SET_NLINK
    */
 #undef LT_OBJDIR
 
+/* make_request_fn() returns void */
+#undef MAKE_REQUEST_FN_RET
+
 /* Name of package */
 #undef PACKAGE